Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BigQuery copy method can convert dict column to JSON string #1143

Merged
merged 1 commit into from
Oct 10, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions parsons/google/google_bigquery.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import logging
import json
import pickle
import random
import uuid
Expand Down Expand Up @@ -745,6 +746,7 @@ def copy(
allow_jagged_rows: bool = True,
quote: Optional[str] = None,
schema: Optional[List[dict]] = None,
convert_dict_columns_to_json: bool = True,
**load_kwargs,
):
"""
Expand Down Expand Up @@ -774,6 +776,8 @@ def copy(
template_table: str
Table name to be used as the load schema. Load operation wil use the same
columns and data types as the template table.
convert_dict_columns_to_json: bool
If set to True, will convert any dict columns (which cannot by default be successfully loaded to BigQuery to JSON strings)
**load_kwargs: kwargs
Arguments to pass to the underlying load_table_from_uri call on the BigQuery
client.
Expand All @@ -796,6 +800,19 @@ def copy(
else:
csv_delimiter = ","

if convert_dict_columns_to_json:
# Convert dict columns to JSON strings
for field in tbl.get_columns_type_stats():
if "dict" in field["type"]:
new_petl = tbl.table.addfield(
field["name"] + "_replace", lambda row: json.dumps(row[field["name"]])
)
new_tbl = Table(new_petl)
new_tbl.remove_column(field["name"])
new_tbl.rename_column(field["name"] + "_replace", field["name"])
new_tbl.materialize()
tbl = new_tbl

job_config = self._process_job_config(
job_config=job_config,
destination_table_name=table_name,
Expand Down
Loading