From 99999495cf645de4f2f9ff8ce1d06b0f9b4d2da5 Mon Sep 17 00:00:00 2001 From: Michael Skarbek Date: Thu, 7 Dec 2023 16:15:49 -0500 Subject: [PATCH] [COST-4481] use StringDtype(storage="pyarrow") (#4826) --- koku/masu/external/downloader/aws/aws_report_downloader.py | 2 +- .../external/downloader/azure/azure_report_downloader.py | 5 ++++- koku/masu/external/downloader/gcp/gcp_report_downloader.py | 2 +- koku/masu/external/downloader/oci/oci_report_downloader.py | 2 +- koku/masu/external/kafka_msg_handler.py | 2 +- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/koku/masu/external/downloader/aws/aws_report_downloader.py b/koku/masu/external/downloader/aws/aws_report_downloader.py index db05e90822..0b90cfbade 100644 --- a/koku/masu/external/downloader/aws/aws_report_downloader.py +++ b/koku/masu/external/downloader/aws/aws_report_downloader.py @@ -118,7 +118,7 @@ def create_daily_archives( local_file, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, usecols=lambda x: x in use_cols, - dtype="str", + dtype=pd.StringDtype(storage="pyarrow"), ) as reader: for i, data_frame in enumerate(reader): if data_frame.empty: diff --git a/koku/masu/external/downloader/azure/azure_report_downloader.py b/koku/masu/external/downloader/azure/azure_report_downloader.py index daa16ea395..5fbb425272 100644 --- a/koku/masu/external/downloader/azure/azure_report_downloader.py +++ b/koku/masu/external/downloader/azure/azure_report_downloader.py @@ -109,7 +109,10 @@ def create_daily_archives( {"UsageDateTime", "Date", "date", "usagedatetime"} )[0] with pd.read_csv( - local_file, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, parse_dates=[time_interval], dtype="str" + local_file, + chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, + parse_dates=[time_interval], + dtype=pd.StringDtype(storage="pyarrow"), ) as reader: for i, data_frame in enumerate(reader): if data_frame.empty: diff --git a/koku/masu/external/downloader/gcp/gcp_report_downloader.py b/koku/masu/external/downloader/gcp/gcp_report_downloader.py index 0c461a1111..ce6576ffc1 100644 --- a/koku/masu/external/downloader/gcp/gcp_report_downloader.py +++ b/koku/masu/external/downloader/gcp/gcp_report_downloader.py @@ -52,7 +52,7 @@ class GCPReportDownloaderError(Exception): def pd_read_csv(local_file_path): try: - return pd.read_csv(local_file_path, dtype="str") + return pd.read_csv(local_file_path, dtype=pd.StringDtype(storage="pyarrow")) except Exception as error: LOG.error(log_json(msg="file could not be parsed", file_path=local_file_path), exc_info=error) raise GCPReportDownloaderError(error) diff --git a/koku/masu/external/downloader/oci/oci_report_downloader.py b/koku/masu/external/downloader/oci/oci_report_downloader.py index b916ef3e4a..5e123e5eb5 100644 --- a/koku/masu/external/downloader/oci/oci_report_downloader.py +++ b/koku/masu/external/downloader/oci/oci_report_downloader.py @@ -40,7 +40,7 @@ def divide_csv_monthly(file_path, filename): directory = os.path.dirname(file_path) try: - data_frame = pd.read_csv(file_path, dtype="str") + data_frame = pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow")) except Exception as error: LOG.error(f"File {file_path} could not be parsed. Reason: {error}") raise error diff --git a/koku/masu/external/kafka_msg_handler.py b/koku/masu/external/kafka_msg_handler.py index 3ea8b07a34..fa1ef41052 100644 --- a/koku/masu/external/kafka_msg_handler.py +++ b/koku/masu/external/kafka_msg_handler.py @@ -75,7 +75,7 @@ def divide_csv_daily(file_path: os.PathLike, manifest_id: int): daily_files = [] try: - data_frame = pd.read_csv(file_path, dtype="str") + data_frame = pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow")) except Exception as error: LOG.error(f"File {file_path} could not be parsed. Reason: {str(error)}") raise error