diff --git a/src/koheesio/integrations/box.py b/src/koheesio/integrations/box.py index 149d9898..30a804e5 100644 --- a/src/koheesio/integrations/box.py +++ b/src/koheesio/integrations/box.py @@ -35,6 +35,7 @@ model_validator, ) from koheesio.spark.readers import Reader +from koheesio.spark.readers.memory import InMemoryDataReader from koheesio.utils import utc_now @@ -418,10 +419,8 @@ def execute(self) -> BoxReaderBase.Output: file = self.client.file(file_id=f) data = file.content().decode(self.file_encoding) - data_buffer = StringIO(data) - temp_df_pandas = pd.read_csv(data_buffer, header=0, dtype=str if not self.schema_ else None, **self.params) # type: ignore - temp_df = self.spark.createDataFrame(temp_df_pandas, schema=self.schema_) - + temp_df = InMemoryDataReader(data=data, schema=self.schema_, params=self.params, format="csv").read() + # type: ignore # noinspection PyUnresolvedReferences temp_df = ( diff --git a/tests/spark/integrations/box/test_box.py b/tests/spark/integrations/box/test_box.py index 42dfd7d7..e366a0d5 100644 --- a/tests/spark/integrations/box/test_box.py +++ b/tests/spark/integrations/box/test_box.py @@ -245,7 +245,7 @@ def test_execute_wo_schema(self, spark, dummy_box): assert bcr.df.count() == 1 assert bcr.df.dtypes == [ ("foo", "string"), - ("bar", "string"), + ("bar", "bigint"), ("meta_file_id", "string"), ("meta_file_name", "string"), ("meta_load_timestamp", "timestamp"),