Skip to content

Commit

Permalink
parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
swuecho committed Jun 13, 2024
1 parent f22fc2d commit e5d149d
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 14 deletions.
3 changes: 1 addition & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ name = "pypi"

pandas = "==1.3.3"
pytest = "==4.5.0"
pyarrow = "==3.0.0"


[dev-packages]
Expand All @@ -18,4 +17,4 @@ pyarrow = "==3.0.0"

[requires]

python_version = "3.7"
python_version = "3.11"
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
pandas==1.5.1
pytest==6.0.2
pyarrow==11.0.0
17 changes: 8 additions & 9 deletions snapshottest_ext/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
import io
import pandas as pd
import pyarrow as pa
from snapshottest.formatter import Formatter
from snapshottest.formatters import BaseFormatter


class ArrowSerializer:

class ParquetSerializer:
@staticmethod
def pandas_to_bytes(df: pd.DataFrame) -> str:
df_bytestring = pa.serialize(df).to_buffer().to_pybytes()
buffer = io.BytesIO()
df.to_parquet(buffer)
df_bytestring = buffer.getvalue()
return df_bytestring

@staticmethod
def bytes_to_pandas(raw_bytes) -> pd.DataFrame:
original_df = pa.deserialize(raw_bytes)
original_df = pd.read_parquet(io.BytesIO(raw_bytes))
return original_df


class PandasSnapshot(object):
def __init__(self, value):
self.value = value
Expand All @@ -39,7 +38,7 @@ def get_formatter(value):

def store(self, formatter, pandas_snap: PandasSnapshot):
""" store pd.DataFrame as bytes in snapshot file"""
return ArrowSerializer.pandas_to_bytes(pandas_snap.value)
return ParquetSerializer.pandas_to_bytes(pandas_snap.value)

#def get_imports(self):
# """not useful in this one, because we do not deserialize from the bytes directly"""
Expand All @@ -51,7 +50,7 @@ def assert_value_matches_snapshot(self, test, test_value: PandasSnapshot, snapsh
:param test_value: the value in snapshot.assert_mach(value)
:param snapshot_value: the value of format.store (after load)
"""
prev_df = ArrowSerializer.bytes_to_pandas(snapshot_value) # deserialize from bytes to pd.DataFrame
prev_df = ParquetSerializer.bytes_to_pandas(snapshot_value) # deserialize from bytes to pd.DataFrame
pd.testing.assert_frame_equal(test_value.value, prev_df)


Expand Down
Loading

0 comments on commit e5d149d

Please sign in to comment.