feat: write data to parquet files instead of csv

kieran-mackle · Dec 8, 2023 · f84599c · f84599c
1 parent 39fc1d0
commit f84599c
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -71,7 +71,6 @@ need to be revised. An option is to partition dynamically, for example
 daily for 1 minutely data, weekly for 30 minutely, and so on.
 - Support for private downloads to assist in accounting, account tracking
 and analysis, etc.
-- Save files to parquet format instead of compressed CSV.
 - Centralise data store across machine.
 - Handling of incomplete day data. If data is downloaded for the present day, it will
 not contain 24 hours worth, depsite being timestamped for the day. This means that in

diff --git a/src/ccxt_download/public.py b/src/ccxt_download/public.py
@@ -306,7 +306,7 @@ async def candles(
     df["symbol"] = symbol
 
     # Save
-    df.to_csv(path_or_buf=filename, compression="gzip")
+    df.to_parquet(path=filename)
 
     if verbose:
         print(
@@ -410,7 +410,7 @@ async def trades(
     df.drop("timestamp", inplace=True, axis=1)
 
     # Save
-    df.to_csv(filename)
+    df.to_parquet(filename)
 
     if verbose:
         print(
@@ -516,7 +516,7 @@ async def funding(
     df.drop("timestamp", inplace=True, axis=1)
 
     # Save
-    df.to_csv(filename)
+    df.to_parquet(filename)
 
     if verbose:
         print(

diff --git a/src/ccxt_download/utilities.py b/src/ccxt_download/utilities.py
@@ -38,7 +38,9 @@ def filename_builder(
     dtid = f"{data_type_id}_" if data_type_id else ""
     filename = os.path.join(
         download_dir,
-        format_str(f"{exchange.lower()}_{dtid}{data_type}_{start_str}_{symbol}.csv.gz"),
+        format_str(
+            f"{exchange.lower()}_{dtid}{data_type}_{start_str}_{symbol}.parquet"
+        ),
     )
     return filename
 
@@ -145,7 +147,7 @@ def filter(unfiltered_files: list[str], match_strs: list[str]):
     df = pd.DataFrame()
     for f in files:
         try:
-            _df = pd.read_csv(f, index_col=0, parse_dates=True)
+            _df = pd.read_parquet(f)
             _df = _df[~_df.index.duplicated(keep="first")]
             df = pd.concat([df, _df])
         except: