Merge pull request #44 from gismart/depupd

Update dependencies
gismart · Jan 25, 2024 · 307abcd · 307abcd
2 parents 279d640 + 79f93c5
commit 307abcd
Show file tree

Hide file tree

Showing 9 changed files with 25 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Add `--upgrade` option to update existing package to a new version
 Specify package link in your `requirements.txt`:
 
 ```txt
-git+https://github.com/gismart/[email protected].0#egg=bi-utils-gismart
+git+https://github.com/gismart/[email protected].1#egg=bi-utils-gismart
 ```
 
 ### Usage

diff --git a/bi_utils/aws/db.py b/bi_utils/aws/db.py
@@ -6,8 +6,8 @@
 import posixpath
 import pandas as pd
 import datetime as dt
-import fastparquet as fp
 from typing import Any, Iterable, Iterator, Sequence, Optional, Union
+import pyarrow.parquet as pp
 
 from .. import files, sql
 from . import connection
@@ -44,7 +44,7 @@ def upload_file(
         copy_options.append("PARQUET")
         separator = None
         if not columns:
-            columns = fp.ParquetFile(file_path).columns
+            columns = pp.ParquetFile(file_path).schema.names
     else:
         raise ValueError(f"{os.path.basename(file_path)} file extension is not supported")
     table_name = f"{schema}.{table}"
@@ -175,7 +175,12 @@ def upload_data(
             logger.warning(f"Partitions are not supported for csv files: {filename}")
         data.to_csv(file_path, index=False, sep=separator)
     elif file_path.lower().endswith(".parquet"):
-        data.to_parquet(file_path, partition_cols=partition_cols, times="int96", index=False)
+        data.to_parquet(
+            file_path,
+            partition_cols=partition_cols,
+            coerce_timestamps="us",
+            index=False,
+        )
     else:
         raise ValueError(f"{filename} file extension is not supported")
     logger.info(f"Data is saved to {filename} ({len(data)} rows)")

diff --git a/bi_utils/queue_exporter.py b/bi_utils/queue_exporter.py
@@ -172,7 +172,12 @@ def _export_df(
             elif ".parquet" in file_path.lower():
                 if partition_cols:
                     logger.warning(f"Partitions are not supported for csv files: {filename}")
-                df.to_parquet(file_path, partition_cols=partition_cols, times="int96", index=False)
+                df.to_parquet(
+                    file_path,
+                    partition_cols=partition_cols,
+                    coerce_timestamps="us",
+                    index=False,
+                )
             else:
                 df.to_pickle(file_path)
             logger.info(f"Saved df to {filename} ({len(df)} rows)")

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 numpy<2.0.0,>=1.19.2
-pandas<2.0.0,>=1.1.0
+pandas<3.0.0,>=1.1.0
 psutil<6.0.0,>=5.7.0
 psycopg2-binary<3.0.0,>=2.9.0
 scikit-learn<2.0.0,>=0.23.1
 SQLAlchemy<2.0.0,>=1.4.46
-fastparquet==2023.2.0
-locopy==0.5.1
+pyarrow>=15.0.0
+locopy==0.5.7
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setuptools.setup(
     name="bi-utils-gismart",
-    version="0.16.0",
+    version="0.16.1",
     author="gismart",
     author_email="[email protected]",
     description="Utils for BI team",

diff --git a/tests/aws/test_db.py b/tests/aws/test_db.py
@@ -21,7 +21,7 @@ def test_delete_wo_conditions():
 def test_upload_download_delete(file_format):
     version = 1
     db.delete(table, schema=schema, version=version)
-    timestamp = pd.Timestamp.now()
+    timestamp = pd.Timestamp.now().as_unit("ns")
     data = pd.DataFrame(
         {
             "text": ["hello", "bye"],
@@ -31,6 +31,7 @@ def test_upload_download_delete(file_format):
         }
     )
     data.predict_dt = pd.to_datetime(data.predict_dt)
+    data.load_dttm = pd.to_datetime(data.load_dttm)
     db.upload_data(data, f"/tmp/data.{file_format}", schema=schema, table=table)
     query = f"""
         SELECT text, predict_dt, version, load_dttm
@@ -59,7 +60,7 @@ def test_upload_update_download(file_format):
     new_version = 2
     db.delete(table, schema=schema, version=version)
     db.delete(table, schema=schema, version=new_version)
-    timestamp = pd.Timestamp.now()
+    timestamp = pd.Timestamp.now().as_unit("ns")
     data = pd.DataFrame(
         {
             "text": ["hello", "bye"],

diff --git a/tests/transformers/test_hierarchical_encoder.py b/tests/transformers/test_hierarchical_encoder.py
@@ -11,7 +11,7 @@
 def test_hierarchical_encoder(cols, C, data):
     data = data.dropna()
     target_data = pd.read_csv(utils.data_path("hierarchical_encoder.csv"))
-    target_data = target_data[(target_data.cols == str(cols)) & (target_data.C == C)]
+    target_data = target_data[(target_data.cols.fillna("None") == str(cols)) & (target_data.C == C)]
     clipper = transformers.HierarchicalEncoder(cols=cols, C=C)
     X = data.drop(["conversion", "conversion_predict"], axis=1)
     y = data["conversion"]

diff --git a/tests/transformers/test_quantile_clipper.py b/tests/transformers/test_quantile_clipper.py
@@ -11,7 +11,7 @@
 def test_quantile_clipper(cols, q, data):
     data = data.dropna()
     target_data = pd.read_csv(utils.data_path("quantile_clipper.csv"))
-    target_data = target_data[(target_data.cols == str(cols)) & (target_data.q == q)]
+    target_data = target_data[(target_data.cols.fillna("None") == str(cols)) & (target_data.q == q)]
     clipper = transformers.QuantileClipper(cols=cols, q=q)
     X = data.drop(["conversion", "conversion_predict"], axis=1)
     y = data["conversion"]

diff --git a/tests/transformers/test_target_encoder.py b/tests/transformers/test_target_encoder.py
@@ -11,7 +11,7 @@
 def test_target_encoder(cols, C, data):
     data = data.dropna()
     target_data = pd.read_csv(utils.data_path("target_encoder.csv"))
-    target_data = target_data[(target_data.cols == str(cols)) & (target_data.C == C)]
+    target_data = target_data[(target_data.cols.fillna("None") == str(cols)) & (target_data.C == C)]
     clipper = transformers.TargetEncoder(cols=cols, C=C)
     X = data.drop(["conversion", "conversion_predict"], axis=1)
     y = data["conversion"]