From fc3dd7dbb739e8ea9fa76397ea4cd340fa0e7203 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 19 Dec 2023 12:04:06 -0800 Subject: [PATCH] Move test generation code into README This (hopefully) fixes a problem where GH security is alerting that we are using an insecure version of pyarrow (<14.0.1). --- extensions/parquet/table/README.md | 66 +++++++++++++++++++ extensions/parquet/table/src/test/e0.py | 23 ------- .../table/src/test/e0.requirements.txt | 6 -- extensions/parquet/table/src/test/e1.py | 23 ------- .../table/src/test/e1.requirements.txt | 6 -- extensions/parquet/table/src/test/e2.py | 23 ------- .../table/src/test/e2.requirements.txt | 9 --- 7 files changed, 66 insertions(+), 90 deletions(-) create mode 100644 extensions/parquet/table/README.md delete mode 100644 extensions/parquet/table/src/test/e0.py delete mode 100644 extensions/parquet/table/src/test/e0.requirements.txt delete mode 100644 extensions/parquet/table/src/test/e1.py delete mode 100644 extensions/parquet/table/src/test/e1.requirements.txt delete mode 100644 extensions/parquet/table/src/test/e2.py delete mode 100644 extensions/parquet/table/src/test/e2.requirements.txt diff --git a/extensions/parquet/table/README.md b/extensions/parquet/table/README.md new file mode 100644 index 00000000000..c880f322b86 --- /dev/null +++ b/extensions/parquet/table/README.md @@ -0,0 +1,66 @@ +# extensions-parquet-table + +## Test data + +Some of the test data under [src/test/resources](src/test/resources) was generated with the following snippet: + +```python +import pandas as pd +import numpy as np + +df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.Categorical(list("abc")), + "i": pd.Categorical(list("abc"), ordered=True), + } +) + +df.to_parquet("uncompressed.parquet", compression=None) +df.to_parquet("brotli.parquet", compression="brotli") +df.to_parquet("gzip.parquet", compression="gzip") +df.to_parquet("lz4.parquet", compression="lz4") +df.to_parquet("snappy.parquet", compression="snappy") +df.to_parquet("zstd.parquet", compression="zstd") +``` + +Using the following requirements: + +```requirements +# for src/test/resources/e0 +numpy==1.24.2 +pandas==1.5.3 +pyarrow==5.0.0 +python-dateutil==2.8.2 +pytz==2022.7.1 +six==1.16.0 +``` + +```requirements +# for src/test/resources/e1 +numpy==1.24.2 +pandas==1.5.3 +pyarrow==11.0.0 +python-dateutil==2.8.2 +pytz==2022.7.1 +six==1.16.0 +``` + +```requirements +# for src/test/resources/e2 +cramjam==2.6.2 +fastparquet==2023.2.0 +fsspec==2023.3.0 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +python-dateutil==2.8.2 +pytz==2022.7.1 +six==1.16.0 +``` diff --git a/extensions/parquet/table/src/test/e0.py b/extensions/parquet/table/src/test/e0.py deleted file mode 100644 index 09416337baa..00000000000 --- a/extensions/parquet/table/src/test/e0.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd -import numpy as np - -df = pd.DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), - "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "h": pd.Categorical(list("abc")), - "i": pd.Categorical(list("abc"), ordered=True), - } -) - -df.to_parquet("resources/e0/uncompressed.parquet", compression=None) -df.to_parquet("resources/e0/brotli.parquet", compression="brotli") -df.to_parquet("resources/e0/gzip.parquet", compression="gzip") -df.to_parquet("resources/e0/lz4.parquet", compression="lz4") -df.to_parquet("resources/e0/snappy.parquet", compression="snappy") -df.to_parquet("resources/e0/zstd.parquet", compression="zstd") diff --git a/extensions/parquet/table/src/test/e0.requirements.txt b/extensions/parquet/table/src/test/e0.requirements.txt deleted file mode 100644 index bce4a36fa96..00000000000 --- a/extensions/parquet/table/src/test/e0.requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy==1.24.2 -pandas==1.5.3 -pyarrow==5.0.0 -python-dateutil==2.8.2 -pytz==2022.7.1 -six==1.16.0 diff --git a/extensions/parquet/table/src/test/e1.py b/extensions/parquet/table/src/test/e1.py deleted file mode 100644 index 408c327f3a8..00000000000 --- a/extensions/parquet/table/src/test/e1.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd -import numpy as np - -df = pd.DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), - "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "h": pd.Categorical(list("abc")), - "i": pd.Categorical(list("abc"), ordered=True), - } -) - -df.to_parquet("resources/e1/uncompressed.parquet", compression=None) -df.to_parquet("resources/e1/brotli.parquet", compression="brotli") -df.to_parquet("resources/e1/gzip.parquet", compression="gzip") -df.to_parquet("resources/e1/lz4.parquet", compression="lz4") -df.to_parquet("resources/e1/snappy.parquet", compression="snappy") -df.to_parquet("resources/e1/zstd.parquet", compression="zstd") diff --git a/extensions/parquet/table/src/test/e1.requirements.txt b/extensions/parquet/table/src/test/e1.requirements.txt deleted file mode 100644 index b5f6f59c296..00000000000 --- a/extensions/parquet/table/src/test/e1.requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy==1.24.2 -pandas==1.5.3 -pyarrow==11.0.0 -python-dateutil==2.8.2 -pytz==2022.7.1 -six==1.16.0 diff --git a/extensions/parquet/table/src/test/e2.py b/extensions/parquet/table/src/test/e2.py deleted file mode 100644 index 446fb28519a..00000000000 --- a/extensions/parquet/table/src/test/e2.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd -import numpy as np - -df = pd.DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.date_range("20130101", periods=3), - "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "h": pd.Categorical(list("abc")), - "i": pd.Categorical(list("abc"), ordered=True), - } -) - -df.to_parquet("resources/e2/uncompressed.parquet", compression=None) -df.to_parquet("resources/e2/brotli.parquet", compression="brotli") -df.to_parquet("resources/e2/gzip.parquet", compression="gzip") -df.to_parquet("resources/e2/lz4.parquet", compression="lz4") -df.to_parquet("resources/e2/snappy.parquet", compression="snappy") -df.to_parquet("resources/e2/zstd.parquet", compression="zstd") diff --git a/extensions/parquet/table/src/test/e2.requirements.txt b/extensions/parquet/table/src/test/e2.requirements.txt deleted file mode 100644 index b1ccb61d6c7..00000000000 --- a/extensions/parquet/table/src/test/e2.requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -cramjam==2.6.2 -fastparquet==2023.2.0 -fsspec==2023.3.0 -numpy==1.24.2 -packaging==23.0 -pandas==1.5.3 -python-dateutil==2.8.2 -pytz==2022.7.1 -six==1.16.0