-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_parquet.py
84 lines (58 loc) · 2.43 KB
/
read_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from pathlib import Path
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet
import pyarrow.feather
INPUT_FILENAME = "output/summary.parquet"
print("")
print(f"INPUT_FILENAME: {INPUT_FILENAME}")
print("")
if Path(INPUT_FILENAME).suffix == ".parquet":
print("===================================")
print("QUERIES AGAINST PARQUET FILE")
print("===================================")
# See: https://arrow.apache.org/docs/python/parquet.html
parquet_file = pyarrow.parquet.ParquetFile(INPUT_FILENAME)
print(parquet_file.metadata.row_group(0).column(0).statistics)
print(parquet_file.metadata.row_group(0).column(1).statistics)
print(parquet_file.metadata)
if Path(INPUT_FILENAME).suffix == ".arrow":
print("===================================")
print("QUERIES AGAINST ARROW FILE")
print("===================================")
source = pa.memory_map(INPUT_FILENAME, "r")
reader = pyarrow.ipc.RecordBatchFileReader(source)
print(f"reader.num_record_batches={reader.num_record_batches}")
print(f"reader.stats={reader.stats}")
if Path(INPUT_FILENAME).suffix == ".parquet":
table = pa.parquet.read_table(INPUT_FILENAME)
else:
table = pa.feather.read_table(INPUT_FILENAME)
# Just select a few columns for debugging
table = table.select(["DATE", "FOPR", "TCPU"])
print("===================================")
print("QUERIES AGAINST ARROW TABLE")
print("===================================")
print("\ntable.schema")
print(table.schema)
#print("\ntable.schema.metadata")
#print(table.schema.metadata)
print("\ntable.shape", table.shape)
arrow_date = table["DATE"][0]
print("\narrow_date:", type(arrow_date), arrow_date.type, arrow_date)
#schema_smry_meta = json.loads(table.schema.metadata[b"smry_meta"])
col_name = "FOPR"
#print(f"schema metadata for {col_name}: {schema_smry_meta[col_name]}")
print(f" field metadata for {col_name}: {json.loads(table.field(col_name).metadata[b'smry_meta'])}")
col_name = table.schema.names[-1]
#print(f"schema metadata for {col_name}: {schema_smry_meta[col_name]}")
print(f" field metadata for {col_name}: {json.loads(table.field(col_name).metadata[b'smry_meta'])}")
print("\n")
print("===================================")
print("QUERIES AGAINST PANDAS DATAFRAME")
print("===================================")
df = table.to_pandas(timestamp_as_object=True)
print(df.head())
pandasdate = df["DATE"][0]
print("\npandasdate", type(pandasdate), pandasdate)