-
Notifications
You must be signed in to change notification settings - Fork 13
/
setup_data.py
92 lines (75 loc) · 2.61 KB
/
setup_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import urllib.request
from pathlib import Path
import psycopg2
import duckdb
def setup_data_directory():
print("Downloading data...")
# Create the data directory
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
paths = [
("penguins/0.10/palmer_penguins.ddb", "penguins"),
("pypi/2024-04-24/deps.parquet", "pypi"),
("pypi/2024-04-24/maintainers.parquet", "pypi"),
("pypi/2024-04-24/package_urls.parquet", "pypi"),
("pypi/2024-04-24/packages.parquet", "pypi"),
("pypi/2024-04-24/scorecard_checks.parquet", "pypi"),
("pypi/2024-04-24/wheels.parquet", "pypi"),
("imdb/2024-03-22/imdb_title_basics_sample_5.parquet", "imdb"),
("imdb/2024-03-22/imdb_title_basics.parquet", "imdb"),
("imdb/2024-03-22/imdb_title_ratings.parquet", "imdb"),
]
for suffix, subdir in paths:
folder = data_dir / subdir
folder.mkdir(exist_ok=True)
target = folder / suffix.rsplit("/", 1)[-1]
if not target.exists():
print(f"- {target}")
urllib.request.urlretrieve(
f"https://storage.googleapis.com/ibis-tutorial-data/{suffix}",
target,
)
def setup_postgres_database():
create_tables = """
DROP TABLE IF EXISTS imdb_title_basics;
CREATE TABLE imdb_title_basics (
tconst TEXT,
"titleType" TEXT,
"primaryTitle" TEXT,
"originalTitle" TEXT,
"isAdult" BIGINT,
"startYear" BIGINT,
"endYear" BIGINT,
"runtimeMinutes" BIGINT,
genres TEXT
);
DROP TABLE IF EXISTS imdb_title_ratings;
CREATE TABLE imdb_title_ratings (
tconst TEXT,
"averageRating" DOUBLE PRECISION,
"numVotes" BIGINT
);
"""
load_data = """
ATTACH 'dbname=postgres user=postgres' AS pg (TYPE POSTGRES);
INSERT INTO pg.imdb_title_basics FROM 'data/imdb/imdb_title_basics.parquet';
INSERT INTO pg.imdb_title_ratings FROM 'data/imdb/imdb_title_ratings.parquet';
"""
validate_data = """
SELECT 'imdb_title_basics' table, COUNT(*) n FROM imdb_title_basics UNION
SELECT 'imdb_title_ratings' table, COUNT(*) n FROM imdb_title_ratings
ORDER BY "table";
"""
print("Creating postgres tables...")
with psycopg2.connect(dbname="postgres", user="postgres") as pg:
with pg.cursor() as cur:
cur.execute(create_tables)
print("Loading data into postgres...")
with duckdb.connect() as ddb:
ddb.execute(load_data)
def setup():
setup_data_directory()
setup_postgres_database()
print("Done!")
if __name__ == "__main__":
setup()