-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsql_queries.py
126 lines (99 loc) · 3.35 KB
/
sql_queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# DROP TABLES
songplay_table_drop = "DROP TABLE IF EXISTS songplays"
user_table_drop = "DROP TABLE IF EXISTS users"
song_table_drop = "DROP TABLE IF EXISTS songs"
artist_table_drop = "DROP TABLE IF EXISTS artists"
time_table_drop = "DROP TABLE IF EXISTS time"
# CREATE TABLES
songplay_table_create = ("""
CREATE TABLE IF NOT EXISTS songplays (
songplay_id BIGSERIAL PRIMARY KEY,
start_time TIMESTAMP REFERENCES time(start_time) NOT NULL,
user_id BIGINT REFERENCES users(user_id) NOT NULL,
level VARCHAR(30),
song_id VARCHAR(100) REFERENCES songs(song_id),
artist_id VARCHAR(100) REFERENCES artists(artist_id),
session_id BIGINT,
location VARCHAR(200),
user_agent VARCHAR(250)
);
""")
user_table_create = ("""
CREATE TABLE IF NOT EXISTS users (
user_id BIGINT PRIMARY KEY,
first_name VARCHAR(50),
last_name VARCHAR(50),
gender CHAR(1),
level varchar(30)
);
""")
# First define artists that then produce songs (referenced by artist_id)
artist_table_create = ("""
CREATE TABLE IF NOT EXISTS artists (
artist_id VARCHAR(100) PRIMARY KEY,
name VARCHAR(100),
location VARCHAR(100),
latitude FLOAT(8),
longitude FLOAT(8)
);
""")
# Song table REFERENCES artist.artist_id as FOREIGN KEY that is also NOT NULL
song_table_create = ("""
CREATE TABLE IF NOT EXISTS songs (
song_id VARCHAR(100) PRIMARY KEY,
title VARCHAR(150),
artist_id VARCHAR(100) REFERENCES artists(artist_id) NOT NULL,
year INTEGER,
duration FLOAT(10),
CONSTRAINT year check (year >= 0)
);
""")
time_table_create = ("""
CREATE TABLE IF NOT EXISTS time (
start_time TIMESTAMP PRIMARY KEY,
hour INTEGER,
day INTEGER,
week INTEGER,
month INTEGER,
year INTEGER,
weekdaY INTEGER
);
""")
# INSERT RECORDS
# (my comment): adding additional on conflict for primary keys
# UPSERT statements
songplay_table_insert = ("""
INSERT INTO songplays (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT(songplay_id) DO NOTHING;
""")
user_table_insert = ("""
INSERT INTO users (user_id, first_name, last_name, gender, level)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT (user_id) DO UPDATE SET level = EXCLUDED.level;
""")
song_table_insert = ("""
INSERT INTO songs (song_id, title, artist_id, year, duration)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")
artist_table_insert = ("""
INSERT INTO artists (artist_id, name, location, latitude, longitude)
VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")
time_table_insert = ("""
INSERT INTO time (start_time, hour, day, week, month, year, weekday)
VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
""")
# FIND SONGS
song_select = ("""
SELECT songs.song_id, songs.artist_id FROM songs
JOIN artists ON songs.artist_id = artists.artist_id
WHERE songs.title = %s
AND artists.name = %s
AND songs.duration = %s
;
""")
# QUERY LISTS
# Reordered the creation of tables, as the FACT TABLE (songplays) is depending on DIMENSION TABLES with FOREIGN KEY
# constraints. I.e. the initial definition is not optimal
create_table_queries = [ user_table_create,artist_table_create, song_table_create, time_table_create,songplay_table_create]
drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]