-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_yt_twitch.py
210 lines (161 loc) · 8.95 KB
/
merge_yt_twitch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import pandas as pd
import os
import re
from db_interface import *
import nltk
def clean_names(df):
# Remove columns containing 'V'
df = df.loc[:, ~df.columns.str.contains('V')]
# Replace spaces with underscores and remove non-alphanumeric characters except underscores
df.columns = [re.sub('[^a-z_]', '', col.lower().replace(' ', '_')) for col in df.columns]
return df
def load_all_files(folder):
streamer_files = [f for f in os.listdir(folder) if f.endswith('.csv')] # List CSV files
dt = pd.DataFrame() # Empty DataFrame
for f in streamer_files:
game = f.split(" - ")[0]
tmp = pd.read_csv(os.path.join(folder, f))
tmp['game'] = game # Create a column with the name of the game
dt = pd.concat([dt, tmp], ignore_index=True) # Append the game data to the big table
dt = clean_names(dt)
return dt
def process_twitch():
dt = load_all_files("twitch")
# Convert minutes to hours
dt['watch_time_hours'] = dt['watch_time_mins'] / 60
dt['stream_time_hours'] = dt['stream_time_mins'] / 60
return dt
def load_all_files(folder):
streamer_files = [f for f in os.listdir(folder) if f.endswith('.csv')]
dt = pd.DataFrame()
for f in streamer_files:
game = f.split(" - ")[0]
tmp = pd.read_csv(os.path.join(folder, f))
tmp['game'] = game
dt = pd.concat([dt, tmp], ignore_index=True)
dt = clean_names(dt)
return dt
def clean_names(df):
df.columns = [re.sub('[^a-z_]', '', col.lower().replace(' ', '_')) for col in df.columns]
return df
def get_youtube():
dt = GetVideos()
# https://www.researchgate.net/figure/Left-Average-percentage-viewed-of-a-video-versus-video-length-measured-in-minutes_fig3_343355041
dt['stream_time_hours'] = dt['duration']
dt['watch_time_hours'] = (20 / (dt['duration'] + 30) + 0.05).clip(lower=.1, upper=1) * dt['views']
return dt
def fix_game_names(twitch, yt):
twitch["game"] = twitch.game.str.lower().replace("[^a-z0-9 ]", "").str.strip()
yt["game"] = yt.game.str.lower().replace("[^a-z0-9 ]", "").str.strip()
games = pd.read_csv("games.csv", names=["game"])
names = pd.DataFrame({"game2": twitch.game.unique().tolist() + yt.game.unique().tolist()})
names = names.merge(games, how="cross")
names["game"] = names.game.str.lower().replace("[^a-z0-9 ]", "").str.strip()
match = names[names.game == names.game2].drop_duplicates().reset_index()
print("games matching: " + ", ".join(match.game.unique().tolist()))
leftovers = names[(~names.game.isin(match.game)) & (~names.game2.isin(match.game2))]
if (leftovers.shape[0] > 0):
leftovers["dist"] = leftovers.apply(lambda x: nltk.edit_distance(x.game, x.game2), axis=1)
leftovers['r'] = leftovers.groupby('game2')['dist'].rank(method="first", ascending=False)
match2 = leftovers[(leftovers['r'] == 1) & (leftovers.dist < 2)][['game', 'game2']]
match = pd.concat([match, match2])
print("games not matching: " + ", ".join(leftovers[~leftovers.game2.isin(match.game2)].game2.unique().tolist()))
twitch.rename(columns = {"game": "game2"}, inplace=True)
twitch = twitch.merge(match, on="game2", how="inner")
twitch.drop(columns=['game2'], inplace=True)
yt.rename(columns = {"game": "game2"}, inplace=True)
yt = yt.merge(match, on="game2", how="inner")
yt.drop(columns=['game2'], inplace=True)
return twitch, yt
def fix_channel_names(twitch, yt, streamer_emails):
names = pd.DataFrame({"channel": twitch.channel.unique().tolist() + yt.channel.unique().tolist()})
names = names.merge(streamer_emails, on="channel", how="left")
names = names[~names.email.isna()]
names['n'] = names['channel'].str.len()
names['r'] = names.groupby('email')['n'].rank(method="first", ascending=False)
names = names[names['r'] == 1][['channel', 'email']]
streamer_emails.rename(columns = {"channel": "ch"}, inplace=True)
names = names.merge(streamer_emails, on="email", how="left")
streamer_emails.rename(columns = {"ch": "channel"}, inplace=True)
twitch.rename(columns = {"channel": "ch"}, inplace=True)
twitch = twitch.merge(names[["channel", "ch"]], on="ch", how="left")
twitch.channel.fillna(twitch.ch, inplace=True)
twitch.drop(columns=['ch'], inplace=True)
yt.rename(columns = {"channel": "ch"}, inplace=True)
yt = yt.merge(names[["channel", "ch"]], on="ch", how="left")
yt.channel.fillna(yt.ch, inplace=True)
yt.drop(columns=['ch'], inplace=True)
return twitch, yt
def merge_yt_twitch(twitch, youtube):
streamer_emails = load_streamer_emails()
twitch['channel'] = twitch['channel'].str.lower()
youtube['channel'] = youtube['channel'].str.lower()
# Set additional columns for twitch
twitch['views'] = twitch['average_viewers']
twitch['origin'] = 'twitch'
twitch.drop('watch_time_hours', axis=1, inplace=True) # Remove existing watch_time_hours
twitch['watch_time_hours'] = twitch['stream_time_hours'] * twitch['views']
twitch_channel = twitch.groupby(["channel"]).agg({"watch_time_hours":"sum"}).reset_index()
twitch_channel.columns = ["channel", "watch_hours_twitch"]
# Set column for youtube
youtube['origin'] = 'youtube'
print(f"yt: {youtube.watch_time_hours.sum()}; {youtube.watch_time_hours.mean()}")
print(f"tw: {twitch.watch_time_hours.sum()}; {twitch.watch_time_hours.mean()}")
twitch, youtube = fix_game_names(twitch, youtube)
twitch, youtube = fix_channel_names(twitch, youtube, streamer_emails)
yt_channel = youtube.groupby(["channel"]).agg({"watch_time_hours":"sum"}).reset_index()
yt_channel.columns = ["channel", "watch_hours_yt"]
# Combine files
selected_cols = ['channel', 'game', 'watch_time_hours', 'stream_time_hours', 'views']
full_dt = pd.concat([twitch[selected_cols], youtube[selected_cols]])
# Aggregate at channel-game level
channel_game = full_dt.groupby(['channel', 'game']).agg({
'watch_time_hours': 'sum',
'stream_time_hours': 'sum',
'views': 'sum'
}).reset_index()
channel_game['rank_var'] = channel_game['watch_time_hours']
channel_game['game_rank'] = channel_game.groupby('channel')['rank_var'].rank(ascending=False, method='dense')
channel_game.sort_values('game_rank', inplace=True)
rank1 = channel_game[channel_game.game_rank==1][["channel", "game", "watch_time_hours"]]
rank1.columns = ["channel", "top_game", "top_game_watch_hours"]
# Top streamer games
top_games = channel_game.groupby('game').size().reset_index(name='N').sort_values(by='N', ascending=False)
# Assuming channel_game DataFrame is already created and populated
# Compute various sums and conditions
channel = channel_game.groupby(['channel']).agg({
'watch_time_hours': 'sum',
'stream_time_hours': 'sum',
'game': lambda x: ", ".join(x)
}).reset_index()
channel.columns = ['channel', 'watch_hours', 'stream_hours', 'games']
channel["avg_viewers"] = channel.watch_hours / channel.stream_hours
channel = channel.merge(yt_channel, on="channel", how = "left")
channel = channel.merge(twitch_channel, on="channel", how = "left")
channel["watch_hours_yt"].fillna(0, inplace=True)
channel["pct_yt"] = channel["watch_hours_yt"] / channel["watch_hours"]
channel.pct_yt.fillna(0, inplace=True)
channel = channel.merge(rank1, on="channel", how="left")
channel["pct_top_game"] = channel["top_game_watch_hours"] / channel["watch_hours"]
channel.pct_top_game.fillna(1, inplace=True)
channel = merge_streamer_emails(channel[['channel', 'avg_viewers', 'watch_hours', 'stream_hours', "pct_yt", "top_game", "pct_top_game", "games"]])
channel["channel"] = channel["channel"].str.replace(",", "")
return channel
def load_streamer_emails():
# Load and clean streamer_emails data
streamer_emails = pd.read_csv("streamer_emails.csv", header=0, encoding="latin1")
streamer_emails['language'] = streamer_emails['language'].str.lower()
streamer_emails['email'] = streamer_emails['email'].str.lower()
streamer_emails = streamer_emails[streamer_emails['email'] != '']
# Compute additional columns in streamer_emails
streamer_emails['n'] = streamer_emails['language'].str.len() + 10 * streamer_emails['email'].str.contains('@')
streamer_emails['r'] = streamer_emails.groupby('channel')['n'].rank(method="first", ascending=False)
streamer_emails = streamer_emails[streamer_emails['r'] == 1][['channel', 'email', 'language']]
return streamer_emails
def merge_streamer_emails(channel):
streamer_emails = load_streamer_emails()
# Merge with emails
channel = pd.merge(channel, streamer_emails, on='channel', how='left')
channel.email.fillna("", inplace=True)
channel.language.fillna("", inplace=True)
return channel