generated from 10xac/Twitter-Data-Analysis-Template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_data_explorator.py
125 lines (90 loc) · 3.66 KB
/
twitter_data_explorator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
import pandas as pd
import re
class TweeterDataExplorator:
def __init__(self, df):
self.df = df
def read_head(self):
return self.df.head()
# catches no. of rows and columns and column info.
def get_info(self):
row_count, col_count = self.df.shape
print(f"Number of rows: {row_count}")
print(f"Number of columns: {col_count}")
return (row_count, col_count), self.df.info()
# Catches whenever a new value is seen in a column
def get_count(self, column_name):
return self.df[column_name].value_counts()
#catches how many +1,0 and -1 polarities are there in a [polarity,count] dict
def get_polarities_count(self, places=[]):
df = self.df
polarity_score_df = pd.DataFrame(columns=['Polarity', 'Count'])
if (places and len(places) > 0):
df = df[df['place'].apply(
lambda x: x in places)]
df['score'] = df['polarity'].apply(
self.text_category)
value_counts = df['score'].value_counts()
pos_count = neg_count = neu_count = 0
try:
pos_count = value_counts['positive']
except:
pos_count = 0
try:
neg_count = value_counts['negative']
except:
neg_count = 0
try:
neu_count = value_counts['neutral']
except:
neu_count = 0
polarity_score_df['Polarity'] = ['negative', 'positive', 'neutral']
polarity_score_df['Count'] = [neg_count, pos_count, neu_count]
return polarity_score_df
# catches a hashtag data frame for every tweet
def get_hash_tag_df(self, places=[]):
df = self.df
if (places and len(places) > 0):
df = df[df['place'].apply(
lambda x: x in places)]
hash_tags = df.clean_text.apply(self.__find_hashtags)
flattened_hash_tags = []
for hash_tag_list in hash_tags:
for hash_tag in hash_tag_list:
flattened_hash_tags.append(hash_tag)
hashtag_df = pd.DataFrame(columns=['hashtag'])
hashtag_df['hashtag'] = flattened_hash_tags
return hashtag_df
# this returns the value count of top hash tags used in a data frame
# if top is not specifed, it returns with a value count of every hashtag used
def most_used_hash_tag(self, top=None, places=[]):
return self.get_hash_tag_df(places)['hashtag'].value_counts().head(top).rename_axis('hashtags').to_frame('counts')
def visualze_polarity(self):
return
# returns value count of top language used
# if top is not specifed, it returns with a value count of language of every language used
def most_used_language(self, top=None):
return self.df['lang'].value_counts().head(top)
# returns value count of top users tweeted
# if top is not specifed, it returns with a value count of language of every users who tweeted
def authors(self, top=None, places=[]):
df = self.df
if (places and len(places) > 0):
df = df[df['place'].apply(
lambda x: x in places)]
return df['original_author'].value_counts().head(top)
def most_retweeted_tweet(self):
pass
def text_category(self, p: float) -> str:
if p > 0:
return "positive"
elif p == 0:
return "neutral"
else:
return "negative"
# private function that finds hash tags from a text
def __find_hashtags(self, tweet):
try:
return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)
except:
return []