-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_ExtractAndParse2.py
61 lines (56 loc) · 2.08 KB
/
2_ExtractAndParse2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import treetaggerwrapper
import json
import pprint
import myUtils
tagger = treetaggerwrapper.TreeTagger(TAGLANG="it", TAGDIR="/home/boffa/PycharmProjects/tweets/venv1/bin/tree-tagger")
typesToConsider = ["NOM", "ADJ", "NPR"]
jsonPoliticians = open('./d3/politicians.json', 'r')
politiciansObject = json.load(jsonPoliticians)
for name, politicianValue in politiciansObject.items():
print(name)
toPlot = []
toWrite = {}
nTweets = 0
nTotalWords = 0
toWrite["idsAndWords"] = {}
f = open("./allTweets/" + politicianValue['twetterName'] + ".json", "r")
tweetsObject = json.loads(f.read())
f.close()
for j, tweet in enumerate(tweetsObject):
toConsider = ""
NotToConsider = ""
tweetText = tweet['text']
# print(tweetText)
tweetText = tweetText.replace("’", " ")
finalTwertText = ""
for word in tweetText.split():
if "http" not in word:
if word[0] != "@" and word != "RT":
if word[0] == "#":
finalTwertText += " " + word[1:]
else:
finalTwertText += " " + word
tags = tagger.tag_text(finalTwertText)
tags2 = treetaggerwrapper.make_tags(tags)
for tag in tags2:
if len(tag) == 3:
if tag[1] in typesToConsider and len(tag[0]) > 1 and tag[0].isalpha():
# print(tag)
toConsider += " " + tag[2].lower()
nTotalWords += 1
else:
NotToConsider += " " + tag[2].lower()
else:
NotToConsider += tag[0].lower()
#print(toConsider[0:2110])
#print()
#print(NotToConsider[0:2110])
if toConsider != "":
toWrite["idsAndWords"][tweet["id"]] = toConsider
nTweets += 1
toWrite["nTweets"] = nTweets
toWrite["nTotalWords"] = nTotalWords
fw = open('allTweetsAndWords/' + politicianValue['twetterName'] + ".json", "w+")
fw.write(json.dumps(toWrite, ensure_ascii=False))
fw.flush()
fw.close()