-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner_main.py
59 lines (48 loc) · 1.72 KB
/
ner_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from collections import defaultdict, Counter
import Algorithmia
client = Algorithmia.client("your_algorithmia_api_key")
def pull_tweets(test_query):
input = {
"query": test_query,
"numTweets": "700",
"auth": {
"app_key": 'your_consumer_key',
"app_secret": 'your_consumer_secret_key',
"oauth_token": 'your_access_token',
"oauth_token_secret": 'your_access_token_secret'
}
}
twitter_algo = client.algo("twitter/RetrieveTweetsWithKeyword/0.1.3")
result = twitter_algo.pipe(input).result
tweet_list = [tweets['text'] for tweets in result]
return tweet_list
def process_text():
"""Remove emoticons, numbers etc. and returns list of cleaned tweets."""
data = pull_tweets()
regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^RT|http.+?"
stripped_text = [
re.sub(regex_remove, '',
tweets).strip() for tweets in data
]
return '. '.join(stripped_text)
def get_ner(test_query):
"""Get named entities from the NER algorithm using clean tweet data."""
data = process_text(test_query)
ner_algo = client.algo(
'StanfordNLP/NamedEntityRecognition/0.1.1').set_options(timeout=600)
ner_result = ner_algo.pipe(data).result
return ner_result
def group_data(test_query):
data = get_ner(test_query)
default_dict = defaultdict(list)
for items in data:
for k, v in items:
if 'ORGANIZATION' in v:
default_dict[v].append(k)
ner_list = [{keys: Counter(values)}
for (keys, values) in default_dict.items()]
return ner_list
if __name__ == '__main__':
test_query="Google"
group_data(test_query)