-
Notifications
You must be signed in to change notification settings - Fork 0
/
filterURLs.py
107 lines (91 loc) · 3.36 KB
/
filterURLs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 22 19:40:09 2017
@author: hagen
(Modified by Partha Das on 29-06-2017)
"""
import json
import os
import time
import pandas as pd
import re
import datetime
def cleaned_url(s):
""" Function to clean up multiple trailing ')', '.', '*' """
l = 0
s = s.strip()
for c in reversed(s):
if c == ')' or c == '.' or c == '*' or c == ',':
l += 1
else:
break
if l > 0:
tmp = s[:-l]
return tmp
else:
return s
def filterURLs(inputfile, outputfile, searchfor):
""" Function to parse through the input csv and filter only the URLs
specified by the filter.
@Params:
inputfile: The relative location of the input csv
outputfile: The relative of where the filtered output should be saved
searchfor: The filter list to be searched for in the urls
"""
with open(inputfile, 'r') as file:
csvobject = pd.read_csv(file)
urls = csvobject.body.str.contains('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', na=False)
urlcomments = pd.DataFrame(csvobject[urls])
newurlcomments = urlcomments.body.str.extractall("(?P<imageurl>http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)")
columns = ['index','match','imageurl','name', 'author', 'ups', 'name']
contextdf = pd.DataFrame(columns=columns)
indices = []
matches = []
dates = []
formatteddates = []
imageurls = []
bodies = []
authors = []
subreddits = []
scores = []
for index, line in newurlcomments.iterrows():
tmp = cleaned_url(line.imageurl)
if len(re.findall(r'|'.join(searchfor), tmp)) > 0:
newindex=index[0]
newdate = csvobject.created_utc[newindex]
formatteddate = datetime.datetime.fromtimestamp(int(newdate)).strftime('%Y-%m-%d %H:%M:%S')
newbody = csvobject.body[newindex]
newauthor = csvobject.author[newindex]
newsubreddit = csvobject.subreddit[newindex]
newscore= csvobject.score[newindex]
indices.append(newindex)
matches.append(index[1])
dates.append(newdate)
formatteddates.append(formatteddate)
imageurls.append(tmp)
bodies.append(newbody)
authors.append(newauthor)
subreddits.append(newsubreddit)
scores.append(newscore)
contextdf['index'] = indices
contextdf['match'] = matches
contextdf['created_unix'] = dates
contextdf['created_time'] = formatteddates
contextdf['imageurl'] = imageurls
contextdf['name'] = bodies
contextdf['author'] = authors
contextdf['reactionscount'] = scores
contextdf['subreddit'] = subreddits
contextdf.to_csv(outputfile, index=False)
if __name__ == '__main__':
starttime = time.time()
print('code started')
#ENTER SEARCH TERMS
#searchfor = ['.png', '.gif', '.jpg']
searchfor =['']
#inputfile = str('RC_2008-05.csv')
inputfile = str('wikipedialinks-the_donald-28-06-2017-12-10-2017.csv')
outputfile = str(inputfile[:-4]) + '-imageurl.csv'
print("\n\n\nJob parameters:\n\tInput ---> %s\n\tFilters ---> %s\n\tOutput --->%s\n\n\n" % (inputfile, '|'.join(searchfor), outputfile))
filterURLs(inputfile, outputfile, searchfor)
print(time.time() - starttime)