-
Notifications
You must be signed in to change notification settings - Fork 4
/
wikipedia_dl.py
73 lines (55 loc) · 1.75 KB
/
wikipedia_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
'''
Downloads the cooresponding wikipedia entry for the reddit submissions.
'''
import glob, json, os, time, codecs
import urlparse
import requests
import wikipedia
import urllib2
import shutil
os.system("mkdir -p data")
os.system("mkdir -p data/wikipedia")
# Move problematic files here
os.system("mkdir -p data/broken_reddit")
import logging
logging.basicConfig(filename='broken_json.log',level=logging.DEBUG)
def reddit_json():
F_REDDIT = sorted(glob.glob("data/reddit/*.json"))
for f_json in F_REDDIT:
with open(f_json) as FIN:
js = json.load(FIN)
js["filename"] = f_json
yield js
def save_page(name,f_html):
r = wikipedia.page(name)
text = r.content
with codecs.open(f_html,'w','utf-8') as FOUT:
print "Downloaded ", f_html
print
FOUT.write(text)
time.sleep(.2)
for js in reddit_json():
url, title = js["url"], js["title"]
r_id = js["name"]
url = urllib2.unquote(url.encode('utf-8'))
name = url.split('/')[-1]
search_bit = "index.php?title="
if search_bit in name:
url = url.replace(search_bit,'')
name = url.split('/')[-1]
print js["filename"]
f_html = "data/wikipedia/{}.txt".format(r_id)
if not os.path.exists(f_html):
name = name.split('#')[0].split('?')[0].split("&")[0]
name = name.replace('_',' ')
name = name.lower()
#print url
#print name
#print "Searching for", name
try:
save_page(name,f_html)
except Exception as Ex:
name = js["filename"]
print name, "broken, moving out."
shutil.move(name,
"data/broken_reddit/{}".format(os.path.basename(name)))