forked from karpathy/arxiv-sanity-preserver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_raw.py
43 lines (36 loc) · 891 Bytes
/
parse_raw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import feedparser
import cPickle as pickle
def encode(d):
"""
get rid of feedparser bs with a deep copy. UNBELIEVABLE. I hate when libs
wrap simple things in their own classes.
"""
if isinstance(d, feedparser.FeedParserDict) or isinstance(d, dict):
j = {}
for k in d.keys():
j[k] = encode(d[k])
return j
elif isinstance(d, list):
l = []
for k in d:
l.append(encode(k))
return l
else:
return d
files = os.listdir('raw')
out = {}
for f in files:
p = 'raw/' + f
print 'reading ', p
txt = open(p, 'r').read()
parse = feedparser.parse(txt)
for e in parse.entries:
j = encode(e)
s = j['id']
ix = s.rfind('/')
rawid = j['id'][ix+1:] # extract just the id (and the version)
j['rawid'] = rawid
out[rawid] = j
print 'read %d unique papers' % (len(out), )
pickle.dump(out, open( "db.p", "wb" ))