-
Notifications
You must be signed in to change notification settings - Fork 2
/
start.py
69 lines (52 loc) · 1.87 KB
/
start.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# -*- encoding: utf-8 -*-
import codecs
from accesspredict.pdfpredictor import *
from accesspredict.zoteropredictor import *
from accesspredict.scraperpredictor import *
from accesspredict.forest import URLForest
from accesspredict.spider import *
from accesspredict.urldataset import URLDataset
from accesspredict.combinedpredictor import P
from accesspredict.statistics import CrawlingStatistics
from urltheory.smoothing import ExponentialDirichlet
from gevent.pool import Pool
from config import redis_client
import gevent
import redis
#redis_client.flushall()
uf = URLForest()
uf.add_tree('pdf')
uf.add_tree('custom')
#uf.add_tree('zotero')
#uf.add_tree('diff')
ud = URLDataset(redis_client)
# this loads up all the cached URLs we have in redis
ud.feed_to_forest(uf)
stats = CrawlingStatistics()
dumpname = 'crossref.train'
#dumpname = 'pdftest'
spider = Spider(forest=uf, dataset=ud, stats=stats)
spider.add_predictor('pdf', PDFPredictor(), ExponentialDirichlet())
spider.add_predictor('custom', ScraperFullTextPredictor(), ExponentialDirichlet())
#spider.add_predictor('zotero', ZoteroFullTextPredictor())
#spider.add_predictor('diff', P('custom') != (P('zotero') | P('pdf')))
def update_stats(for_greenlet):
while not for_greenlet.ready():
gevent.sleep(120)
stats.log_all()
stats.write('www/stats_%s.html' % dumpname)
pool = Pool(1)
def urls():
with codecs.open('data/%s/urls.txt' % dumpname, 'r', 'utf-8') as f:
for l in f:
fields = l.strip().split('\t')
u = fields[0]
yield u
def crawler():
for result in pool.imap_unordered(lambda u: spider.predict('custom',u), urls()):
print("-- final result: {}".format(result))
crawler_greenlet = gevent.Greenlet(crawler)
crawler_greenlet.start()
update_stats(crawler_greenlet)
ud.save('data/%s/dataset.tsv'% dumpname)
uf.save('data/%s/forest.pkl'% dumpname)