-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
126 lines (114 loc) · 3.72 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from tqdm import tqdm
import dateparser
from urlparse import urlparse,urljoin
from db import *
from models import get_article_pg
from topics import get_nounphrases
import goose, newspaper
from lxml.etree import tostring
def get_domain(url, indomain = None):
if indomain != None:
url = urljoin(indomain, url)
return urlparse(url).netloc
def consume_source(url, link):
domain = get_domain(link['url'],url)
session.run('''
MATCH (n:Article {url:{url}})
MERGE (m:Article {url:{lurl}})
ON CREATE SET m.domain={domain}
WITH n,m
MERGE (n)-[:CITES {text:{text}}]->(m)
WITH m
MATCH (d:Domain {name:{domain}})
MERGE (m)-[:FROM]->(d)
''', {
'url':url,
'lurl':link['url'],
'text':link['text'],
'domain':domain
}).consume()
def consume_topic(url, topic):
session.run('''
MATCH (n:Article {url:{url}})
MERGE (m:Topic {name:{topic}})
CREATE UNIQUE
(n)-[:MENTIONS {score:{score}}]->(m)
''',{'url':url,'topic':topic[0],'score':topic[1]})
def consume_article(**kwargs):
topics = kwargs.pop('topics',[])
links = kwargs.pop('links',[])
id=list(session.run('''
MERGE (n:Article {url:$map.url})
SET
n += $map
with n
merge (m:Domain {domain:$map.domain})
MERGE (n)-[:FROM]->(m)
return id(n)
''',{'map':kwargs}))[0]['id(n)']
for link in links:
if link['url'].startswith('http') and not link['url'].endswith('.pdf'):
consume_source(kwargs['url'], link)
for topic in topics:
consume_topic(kwargs['url'], topic)
return id
def parse_text(text):
return get_nounphrases(text)
def parse_article(url):
g = goose.Goose()
a = g.extract(url)
return {
'url':url,
'author':','.join(a.authors),
'date':str(dateparser.parse(a.publish_date or '')),
'text':a.cleaned_text.replace('. ','.\n '),
'title':a.title,
'links':a.links,
'topics':parse_text(a.cleaned_text),
'html':tostring(a.doc),
'domain':get_domain(url),
'raw_html':a.raw_html
}
def crawl_article_sources(links, crawl_depth = 0):
if crawl_depth <MAX_CRAWL_DEPTH:
for i in set(links):
rq_add_job(
func = merge_article,
kwargs = {
'article':i,
'crawl_depth':crawl_depth + 1
},
queue='default'
)
def merge_article(article, crawl_depth=0):
if type(article) is list:
article = article[0]
data = parse_article(article)
text = data.pop('text')
html = data.pop('html')
raw_html = data.pop('raw_html')
if data != None:
id = consume_article(
**data
)
insert_article(id, article, html, text, raw_html)
links = filter(lambda x: x, map(
lambda x: urljoin(article, x['url']
).decode('ascii',errors='ignore').split('#')[0],
data['links']))
crawl_article_sources(links, crawl_depth)
return id
else:
raise Exception('Failed to merge article')
def merge_domain(domain, limit=100):
if not 'http:' in domain:
domain = 'http://' + domain
paper = newspaper.build(domain)
articles = map(lambda x: x.url, filter(lambda x: paper.url in x.url and paper.url!=x.url, paper.articles))
print 'Consuming %d Articles' % len(articles)
for ind, article in enumerate(articles[:limit]):
print 'Article ',ind, ' - ',article
try:
merge_article(article)
except Exception as e:
print e