-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathebi_scraper.py
executable file
·112 lines (93 loc) · 4.2 KB
/
ebi_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# Sources for this example script:
# http://docs.python-guide.org/en/latest/scenarios/scrape/
# http://docs.ckan.org/en/latest/api/index.html#example-importing-datasets-with-the-ckan-api
# BROKEN: HTTP error 409 caused by stuff on line 118
# The breakage is caused by attemting to upload an existing record; this requires that a check be performed
# first. Unfortuantely, the check is broken.
from lxml import html
from bs4 import BeautifulSoup
from training import *
import requests
import re
import urllib2
import urllib
import json
import pprint
root_url = 'http://www.ebi.ac.uk'
owner_org = 'european-bioinformatics-institute-ebi'
lessons = {}
# There doesn't seem to be much need to parse this at the moment - CKAN seems to be expecting the URL of a resource
# rather than the resource itself.
def parse_data(page):
response = urllib2.urlopen(root_url + page)
tree = BeautifulSoup(response.read())
links = tree.find("div", {"class": "item-list"}).find_all('ul')[0].find_all('li')
count = 1
for link in links:
item = link.find("div", {"class": "views-field-title"}).find('a')
description = link.find("div", {"class": "views-field-field-course-desc-value"}).get_text()
count += 1
try:
topics = link.find("div", {"class": "views-field-tid"}).get_text()
href = item['href']
lessons[href] = {}
text= item.get_text()
lessons[href]['text'] = text
lessons[href]['description'] = description
lessons[href]['topics'] = extract_keywords(topics)
except Exception, e:
print "Failed to parse link: " + str(e)
# example:
# text = "\n\n Topic: \n\n Ontologies\n\n System\n\n"
def extract_keywords(text):
# Remove 'Topic:', newlines, and white space.
# Split into array of topics and reject any empty ones
# Format for CKAN Uploader
text = text.replace('Topic:', '')
text = re.sub('\n', '', text)
text = re.sub(' +',' ', text)
keywords = text.split(' ')
tags = []
for keyword in keywords:
if keyword == '' or len(keyword) < 3:
keywords.remove(keyword)
else:
tags.append({"name": re.sub(r'\W+', '', keyword)})
return tags
# upload_dataset must return an id which has to be passed to upload_resource, so the resource can be linked to the dataset.
# Therefore, the former returns None if nothing is created so that we can detect whether it has worked or not. In the case
# of the upload_resource then the error can be returned here rather than in the rest of the script, as with upload_dataset.
#Stub - to complete find li with url of last page and return its int here
def last_page_number():
return 2
def setup_organisation():
organisation = OrgUnit()
organisation.title = 'European Bioinformatics Institute (EBI)'
organisation.name = 'european-bioinformatics-institute-ebi'
organisation.description = 'EMBL-EBI provides freely available data from life science experiments, performs basic research in computational biology and offers an extensive user training programme, supporting researchers in academia and industry.'
organisation.image_url = 'http://www.theconsultants-e.com/Libraries/Clients/European_Bioinformatics_Institute.sflb.ashx'
OrgUnit.upload_organisation(organisation)
def scrape_page(page):
parse_data(page)
for key in lessons:
course = Tutorial()
course.url = root_url + key
course.notes = lessons[key]['description']
course.title = lessons[key]['text']
course.set_name(owner_org,lessons[key]['text'])
course.tags = lessons[key]['topics']
course.owning_org = owner_org
course.format = 'html'
# Before attempting to create anything we need to check if the resource/dataset already exists, updating it
# as and where necessary.
CKANUploader.create_or_update(course)
#setup_organisation()
# each individual tutorial
first_page = '/training/online/course-list'
scrape_page(first_page)
for page_no in range(1, last_page_number() + 1):
page = first_page + '?page=' + str(page_no)
print "\n\n\n\n\n" + page + "\n\n\n\n\n"
print "Scraping page: " + str(page_no)
scrape_page(page)