-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenome3d_scraper.py
executable file
·112 lines (88 loc) · 3.09 KB
/
genome3d_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# Sources for this example script:
# http://docs.python-guide.org/en/latest/scenarios/scrape/
# http://docs.ckan.org/en/latest/api/index.html#example-importing-datasets-with-the-ckan-api
# BROKEN: HTTP error 409 caused by stuff on line 118
from lxml import html
from bs4 import BeautifulSoup
from training import *
import requests
import re
import urllib2
import urllib
import json
import pprint
root_url = 'http://genome3d.eu/'
owner_org = 'genome3d'
lessons = {}
# There doesn't seem to be much need to parse this at the moment - CKAN seems to be expecting the URL of a resource
# rather than the resource itself.
def parse_data(page):
response = urllib2.urlopen(root_url + page)
tree = BeautifulSoup(response.read())
#with open ("genome3d.html", "r") as myfile:
# data=myfile.read().replace('\n', '')
#tree = BeautifulSoup(data)
links = tree.find("div", {"id": "context-menu"}).find_all('ul')[0].find_all('li')
for link in links:
item = link.find('a')
href = item['href']
text= item.get_text()
if text == 'Tutorials Home':
continue
lessons[href] = text.replace('Tutorial: ','')
# upload_dataset must return an id which has to be passed to upload_resource, so the resource can be linked to the dataset.
# Therefore, the former returns None if nothing is created so that we can detect whether it has worked or not. In the case
# of the upload_resource then the error can be returned here rather than in the rest of the script, as with upload_dataset.
"""
def do_upload_dataset(course):
try:
dataset = CKANUploader.create_dataset(course.dump())
return str(dataset['id'])
except:
return None
def do_upload_resource(course,package_id):
try:
course.package_id = package_id
course.name = course.name + "-link"
CKANUploader.create_resource(course.dump())
except Exception as e:
print "Error whilst uploading! Details: " + str(e)
def check_data(course):
result = CKANUploader.check_dataset(course.dump())
if result:
name = result['name']
print "Got dataset: " + name
return result
else:
return None
"""
# each individual tutorial
parse_data('tutorials/page/Public/Page/Tutorial/Index')
#print "LESSONS:"
#pprint.pprint(lessons)
for key in lessons:
course = Tutorial()
course.url = root_url + key
course.title = lessons[key]
course.set_name(owner_org,lessons[key])
course.owning_org = owner_org
course.format = 'html'
CKANUploader.create_or_update(course)
"""
#print "COURSE: "
#pprint.pprint(course.dump())
# Upload at present with no checking.
dataset_id = do_upload_dataset(course)
print "ID: " + str(dataset_id)
if dataset_id:
do_upload_resource(course,dataset_id)
else:
print "Failed to create dataset so could not create resource: " + course.name
existing = check_data(course)
#print "EXISTING:"
#pprint.pprint(existing)
check = TuitionUnit.compare(course.dump(),existing)
print "CHECK:"
pprint.pprint(check)
"""