forked from Vaishaal/ninjascraper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_ninja.py
155 lines (139 loc) · 6.18 KB
/
scrape_ninja.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import urllib2, urllib
import sys
import os
import re
from bs4 import BeautifulSoup
# Use for general scraping
sem_list = ["Fall", "Spring", "Summer"]
year_list = [str(year) for year in range(2000, 2013)]
ways = ["", "(solution)", "solution"]
tests = ['Midterm', 'Midterm 1', 'Midterm 2', 'Midterm 3', 'Final']
# Use for scrape_prof specifically
two_list = ['CYPLAN', 'EALANG', 'ELENG', 'LDARCH', 'NESTUD', 'PBHLTH']
three_list = ['AGRCHM', 'BIOENG', 'CHMENG', 'CIVENG', 'COGSCI', 'COMLIT', 'DEVSTD',
'ENVDES', 'ENVSCI', 'ETHSTD', 'ETHGRP', 'INDENG', 'LANPRO', 'MATSCI',
'MEDST', 'MECENG', 'MILAFF', 'MILSCI', 'NATRES', 'NAVSCI', 'NUCENG',
'POLSCI', 'PUBPOL', 'SOCWEL', 'VISSCI', 'VISSTD']
four_list = ['CRITTH', 'EURAST', 'PHYSED']
def change_dep_match_url(dep="MECENG"):
if dep == 'LS':
new_dep = 'L%20&%20S'
elif dep == 'SASIAN':
new_dep = 'S%20ASIAN'
elif dep == 'MESTU':
new_dep = 'M%20E%20STU'
elif dep in two_list:
new_dep = dep[:2] + '%20' + dep[2:]
elif dep in three_list:
new_dep = dep[:3] + '%20' + dep[3:]
elif dep in four_list:
new_dep = dep[:4] + '%20' + dep[4:]
else:
new_dep = dep
return new_dep
def is_semester(string):
return "Fall " in string or "Spring " in string or "Summer " in string
def scrape_prof(department="COMPSCI", course="70"):
base_url = "http://ninjacourses.com/explore/1/course/"
output_set = set()
department = change_dep_match_url(department)
url = base_url + department + '/' + course + '/'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
scrape_result = soup.find_all(href=re.compile("/ratings/view/instructor/"))
# Scrape for Prof's Last Name, append result to a set
for result in scrape_result:
prof_name = result.text.split()[0]
if prof_name[-1] == ',':
prof_name = prof_name[:-1]
if re.match("^[A-Za-z_-]+$", prof_name): # if prof's name only consists of characters
output_set.add(prof_name)
return output_set
def scrape_hkn(abv="CS", course="70"):
prof_year = {}
html = urllib2.urlopen("https://hkn.eecs.berkeley.edu/coursesurveys/course/{0}/{1}".format(abv, course))
soup = BeautifulSoup(html)
tables = soup.find_all("table")[1]
links = tables.find_all("a")
current_semester = None
current_tuple = ()
for i in links:
text = str(i.text)
if is_semester(text):
if current_semester:
prof_year[current_semester] = current_tuple
current_tuple = ()
current_semester = text
else:
current_tuple += (text,)
prof_year[current_semester] = current_tuple
return prof_year
# Works only for CS courses. You only need to provide one argument
def scrape_ninja_cs(course="70", department="COMPSCI", abv="CS"):
prof_year = scrape_hkn(abv, course)
base_url = "http://media.ninjacourses.com/var/exams/1/{0}/{1}%20{2}%20-%20{3}%20{4}%20-%20{5}%20-%20{6}%20{7}.pdf"
exists = {}
for semester, profs in prof_year.iteritems():
sem, year = tuple(semester.split())
for test in tests:
for prof in profs:
for way in ways:
try:
prof = prof.split()[-1]
test_split = test.split()
if len(test_split) == 1:
url = base_url.format(department, abv, course, sem, year, prof, test, way)
if not way:
url = url[:-7] + ".pdf"
elif len(test_split) == 2:
url = base_url.format(department, abv, course, sem, year, prof, test_split[0], test_split[1])
if way:
url = url[:-4] + "%20" + way + url[-4:]
urllib2.urlopen(url)
exists[str(profs + (test, way)) + " " + semester] = url
except Exception as e:
pass
# Download them into your local folder
for info, url in exists.iteritems():
print url
urllib.urlretrieve(url, abv + course + '/' + info + ".pdf")
# Modified to work with ANY other courses
def scrape_ninja(department="ECON", abv="ECON", course="100B"):
base_url = "http://media.ninjacourses.com/var/exams/1/{0}/{1}%20{2}%20-%20{3}%20{4}%20-%20{5}%20-%20{6}%20{7}.pdf"
exists = {}
prof_list = scrape_prof(department, course)
for prof in prof_list:
for year in year_list:
for sem in sem_list:
for test in tests:
for way in ways:
try:
test_split = test.split()
if len(test_split) == 1:
url = base_url.format(department, abv, course, sem, year, prof, test, way)
if not way:
url = url[:-7] + ".pdf"
elif len(test_split) == 2:
url = base_url.format(department, abv, course, sem, year, prof, test_split[0], test_split[1])
if way:
url = url[:-4] + "%20" + way + url[-4:]
urllib2.urlopen(url)
exists[str((str(prof), test, way)) + " " + sem + " " + year] = url
except Exception as e:
pass
# Download them into your local folder
for info, url in exists.iteritems():
print url
urllib.urlretrieve(url, abv + course + '/' + info + ".pdf")
if len(sys.argv) == 4:
dir_name = str(sys.argv[2] + sys.argv[3])
if not os.path.exists(dir_name):
os.makedirs(dir_name)
scrape_ninja(sys.argv[1], sys.argv[2], sys.argv[3])
elif len(sys.argv) == 2:
dir_name = 'CS' + str(sys.argv[1])
if not os.path.exists(dir_name):
os.makedirs(dir_name)
scrape_ninja_cs(sys.argv[1])
else:
print "Not enough/Wrong arguments. Please try again."