This repository has been archived by the owner on Apr 14, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobtain_csv.py
25 lines (19 loc) · 4.7 KB
/
obtain_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
links = ['0020.html', '0028.html', '0029.html', '0047.html', '0057.html', '0069.html', '0079.html', '0100.html', '0190.html', '0498.html', '1322.html', '2098.html', '2116.html', '2602.html', '2603.html', '2625.html', '2627.html', '2695.html', '2700.html', '2710.html', '2932.html', '2953.html', '2977.html', '3111.html', '3194.html', '3204.html', '3262.html', '3291.html', '3292.html', '3337.html', '3379.html', '3438.html', '3443.html', '3521.html', '3736.html', '3863.html', '3940.html', '3954.html', '3971.html', '4066.html', '4067.html', '4071.html', '4080.html', '4086.html', '4087.html', '4088.html', '4091.html', '4102.html', '4103.html', '4119.html', '4307.html', '4308.html', '4320.html', '4413.html', '4414.html', '4415.html', '4502.html', '4520.html', '4530.html', '4533.html', 'A0001.html', 'A0501.html', 'A0502.html', 'A0503.html', 'A2000.html', 'A2001.html', 'A2003.html', 'A2004.html', 'A2005.html', 'A2006.html', 'A2007.html', 'A3701.html', 'A3702.html', 'A6001.html', 'A6002.html', 'A6003.html', 'A6004.html', 'A6006.html', 'A6007.html', 'A6008.html', 'A6009.html', 'A6010.html', 'A6011.html', 'A6012.html', 'A6013.html', 'A6014.html', 'A6015.html', 'B2000.html', 'B2001.html', 'B2003.html', 'B2004.html', 'B2006.html', 'B2007.html', 'B2008.html', 'B2009.html', 'B2012.html', 'B2013.html', 'B2014.html', 'B2015.html', 'B2016.html', 'B2017.html', 'B2018.html', 'B2019.html', 'B2020.html', 'B2021.html', 'B2022.html', 'B2023.html', 'B2024.html', 'B2025.html', 'B2026.html', 'B2027.html', 'B3701.html', 'B3702.html', 'B4001.html', 'B5001.html', 'B5002.html', 'B5003.html', 'B6001.html', 'B6002.html', 'B6003.html', 'B6004.html', 'B6005.html', 'B6006.html', 'B6007.html', 'B6009.html', 'B6010.html', 'B6011.html', 'B6012.html', 'B6013.html', 'B6014.html', 'B6015.html', 'B6016.html', 'C2000.html', 'C2001.html', 'C2002.html', 'C2003.html', 'C3001.html', 'C3701.html', 'C3702.html', 'C4006.html', 'C5003.html', 'C6001.html', 'C6002.html', 'C6003.html', 'C6004.html', 'D0001.html', 'D0501.html', 'D0502.html', 'D2002.html', 'D3001.html', 'D3002.html', 'D3003.html', 'D3004.html', 'D3005.html', 'D3006.html', 'D3007.html', 'D3008.html', 'D3009.html', 'D3701.html', 'D4001.html', 'D4004.html', 'D4005.html', 'D5002.html', 'D6001.html', 'D6002.html', 'D6003.html', 'D6004.html', 'D6005.html', 'D6006.html', 'D6007.html', 'D6008.html', 'E3001.html', 'E3002.html', 'E3003.html', 'E3004.html', 'E3005.html', 'E3006.html', 'E3007.html', 'E3008.html', 'E3009.html', 'E3010.html', 'E3011.html', 'E6001.html', 'E6002.html', 'F2001.html', 'F2002.html', 'F2003.html', 'F2004.html', 'F2005.html', 'F2006.html', 'F2007.html', 'F3001.html', 'F3701.html', 'F3702.html', 'F6001.html', 'F6002.html', 'L3001.html', 'L3002.html', 'L3003.html', 'L3004.html', 'L3005.html', 'L3006.html', 'L3007.html', 'L3009.html', 'L5001.html', 'L6001.html', 'L6002.html', 'L6003.html', 'L6004.html', 'L6005.html', 'L6006.html', 'L6007.html', 'L6011.html', 'L6012.html', 'M2001.html', 'M2002.html', 'M2003.html', 'M2004.html', 'M2006.html', 'M3001.html', 'M3002.html', 'M3003.html', 'M3004.html', 'M3005.html', 'M3006.html', 'M3007.html', 'M3701.html', 'M3702.html', 'M3703.html', 'M3704.html', 'M4002.html', 'M4006.html', 'M4017.html', 'M5003.html', 'M5004.html', 'M5007.html', 'M5010.html', 'M5013.html', 'M5017.html', 'M5018.html', 'M6001.html', 'M6002.html', 'M6003.html', 'M6004.html', 'M6005.html', 'M6006.html', 'M6007.html', 'M6008.html', 'M6009.html', 'M6010.html', 'M6011.html', 'M6012.html', 'M6014.html', 'M6015.html', 'M6016.html', 'M6017.html', 'M6018.html', 'M6021.html', 'M6022.html', 'M6023.html', 'M6024.html', 'M6025.html', 'M6026.html', 'M6027.html', 'M6028.html', 'P2001.html', 'P3001.html', 'P3002.html', 'P3701.html', 'P4001.html', 'P6001.html', 'P6002.html', 'P6003.html', 'S2000.html', 'S2003.html', 'S2004.html', 'S2005.html', 'S2006.html', 'S2007.html', 'S2008.html', 'S2009.html', 'S3001.html', 'S3002.html', 'S3701.html', 'S6001.html', 'S6002.html']
import re
import csv
unit_code = re.compile(r"[A-Z]{3}[0-9]{4}")
from bs4 import BeautifulSoup
for link in links:
f = open(link.rstrip(".html"), "r")
soup = BeautifulSoup(f.read(), "lxml")
print(link)
for i, table in enumerate(soup.findAll('table', {"class": "course-page__course-structure"})):
file_name = link.rstrip(".html") + "-" + str(i) + ".csv"
with open(file_name, "w") as csv_file:
print("Writing:", file_name)
course_csv = csv.writer(csv_file)
rows = table.findAll('tr')
for row in rows:
cells = row.findAll('td')
course_csv.writerow([unit_code.findall(cell.getText())[0] for cell in cells if unit_code.search(cell.getText()) is not None])
# print(tree.xpath('//*[@class="course-page__course-structure"]//text()'))