forked from Annihilatexv/qlu-cr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_course_on_table.py
221 lines (174 loc) · 7.63 KB
/
get_course_on_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import requests
from lxml import etree
#from bs4 import BeautifulSoup
import joblib
import json
# unhashable type: 'list'
# 原因是列表是可变的type,而字典中的哈希类型必须是不可变得type,比如元组。
# <<<<<!!!需要使用代理,提供Cookie获取数据!!!>>>>>
Cookie = "抓取提供"
def get_table():
# 内网使用可用url2
url2 = 'http://jwxt.qlu.edu.cn/jsxsd/kbcx/kbxx_classroom_ifr'
url = "http://jwxt-qlu-edu-cn.vpn.qlu.edu.cn/jsxsd/kbcx/kbxx_classroom_ifr"
url3 = 'http://jwxt-qlu-edu-cn.vpn.qlu.edu.cn:8118/jsxsd/kbcx/kbxx_classroom_ifr'
# --------可能需要外部变量!
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4412.0 Safari/537.36 Edg/90.0.796.0',
'Cookie': Cookie}
#--------可能需要外部变量!
data = {'xnxqh': '2022-2023-2',
'skyx': ''
}
course = requests.post(url2, headers=headers, data=data)
html = etree.HTML(course.text)
# result = etree.tostring(html).decode('utf-8')
# print(result)
# print(course.text)
result = html.xpath('//tr')
# print(result) #element对象的列表形式,
table = []
# 从第三行开始,前两行是星期和节数
for tri in range(2, len(result)):
td = result[tri].xpath('.//td/nobr')
classroom_name = td[0].xpath('./text()')
# print(classroom_name)
tr_cell = [classroom_name]
for tdi in range(1, len(td)):
tr_cell.append(td[tdi].xpath('./div/text()'))
# print(tr_cell)
table.append(tr_cell)
# print(table[0])
return table
def get_course_on_table(table):
course_on_table = multidict()
all_week = 7
# --------可能需要外部变量!
day_course = 6
# 用于记录所有的教室名称,便于遍历
all_classroom = []
for tr in table:
# 每行第一个cell是教室名
classroom_name = tr[0][0]
all_classroom.append(classroom_name)
# print(classroom_name)
# 剔除教室名,保留上课信息
tr = tr[1:]
# 每一行去除教室名,有42个cell=6*7
# 先对星期几进行分组
for week_i in range(all_week): # (0-6)
# 再对每天的第几节课进行分组,一天共有6节课
for course_i in range(day_course): # (0-5)
# 根据上面的索引,计算出对应的cell
cell = tr[week_i * day_course + course_i] # 乘以6,因为每天只有6节课
# 查询每个cell哪些周有课, 返回来的都是列表, week_on 是双层列表
course_name_ls, tcher_name_ls, week_on, class_name_ls = cell_parse(cell)
# print(classroom_name,week_on)
# 按每个有课周写入对应的信息,最底层不为空即为有课, week_on 是双层列表
for kind_i in range(len(week_on)):
for each_week in week_on[kind_i]:
# 先判断是否为空
# 为空新建
if not course_on_table[each_week][week_i + 1][course_i + 1][classroom_name]:
# 第几周,星期几,第几节课,教室名
course_on_table[each_week][week_i + 1][course_i + 1][classroom_name] = (
course_name_ls[kind_i], tcher_name_ls[kind_i], class_name_ls[kind_i])
# 不为空添加
else:
course_on_table[each_week][week_i + 1][course_i + 1][classroom_name] = tuple(
set([course_on_table[each_week][week_i + 1][course_i + 1][classroom_name],
(course_name_ls[kind_i], tcher_name_ls[kind_i], class_name_ls[kind_i])]))
# course_on_table[classroom_name][week_i+1][course_i+1]=(course_name,tcher_name,class_name)
# print(set(week_on))
# print(course_on_table[classroom_name][week_i+1][course_i+1])
return course_on_table,all_classroom
def cell_parse(cell):
# 保存每个cell中有课的周数
week_on = []
course_name_ls, tcher_name_ls, class_name_ls = [], [], []
# 从每个单元3行3行判断
for i in range(0, len(cell), 3):
# i为每3行的头一行行数
course_name = cell[i]
tcher_name, week_name = cell[i + 1].split('\n')
class_name = cell[i + 2]
# print(week_name)
week_on.append(week_name_parse(week_name, cell))
course_name_ls.append(course_name)
tcher_name_ls.append(tcher_name)
class_name_ls.append(class_name)
# print(week_name)
# print(week_name_parse(week_name))
# print(week_on)
return course_name_ls, tcher_name_ls, week_on, class_name_ls
def week_name_parse(week_name, cell):
# (1-16周), (5,12周),(1-8,10-17双周),(1-10单周),(2,4,6,8,10,14,16,18双周)
week_on = []
# 如果是单双周那么步长就会是2
odd_even_step = 0
if '单' in week_name:
odd_even_step = 1
elif '双' in week_name:
odd_even_step = 2
# 清除汉字,只保留数字进行识别
week_name = week_name.replace('单', '').replace('双', '').replace('周', '').replace('(', '').replace(')', '')
week_num = week_name.split(',')
for each_num in week_num:
# 连续周识别
if '-' in each_num:
start, end = each_num.split('-')
# print(start,end)
# 建立连续周数的列表,并进行单双周过滤
num_list = [i for i in range(int(start), int(end) + 1)]
if odd_even_step == 1:
num_list = [i for i in num_list if i % 2]
elif odd_even_step == 2:
num_list = [i for i in num_list if not i % 2]
week_on.extend(num_list)
# 是孤立的周直接添加进来
else:
try:
week_on.append(int(each_num))
except:
print('该单元可能有问题:', week_name, each_num, cell, sep='\n')
return week_on
# 定义一个多层字典类
class multidict(dict):
def __getitem__(self, item):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item] = type(self)()
return value
# 定义用于一个保存全校性课表的类
class JsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, datetime):
return obj.__str__()
else:
return super(MyEncoder, self).default(obj)
# 保存字典
def save_dict(filename, dic):
with open(filename,'w',encoding='utf-8') as json_file:
json.dump(dic, json_file, ensure_ascii=False, cls=JsonEncoder)
# 加载字典
def load_dict(filename):
with open(filename,"r",encoding='utf-8') as json_file:
dic = json.load(json_file)
return dic
if __name__ == '__main__':
table=get_table()
course_on_table,all_classroom = get_course_on_table(table)
# 保存课表字典
save_dict("./static/data/course_on_table.json",course_on_table)
# joblib.dump(course_on_table, './static/data/course_on_table.pkl',compress=3)
joblib.dump(all_classroom, r'./static/data/all_classroom.pkl')
# print(course_on_table['1号公教楼JT102'][1][1])
# print(course_on_table[3][1][1]['1号公教楼JT104'])
# print(course_on_table.keys())