1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
''' |
3
|
|
|
@Time : 2016 - 2018 |
4
|
|
|
@Author : dairoot |
5
|
|
|
@Email : [email protected] |
6
|
|
|
@description: 课表解析 |
7
|
|
|
''' |
8
|
|
|
from __future__ import absolute_import, unicode_literals |
9
|
|
|
|
10
|
|
|
import re |
11
|
|
|
import six |
12
|
|
|
from bs4 import BeautifulSoup |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class BaseScheduleParse(): |
|
|
|
|
16
|
|
|
''' 课表页面解析模块 ''' |
17
|
|
|
COlOR = ['green', 'blue', 'purple', 'red', 'yellow'] |
18
|
|
|
|
19
|
|
|
def __init__(self, html, time_list, schedule_type): |
20
|
|
|
self.schedule_year = '' |
21
|
|
|
self.schedule_term = '' |
22
|
|
|
self.time_list = time_list |
23
|
|
|
self.schedule_type = schedule_type |
24
|
|
|
self.schedule_list = [[], [], [], [], [], [], []] |
25
|
|
|
self.schedule_dict = [[], [], [], [], [], [], []] |
26
|
|
|
|
27
|
|
|
soup = BeautifulSoup(html, "html.parser") |
28
|
|
|
option_args = soup.find_all("option", {"selected": "selected"}) |
29
|
|
|
if option_args: |
30
|
|
|
self.schedule_year = option_args[0].text |
31
|
|
|
self.schedule_term = option_args[1].text |
32
|
|
|
table = soup.find("table", {"id": "Table6"}) if \ |
33
|
|
|
schedule_type == 1 else soup.find("table", {"id": "Table1"}) |
34
|
|
|
trs = table.find_all('tr') |
35
|
|
|
self.html_parse(trs) |
36
|
|
|
|
37
|
|
|
def html_parse(self, trs): |
38
|
|
|
""" |
39
|
|
|
:param n+1: 为周几 |
40
|
|
|
:param i-1: 为第几节 |
41
|
|
|
:param arr: ["课程", "时间", "姓名", "地点", "节数", "周数数组"] |
42
|
|
|
:param row_arr: 为周几第几节 的课程信息 |
43
|
|
|
:param rowspan: 表示该课程有几节课 |
44
|
|
|
:return: |
45
|
|
|
""" |
46
|
|
|
pattern = r'^\([\u2E80-\u9FFF]{1,3}\d+\)' |
47
|
|
|
# 每天最多有10节课, 数据从2到12, (i-1) 代表是第几节课 (偶数节 不获取) |
48
|
|
|
for i in range(2, 12, 2): |
49
|
|
|
tds = trs[i].find_all("td") |
50
|
|
|
# 去除无用数据,比如(上午, 第一节... 等等) |
51
|
|
|
if i in [2, 6, 10]: |
52
|
|
|
tds.pop(0) |
53
|
|
|
tds.pop(0) |
54
|
|
|
# 默认获取7天内的课表(周一到周日) tds 长度为7 |
55
|
|
|
for day, day_c in enumerate(tds): |
56
|
|
|
row_arr = [] |
57
|
|
|
if day_c.text != u' ': |
58
|
|
|
td_str = day_c.__unicode__() |
59
|
|
|
rowspan = 2 if 'rowspan="2"' in td_str else 1 |
60
|
|
|
td_main = re.sub(r'<td align="Center".*?>', '', td_str)[:-5] |
61
|
|
|
|
62
|
|
|
for text in td_main.split('<br/><br/>'): |
63
|
|
|
course_arr = self._get_td_course_info(text) |
64
|
|
|
if course_arr[0] and not re.match(pattern, course_arr[0]): |
65
|
|
|
course_arr[1] = self._get_weeks_text(course_arr[1]) |
66
|
|
|
weeks_arr = self._get_weeks_arr(course_arr[1]) |
67
|
|
|
row_arr.append(course_arr + [rowspan, weeks_arr]) |
68
|
|
|
self.schedule_list[day].append(row_arr) |
69
|
|
|
|
70
|
|
|
def get_schedule_dict(self): |
71
|
|
|
''' 返回课表数据 字典格式 ''' |
72
|
|
|
|
73
|
|
|
for day, day_schedule in enumerate(self.schedule_list): |
74
|
|
|
for section, section_schedule in enumerate(day_schedule): |
75
|
|
|
section_schedule_dict = [] |
76
|
|
|
color_index = (day * 3 + section + 1) % 5 |
77
|
|
|
for schedule in section_schedule: |
78
|
|
|
if schedule: |
79
|
|
|
section_schedule_dict.append({ |
80
|
|
|
"color": self.COlOR[color_index], |
81
|
|
|
"name": schedule[0], |
82
|
|
|
"weeks_text": schedule[1], |
83
|
|
|
"teacher": schedule[2], |
84
|
|
|
"place": schedule[3], |
85
|
|
|
"section": schedule[4], |
86
|
|
|
"weeks_arr": schedule[5], |
87
|
|
|
"time": self.time_list[schedule[4]][section] |
88
|
|
|
}) |
89
|
|
|
self.schedule_dict[day].append(section_schedule_dict) |
90
|
|
|
|
91
|
|
|
schedule_data = { |
92
|
|
|
'schedule_term': self.schedule_term, |
93
|
|
|
'schedule_year': self.schedule_year, |
94
|
|
|
'schedule': self.schedule_dict |
95
|
|
|
} |
96
|
|
|
return schedule_data |
97
|
|
|
|
98
|
|
|
def _get_weeks_text(self, class_time): |
99
|
|
|
''' 课程周数文本 ''' |
100
|
|
|
if not self.schedule_type: |
101
|
|
|
weeks_text = re.findall(r"{(.*)}", class_time)[0] |
102
|
|
|
else: |
103
|
|
|
# 2节/周 |
104
|
|
|
# 2节/单周(7-7) |
105
|
|
|
# 1-10,13-18(1,2) |
106
|
|
|
if '2节/' in class_time: |
107
|
|
|
weeks_text = class_time if '(' in class_time else class_time + '(1-18)' |
108
|
|
|
else: |
109
|
|
|
weeks_text = class_time.split('(')[0] |
110
|
|
|
return weeks_text |
111
|
|
|
|
112
|
|
|
@staticmethod |
113
|
|
|
def _get_weeks_arr(weeks_text): |
114
|
|
|
""" |
115
|
|
|
将上课时间 转成 数组形式 |
116
|
|
|
:param class_time: 上课时间 |
117
|
|
|
:param weeks_text: 课程周数文本 |
118
|
|
|
:param weeks_arr: 上课周数数组 |
119
|
|
|
:return: |
120
|
|
|
""" |
121
|
|
|
weeks_arr = [] |
122
|
|
|
step = 2 if '单' in weeks_text or '双' in weeks_text else 1 |
123
|
|
|
for split_text in weeks_text.split(','): |
124
|
|
|
weeks = re.findall(r'(\d{1,2})-(\d{1,2})', split_text) |
125
|
|
|
|
126
|
|
|
if weeks: |
127
|
|
|
weeks_arr += range(int(weeks[0][0]), int(weeks[0][1]) + 1, step) |
128
|
|
|
else: |
129
|
|
|
weeks_arr += [int(split_text)] |
130
|
|
|
|
131
|
|
|
return weeks_arr |
132
|
|
|
|
133
|
|
|
@staticmethod |
134
|
|
|
def _get_td_course_info(text): |
135
|
|
|
''' 获取td标签的课程信息 ''' |
136
|
|
|
text = re.sub(r'<[/]{0,1}font[^>]*?>', '', text) |
137
|
|
|
text = re.sub(r'^<br/>', '', text) |
138
|
|
|
|
139
|
|
|
if six.PY2: |
140
|
|
|
# 以下兼容 python2 版本解析处理 |
141
|
|
|
text = re.sub(r'</br></br></br>$', '', text) |
142
|
|
|
text = text.replace('<br>', '<br/>') |
143
|
|
|
|
144
|
|
|
info_arr = [] |
145
|
|
|
for k in text.split('<br/>'): |
146
|
|
|
if k not in ['选修', '公选', '必修']: |
147
|
|
|
info_arr.append(k) |
148
|
|
|
|
149
|
|
|
info_arr = info_arr[:4:] |
150
|
|
|
if len(info_arr) == 3: |
151
|
|
|
# 没有上课地点的情况 |
152
|
|
|
info_arr.append('') |
153
|
|
|
return info_arr |
154
|
|
|
|
155
|
|
|
|
156
|
|
|
class ScheduleParse(BaseScheduleParse): |
|
|
|
|
157
|
|
|
''' 课表节数合并 ''' |
158
|
|
|
|
159
|
|
|
def __init__(self, html, time_list, schedule_type=0): |
160
|
|
|
BaseScheduleParse.__init__(self, html, time_list, schedule_type) |
161
|
|
|
self.merger_same_schedule() |
162
|
|
|
|
163
|
|
|
def merger_same_schedule(self): |
164
|
|
|
""" |
165
|
|
|
:param day_schedule: 一天的课程 |
166
|
|
|
:param section_schedule: 一节课的课程 |
167
|
|
|
:return: |
168
|
|
|
""" |
169
|
|
|
for day_schedule in self.schedule_list: |
170
|
|
|
self._merger_day_schedule(day_schedule) |
171
|
|
|
|
172
|
|
|
def _merger_day_schedule(self, day_schedule): |
173
|
|
|
""" |
174
|
|
|
将同一天相邻的相同两节课合并 |
175
|
|
|
例如:[[["英语", "2节/双周(14-14)", "姓名", "1-301", "2", "[7,8]"],[...]], |
176
|
|
|
[["英语", "2节/双周(14-14)", "姓名", "1-301", "2", "[7,8]"],[...]]] |
177
|
|
|
合并为: 课程节数修改 |
178
|
|
|
[[["英语", "2节/双周(14-14)", "姓名", "1-301", "4", "[7,8]"],[...]], |
179
|
|
|
[[...]]] |
180
|
|
|
""" |
181
|
|
|
# 先合并 同一节课的相同课程 |
182
|
|
|
for section_schedule in day_schedule: |
183
|
|
|
self._merger_section_schedule(section_schedule) |
184
|
|
|
|
185
|
|
|
# 再合并 同一天相邻的相同两节课合并 |
186
|
|
|
day_slen = len(day_schedule) |
187
|
|
|
for i in range(day_slen - 1): |
188
|
|
|
for last_i, last_schedule in enumerate(day_schedule[i]): |
189
|
|
|
for next_i, next_schedule in enumerate(day_schedule[i + 1]): |
190
|
|
|
if last_schedule and next_schedule: |
191
|
|
|
# 课程名 上课地点 上课时间 教师名 |
192
|
|
|
if last_schedule[0] == next_schedule[0] and \ |
193
|
|
|
last_schedule[1] == next_schedule[1] and \ |
194
|
|
|
last_schedule[2] == next_schedule[2] and\ |
195
|
|
|
last_schedule[3] == next_schedule[3]: |
196
|
|
|
|
197
|
|
|
day_schedule[i][last_i][4] += day_schedule[i + 1][next_i][4] |
198
|
|
|
day_schedule[i + 1][next_i] = [] |
199
|
|
|
|
200
|
|
|
@staticmethod |
201
|
|
|
def _merger_section_schedule(section_schedule): |
202
|
|
|
""" |
203
|
|
|
将同一节课的相同课程合并 |
204
|
|
|
例如:[["英语", "2节/单周(7-7)", "姓名", "1-301", "2", "[7]"], |
205
|
|
|
["英语", "2节/双周(8-8)", "姓名", "1-301", "2", "[8]"]] |
206
|
|
|
合并为:课程时间修改 |
207
|
|
|
[["英语", "2节/单周(7-7),2节/双周(8-8)", "姓名", "1-301", "2", "[7,8]"]] |
208
|
|
|
""" |
209
|
|
|
section_slen = len(section_schedule) |
210
|
|
|
for i in range(section_slen): |
211
|
|
|
for j in range(i + 1, section_slen): |
212
|
|
|
if section_schedule[i] and section_schedule[j]: |
213
|
|
|
# 课程名 一样时 |
214
|
|
|
if section_schedule[i][0] == section_schedule[j][0]: |
215
|
|
|
# 并且上课时间不同,上课地点 一样时 |
216
|
|
|
if section_schedule[i][1] != section_schedule[j][1] and \ |
217
|
|
|
section_schedule[i][3] == section_schedule[j][3]: |
218
|
|
|
section_schedule[j][5] += section_schedule[i][5] |
219
|
|
|
section_schedule[j][1] += ',' + section_schedule[i][1] |
220
|
|
|
section_schedule[i] = [] |
221
|
|
|
|
222
|
|
|
# 课程名和上课时间一样时 将上一个赋为空 |
223
|
|
|
if section_schedule[i] and section_schedule[i][1] == section_schedule[j][1]: |
224
|
|
|
section_schedule[i] = [] |
225
|
|
|
|