1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- encoding: utf-8 -*- |
3
|
|
|
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: |
4
|
|
|
# Author: Binux<[email protected]> |
5
|
|
|
# http://binux.me |
6
|
|
|
# Created on 2014-02-23 00:19:06 |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
import sys |
10
|
|
|
import time |
11
|
|
|
import socket |
12
|
|
|
import inspect |
13
|
|
|
import datetime |
14
|
|
|
import traceback |
15
|
|
|
from flask import render_template, request, json |
16
|
|
|
from flask.ext import login |
17
|
|
|
|
18
|
|
|
from pyspider.libs import utils, sample_handler, dataurl |
19
|
|
|
from pyspider.libs.response import rebuild_response |
20
|
|
|
from pyspider.processor.project_module import ProjectManager, ProjectFinder |
21
|
|
|
from .app import app |
22
|
|
|
|
23
|
|
|
default_task = { |
24
|
|
|
'taskid': 'data:,on_start', |
25
|
|
|
'project': '', |
26
|
|
|
'url': 'data:,on_start', |
27
|
|
|
'process': { |
28
|
|
|
'callback': 'on_start', |
29
|
|
|
}, |
30
|
|
|
} |
31
|
|
|
default_script = inspect.getsource(sample_handler) |
32
|
|
|
|
33
|
|
|
|
34
|
|
|
@app.route('/debug/<project>', methods=['GET', 'POST']) |
35
|
|
|
def debug(project): |
36
|
|
|
projectdb = app.config['projectdb'] |
37
|
|
|
if not projectdb.verify_project_name(project): |
38
|
|
|
return 'project name is not allowed!', 400 |
39
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
40
|
|
|
if info: |
41
|
|
|
script = info['script'] |
42
|
|
|
else: |
43
|
|
|
script = (default_script |
44
|
|
|
.replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) |
45
|
|
|
.replace('__PROJECT_NAME__', project) |
46
|
|
|
.replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) |
47
|
|
|
|
48
|
|
|
taskid = request.args.get('taskid') |
49
|
|
|
if taskid: |
50
|
|
|
taskdb = app.config['taskdb'] |
51
|
|
|
task = taskdb.get_task( |
52
|
|
|
project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) |
53
|
|
|
else: |
54
|
|
|
task = default_task |
55
|
|
|
|
56
|
|
|
default_task['project'] = project |
57
|
|
|
return render_template("debug.html", task=task, script=script, project_name=project) |
58
|
|
|
|
59
|
|
|
|
60
|
|
|
@app.before_first_request |
61
|
|
|
def enable_projects_import(): |
62
|
|
|
sys.meta_path.append(ProjectFinder(app.config['projectdb'])) |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
@app.route('/debug/<project>/run', methods=['POST', ]) |
66
|
|
|
def run(project): |
67
|
|
|
start_time = time.time() |
68
|
|
|
try: |
69
|
|
|
task = utils.decode_unicode_obj(json.loads(request.form['task'])) |
70
|
|
|
except Exception: |
71
|
|
|
result = { |
72
|
|
|
'fetch_result': "", |
73
|
|
|
'logs': u'task json error', |
74
|
|
|
'follows': [], |
75
|
|
|
'messages': [], |
76
|
|
|
'result': None, |
77
|
|
|
'time': time.time() - start_time, |
78
|
|
|
} |
79
|
|
|
return json.dumps(utils.unicode_obj(result)), \ |
80
|
|
|
200, {'Content-Type': 'application/json'} |
81
|
|
|
|
82
|
|
|
project_info = { |
83
|
|
|
'name': project, |
84
|
|
|
'status': 'DEBUG', |
85
|
|
|
'script': request.form['script'], |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
if request.form.get('webdav_mode') == 'true': |
89
|
|
|
projectdb = app.config['projectdb'] |
90
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
91
|
|
|
if not info: |
92
|
|
|
result = { |
93
|
|
|
'fetch_result': "", |
94
|
|
|
'logs': u' in wevdav mode, cannot load script', |
95
|
|
|
'follows': [], |
96
|
|
|
'messages': [], |
97
|
|
|
'result': None, |
98
|
|
|
'time': time.time() - start_time, |
99
|
|
|
} |
100
|
|
|
return json.dumps(utils.unicode_obj(result)), \ |
101
|
|
|
200, {'Content-Type': 'application/json'} |
102
|
|
|
project_info['script'] = info['script'] |
103
|
|
|
|
104
|
|
|
fetch_result = {} |
105
|
|
|
try: |
106
|
|
|
module = ProjectManager.build_module(project_info, { |
107
|
|
|
'debugger': True, |
108
|
|
|
'process_time_limit': app.config['process_time_limit'], |
109
|
|
|
}) |
110
|
|
|
|
111
|
|
|
# The code below is to mock the behavior that crawl_config been joined when selected by scheduler. |
112
|
|
|
# but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` |
113
|
|
|
# crawl_config = module['instance'].crawl_config |
114
|
|
|
# task = module['instance'].task_join_crawl_config(task, crawl_config) |
115
|
|
|
|
116
|
|
|
fetch_result = app.config['fetch'](task) |
117
|
|
|
response = rebuild_response(fetch_result) |
118
|
|
|
|
119
|
|
|
ret = module['instance'].run_task(module['module'], task, response) |
120
|
|
|
except Exception: |
121
|
|
|
type, value, tb = sys.exc_info() |
122
|
|
|
tb = utils.hide_me(tb, globals()) |
123
|
|
|
logs = ''.join(traceback.format_exception(type, value, tb)) |
124
|
|
|
result = { |
125
|
|
|
'fetch_result': fetch_result, |
126
|
|
|
'logs': logs, |
127
|
|
|
'follows': [], |
128
|
|
|
'messages': [], |
129
|
|
|
'result': None, |
130
|
|
|
'time': time.time() - start_time, |
131
|
|
|
} |
132
|
|
|
else: |
133
|
|
|
result = { |
134
|
|
|
'fetch_result': fetch_result, |
135
|
|
|
'logs': ret.logstr(), |
136
|
|
|
'follows': ret.follows, |
137
|
|
|
'messages': ret.messages, |
138
|
|
|
'result': ret.result, |
139
|
|
|
'time': time.time() - start_time, |
140
|
|
|
} |
141
|
|
|
result['fetch_result']['content'] = response.text |
142
|
|
|
if (response.headers.get('content-type', '').startswith('image')): |
143
|
|
|
result['fetch_result']['dataurl'] = dataurl.encode( |
144
|
|
|
response.content, response.headers['content-type']) |
145
|
|
|
|
146
|
|
|
try: |
147
|
|
|
# binary data can't encode to JSON, encode result as unicode obj |
148
|
|
|
# before send it to frontend |
149
|
|
|
return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} |
150
|
|
|
except Exception: |
151
|
|
|
type, value, tb = sys.exc_info() |
152
|
|
|
tb = utils.hide_me(tb, globals()) |
153
|
|
|
logs = ''.join(traceback.format_exception(type, value, tb)) |
154
|
|
|
result = { |
155
|
|
|
'fetch_result': "", |
156
|
|
|
'logs': logs, |
157
|
|
|
'follows': [], |
158
|
|
|
'messages': [], |
159
|
|
|
'result': None, |
160
|
|
|
'time': time.time() - start_time, |
161
|
|
|
} |
162
|
|
|
return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} |
163
|
|
|
|
164
|
|
|
|
165
|
|
|
@app.route('/debug/<project>/save', methods=['POST', ]) |
166
|
|
|
def save(project): |
167
|
|
|
projectdb = app.config['projectdb'] |
168
|
|
|
if not projectdb.verify_project_name(project): |
169
|
|
|
return 'project name is not allowed!', 400 |
170
|
|
|
script = request.form['script'] |
171
|
|
|
project_info = projectdb.get(project, fields=['name', 'status', 'group']) |
172
|
|
|
if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ |
173
|
|
|
and not login.current_user.is_active(): |
174
|
|
|
return app.login_response |
175
|
|
|
|
176
|
|
|
if project_info: |
177
|
|
|
info = { |
178
|
|
|
'script': script, |
179
|
|
|
} |
180
|
|
|
if project_info.get('status') in ('DEBUG', 'RUNNING', ): |
181
|
|
|
info['status'] = 'CHECKING' |
182
|
|
|
projectdb.update(project, info) |
183
|
|
|
else: |
184
|
|
|
info = { |
185
|
|
|
'name': project, |
186
|
|
|
'script': script, |
187
|
|
|
'status': 'TODO', |
188
|
|
|
'rate': app.config.get('max_rate', 1), |
189
|
|
|
'burst': app.config.get('max_burst', 3), |
190
|
|
|
} |
191
|
|
|
projectdb.insert(project, info) |
192
|
|
|
|
193
|
|
|
rpc = app.config['scheduler_rpc'] |
194
|
|
|
if rpc is not None: |
195
|
|
|
try: |
196
|
|
|
rpc.update_project() |
197
|
|
|
except socket.error as e: |
198
|
|
|
app.logger.warning('connect to scheduler rpc error: %r', e) |
199
|
|
|
return 'rpc error', 200 |
200
|
|
|
|
201
|
|
|
return 'ok', 200 |
202
|
|
|
|
203
|
|
|
|
204
|
|
|
@app.route('/debug/<project>/get') |
205
|
|
|
def get_script(project): |
206
|
|
|
projectdb = app.config['projectdb'] |
207
|
|
|
if not projectdb.verify_project_name(project): |
208
|
|
|
return 'project name is not allowed!', 400 |
209
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
210
|
|
|
return json.dumps(utils.unicode_obj(info)), \ |
211
|
|
|
200, {'Content-Type': 'application/json'} |
212
|
|
|
|
213
|
|
|
|
214
|
|
|
@app.route('/blank.html') |
215
|
|
|
def blank_html(): |
216
|
|
|
return "" |
217
|
|
|
|