|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- encoding: utf-8 -*- |
|
3
|
|
|
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: |
|
4
|
|
|
# Author: Binux<[email protected]> |
|
5
|
|
|
# http://binux.me |
|
6
|
|
|
# Created on 2014-02-23 00:19:06 |
|
7
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
import sys |
|
10
|
|
|
import time |
|
11
|
|
|
import socket |
|
12
|
|
|
import inspect |
|
13
|
|
|
import datetime |
|
14
|
|
|
import traceback |
|
15
|
|
|
from flask import render_template, request, json |
|
16
|
|
|
from flask.ext import login |
|
17
|
|
|
|
|
18
|
|
|
from pyspider.libs import utils, sample_handler, dataurl |
|
19
|
|
|
from pyspider.libs.response import rebuild_response |
|
20
|
|
|
from pyspider.processor.project_module import ProjectManager, ProjectFinder |
|
21
|
|
|
from .app import app |
|
22
|
|
|
|
|
23
|
|
|
default_task = { |
|
24
|
|
|
'taskid': 'data:,on_start', |
|
25
|
|
|
'project': '', |
|
26
|
|
|
'url': 'data:,on_start', |
|
27
|
|
|
'process': { |
|
28
|
|
|
'callback': 'on_start', |
|
29
|
|
|
}, |
|
30
|
|
|
} |
|
31
|
|
|
default_script = inspect.getsource(sample_handler) |
|
32
|
|
|
|
|
33
|
|
|
|
|
34
|
|
|
@app.route('/debug/<project>', methods=['GET', 'POST']) |
|
35
|
|
|
def debug(project): |
|
36
|
|
|
projectdb = app.config['projectdb'] |
|
37
|
|
|
if not projectdb.verify_project_name(project): |
|
38
|
|
|
return 'project name is not allowed!', 400 |
|
39
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
|
40
|
|
|
if info: |
|
41
|
|
|
script = info['script'] |
|
42
|
|
|
else: |
|
43
|
|
|
script = (default_script |
|
44
|
|
|
.replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) |
|
45
|
|
|
.replace('__PROJECT_NAME__', project) |
|
46
|
|
|
.replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) |
|
47
|
|
|
|
|
48
|
|
|
taskid = request.args.get('taskid') |
|
49
|
|
|
if taskid: |
|
50
|
|
|
taskdb = app.config['taskdb'] |
|
51
|
|
|
task = taskdb.get_task( |
|
52
|
|
|
project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) |
|
53
|
|
|
else: |
|
54
|
|
|
task = default_task |
|
55
|
|
|
|
|
56
|
|
|
default_task['project'] = project |
|
57
|
|
|
return render_template("debug.html", task=task, script=script, project_name=project) |
|
58
|
|
|
|
|
59
|
|
|
|
|
60
|
|
|
@app.before_first_request |
|
61
|
|
|
def enable_projects_import(): |
|
62
|
|
|
sys.meta_path.append(ProjectFinder(app.config['projectdb'])) |
|
63
|
|
|
|
|
64
|
|
|
|
|
65
|
|
|
@app.route('/debug/<project>/run', methods=['POST', ]) |
|
66
|
|
|
def run(project): |
|
67
|
|
|
start_time = time.time() |
|
68
|
|
|
try: |
|
69
|
|
|
task = utils.decode_unicode_obj(json.loads(request.form['task'])) |
|
70
|
|
|
except Exception: |
|
71
|
|
|
result = { |
|
72
|
|
|
'fetch_result': "", |
|
73
|
|
|
'logs': u'task json error', |
|
74
|
|
|
'follows': [], |
|
75
|
|
|
'messages': [], |
|
76
|
|
|
'result': None, |
|
77
|
|
|
'time': time.time() - start_time, |
|
78
|
|
|
} |
|
79
|
|
|
return json.dumps(utils.unicode_obj(result)), \ |
|
80
|
|
|
200, {'Content-Type': 'application/json'} |
|
81
|
|
|
|
|
82
|
|
|
project_info = { |
|
83
|
|
|
'name': project, |
|
84
|
|
|
'status': 'DEBUG', |
|
85
|
|
|
'script': request.form['script'], |
|
86
|
|
|
} |
|
87
|
|
|
|
|
88
|
|
|
if request.form.get('webdav_mode') == 'true': |
|
89
|
|
|
projectdb = app.config['projectdb'] |
|
90
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
|
91
|
|
|
if not info: |
|
92
|
|
|
result = { |
|
93
|
|
|
'fetch_result': "", |
|
94
|
|
|
'logs': u' in wevdav mode, cannot load script', |
|
95
|
|
|
'follows': [], |
|
96
|
|
|
'messages': [], |
|
97
|
|
|
'result': None, |
|
98
|
|
|
'time': time.time() - start_time, |
|
99
|
|
|
} |
|
100
|
|
|
return json.dumps(utils.unicode_obj(result)), \ |
|
101
|
|
|
200, {'Content-Type': 'application/json'} |
|
102
|
|
|
project_info['script'] = info['script'] |
|
103
|
|
|
|
|
104
|
|
|
fetch_result = {} |
|
105
|
|
|
try: |
|
106
|
|
|
module = ProjectManager.build_module(project_info, { |
|
107
|
|
|
'debugger': True, |
|
108
|
|
|
'process_time_limit': app.config['process_time_limit'], |
|
109
|
|
|
}) |
|
110
|
|
|
|
|
111
|
|
|
# The code below is to mock the behavior that crawl_config been joined when selected by scheduler. |
|
112
|
|
|
# but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` |
|
113
|
|
|
# crawl_config = module['instance'].crawl_config |
|
114
|
|
|
# task = module['instance'].task_join_crawl_config(task, crawl_config) |
|
115
|
|
|
|
|
116
|
|
|
fetch_result = app.config['fetch'](task) |
|
117
|
|
|
response = rebuild_response(fetch_result) |
|
118
|
|
|
|
|
119
|
|
|
ret = module['instance'].run_task(module['module'], task, response) |
|
120
|
|
|
except Exception: |
|
121
|
|
|
type, value, tb = sys.exc_info() |
|
122
|
|
|
tb = utils.hide_me(tb, globals()) |
|
123
|
|
|
logs = ''.join(traceback.format_exception(type, value, tb)) |
|
124
|
|
|
result = { |
|
125
|
|
|
'fetch_result': fetch_result, |
|
126
|
|
|
'logs': logs, |
|
127
|
|
|
'follows': [], |
|
128
|
|
|
'messages': [], |
|
129
|
|
|
'result': None, |
|
130
|
|
|
'time': time.time() - start_time, |
|
131
|
|
|
} |
|
132
|
|
|
else: |
|
133
|
|
|
result = { |
|
134
|
|
|
'fetch_result': fetch_result, |
|
135
|
|
|
'logs': ret.logstr(), |
|
136
|
|
|
'follows': ret.follows, |
|
137
|
|
|
'messages': ret.messages, |
|
138
|
|
|
'result': ret.result, |
|
139
|
|
|
'time': time.time() - start_time, |
|
140
|
|
|
} |
|
141
|
|
|
result['fetch_result']['content'] = response.text |
|
142
|
|
|
if (response.headers.get('content-type', '').startswith('image')): |
|
143
|
|
|
result['fetch_result']['dataurl'] = dataurl.encode( |
|
144
|
|
|
response.content, response.headers['content-type']) |
|
145
|
|
|
|
|
146
|
|
|
try: |
|
147
|
|
|
# binary data can't encode to JSON, encode result as unicode obj |
|
148
|
|
|
# before send it to frontend |
|
149
|
|
|
return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} |
|
150
|
|
|
except Exception: |
|
151
|
|
|
type, value, tb = sys.exc_info() |
|
152
|
|
|
tb = utils.hide_me(tb, globals()) |
|
153
|
|
|
logs = ''.join(traceback.format_exception(type, value, tb)) |
|
154
|
|
|
result = { |
|
155
|
|
|
'fetch_result': "", |
|
156
|
|
|
'logs': logs, |
|
157
|
|
|
'follows': [], |
|
158
|
|
|
'messages': [], |
|
159
|
|
|
'result': None, |
|
160
|
|
|
'time': time.time() - start_time, |
|
161
|
|
|
} |
|
162
|
|
|
return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} |
|
163
|
|
|
|
|
164
|
|
|
|
|
165
|
|
|
@app.route('/debug/<project>/save', methods=['POST', ]) |
|
166
|
|
|
def save(project): |
|
167
|
|
|
projectdb = app.config['projectdb'] |
|
168
|
|
|
if not projectdb.verify_project_name(project): |
|
169
|
|
|
return 'project name is not allowed!', 400 |
|
170
|
|
|
script = request.form['script'] |
|
171
|
|
|
project_info = projectdb.get(project, fields=['name', 'status', 'group']) |
|
172
|
|
|
if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ |
|
173
|
|
|
and not login.current_user.is_active(): |
|
174
|
|
|
return app.login_response |
|
175
|
|
|
|
|
176
|
|
|
if project_info: |
|
177
|
|
|
info = { |
|
178
|
|
|
'script': script, |
|
179
|
|
|
} |
|
180
|
|
|
if project_info.get('status') in ('DEBUG', 'RUNNING', ): |
|
181
|
|
|
info['status'] = 'CHECKING' |
|
182
|
|
|
projectdb.update(project, info) |
|
183
|
|
|
else: |
|
184
|
|
|
info = { |
|
185
|
|
|
'name': project, |
|
186
|
|
|
'script': script, |
|
187
|
|
|
'status': 'TODO', |
|
188
|
|
|
'rate': app.config.get('max_rate', 1), |
|
189
|
|
|
'burst': app.config.get('max_burst', 3), |
|
190
|
|
|
} |
|
191
|
|
|
projectdb.insert(project, info) |
|
192
|
|
|
|
|
193
|
|
|
rpc = app.config['scheduler_rpc'] |
|
194
|
|
|
if rpc is not None: |
|
195
|
|
|
try: |
|
196
|
|
|
rpc.update_project() |
|
197
|
|
|
except socket.error as e: |
|
198
|
|
|
app.logger.warning('connect to scheduler rpc error: %r', e) |
|
199
|
|
|
return 'rpc error', 200 |
|
200
|
|
|
|
|
201
|
|
|
return 'ok', 200 |
|
202
|
|
|
|
|
203
|
|
|
|
|
204
|
|
|
@app.route('/debug/<project>/get') |
|
205
|
|
|
def get_script(project): |
|
206
|
|
|
projectdb = app.config['projectdb'] |
|
207
|
|
|
if not projectdb.verify_project_name(project): |
|
208
|
|
|
return 'project name is not allowed!', 400 |
|
209
|
|
|
info = projectdb.get(project, fields=['name', 'script']) |
|
210
|
|
|
return json.dumps(utils.unicode_obj(info)), \ |
|
211
|
|
|
200, {'Content-Type': 'application/json'} |
|
212
|
|
|
|
|
213
|
|
|
|
|
214
|
|
|
@app.route('/blank.html') |
|
215
|
|
|
def blank_html(): |
|
216
|
|
|
return "" |
|
217
|
|
|
|