1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- encoding: utf-8 -*- |
3
|
|
|
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: |
4
|
|
|
# Author: Binux<[email protected]> |
5
|
|
|
# http://binux.me |
6
|
|
|
# Created on 2014-11-21 22:32:35 |
7
|
|
|
|
8
|
|
|
from __future__ import print_function |
9
|
|
|
|
10
|
|
|
import os |
11
|
|
|
import sys |
12
|
|
|
import six |
13
|
|
|
import time |
14
|
|
|
import json |
15
|
|
|
import signal |
16
|
|
|
import shutil |
17
|
|
|
import inspect |
18
|
|
|
import requests |
19
|
|
|
import unittest2 as unittest |
20
|
|
|
|
21
|
|
|
from pyspider import run |
22
|
|
|
from pyspider.libs import utils |
23
|
|
|
from tests import data_sample_handler |
24
|
|
|
|
25
|
|
|
class TestRun(unittest.TestCase): |
26
|
|
|
|
27
|
|
|
@classmethod |
28
|
|
|
def setUpClass(self): |
29
|
|
|
shutil.rmtree('./data/tests', ignore_errors=True) |
30
|
|
|
os.makedirs('./data/tests') |
31
|
|
|
|
32
|
|
|
import tests.data_test_webpage |
33
|
|
|
import httpbin |
34
|
|
|
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) |
35
|
|
|
self.httpbin = 'http://127.0.0.1:14887' |
36
|
|
|
|
37
|
|
|
@classmethod |
38
|
|
|
def tearDownClass(self): |
39
|
|
|
self.httpbin_thread.terminate() |
40
|
|
|
self.httpbin_thread.join() |
41
|
|
|
|
42
|
|
|
assert not utils.check_port_open(5000) |
43
|
|
|
assert not utils.check_port_open(23333) |
44
|
|
|
assert not utils.check_port_open(24444) |
45
|
|
|
assert not utils.check_port_open(25555) |
46
|
|
|
assert not utils.check_port_open(14887) |
47
|
|
|
|
48
|
|
|
shutil.rmtree('./data/tests', ignore_errors=True) |
49
|
|
|
|
50
|
|
|
def test_10_cli(self): |
51
|
|
|
ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) |
52
|
|
|
ctx = run.cli.invoke(ctx) |
53
|
|
|
self.assertEqual(ctx.obj.debug, False) |
54
|
|
|
for db in ('taskdb', 'projectdb', 'resultdb'): |
55
|
|
|
self.assertIsNotNone(getattr(ctx.obj, db)) |
56
|
|
|
for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', |
57
|
|
|
'fetcher2processor', 'processor2result'): |
58
|
|
|
self.assertIsNotNone(getattr(ctx.obj, name)) |
59
|
|
|
self.assertEqual(len(ctx.obj.instances), 0) |
60
|
|
|
|
61
|
|
|
def test_20_cli_config(self): |
62
|
|
|
with open('./data/tests/config.json', 'w') as fp: |
63
|
|
|
json.dump({ |
64
|
|
|
'debug': True, |
65
|
|
|
'taskdb': 'mysql+taskdb://localhost:23456/taskdb', |
66
|
|
|
'amqp-url': 'amqp://guest:guest@localhost:23456/%%2F' |
67
|
|
|
}, fp) |
68
|
|
|
ctx = run.cli.make_context('test', |
69
|
|
|
['--config', './data/tests/config.json'], |
70
|
|
|
None, obj=dict(testing_mode=True)) |
71
|
|
|
ctx = run.cli.invoke(ctx) |
72
|
|
|
self.assertEqual(ctx.obj.debug, True) |
73
|
|
|
|
74
|
|
|
import mysql.connector |
75
|
|
|
with self.assertRaises(mysql.connector.Error): |
76
|
|
|
ctx.obj.taskdb |
77
|
|
|
|
78
|
|
|
with self.assertRaises(Exception): |
79
|
|
|
ctx.obj.newtask_queue |
80
|
|
|
|
81
|
|
|
def test_30_cli_command_line(self): |
82
|
|
|
ctx = run.cli.make_context( |
83
|
|
|
'test', |
84
|
|
|
['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'], |
85
|
|
|
None, |
86
|
|
|
obj=dict(testing_mode=True) |
87
|
|
|
) |
88
|
|
|
ctx = run.cli.invoke(ctx) |
89
|
|
|
|
90
|
|
|
from pymongo.errors import ConnectionFailure |
91
|
|
|
with self.assertRaises(ConnectionFailure): |
92
|
|
|
ctx.obj.projectdb |
93
|
|
|
|
94
|
|
|
def test_40_cli_env(self): |
95
|
|
|
try: |
96
|
|
|
os.environ['RESULTDB'] = 'sqlite+resultdb://' |
97
|
|
|
ctx = run.cli.make_context('test', [], None, |
98
|
|
|
obj=dict(testing_mode=True)) |
99
|
|
|
ctx = run.cli.invoke(ctx) |
100
|
|
|
|
101
|
|
|
from pyspider.database.sqlite import resultdb |
102
|
|
|
self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB) |
103
|
|
|
finally: |
104
|
|
|
del os.environ['RESULTDB'] |
105
|
|
|
|
106
|
|
View Code Duplication |
@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') |
|
|
|
|
107
|
|
|
def test_50_docker_rabbitmq(self): |
108
|
|
|
try: |
109
|
|
|
os.environ['RABBITMQ_NAME'] = 'rabbitmq' |
110
|
|
|
os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost' |
111
|
|
|
os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672' |
112
|
|
|
ctx = run.cli.make_context('test', [], None, |
113
|
|
|
obj=dict(testing_mode=True)) |
114
|
|
|
ctx = run.cli.invoke(ctx) |
115
|
|
|
queue = ctx.obj.newtask_queue |
116
|
|
|
queue.put('abc') |
117
|
|
|
queue.delete() |
118
|
|
|
except Exception as e: |
119
|
|
|
self.assertIsNone(e) |
120
|
|
|
finally: |
121
|
|
|
del os.environ['RABBITMQ_NAME'] |
122
|
|
|
del os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] |
123
|
|
|
del os.environ['RABBITMQ_PORT_5672_TCP_PORT'] |
124
|
|
|
|
125
|
|
View Code Duplication |
@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') |
|
|
|
|
126
|
|
|
def test_60_docker_mongodb(self): |
127
|
|
|
try: |
128
|
|
|
os.environ['MONGODB_NAME'] = 'mongodb' |
129
|
|
|
os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost' |
130
|
|
|
os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017' |
131
|
|
|
ctx = run.cli.make_context('test', [], None, |
132
|
|
|
obj=dict(testing_mode=True)) |
133
|
|
|
ctx = run.cli.invoke(ctx) |
134
|
|
|
ctx.obj.resultdb |
135
|
|
|
except Exception as e: |
136
|
|
|
self.assertIsNone(e) |
137
|
|
|
finally: |
138
|
|
|
del os.environ['MONGODB_NAME'] |
139
|
|
|
del os.environ['MONGODB_PORT_27017_TCP_ADDR'] |
140
|
|
|
del os.environ['MONGODB_PORT_27017_TCP_PORT'] |
141
|
|
|
|
142
|
|
View Code Duplication |
@unittest.skip('noly available in docker') |
|
|
|
|
143
|
|
|
@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') |
144
|
|
|
def test_70_docker_mysql(self): |
145
|
|
|
try: |
146
|
|
|
os.environ['MYSQL_NAME'] = 'mysql' |
147
|
|
|
os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost' |
148
|
|
|
os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306' |
149
|
|
|
ctx = run.cli.make_context('test', [], None, |
150
|
|
|
obj=dict(testing_mode=True)) |
151
|
|
|
ctx = run.cli.invoke(ctx) |
152
|
|
|
ctx.obj.resultdb |
153
|
|
|
except Exception as e: |
154
|
|
|
self.assertIsNone(e) |
155
|
|
|
finally: |
156
|
|
|
del os.environ['MYSQL_NAME'] |
157
|
|
|
del os.environ['MYSQL_PORT_3306_TCP_ADDR'] |
158
|
|
|
del os.environ['MYSQL_PORT_3306_TCP_PORT'] |
159
|
|
|
|
160
|
|
|
def test_80_docker_phantomjs(self): |
161
|
|
|
try: |
162
|
|
|
os.environ['PHANTOMJS_NAME'] = 'phantomjs' |
163
|
|
|
os.environ['PHANTOMJS_PORT_25555_TCP'] = 'tpc://binux:25678' |
164
|
|
|
ctx = run.cli.make_context('test', [], None, |
165
|
|
|
obj=dict(testing_mode=True)) |
166
|
|
|
ctx = run.cli.invoke(ctx) |
167
|
|
|
self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678') |
168
|
|
|
except Exception as e: |
169
|
|
|
self.assertIsNone(e) |
170
|
|
|
finally: |
171
|
|
|
del os.environ['PHANTOMJS_NAME'] |
172
|
|
|
del os.environ['PHANTOMJS_PORT_25555_TCP'] |
173
|
|
|
|
174
|
|
|
def test_90_docker_scheduler(self): |
175
|
|
|
try: |
176
|
|
|
os.environ['SCHEDULER_NAME'] = 'scheduler' |
177
|
|
|
os.environ['SCHEDULER_PORT_23333_TCP'] = 'tpc://binux:25678' |
178
|
|
|
ctx = run.cli.make_context('test', [], None, |
179
|
|
|
obj=dict(testing_mode=True)) |
180
|
|
|
ctx = run.cli.invoke(ctx) |
181
|
|
|
webui = run.cli.get_command(ctx, 'webui') |
182
|
|
|
webui_ctx = webui.make_context('webui', [], ctx) |
183
|
|
|
app = webui.invoke(webui_ctx) |
184
|
|
|
rpc = app.config['scheduler_rpc'] |
185
|
|
|
self.assertEqual(rpc._ServerProxy__host, 'binux:25678') |
186
|
|
|
except Exception as e: |
187
|
|
|
self.assertIsNone(e) |
188
|
|
|
finally: |
189
|
|
|
del os.environ['SCHEDULER_NAME'] |
190
|
|
|
del os.environ['SCHEDULER_PORT_23333_TCP'] |
191
|
|
|
|
192
|
|
|
def test_a100_all(self): |
193
|
|
|
import subprocess |
194
|
|
|
#cmd = [sys.executable] |
195
|
|
|
cmd = ['coverage', 'run'] |
196
|
|
|
p = subprocess.Popen(cmd+[ |
197
|
|
|
inspect.getsourcefile(run), |
198
|
|
|
'--taskdb', 'sqlite+taskdb:///data/tests/all_test_task.db', |
199
|
|
|
'--resultdb', 'sqlite+resultdb:///data/tests/all_test_result.db', |
200
|
|
|
'--projectdb', 'local+projectdb://'+inspect.getsourcefile(data_sample_handler), |
201
|
|
|
'all', |
202
|
|
|
], close_fds=True, preexec_fn=os.setsid) |
203
|
|
|
|
204
|
|
|
|
205
|
|
|
try: |
206
|
|
|
limit = 30 |
207
|
|
|
while limit >= 0: |
208
|
|
|
time.sleep(3) |
209
|
|
|
# click run |
210
|
|
|
try: |
211
|
|
|
requests.post('http://localhost:5000/run', data={ |
212
|
|
|
'project': 'data_sample_handler', |
213
|
|
|
}) |
214
|
|
|
except requests.exceptions.ConnectionError: |
215
|
|
|
limit -= 1 |
216
|
|
|
continue |
217
|
|
|
break |
218
|
|
|
|
219
|
|
|
limit = 30 |
220
|
|
|
data = requests.get('http://localhost:5000/counter') |
221
|
|
|
self.assertEqual(data.status_code, 200) |
222
|
|
|
while data.json().get('data_sample_handler', {}).get('5m', {}).get('success', 0) < 5: |
223
|
|
|
time.sleep(1) |
224
|
|
|
data = requests.get('http://localhost:5000/counter') |
225
|
|
|
limit -= 1 |
226
|
|
|
if limit <= 0: |
227
|
|
|
break |
228
|
|
|
|
229
|
|
|
self.assertGreater(limit, 0) |
230
|
|
|
rv = requests.get('http://localhost:5000/results?project=data_sample_handler') |
231
|
|
|
self.assertIn('<th>url</th>', rv.text) |
232
|
|
|
self.assertIn('class=url', rv.text) |
233
|
|
|
except: |
234
|
|
|
raise |
235
|
|
|
finally: |
236
|
|
|
time.sleep(1) |
237
|
|
|
os.killpg(p.pid, signal.SIGTERM) |
238
|
|
|
p.wait() |
239
|
|
|
|
240
|
|
|
def test_a110_one(self): |
241
|
|
|
pid, fd = os.forkpty() |
242
|
|
|
#cmd = [sys.executable] |
243
|
|
|
cmd = ['coverage', 'run'] |
244
|
|
|
cmd += [ |
245
|
|
|
inspect.getsourcefile(run), |
246
|
|
|
'one', |
247
|
|
|
'-i', |
248
|
|
|
inspect.getsourcefile(data_sample_handler) |
249
|
|
|
] |
250
|
|
|
|
251
|
|
|
if pid == 0: |
252
|
|
|
# child |
253
|
|
|
os.execvp(cmd[0], cmd) |
254
|
|
|
else: |
255
|
|
|
# parent |
256
|
|
|
def wait_text(timeout=1): |
257
|
|
|
import select |
258
|
|
|
text = [] |
259
|
|
|
while True: |
260
|
|
|
rl, wl, xl = select.select([fd], [], [], timeout) |
261
|
|
|
if not rl: |
262
|
|
|
break |
263
|
|
|
try: |
264
|
|
|
t = os.read(fd, 1024) |
265
|
|
|
except OSError: |
266
|
|
|
break |
267
|
|
|
if not t: |
268
|
|
|
break |
269
|
|
|
t = utils.text(t) |
270
|
|
|
text.append(t) |
271
|
|
|
print(t, end='') |
272
|
|
|
return ''.join(text) |
273
|
|
|
|
274
|
|
|
text = wait_text(3) |
275
|
|
|
self.assertIn('new task data_sample_handler:on_start', text) |
276
|
|
|
self.assertIn('pyspider shell', text) |
277
|
|
|
|
278
|
|
|
os.write(fd, utils.utf8('run()\n')) |
279
|
|
|
text = wait_text() |
280
|
|
|
self.assertIn('task done data_sample_handler:on_start', text) |
281
|
|
|
|
282
|
|
|
os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin)) |
283
|
|
|
text = wait_text() |
284
|
|
|
self.assertIn('/robots.txt', text) |
285
|
|
|
|
286
|
|
|
os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin)) |
287
|
|
|
text = wait_text() |
288
|
|
|
if '"title": "Links"' not in text: |
289
|
|
|
os.write(fd, utils.utf8('crawl("%s/links/10/1")\n' % self.httpbin)) |
290
|
|
|
text = wait_text() |
291
|
|
|
self.assertIn('"title": "Links"', text) |
292
|
|
|
|
293
|
|
|
os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin)) |
294
|
|
|
text = wait_text() |
295
|
|
|
self.assertIn('task retry', text) |
296
|
|
|
|
297
|
|
|
os.write(fd, b'quit_pyspider()\n') |
298
|
|
|
text = wait_text() |
299
|
|
|
self.assertIn('scheduler exiting...', text) |
300
|
|
|
os.close(fd) |
301
|
|
|
os.kill(pid, signal.SIGINT) |
302
|
|
|
|
303
|
|
|
class TestSendMessage(unittest.TestCase): |
304
|
|
|
|
305
|
|
|
@classmethod |
306
|
|
|
def setUpClass(self): |
307
|
|
|
shutil.rmtree('./data/tests', ignore_errors=True) |
308
|
|
|
os.makedirs('./data/tests') |
309
|
|
|
|
310
|
|
|
ctx = run.cli.make_context('test', [ |
311
|
|
|
'--taskdb', 'sqlite+taskdb:///data/tests/task.db', |
312
|
|
|
'--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', |
313
|
|
|
'--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', |
314
|
|
|
], None, obj=dict(testing_mode=True)) |
315
|
|
|
self.ctx = run.cli.invoke(ctx) |
316
|
|
|
|
317
|
|
|
ctx = run.scheduler.make_context('scheduler', [], self.ctx) |
318
|
|
|
scheduler = run.scheduler.invoke(ctx) |
319
|
|
|
self.xmlrpc_thread = utils.run_in_thread(scheduler.xmlrpc_run) |
320
|
|
|
self.scheduler_thread = utils.run_in_thread(scheduler.run) |
321
|
|
|
|
322
|
|
|
time.sleep(1) |
323
|
|
|
|
324
|
|
View Code Duplication |
@classmethod |
|
|
|
|
325
|
|
|
def tearDownClass(self): |
326
|
|
|
for each in self.ctx.obj.instances: |
327
|
|
|
each.quit() |
328
|
|
|
self.xmlrpc_thread.join() |
329
|
|
|
self.scheduler_thread.join() |
330
|
|
|
time.sleep(1) |
331
|
|
|
|
332
|
|
|
assert not utils.check_port_open(5000) |
333
|
|
|
assert not utils.check_port_open(23333) |
334
|
|
|
assert not utils.check_port_open(24444) |
335
|
|
|
assert not utils.check_port_open(25555) |
336
|
|
|
|
337
|
|
|
shutil.rmtree('./data/tests', ignore_errors=True) |
338
|
|
|
|
339
|
|
|
def test_10_send_message(self): |
340
|
|
|
ctx = run.send_message.make_context('send_message', [ |
341
|
|
|
'test_project', 'test_message' |
342
|
|
|
], self.ctx) |
343
|
|
|
self.assertTrue(run.send_message.invoke(ctx)) |
344
|
|
|
while True: |
345
|
|
|
task = self.ctx.obj.scheduler2fetcher.get(timeout=1) |
346
|
|
|
if task['url'] == 'data:,on_message': |
347
|
|
|
break |
348
|
|
|
self.assertEqual(task['process']['callback'], '_on_message') |
349
|
|
|
|
350
|
|
|
|