1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- encoding: utf-8 -*- |
3
|
|
|
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: |
4
|
|
|
# Author: Binux<[email protected]> |
5
|
|
|
# http://binux.me |
6
|
|
|
# Created on 2014-12-08 22:23:10 |
7
|
|
|
|
8
|
|
|
import time |
9
|
|
|
import logging |
10
|
|
|
logger = logging.getLogger('bench') |
11
|
|
|
|
12
|
|
|
from six.moves import queue as Queue |
13
|
|
|
from pyspider.scheduler import ThreadBaseScheduler as Scheduler |
14
|
|
|
from pyspider.fetcher.tornado_fetcher import Fetcher |
15
|
|
|
from pyspider.processor import Processor |
16
|
|
|
from pyspider.result import ResultWorker |
17
|
|
|
from pyspider.libs.utils import md5string |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
def bench_test_taskdb(taskdb): |
21
|
|
|
project_name = '__bench_test__' |
22
|
|
|
task = { |
23
|
|
|
"fetch": { |
24
|
|
|
"fetch_type": "js", |
25
|
|
|
"headers": { |
26
|
|
|
"User-Agent": "BaiDuSpider" |
27
|
|
|
} |
28
|
|
|
}, |
29
|
|
|
"process": { |
30
|
|
|
"callback": "detail_page" |
31
|
|
|
}, |
32
|
|
|
"project": project_name, |
33
|
|
|
"taskid": "553300d2582154413b4982c00c34a2d5", |
34
|
|
|
"url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" |
35
|
|
|
} |
36
|
|
|
|
37
|
|
|
track = { |
38
|
|
|
"fetch": { |
39
|
|
|
"content": None, |
40
|
|
|
"encoding": "unicode", |
41
|
|
|
"error": None, |
42
|
|
|
"headers": { |
43
|
|
|
"last-modified": "Wed, 04 Mar 2015 09:24:33 GMT" |
44
|
|
|
}, |
45
|
|
|
"ok": True, |
46
|
|
|
"redirect_url": None, |
47
|
|
|
"status_code": 200, |
48
|
|
|
"time": 5.543 |
49
|
|
|
}, |
50
|
|
|
"process": { |
51
|
|
|
"exception": None, |
52
|
|
|
"follows": 4, |
53
|
|
|
"logs": "", |
54
|
|
|
"ok": True, |
55
|
|
|
"result": "{'url': u'", |
56
|
|
|
"time": 0.07105398178100586 |
57
|
|
|
} |
58
|
|
|
} |
59
|
|
|
|
60
|
|
View Code Duplication |
def test_insert(n, start=0): |
|
|
|
|
61
|
|
|
logger.info("taskdb insert %d", n) |
62
|
|
|
start_time = time.time() |
63
|
|
|
for i in range(n): |
64
|
|
|
task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) |
65
|
|
|
task['taskid'] = md5string(task['url']) |
66
|
|
|
task['track'] = {} |
67
|
|
|
taskdb.insert(task['project'], task['taskid'], task) |
68
|
|
|
end_time = time.time() |
69
|
|
|
cost_time = end_time - start_time |
70
|
|
|
logger.info("cost %.2fs, %.2f/s %.2fms", |
71
|
|
|
cost_time, n * 1.0 / cost_time, cost_time / n * 1000) |
72
|
|
|
|
73
|
|
View Code Duplication |
def test_update(n, start=0): |
|
|
|
|
74
|
|
|
logger.info("taskdb update %d" % n) |
75
|
|
|
start_time = time.time() |
76
|
|
|
for i in range(n): |
77
|
|
|
task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) |
78
|
|
|
task['taskid'] = md5string(task['url']) |
79
|
|
|
task['track'] = track |
80
|
|
|
taskdb.update(task['project'], task['taskid'], task) |
81
|
|
|
end_time = time.time() |
82
|
|
|
cost_time = end_time - start_time |
83
|
|
|
logger.info("cost %.2fs, %.2f/s %.2fms", |
84
|
|
|
cost_time, n * 1.0 / cost_time, cost_time / n * 1000) |
85
|
|
|
|
86
|
|
|
request_task_fields = [ |
87
|
|
|
'taskid', |
88
|
|
|
'project', |
89
|
|
|
'url', |
90
|
|
|
'status', |
91
|
|
|
'fetch', |
92
|
|
|
'process', |
93
|
|
|
'track', |
94
|
|
|
'lastcrawltime' |
95
|
|
|
] |
96
|
|
|
|
97
|
|
|
def test_get(n, start=0, random=True, fields=request_task_fields): |
98
|
|
|
logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) |
99
|
|
|
range_n = list(range(n)) |
100
|
|
|
if random: |
101
|
|
|
from random import shuffle |
102
|
|
|
shuffle(range_n) |
103
|
|
|
start_time = time.time() |
104
|
|
|
for i in range_n: |
105
|
|
|
task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) |
106
|
|
|
task['taskid'] = md5string(task['url']) |
107
|
|
|
task['track'] = track |
108
|
|
|
taskdb.get_task(task['project'], task['taskid'], fields=fields) |
109
|
|
|
end_time = time.time() |
110
|
|
|
cost_time = end_time - start_time |
111
|
|
|
logger.info("cost %.2fs, %.2f/s %.2fms", |
112
|
|
|
cost_time, n * 1.0 / cost_time, cost_time / n * 1000) |
113
|
|
|
|
114
|
|
|
try: |
115
|
|
|
test_insert(1000) |
116
|
|
|
test_update(1000) |
117
|
|
|
test_get(1000) |
118
|
|
|
test_insert(10000, 1000) |
119
|
|
|
test_update(10000, 1000) |
120
|
|
|
test_get(10000, 1000) |
121
|
|
|
except Exception as e: |
122
|
|
|
logger.exception(e) |
123
|
|
|
finally: |
124
|
|
|
taskdb.drop(project_name) |
125
|
|
|
|
126
|
|
|
|
127
|
|
|
def bench_test_message_queue(queue): |
128
|
|
|
task = { |
129
|
|
|
"fetch": { |
130
|
|
|
"fetch_type": "js", |
131
|
|
|
"headers": { |
132
|
|
|
"User-Agent": "BaiDuSpider" |
133
|
|
|
} |
134
|
|
|
}, |
135
|
|
|
"process": { |
136
|
|
|
"callback": "detail_page" |
137
|
|
|
}, |
138
|
|
|
"project": "__bench_test__", |
139
|
|
|
"taskid": "553300d2582154413b4982c00c34a2d5", |
140
|
|
|
"url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
def test_put(n): |
144
|
|
|
logger.info("message queue put %d", n) |
145
|
|
|
start_time = time.time() |
146
|
|
|
for i in range(n): |
147
|
|
|
task['url'] = 'http://bench.pyspider.org/?l=%d' % i |
148
|
|
|
task['taskid'] = md5string(task['url']) |
149
|
|
|
queue.put(task, block=True, timeout=1) |
150
|
|
|
end_time = time.time() |
151
|
|
|
cost_time = end_time - start_time |
152
|
|
|
logger.info("cost %.2fs, %.2f/s %.2fms", |
153
|
|
|
cost_time, n * 1.0 / cost_time, cost_time / n * 1000) |
154
|
|
|
|
155
|
|
|
def test_get(n): |
156
|
|
|
logger.info("message queue get %d", n) |
157
|
|
|
start_time = time.time() |
158
|
|
|
for i in range(n): |
159
|
|
|
try: |
160
|
|
|
queue.get(True, 1) |
161
|
|
|
except Queue.Empty: |
162
|
|
|
logger.error('message queue empty while get %d', i) |
163
|
|
|
raise |
164
|
|
|
end_time = time.time() |
165
|
|
|
cost_time = end_time - start_time |
166
|
|
|
logger.info("cost %.2fs, %.2f/s %.2fms", |
167
|
|
|
cost_time, n * 1.0 / cost_time, cost_time / n * 1000) |
168
|
|
|
|
169
|
|
|
try: |
170
|
|
|
test_put(1000) |
171
|
|
|
test_get(1000) |
172
|
|
|
test_put(10000) |
173
|
|
|
test_get(10000) |
174
|
|
|
except Exception as e: |
175
|
|
|
logger.exception(e) |
176
|
|
|
finally: |
177
|
|
|
if hasattr(queue, 'channel'): |
178
|
|
|
queue.channel.queue_purge(queue.name) |
179
|
|
|
|
180
|
|
|
# clear message queue |
181
|
|
|
try: |
182
|
|
|
while queue.get(False): |
183
|
|
|
continue |
184
|
|
|
except Queue.Empty: |
185
|
|
|
pass |
186
|
|
|
|
187
|
|
|
|
188
|
|
|
class BenchMixin(object): |
189
|
|
|
"""Report to logger for bench test""" |
190
|
|
|
def _bench_init(self): |
191
|
|
|
self.done_cnt = 0 |
192
|
|
|
self.start_time = time.time() |
193
|
|
|
self.last_cnt = 0 |
194
|
|
|
self.last_report = 0 |
195
|
|
|
|
196
|
|
|
def _bench_report(self, name, prefix=0, rjust=0): |
197
|
|
|
self.done_cnt += 1 |
198
|
|
|
now = time.time() |
199
|
|
|
if now - self.last_report >= 1: |
200
|
|
|
rps = float(self.done_cnt - self.last_cnt) / (now - self.last_report) |
201
|
|
|
output = '' |
202
|
|
|
if prefix: |
203
|
|
|
output += " " * prefix |
204
|
|
|
output += ("%s %s pages (at %d pages/min)" % ( |
205
|
|
|
name, self.done_cnt, rps * 60.0)).rjust(rjust) |
206
|
|
|
logger.info(output) |
207
|
|
|
self.last_cnt = self.done_cnt |
208
|
|
|
self.last_report = now |
209
|
|
|
|
210
|
|
|
|
211
|
|
|
class BenchScheduler(Scheduler, BenchMixin): |
212
|
|
|
def __init__(self, *args, **kwargs): |
213
|
|
|
super(BenchScheduler, self).__init__(*args, **kwargs) |
214
|
|
|
self._bench_init() |
215
|
|
|
self.trigger_on_start('__bench_test__') |
216
|
|
|
|
217
|
|
|
def on_task_status(self, task): |
218
|
|
|
self._bench_report('Crawled') |
219
|
|
|
return super(BenchScheduler, self).on_task_status(task) |
220
|
|
|
|
221
|
|
|
|
222
|
|
|
class BenchFetcher(Fetcher, BenchMixin): |
223
|
|
|
def __init__(self, *args, **kwargs): |
224
|
|
|
super(BenchFetcher, self).__init__(*args, **kwargs) |
225
|
|
|
self._bench_init() |
226
|
|
|
|
227
|
|
|
def on_result(self, type, task, result): |
228
|
|
|
self._bench_report("Fetched", 0, 75) |
229
|
|
|
return super(BenchFetcher, self).on_result(type, task, result) |
230
|
|
|
|
231
|
|
|
|
232
|
|
|
class BenchProcessor(Processor, BenchMixin): |
233
|
|
|
def __init__(self, *args, **kwargs): |
234
|
|
|
super(BenchProcessor, self).__init__(*args, **kwargs) |
235
|
|
|
self._bench_init() |
236
|
|
|
|
237
|
|
|
def on_task(self, task, response): |
238
|
|
|
self._bench_report("Processed", 75) |
239
|
|
|
return super(BenchProcessor, self).on_task(task, response) |
240
|
|
|
|
241
|
|
|
|
242
|
|
|
class BenchResultWorker(ResultWorker, BenchMixin): |
243
|
|
|
def __init__(self, *args, **kwargs): |
244
|
|
|
super(BenchResultWorker, self).__init__(*args, **kwargs) |
245
|
|
|
self._bench_init() |
246
|
|
|
|
247
|
|
|
def on_result(self, task, result): |
248
|
|
|
self._bench_report("Saved", 0, 150) |
249
|
|
|
super(BenchResultWorker, self).on_result(task, result) |
250
|
|
|
|
251
|
|
|
|
252
|
|
|
bench_script = ''' |
253
|
|
|
from pyspider.libs.base_handler import * |
254
|
|
|
|
255
|
|
|
class Handler(BaseHandler): |
256
|
|
|
def on_start(self): |
257
|
|
|
self.crawl('http://127.0.0.1:5000/bench', |
258
|
|
|
params={'total': %(total)d, 'show': %(show)d}, |
259
|
|
|
callback=self.index_page) |
260
|
|
|
|
261
|
|
|
def index_page(self, response): |
262
|
|
|
for each in response.doc('a[href^="http://"]').items(): |
263
|
|
|
self.crawl(each.attr.href, callback=self.index_page) |
264
|
|
|
return response.url |
265
|
|
|
''' |
266
|
|
|
|