@@ 9-26 (lines=18) @@ | ||
6 | from pyspider.libs.base_handler import * |
|
7 | ||
8 | ||
9 | class Handler(BaseHandler): |
|
10 | crawl_config = { |
|
11 | } |
|
12 | ||
13 | @every(minutes=24 * 60) |
|
14 | def on_start(self): |
|
15 | self.crawl('__START_URL__', callback=self.index_page) |
|
16 | ||
17 | @config(age=10 * 24 * 60 * 60) |
|
18 | def index_page(self, response): |
|
19 | for each in response.doc('a[href^="http"]').items(): |
|
20 | self.crawl(each.attr.href, callback=self.detail_page) |
|
21 | ||
22 | @config(priority=2) |
|
23 | def detail_page(self, response): |
|
24 | return { |
|
25 | "url": response.url, |
|
26 | "title": response.doc('title').text(), |
|
27 | } |
|
28 |
@@ 9-26 (lines=18) @@ | ||
6 | from pyspider.libs.base_handler import * |
|
7 | ||
8 | ||
9 | class Handler(BaseHandler): |
|
10 | crawl_config = { |
|
11 | } |
|
12 | ||
13 | @every(minutes=24 * 60) |
|
14 | def on_start(self): |
|
15 | self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page) |
|
16 | ||
17 | @config(age=10 * 24 * 60 * 60) |
|
18 | def index_page(self, response): |
|
19 | for each in response.doc('a[href^="http"]').items(): |
|
20 | self.crawl(each.attr.href, callback=self.detail_page) |
|
21 | ||
22 | @config(priority=2) |
|
23 | def detail_page(self, response): |
|
24 | return { |
|
25 | "url": response.url, |
|
26 | "title": response.doc('title').text(), |
|
27 | } |
|
28 |