1 | <?php |
||
22 | class Crawler |
||
23 | { |
||
24 | |||
25 | /** |
||
26 | * @var Url |
||
27 | */ |
||
28 | private $baseUrl; |
||
29 | |||
30 | /** |
||
31 | * @var LinkParser |
||
32 | */ |
||
33 | private $parser; |
||
34 | |||
35 | /** |
||
36 | * @var ClientInterface |
||
37 | */ |
||
38 | private $httpClient; |
||
39 | |||
40 | /** |
||
41 | * @var array |
||
42 | */ |
||
43 | private $policies = []; |
||
44 | |||
45 | /** |
||
46 | * @var array |
||
47 | */ |
||
48 | private $collectors = []; |
||
49 | |||
50 | /** |
||
51 | * Crawler constructor. |
||
52 | * |
||
53 | * @param Url $baseUrl |
||
54 | * @param LinkParser $parser |
||
55 | * @param ClientInterface $httpClient |
||
56 | */ |
||
57 | public function __construct(Url $baseUrl, LinkParser $parser, ClientInterface $httpClient) |
||
63 | |||
64 | /** |
||
65 | * Add a new crawler policy. |
||
66 | * |
||
67 | * @param $key |
||
68 | * @param Policy $policy |
||
69 | */ |
||
70 | public function setPolicy($key, Policy $policy) |
||
74 | |||
75 | /** |
||
76 | * Set crawler policies to follow the URLs |
||
77 | * of a webpage. |
||
78 | * |
||
79 | * @param array $policies |
||
80 | */ |
||
81 | public function setPolicies(array $policies) |
||
91 | |||
92 | /** |
||
93 | * Set a crawler collector. |
||
94 | * |
||
95 | * @param $key |
||
96 | * @param Collector $collector |
||
97 | */ |
||
98 | public function setCollector($key, Collector $collector) |
||
102 | |||
103 | /** |
||
104 | * Return a previously set crawler collector. |
||
105 | * |
||
106 | * @param $key |
||
107 | * @return Collector|null |
||
108 | */ |
||
109 | public function getCollector($key) |
||
115 | |||
116 | /** |
||
117 | * Set crawler collectors. |
||
118 | * |
||
119 | * @param array $collectors |
||
120 | */ |
||
121 | public function setCollectors(array $collectors) |
||
131 | |||
132 | /** |
||
133 | * Will return true|false if the URL passed as argument should |
||
134 | * be visited by the crawler based upon policies. |
||
135 | * |
||
136 | * @param Url $url |
||
137 | * @return bool |
||
138 | */ |
||
139 | public function shouldVisit(Url $url) |
||
149 | |||
150 | /** |
||
151 | * Will return collect the data based on added collector rules. |
||
152 | * |
||
153 | * @param Url $url |
||
154 | * @param $content |
||
155 | */ |
||
156 | public function shouldCollect(Url $url, $content) |
||
164 | |||
165 | /** |
||
166 | * Visit a webpage. |
||
167 | * |
||
168 | * @TODO handle the exception |
||
169 | * @param HttpResource $httpResource |
||
170 | * @return array |
||
171 | */ |
||
172 | private function visitAndCollect(HttpResource $httpResource) |
||
187 | |||
188 | /** |
||
189 | * This method will return the array of visited URLs by the crawler |
||
190 | * based upon specified deep scan and policies. |
||
191 | * |
||
192 | * @param $maxDeep |
||
193 | * @return array|mixed |
||
194 | */ |
||
195 | public function crawl($maxDeep = 1) |
||
218 | |||
219 | /** |
||
220 | * @param array $links |
||
221 | * @return array |
||
222 | */ |
||
223 | protected function getUrlArray(array $links = array()) |
||
229 | } |