Complex classes like Crawl often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawl, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
17 | class Crawl |
||
18 | { |
||
19 | use DiffbotAware; |
||
20 | |||
21 | /** @var string API URL to which to send the request */ |
||
22 | protected $apiUrl = 'https://api.diffbot.com/v3/crawl'; |
||
23 | |||
24 | /** @var string */ |
||
25 | protected $name; |
||
26 | |||
27 | /** @var Api Api which should be used to process the pages */ |
||
28 | protected $api; |
||
29 | |||
30 | /** @var array Options to set while initiating the API call */ |
||
31 | protected $otherOptions = []; |
||
32 | |||
33 | /** @var array Array of seed URLs to crawl */ |
||
34 | protected $seeds = []; |
||
35 | |||
36 | /** |
||
37 | * @see getName |
||
38 | * @param string|null $name |
||
39 | * @param null|Api $api |
||
40 | */ |
||
41 | 61 | public function __construct($name = null, Api $api = null) |
|
50 | |||
51 | /** |
||
52 | * Returns the unique name of the crawljob |
||
53 | * This name is later used to download datasets, or to modify the job |
||
54 | * |
||
55 | * @return string |
||
56 | */ |
||
57 | 45 | public function getName() |
|
61 | |||
62 | /** |
||
63 | * API which should be used to process the pages |
||
64 | * |
||
65 | * Accepts a fully formed instance of any other API. Will use it to build |
||
66 | * and auto-encode the URL. To satisfy the required $url param of the API |
||
67 | * classes, use the string 'crawl' which prepares the API for Crawlbot |
||
68 | * consumption internally. |
||
69 | * |
||
70 | * @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs |
||
71 | * @param Api $api |
||
72 | * @return $this |
||
73 | */ |
||
74 | 1 | public function setApi(Api $api) |
|
80 | |||
81 | /** |
||
82 | * An array of URLs (seeds) which to crawl for matching links |
||
83 | * |
||
84 | * By default Crawlbot will restrict spidering to the entire domain |
||
85 | * ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com"). |
||
86 | * |
||
87 | * @param array $seeds |
||
88 | * @return $this |
||
89 | */ |
||
90 | 52 | public function setSeeds(array $seeds) |
|
108 | |||
109 | /** |
||
110 | * Array of strings to limit pages crawled to those whose URLs |
||
111 | * contain any of the content strings. |
||
112 | * |
||
113 | * You can use the exclamation point to specify a negative string, e.g. |
||
114 | * !product to exclude URLs containing the string "product," and the ^ and |
||
115 | * $ characters to limit matches to the beginning or end of the URL. |
||
116 | * |
||
117 | * The use of a urlCrawlPattern will allow Crawlbot to spider outside of |
||
118 | * the seed domain; it will follow all matching URLs regardless of domain. |
||
119 | * |
||
120 | * @param array $pattern |
||
121 | * @return $this |
||
122 | */ |
||
123 | 1 | public function setUrlCrawlPatterns(array $pattern = null) |
|
132 | |||
133 | /** |
||
134 | * Specify a regular expression to limit pages crawled to those URLs that |
||
135 | * match your expression. This will override any urlCrawlPattern value. |
||
136 | * |
||
137 | * The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the |
||
138 | * seed domain; it will follow all matching URLs regardless of domain. |
||
139 | * |
||
140 | * @param string $regex |
||
141 | * @return $this |
||
142 | */ |
||
143 | 1 | public function setUrlCrawlRegEx($regex) |
|
149 | |||
150 | /** |
||
151 | * Specify ||-separated strings to limit pages processed to those whose |
||
152 | * URLs contain any of the content strings. |
||
153 | * |
||
154 | * You can use the exclamation point to specify a negative string, e.g. |
||
155 | * !/category to exclude URLs containing the string "/category," and the ^ |
||
156 | * and $ characters to limit matches to the beginning or end of the URL. |
||
157 | * |
||
158 | * @param array $pattern |
||
159 | * @return $this |
||
160 | */ |
||
161 | 1 | public function setUrlProcessPatterns(array $pattern = null) |
|
170 | |||
171 | /** |
||
172 | * Specify a regular expression to limit pages processed to those URLs that |
||
173 | * match your expression. This will override any urlProcessPattern value. |
||
174 | * |
||
175 | * @param string $regex |
||
176 | * @return $this |
||
177 | */ |
||
178 | 1 | public function setUrlProcessRegEx($regex) |
|
185 | |||
186 | /** |
||
187 | * Specify ||-separated strings to limit pages processed to those whose |
||
188 | * HTML contains any of the content strings. |
||
189 | * |
||
190 | * @param array $pattern |
||
191 | * @return $this |
||
192 | */ |
||
193 | 1 | public function setPageProcessPatterns(array $pattern) |
|
202 | |||
203 | /** |
||
204 | * Specify the depth of your crawl. A maxHops=0 will limit processing to |
||
205 | * the seed URL(s) only -- no other links will be processed; maxHops=1 will |
||
206 | * process all (otherwise matching) pages whose links appear on seed URL(s); |
||
207 | * maxHops=2 will process pages whose links appear on those pages; and so on |
||
208 | * |
||
209 | * By default, Crawlbot will crawl and process links at any depth. |
||
210 | * |
||
211 | * @param int $input |
||
212 | * @return $this |
||
213 | */ |
||
214 | 6 | public function setMaxHops($input = -1) |
|
223 | |||
224 | /** |
||
225 | * Specify max pages to spider. Default: 100,000. |
||
226 | * |
||
227 | * @param int $input |
||
228 | * @return $this |
||
229 | */ |
||
230 | 6 | public function setMaxToCrawl($input = 100000) |
|
239 | |||
240 | /** |
||
241 | * Specify max pages to process through Diffbot APIs. Default: 100,000. |
||
242 | * |
||
243 | * @param int $input |
||
244 | * @return $this |
||
245 | */ |
||
246 | 6 | public function setMaxToProcess($input = 100000) |
|
255 | |||
256 | /** |
||
257 | * If input is email address, end a message to this email address when the |
||
258 | * crawl hits the maxToCrawl or maxToProcess limit, or when the crawl |
||
259 | * completes. |
||
260 | * |
||
261 | * If input is URL, you will receive a POST with X-Crawl-Name and |
||
262 | * X-Crawl-Status in the headers, and the full JSON response in the |
||
263 | * POST body. |
||
264 | * |
||
265 | * @param string $string |
||
266 | * @return $this |
||
267 | * @throws InvalidArgumentException |
||
268 | */ |
||
269 | 6 | public function notify($string) |
|
286 | |||
287 | /** |
||
288 | * Wait this many seconds between each URL crawled from a single IP address. |
||
289 | * Specify the number of seconds as an integer or floating-point number. |
||
290 | * |
||
291 | * @param float $input |
||
292 | * @return $this |
||
293 | * @throws InvalidArgumentException |
||
294 | */ |
||
295 | 14 | public function setCrawlDelay($input = 0.25) |
|
305 | |||
306 | /** |
||
307 | * Specify the number of days as a floating-point (e.g. repeat=7.0) to |
||
308 | * repeat this crawl. By default crawls will not be repeated. |
||
309 | * |
||
310 | * @param int|float $input |
||
311 | * @return $this |
||
312 | * @throws \InvalidArgumentException |
||
313 | */ |
||
314 | 7 | public function setRepeat($input) |
|
323 | |||
324 | /** |
||
325 | * By default repeat crawls will only process new (previously unprocessed) |
||
326 | * pages. Set to 0 to process all content on repeat crawls. |
||
327 | * |
||
328 | * @param int $int |
||
329 | * @return $this |
||
330 | */ |
||
331 | 1 | public function setOnlyProcessIfNew($int = 1) |
|
337 | |||
338 | /** |
||
339 | * Specify the maximum number of crawl repeats. By default (maxRounds=0) |
||
340 | * repeating crawls will continue indefinitely. |
||
341 | * |
||
342 | * @param int $input |
||
343 | * @return $this |
||
344 | */ |
||
345 | 6 | public function setMaxRounds($input = 0) |
|
355 | |||
356 | /** |
||
357 | * Ignores robots.txt if set to 0/false |
||
358 | * |
||
359 | * @param bool $bool |
||
360 | * @return $this |
||
361 | */ |
||
362 | 1 | public function setObeyRobots($bool = true) |
|
368 | |||
369 | /** |
||
370 | * Set value to 1 to force the use of proxy IPs for the crawl. |
||
371 | * |
||
372 | * @param bool $bool |
||
373 | * @return $this |
||
374 | */ |
||
375 | public function setUseProxies($bool = true) |
||
381 | |||
382 | /** |
||
383 | * Force the start of a new crawl "round" (manually repeat the crawl). |
||
384 | * If onlyProcessIfNew is set to 1 (default), only newly-created pages will |
||
385 | * be processed. |
||
386 | * |
||
387 | * @param bool $commit |
||
388 | * @return EntityIterator |
||
389 | * @throws DiffbotException |
||
390 | */ |
||
391 | 1 | public function roundStart($commit = true) |
|
397 | |||
398 | /** |
||
399 | * Pause a crawl. |
||
400 | * |
||
401 | * @param bool $commit |
||
402 | * @return EntityIterator |
||
403 | * @throws DiffbotException |
||
404 | */ |
||
405 | 1 | public function pause($commit = true) |
|
411 | |||
412 | /** |
||
413 | * Pause a crawl. |
||
414 | * |
||
415 | * @param bool $commit |
||
416 | * @return EntityIterator |
||
417 | * @throws DiffbotException |
||
418 | */ |
||
419 | 1 | public function unpause($commit = true) |
|
425 | |||
426 | /** |
||
427 | * Restart removes all crawled data while maintaining crawl settings. |
||
428 | * |
||
429 | * @param bool $commit |
||
430 | * @return EntityIterator |
||
431 | * @throws DiffbotException |
||
432 | */ |
||
433 | 1 | public function restart($commit = true) |
|
439 | |||
440 | /** |
||
441 | * Delete a crawl, and all associated data, completely. |
||
442 | * |
||
443 | * @param bool $commit |
||
444 | * @return EntityIterator |
||
445 | * @throws DiffbotException |
||
446 | */ |
||
447 | 1 | public function delete($commit = true) |
|
472 | |||
473 | 7 | public function call() |
|
494 | |||
495 | /** |
||
496 | * Builds out the URL string that gets requested once `call()` is called |
||
497 | * |
||
498 | * @return string |
||
499 | */ |
||
500 | 45 | public function buildUrl() |
|
543 | |||
544 | /** |
||
545 | * Sets the request type to "urls" to retrieve the URL Report |
||
546 | * URL for understanding diagnostic data of URLs |
||
547 | * |
||
548 | * @return $this |
||
549 | */ |
||
550 | public function getUrlReportUrl($num = null) |
||
579 | |||
580 | /** |
||
581 | * @return string |
||
582 | */ |
||
583 | 44 | protected function getApiString() |
|
592 | } |
||
593 |