Complex classes like CrawlerApi often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use CrawlerApi, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
43 | class CrawlerApi |
||
44 | { |
||
45 | /** |
||
46 | * @var CrawlerController |
||
47 | */ |
||
48 | private $crawlerController; |
||
49 | |||
50 | /** |
||
51 | * @var QueueRepository |
||
52 | */ |
||
53 | protected $queueRepository; |
||
54 | |||
55 | /** |
||
56 | * @var $allowedConfigurations array |
||
57 | */ |
||
58 | protected $allowedConfigurations = []; |
||
59 | |||
60 | /** |
||
61 | * @var QueryBuilder |
||
62 | */ |
||
63 | protected $queryBuilder; |
||
64 | |||
65 | /** |
||
66 | * @var string |
||
67 | */ |
||
68 | protected $tableName = 'tx_crawler_queue'; |
||
69 | |||
70 | 11 | public function __construct() |
|
75 | |||
76 | /** |
||
77 | * Each crawler run has a setid, this facade method delegates |
||
78 | * the it to the crawler object |
||
79 | * |
||
80 | * @param int |
||
81 | */ |
||
82 | 1 | public function overwriteSetId($id) |
|
86 | |||
87 | /** |
||
88 | * This method is used to limit the configuration selection to |
||
89 | * a set of configurations. |
||
90 | * |
||
91 | * @param array $allowedConfigurations |
||
92 | */ |
||
93 | 1 | public function setAllowedConfigurations(array $allowedConfigurations) |
|
97 | |||
98 | /** |
||
99 | * @return array |
||
100 | */ |
||
101 | 1 | public function getAllowedConfigurations() |
|
105 | |||
106 | /** |
||
107 | * Returns the setID of the crawler |
||
108 | * |
||
109 | * @return int |
||
110 | */ |
||
111 | 1 | public function getSetId() |
|
115 | |||
116 | /** |
||
117 | * Method to get an instance of the internal crawler singleton |
||
118 | * |
||
119 | * @return CrawlerController Instance of the crawler lib |
||
120 | * |
||
121 | * @throws \Exception |
||
122 | */ |
||
123 | 2 | protected function findCrawler() |
|
136 | |||
137 | /** |
||
138 | * Adds a page to the crawlerqueue by uid |
||
139 | * |
||
140 | * @param int $uid uid |
||
141 | */ |
||
142 | public function addPageToQueue($uid) |
||
148 | |||
149 | /** |
||
150 | * This method is used to limit the processing instructions to the processing instructions |
||
151 | * that are allowed. |
||
152 | * |
||
153 | * @return array |
||
154 | */ |
||
155 | 2 | protected function filterUnallowedConfigurations($configurations) |
|
168 | |||
169 | /** |
||
170 | * Adds a page to the crawlerqueue by uid and sets a |
||
171 | * timestamp when the page should be crawled. |
||
172 | * |
||
173 | * @param int $uid pageid |
||
174 | * @param int $time timestamp |
||
175 | */ |
||
176 | 2 | public function addPageToQueueTimed($uid, $time) |
|
211 | |||
212 | /** |
||
213 | * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp. |
||
214 | * |
||
215 | * @param int $page_uid |
||
216 | * @param int $schedule_timestamp |
||
217 | * |
||
218 | * @return int |
||
219 | */ |
||
220 | 1 | protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp) |
|
248 | |||
249 | /** |
||
250 | * Determines if a page is queued |
||
251 | * |
||
252 | * @param $uid |
||
253 | * @param bool $unprocessed_only |
||
254 | * @param bool $timed_only |
||
255 | * @param bool $timestamp |
||
256 | * |
||
257 | * @return bool |
||
258 | * |
||
259 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
260 | */ |
||
261 | public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false) |
||
297 | |||
298 | /** |
||
299 | * Method to return the latest Crawle Timestamp for a page. |
||
300 | * |
||
301 | * @param int $uid uid id of the page |
||
302 | * @param bool $future_crawldates_only |
||
303 | * @param bool $unprocessed_only |
||
304 | * |
||
305 | * @return int |
||
306 | */ |
||
307 | 1 | public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false) |
|
340 | |||
341 | /** |
||
342 | * Returns an array with timestamps when the page has been scheduled for crawling and |
||
343 | * at what time the scheduled crawl has been executed. The array also contains items that are |
||
344 | * scheduled but have note been crawled yet. |
||
345 | * |
||
346 | * @param int $uid uid of the page |
||
347 | * @param bool $limit |
||
348 | * |
||
349 | * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id |
||
350 | */ |
||
351 | 1 | public function getCrawlHistoryForPage($uid, $limit = 0) |
|
369 | |||
370 | /** |
||
371 | * Method to determine unprocessed Items in the crawler queue. |
||
372 | * |
||
373 | * @return array |
||
374 | * |
||
375 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
376 | */ |
||
377 | public function getUnprocessedItems() |
||
391 | |||
392 | /** |
||
393 | * Method to get the number of unprocessed items in the crawler |
||
394 | * |
||
395 | * @param int number of unprocessed items in the queue |
||
396 | * |
||
397 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
398 | */ |
||
399 | public function countUnprocessedItems() |
||
411 | |||
412 | /** |
||
413 | * Method to check if a page is in the queue which is timed for a |
||
414 | * date when it should be crawled |
||
415 | * |
||
416 | * @param int $uid uid of the page |
||
417 | * @param boolean $show_unprocessed only respect unprocessed pages |
||
418 | * |
||
419 | * @return boolean |
||
420 | * |
||
421 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
422 | */ |
||
423 | public function isPageInQueueTimed($uid, $show_unprocessed = true) |
||
429 | |||
430 | /** |
||
431 | * Reads the registered processingInstructions of the crawler |
||
432 | * |
||
433 | * @return array |
||
434 | */ |
||
435 | 2 | private function getCrawlerProcInstructions() |
|
443 | |||
444 | /** |
||
445 | * Removes an queue entry with a given queue id |
||
446 | * |
||
447 | * @param int $qid |
||
448 | * |
||
449 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
450 | */ |
||
451 | public function removeQueueEntrie($qid) |
||
462 | |||
463 | /** |
||
464 | * Get queue statistics |
||
465 | * |
||
466 | * @param void |
||
467 | * |
||
468 | * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>); |
||
469 | */ |
||
470 | 1 | public function getQueueStatistics() |
|
477 | |||
478 | /** |
||
479 | * Get queue repository |
||
480 | * |
||
481 | * @return QueueRepository |
||
482 | */ |
||
483 | 2 | protected function getQueueRepository() |
|
491 | |||
492 | /** |
||
493 | * Get queue statistics by configuration |
||
494 | * |
||
495 | * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>) |
||
496 | */ |
||
497 | public function getQueueStatisticsByConfiguration() |
||
512 | |||
513 | /** |
||
514 | * Get active processes count |
||
515 | * |
||
516 | * @param void |
||
517 | * |
||
518 | * @return int |
||
519 | */ |
||
520 | public function getActiveProcessesCount() |
||
526 | |||
527 | /** |
||
528 | * Get last processed entries |
||
529 | * |
||
530 | * @param int $limit |
||
531 | * |
||
532 | * @return array |
||
533 | */ |
||
534 | public function getLastProcessedQueueEntries($limit) |
||
538 | |||
539 | /** |
||
540 | * Get current crawling speed |
||
541 | * |
||
542 | * @param float|false page speed in pages per minute |
||
543 | * |
||
544 | * @return int |
||
545 | */ |
||
546 | public function getCurrentCrawlingSpeed() |
||
581 | |||
582 | /** |
||
583 | * Get some performance data |
||
584 | * |
||
585 | * @param integer $start |
||
586 | * @param integer $end |
||
587 | * @param integer $resolution |
||
588 | * |
||
589 | * @return array data |
||
590 | * |
||
591 | * @throws \Exception |
||
592 | */ |
||
593 | public function getPerformanceData($start, $end, $resolution) |
||
642 | } |
||
643 |
This method has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.