Complex classes like CrawlerApi often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use CrawlerApi, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
36 | class CrawlerApi |
||
37 | { |
||
38 | /** |
||
39 | * @var \tx_crawler_lib |
||
40 | */ |
||
41 | private $crawlerObj; |
||
42 | |||
43 | /** |
||
44 | * @var \tx_crawler_domain_queue_repository queue repository |
||
45 | */ |
||
46 | protected $queueRepository; |
||
47 | |||
48 | /** |
||
49 | * @var $allowedConfigrations array |
||
50 | */ |
||
51 | protected $allowedConfigrations = array(); |
||
52 | |||
53 | /** |
||
54 | * Each crawler run has a setid, this facade method delegates |
||
55 | * the it to the crawler object |
||
56 | * |
||
57 | * @param int |
||
58 | */ |
||
59 | public function overwriteSetId($id) |
||
63 | |||
64 | /** |
||
65 | * This method is used to limit the configuration selection to |
||
66 | * a set of configurations. |
||
67 | * |
||
68 | * @param array $allowedConfigurations |
||
69 | */ |
||
70 | public function setAllowedConfigurations(array $allowedConfigurations) |
||
74 | |||
75 | /** |
||
76 | * Returns the setID of the crawler |
||
77 | * |
||
78 | * @return int |
||
79 | */ |
||
80 | public function getSetId() |
||
84 | |||
85 | /** |
||
86 | * Method to get an instance of the internal crawler singleton |
||
87 | * |
||
88 | * @return \tx_crawler_lib Instance of the crawler lib |
||
89 | * |
||
90 | * @throws \Exception |
||
91 | */ |
||
92 | protected function findCrawler() |
||
105 | |||
106 | /** |
||
107 | * Adds a page to the crawlerqueue by uid |
||
108 | * |
||
109 | * @param int $uid uid |
||
110 | */ |
||
111 | public function addPageToQueue($uid) |
||
117 | |||
118 | /** |
||
119 | * This method is used to limit the processing instructions to the processing instructions |
||
120 | * that are allowed. |
||
121 | * |
||
122 | * @return array |
||
123 | */ |
||
124 | protected function filterUnallowedConfigurations($configurations) |
||
137 | |||
138 | /** |
||
139 | * Adds a page to the crawlerqueue by uid and sets a |
||
140 | * timestamp when the page should be crawled. |
||
141 | * |
||
142 | * @param int $uid pageid |
||
143 | * @param int $time timestamp |
||
144 | */ |
||
145 | public function addPageToQueueTimed($uid, $time) |
||
181 | |||
182 | /** |
||
183 | * Counts all entrys in the database which are scheduled for a given page id and a schedule timestamp. |
||
184 | * |
||
185 | * @param int $page_uid |
||
186 | * @param int $schedule_timestamp |
||
187 | * |
||
188 | * @return int |
||
189 | */ |
||
190 | protected function countEntriesInQueueForPageByScheduletime($page_uid, $schedule_timestamp) |
||
208 | |||
209 | /** |
||
210 | * Determines if a page is queued |
||
211 | * |
||
212 | * @param $uid |
||
213 | * @param bool $unprocessed_only |
||
214 | * @param bool $timed_only |
||
215 | * @param bool $timestamp |
||
216 | * |
||
217 | * @return bool |
||
218 | */ |
||
219 | public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false) |
||
253 | |||
254 | /** |
||
255 | * Method to return the latest Crawle Timestamp for a page. |
||
256 | * |
||
257 | * @param int $uid uid id of the page |
||
258 | * @param bool $future_crawldates_only |
||
259 | * @param bool $unprocessed_only |
||
260 | * |
||
261 | * @return int |
||
262 | */ |
||
263 | public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false) |
||
286 | |||
287 | /** |
||
288 | * Returns an array with timestamps when the page has been scheduled for crawling and |
||
289 | * at what time the scheduled crawl has been executed. The array also contains items that are |
||
290 | * scheduled but have note been crawled yet. |
||
291 | * |
||
292 | * @param int $uid uid of the page |
||
293 | * @param bool $limit |
||
294 | * |
||
295 | * @return array array with the crawlhistory of a page => 0 : scheduled time , 1 : execuded_time, 2 : set_id |
||
296 | */ |
||
297 | public function getCrawlHistoryForPage($uid, $limit = false) |
||
311 | |||
312 | /** |
||
313 | * Method to determine unprocessed Items in the crawler queue. |
||
314 | * |
||
315 | * @return array |
||
316 | */ |
||
317 | public function getUnprocessedItems() |
||
325 | |||
326 | /** |
||
327 | * Method to get the number of unprocessed items in the crawler |
||
328 | * |
||
329 | * @param int number of unprocessed items in the queue |
||
330 | */ |
||
331 | public function countUnprocessedItems() |
||
340 | |||
341 | /** |
||
342 | * Method to check if a page is in the queue which is timed for a |
||
343 | * date when it should be crawled |
||
344 | * |
||
345 | * @param int $uid uid of the page |
||
346 | * @param boolean $show_unprocessed only respect unprocessed pages |
||
347 | * |
||
348 | * @return boolean |
||
349 | */ |
||
350 | public function isPageInQueueTimed($uid, $show_unprocessed = true) |
||
356 | |||
357 | /** |
||
358 | * Reads the registered processingInstructions of the crawler |
||
359 | * |
||
360 | * @return array |
||
361 | */ |
||
362 | private function getCrawlerProcInstructions() |
||
371 | |||
372 | /** |
||
373 | * Removes an queue entry with a given queue id |
||
374 | * |
||
375 | * @param int $qid |
||
376 | */ |
||
377 | public function removeQueueEntrie($qid) |
||
384 | |||
385 | /** |
||
386 | * Get queue statistics |
||
387 | * |
||
388 | * @param void |
||
389 | * |
||
390 | * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>); |
||
391 | */ |
||
392 | public function getQueueStatistics() |
||
399 | |||
400 | /** |
||
401 | * Get queue repository |
||
402 | * |
||
403 | * @param void |
||
404 | * |
||
405 | * @return \tx_crawler_domain_queue_repository queue repository |
||
406 | */ |
||
407 | protected function getQueueRepository() |
||
415 | |||
416 | /** |
||
417 | * Get queue statistics by configuration |
||
418 | * |
||
419 | * @param void |
||
420 | * |
||
421 | * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>) |
||
422 | */ |
||
423 | public function getQueueStatisticsByConfiguration() |
||
438 | |||
439 | /** |
||
440 | * Get active processes count |
||
441 | * |
||
442 | * @param void |
||
443 | * |
||
444 | * @return int |
||
445 | * @author Fabrizio Branca <[email protected]> |
||
446 | * @since 2009-09-03 |
||
447 | */ |
||
448 | public function getActiveProcessesCount() |
||
454 | |||
455 | /** |
||
456 | * Get last processed entries |
||
457 | * |
||
458 | * @param int limit |
||
459 | * |
||
460 | * @return array |
||
461 | */ |
||
462 | public function getLastProcessedQueueEntries($limit) |
||
466 | |||
467 | /** |
||
468 | * Get current crawling speed |
||
469 | * |
||
470 | * @param float|false page speed in pages per minute |
||
471 | * |
||
472 | * @return int |
||
473 | */ |
||
474 | public function getCurrentCrawlingSpeed() |
||
509 | |||
510 | /** |
||
511 | * Get some performance data |
||
512 | * |
||
513 | * @param integer $start |
||
514 | * @param integer $end |
||
515 | * @param integer $resolution |
||
516 | * |
||
517 | * @return array data |
||
518 | * |
||
519 | * @throws \Exception |
||
520 | */ |
||
521 | public function getPerformanceData($start, $end, $resolution) |
||
571 | |||
572 | /** |
||
573 | * Wrapper to support old an new method to test integer value. |
||
574 | * |
||
575 | * @param integer $value |
||
576 | * @param integer $min |
||
577 | * @param integer $max |
||
578 | * @param integer $default |
||
579 | * |
||
580 | * @return integer |
||
581 | */ |
||
582 | static public function forceIntegerInRange($value, $min, $max = 2000000, $default = 0) |
||
588 | |||
589 | /** |
||
590 | * Wrapper to support old an new method to test integer value. |
||
591 | * |
||
592 | * @param integer $value |
||
593 | * |
||
594 | * @return bool |
||
595 | */ |
||
596 | static public function canBeInterpretedAsInteger($value) |
||
602 | } |
||
603 |
This checks looks for assignemnts to variables using the
list(...)
function, where not all assigned variables are subsequently used.Consider the following code example.
Only the variables
$a
and$c
are used. There was no need to assign$b
.Instead, the list call could have been.