Complex classes like Spider often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Spider, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 25 | class Spider |
||
| 26 | { |
||
| 27 | /** @var RequestHandler */ |
||
| 28 | private $requestHandler; |
||
| 29 | |||
| 30 | /** @var PersistenceHandler */ |
||
| 31 | private $persistenceHandler; |
||
| 32 | |||
| 33 | /** @var QueueManager */ |
||
| 34 | private $queueManager; |
||
| 35 | |||
| 36 | /** @var EventDispatcherInterface */ |
||
| 37 | private $dispatcher; |
||
| 38 | |||
| 39 | /** @var DiscovererSet */ |
||
| 40 | private $discovererSet; |
||
| 41 | |||
| 42 | /** @var PostFetchFilter[] */ |
||
| 43 | private $postFetchFilters = array(); |
||
| 44 | |||
| 45 | /** @var FilterableUri The URI of the site to spider */ |
||
| 46 | private $seed = array(); |
||
| 47 | |||
| 48 | /** @var string the unique id of this spider instance */ |
||
| 49 | private $spiderId; |
||
| 50 | |||
| 51 | /** @var array the list of already visited URIs with the depth they were discovered on as value */ |
||
| 52 | private $alreadySeenUris = array(); |
||
| 53 | |||
| 54 | /** @var the maximum number of downloaded resources. 0 means no limit */ |
||
| 55 | public $downloadLimit = 0; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * @param string $seed the URI to start crawling |
||
| 59 | * @param string $spiderId |
||
|
|
|||
| 60 | */ |
||
| 61 | public function __construct($seed, $spiderId = null) |
||
| 81 | |||
| 82 | /** |
||
| 83 | * Starts crawling the URI provided on instantiation |
||
| 84 | * |
||
| 85 | * @return array |
||
| 86 | */ |
||
| 87 | public function crawl() |
||
| 94 | |||
| 95 | /** |
||
| 96 | * @param PostFetchFilter $filter |
||
| 97 | */ |
||
| 98 | public function addPostFetchFilter(PostFetchFilter $filter) |
||
| 102 | |||
| 103 | /** |
||
| 104 | * @param RequestHandler $requestHandler |
||
| 105 | */ |
||
| 106 | public function setRequestHandler(RequestHandler $requestHandler) |
||
| 110 | |||
| 111 | /** |
||
| 112 | * @return RequestHandler |
||
| 113 | */ |
||
| 114 | public function getRequestHandler() |
||
| 122 | |||
| 123 | /** |
||
| 124 | * param DiscovererSet $discovererSet |
||
| 125 | */ |
||
| 126 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||
| 130 | |||
| 131 | /** |
||
| 132 | * @return DiscovererSet |
||
| 133 | */ |
||
| 134 | public function getDiscovererSet() |
||
| 142 | |||
| 143 | /** |
||
| 144 | * param QueueManager $queueManager |
||
| 145 | */ |
||
| 146 | public function setQueueManager(QueueManager $queueManager) |
||
| 150 | |||
| 151 | /** |
||
| 152 | * @return QueueManager |
||
| 153 | */ |
||
| 154 | public function getQueueManager() |
||
| 162 | |||
| 163 | /** |
||
| 164 | * @param PersistenceHandler $persistenceHandler |
||
| 165 | */ |
||
| 166 | public function setPersistenceHandler(PersistenceHandler $persistenceHandler) |
||
| 170 | |||
| 171 | /** |
||
| 172 | * @return PersistenceHandler |
||
| 173 | */ |
||
| 174 | public function getPersistenceHandler() |
||
| 182 | |||
| 183 | /** |
||
| 184 | * @param EventDispatcherInterface $eventDispatcher |
||
| 185 | * @return $this |
||
| 186 | */ |
||
| 187 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
| 193 | |||
| 194 | /** |
||
| 195 | * @return EventDispatcherInterface |
||
| 196 | */ |
||
| 197 | public function getDispatcher() |
||
| 204 | |||
| 205 | public function handleSignal($signal) |
||
| 215 | |||
| 216 | /** |
||
| 217 | * @param Resource $resource |
||
| 218 | * @return bool |
||
| 219 | */ |
||
| 220 | private function matchesPostfetchFilter(Resource $resource) |
||
| 233 | |||
| 234 | private function isDownLoadLimitExceeded() |
||
| 241 | /** |
||
| 242 | * Function that crawls each provided URI |
||
| 243 | * It applies all processors and listeners set on the Spider |
||
| 244 | * |
||
| 245 | * This is a either depth first algorithm as explained here: |
||
| 246 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
| 247 | * Note that because we don't do it recursive, but iteratively, |
||
| 248 | * results will be in a different order from the example, because |
||
| 249 | * we always take the right-most child first, whereas a recursive |
||
| 250 | * variant would always take the left-most child first |
||
| 251 | * |
||
| 252 | * or |
||
| 253 | * |
||
| 254 | * a breadth first algorithm |
||
| 255 | * |
||
| 256 | * @return void |
||
| 257 | */ |
||
| 258 | private function doCrawl() |
||
| 304 | |||
| 305 | /** |
||
| 306 | * @param UriInterface $uri |
||
| 307 | * @return bool|Resource |
||
| 308 | */ |
||
| 309 | protected function fetchResource(UriInterface $uri) |
||
| 335 | |||
| 336 | /** |
||
| 337 | * A shortcut for EventDispatcher::dispatch() |
||
| 338 | * |
||
| 339 | * @param string $eventName |
||
| 340 | * @param Event $event |
||
| 341 | */ |
||
| 342 | private function dispatch($eventName, Event $event = null) |
||
| 346 | |||
| 347 | /** |
||
| 348 | * @param string $uri |
||
| 349 | */ |
||
| 350 | private function setSeed($uri) |
||
| 355 | } |
||
| 356 |
This check looks for
@paramannotations where the type inferred by our type inference engine differs from the declared type.It makes a suggestion as to what type it considers more descriptive.
Most often this is a case of a parameter that can be null in addition to its declared types.