Complex classes like Spider often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Spider, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
26 | class Spider |
||
27 | { |
||
28 | /** @var RequestHandlerInterface */ |
||
29 | private $requestHandler; |
||
30 | |||
31 | /** @var PersistenceHandlerInterface */ |
||
32 | private $persistenceHandler; |
||
33 | |||
34 | /** @var QueueManagerInterface */ |
||
35 | private $queueManager; |
||
36 | |||
37 | /** @var EventDispatcherInterface */ |
||
38 | private $dispatcher; |
||
39 | |||
40 | /** @var DiscovererSet */ |
||
41 | private $discovererSet; |
||
42 | |||
43 | /** @var PostFetchFilterInterface[] */ |
||
44 | private $postFetchFilters = array(); |
||
45 | |||
46 | /** @var FilterableUri The URI of the site to spider */ |
||
47 | private $seed = array(); |
||
48 | |||
49 | /** @var string the unique id of this spider instance */ |
||
50 | private $spiderId; |
||
51 | |||
52 | /** @var int the maximum number of downloaded resources. 0 means no limit */ |
||
53 | public $downloadLimit = 0; |
||
54 | |||
55 | /** |
||
56 | * @param string $seed the URI to start crawling |
||
57 | * @param string $spiderId |
||
|
|||
58 | */ |
||
59 | public function __construct($seed, $spiderId = null) |
||
79 | |||
80 | /** |
||
81 | * Starts crawling the URI provided on instantiation |
||
82 | * |
||
83 | * @return array |
||
84 | */ |
||
85 | public function crawl() |
||
92 | |||
93 | /** |
||
94 | * @param PostFetchFilterInterface $filter |
||
95 | */ |
||
96 | public function addPostFetchFilter(PostFetchFilterInterface $filter) |
||
100 | |||
101 | /** |
||
102 | * @param RequestHandlerInterface $requestHandler |
||
103 | */ |
||
104 | public function setRequestHandler(RequestHandlerInterface $requestHandler) |
||
108 | |||
109 | /** |
||
110 | * @return RequestHandlerInterface |
||
111 | */ |
||
112 | public function getRequestHandler() |
||
120 | |||
121 | /** |
||
122 | * param DiscovererSet $discovererSet |
||
123 | */ |
||
124 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||
128 | |||
129 | /** |
||
130 | * @return DiscovererSet |
||
131 | */ |
||
132 | public function getDiscovererSet() |
||
140 | |||
141 | /** |
||
142 | * param QueueManagerInterface $queueManager |
||
143 | */ |
||
144 | public function setQueueManager(QueueManagerInterface $queueManager) |
||
148 | |||
149 | /** |
||
150 | * @return QueueManagerInterface |
||
151 | */ |
||
152 | public function getQueueManager() |
||
160 | |||
161 | /** |
||
162 | * @param PersistenceHandlerInterface $persistenceHandler |
||
163 | */ |
||
164 | public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler) |
||
168 | |||
169 | /** |
||
170 | * @return PersistenceHandlerInterface |
||
171 | */ |
||
172 | public function getPersistenceHandler() |
||
180 | |||
181 | /** |
||
182 | * @param EventDispatcherInterface $eventDispatcher |
||
183 | * @return $this |
||
184 | */ |
||
185 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
191 | |||
192 | /** |
||
193 | * @return EventDispatcherInterface |
||
194 | */ |
||
195 | public function getDispatcher() |
||
202 | |||
203 | public function handleSignal($signal) |
||
213 | |||
214 | /** |
||
215 | * @param Resource $resource |
||
216 | * @return bool |
||
217 | */ |
||
218 | private function matchesPostfetchFilter(Resource $resource) |
||
231 | |||
232 | private function isDownLoadLimitExceeded() |
||
239 | /** |
||
240 | * Function that crawls each provided URI |
||
241 | * It applies all processors and listeners set on the Spider |
||
242 | * |
||
243 | * This is a either depth first algorithm as explained here: |
||
244 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
245 | * Note that because we don't do it recursive, but iteratively, |
||
246 | * results will be in a different order from the example, because |
||
247 | * we always take the right-most child first, whereas a recursive |
||
248 | * variant would always take the left-most child first |
||
249 | * |
||
250 | * or |
||
251 | * |
||
252 | * a breadth first algorithm |
||
253 | * |
||
254 | * @return void |
||
255 | */ |
||
256 | private function doCrawl() |
||
288 | |||
289 | /** |
||
290 | * @param UriInterface $uri |
||
291 | * @return Resource |
||
292 | */ |
||
293 | protected function fetchResource(UriInterface $uri) |
||
318 | |||
319 | /** |
||
320 | * A shortcut for EventDispatcher::dispatch() |
||
321 | * |
||
322 | * @param string $eventName |
||
323 | * @param Event $event |
||
324 | */ |
||
325 | private function dispatch($eventName, Event $event = null) |
||
329 | |||
330 | /** |
||
331 | * @param string $uri |
||
332 | */ |
||
333 | private function setSeed($uri) |
||
338 | } |
||
339 |
This check looks for
@param
annotations where the type inferred by our type inference engine differs from the declared type.It makes a suggestion as to what type it considers more descriptive.
Most often this is a case of a parameter that can be null in addition to its declared types.