Complex classes like Spider often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Spider, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
27 | class Spider |
||
28 | { |
||
29 | /** @var RequestHandler */ |
||
30 | private $requestHandler; |
||
31 | |||
32 | /** @var PersistenceHandler */ |
||
33 | private $persistenceHandler; |
||
34 | |||
35 | /** @var QueueManager */ |
||
36 | private $queueManager; |
||
37 | |||
38 | /** @var EventDispatcherInterface */ |
||
39 | private $dispatcher; |
||
40 | |||
41 | /** @var DiscovererSet */ |
||
42 | private $discovererSet; |
||
43 | |||
44 | /** @var PostFetchFilter[] */ |
||
45 | private $postFetchFilters = array(); |
||
46 | |||
47 | /** @var FilterableUri The URI of the site to spider */ |
||
48 | private $seed = array(); |
||
49 | |||
50 | /** @var string the unique id of this spider instance */ |
||
51 | private $spiderId; |
||
52 | |||
53 | /** @var array the list of already visited URIs with the depth they were discovered on as value */ |
||
54 | private $alreadySeenUris = array(); |
||
55 | |||
56 | /** @var the maximum number of downloaded resources. 0 means no limit */ |
||
57 | public $downloadLimit = 0; |
||
58 | |||
59 | /** |
||
60 | * @param string $seed the URI to start crawling |
||
61 | * @param string $spiderId |
||
62 | */ |
||
63 | public function __construct($seed, $spiderId = null) |
||
83 | |||
84 | /** |
||
85 | * Starts crawling the URI provided on instantiation |
||
86 | * |
||
87 | * @return array |
||
88 | */ |
||
89 | public function crawl() |
||
96 | |||
97 | /** |
||
98 | * @param PostFetchFilter $filter |
||
99 | */ |
||
100 | public function addPostFetchFilter(PostFetchFilter $filter) |
||
104 | |||
105 | /** |
||
106 | * @param RequestHandler $requestHandler |
||
107 | */ |
||
108 | public function setRequestHandler(RequestHandler $requestHandler) |
||
112 | |||
113 | /** |
||
114 | * @return RequestHandler |
||
115 | */ |
||
116 | public function getRequestHandler() |
||
124 | |||
125 | /** |
||
126 | * param DiscovererSet $discovererSet |
||
127 | */ |
||
128 | public function setDiscovererSet(DiscovererSet $discovererSet) |
||
132 | |||
133 | /** |
||
134 | * @return DiscovererSet |
||
135 | */ |
||
136 | public function getDiscovererSet() |
||
144 | |||
145 | /** |
||
146 | * param QueueManager $queueManager |
||
147 | */ |
||
148 | public function setQueueManager(QueueManager $queueManager) |
||
152 | |||
153 | /** |
||
154 | * @return QueueManager |
||
155 | */ |
||
156 | public function getQueueManager() |
||
164 | |||
165 | /** |
||
166 | * @param PersistenceHandler $persistenceHandler |
||
167 | */ |
||
168 | public function setPersistenceHandler(PersistenceHandler $persistenceHandler) |
||
172 | |||
173 | /** |
||
174 | * @return PersistenceHandler |
||
175 | */ |
||
176 | public function getPersistenceHandler() |
||
184 | |||
185 | /** |
||
186 | * @param EventDispatcherInterface $eventDispatcher |
||
187 | * @return $this |
||
188 | */ |
||
189 | public function setDispatcher(EventDispatcherInterface $eventDispatcher) |
||
195 | |||
196 | /** |
||
197 | * @return EventDispatcherInterface |
||
198 | */ |
||
199 | public function getDispatcher() |
||
206 | |||
207 | public function handleSignal($signal) |
||
217 | |||
218 | /** |
||
219 | * @param Resource $resource |
||
220 | * @return bool |
||
221 | */ |
||
222 | private function matchesPostfetchFilter(Resource $resource) |
||
235 | |||
236 | private function isDownLoadLimitExceeded() |
||
243 | /** |
||
244 | * Function that crawls each provided URI |
||
245 | * It applies all processors and listeners set on the Spider |
||
246 | * |
||
247 | * This is a either depth first algorithm as explained here: |
||
248 | * https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
249 | * Note that because we don't do it recursive, but iteratively, |
||
250 | * results will be in a different order from the example, because |
||
251 | * we always take the right-most child first, whereas a recursive |
||
252 | * variant would always take the left-most child first |
||
253 | * |
||
254 | * or |
||
255 | * |
||
256 | * a breadth first algorithm |
||
257 | * |
||
258 | * @return void |
||
259 | */ |
||
260 | private function doCrawl() |
||
306 | |||
307 | /** |
||
308 | * @param UriInterface $uri |
||
309 | * @return bool|Resource |
||
310 | */ |
||
311 | protected function fetchResource(UriInterface $uri) |
||
337 | |||
338 | /** |
||
339 | * A shortcut for EventDispatcher::dispatch() |
||
340 | * |
||
341 | * @param string $eventName |
||
342 | * @param Event $event |
||
343 | */ |
||
344 | private function dispatch($eventName, Event $event = null) |
||
348 | |||
349 | /** |
||
350 | * @param string $uri |
||
351 | */ |
||
352 | private function setSeed($uri) |
||
357 | } |
||
358 |