Complex classes like SitemapParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use SitemapParser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 17 | class SitemapParser |
||
| 18 | { |
||
| 19 | /** |
||
| 20 | * User-Agent to send with every HTTP(S) request |
||
| 21 | * @var string |
||
| 22 | */ |
||
| 23 | protected $userAgent; |
||
| 24 | |||
| 25 | /** |
||
| 26 | * Configuration options |
||
| 27 | * @var array |
||
| 28 | */ |
||
| 29 | protected $config = []; |
||
| 30 | |||
| 31 | /** |
||
| 32 | * Sitemaps discovered |
||
| 33 | * @var array |
||
| 34 | */ |
||
| 35 | protected $sitemaps = []; |
||
| 36 | |||
| 37 | /** |
||
| 38 | * URLs discovered |
||
| 39 | * @var array |
||
| 40 | */ |
||
| 41 | protected $urls = []; |
||
| 42 | |||
| 43 | /** |
||
| 44 | * Sitemap URLs discovered but not yet parsed |
||
| 45 | * @var array |
||
| 46 | */ |
||
| 47 | protected $queue = []; |
||
| 48 | |||
| 49 | /** |
||
| 50 | * Parsed URLs history |
||
| 51 | * @var array |
||
| 52 | */ |
||
| 53 | protected $history = []; |
||
| 54 | |||
| 55 | /** |
||
| 56 | * Current URL being parsed |
||
| 57 | * @var null|string |
||
| 58 | */ |
||
| 59 | protected $currentURL; |
||
| 60 | |||
| 61 | /** |
||
| 62 | * Constructor |
||
| 63 | * |
||
| 64 | * @param string $userAgent User-Agent to send with every HTTP(S) request |
||
| 65 | * @param array $config Configuration options |
||
| 66 | * @throws SitemapParserException |
||
| 67 | */ |
||
| 68 | public function __construct($userAgent = 'SitemapParser', $config = []) |
||
| 69 | { |
||
| 70 | if (!extension_loaded('simplexml')) { |
||
| 71 | throw new SitemapParserException('The extension `simplexml` must be installed and loaded for this library'); |
||
| 72 | } |
||
| 73 | if (!extension_loaded('mbstring')) { |
||
| 74 | throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library'); |
||
| 75 | } |
||
| 76 | mb_language("uni"); |
||
| 77 | if (!mb_internal_encoding('UTF-8')) { |
||
| 78 | throw new SitemapParserException('Unable to set internal character encoding to UTF-8'); |
||
| 79 | } |
||
| 80 | $this->userAgent = $userAgent; |
||
| 81 | $this->config = $config; |
||
| 82 | } |
||
| 83 | |||
| 84 | /** |
||
| 85 | * Parse Recursive |
||
| 86 | * |
||
| 87 | * @param string $url |
||
| 88 | * @return void |
||
| 89 | * @throws SitemapParserException |
||
| 90 | */ |
||
| 91 | public function parseRecursive($url) |
||
| 92 | { |
||
| 93 | $this->addToQueue([$url]); |
||
| 94 | while (count($todo = $this->getQueue()) > 0) { |
||
| 95 | $sitemaps = $this->sitemaps; |
||
| 96 | $urls = $this->urls; |
||
| 97 | $this->parse($todo[0]); |
||
| 98 | $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps); |
||
| 99 | $this->urls = array_merge_recursive($urls, $this->urls); |
||
| 100 | } |
||
| 101 | } |
||
| 102 | |||
| 103 | /** |
||
| 104 | * Add an array of URLs to the parser queue |
||
| 105 | * |
||
| 106 | * @param array $urlArray |
||
| 107 | */ |
||
| 108 | public function addToQueue($urlArray) |
||
| 109 | { |
||
| 110 | foreach ($urlArray as $url) { |
||
| 111 | $this->queue[] = $url; |
||
| 112 | } |
||
| 113 | } |
||
| 114 | |||
| 115 | /** |
||
| 116 | * Sitemap URLs discovered but not yet parsed |
||
| 117 | * |
||
| 118 | * @return array |
||
| 119 | */ |
||
| 120 | public function getQueue() |
||
| 121 | { |
||
| 122 | $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history)); |
||
| 123 | return $this->queue; |
||
| 124 | } |
||
| 125 | |||
| 126 | /** |
||
| 127 | * Parse |
||
| 128 | * |
||
| 129 | * @param string $url URL to parse |
||
| 130 | * @param string|null $urlContent URL body content (skip download) |
||
| 131 | * @return void |
||
| 132 | * @throws SitemapParserException |
||
| 133 | */ |
||
| 134 | public function parse($url, $urlContent = null) |
||
| 135 | { |
||
| 136 | $this->clean(); |
||
| 137 | $this->currentURL = $url; |
||
| 138 | $response = (is_string($urlContent)) ? $urlContent : $this->getContent(); |
||
| 139 | $this->history[] = $this->currentURL; |
||
| 140 | if (parse_url($this->currentURL, PHP_URL_PATH) == '/robots.txt') { |
||
| 141 | $this->parseRobotstxt($response); |
||
| 142 | return; |
||
| 143 | } |
||
| 144 | // Check if content is an gzip file |
||
| 145 | if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) { |
||
| 146 | $response = gzdecode($response); |
||
| 147 | } |
||
| 148 | $sitemapJson = $this->generateXMLObject($response); |
||
| 149 | if ($sitemapJson instanceof SimpleXMLElement === false) { |
||
| 150 | $this->parseString($response); |
||
| 151 | return; |
||
| 152 | } |
||
| 153 | if (isset($sitemapJson->sitemap)) { |
||
| 154 | $this->parseJson('sitemap', $sitemapJson->sitemap); |
||
| 155 | } |
||
| 156 | if (isset($sitemapJson->url)) { |
||
| 157 | $this->parseJson('url', $sitemapJson->url); |
||
| 158 | } |
||
| 159 | } |
||
| 160 | |||
| 161 | /** |
||
| 162 | * Cleanup between each parse |
||
| 163 | * |
||
| 164 | * @return void |
||
| 165 | */ |
||
| 166 | protected function clean() |
||
| 167 | { |
||
| 168 | $this->sitemaps = []; |
||
| 169 | $this->urls = []; |
||
| 170 | } |
||
| 171 | |||
| 172 | /** |
||
| 173 | * Request the body content of an URL |
||
| 174 | * |
||
| 175 | * @return string Raw body content |
||
| 176 | * @throws SitemapParserException |
||
| 177 | */ |
||
| 178 | protected function getContent() |
||
| 179 | { |
||
| 180 | if (!filter_var($this->currentURL, FILTER_VALIDATE_URL)) { |
||
| 181 | throw new SitemapParserException('Passed URL not valid according to filter_var function'); |
||
| 182 | } |
||
| 183 | try { |
||
| 184 | if (!isset($this->config['guzzle']['headers']['User-Agent'])) { |
||
| 185 | $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent; |
||
| 186 | } |
||
| 187 | $client = new GuzzleHttp\Client(); |
||
| 188 | $res = $client->request('GET', $this->currentURL, $this->config['guzzle']); |
||
| 189 | return $res->getBody(); |
||
| 190 | } catch (GuzzleHttp\Exception\TransferException $e) { |
||
| 191 | throw new SitemapParserException($e->getMessage()); |
||
| 192 | } |
||
| 193 | } |
||
| 194 | |||
| 195 | /** |
||
| 196 | * Search for sitemaps in the robots.txt content |
||
| 197 | * |
||
| 198 | * @param string $robotstxt |
||
| 199 | * @return void |
||
| 200 | */ |
||
| 201 | protected function parseRobotstxt($robotstxt) |
||
| 211 | |||
| 212 | /** |
||
| 213 | * Validate URL arrays and add them to their corresponding arrays |
||
| 214 | * |
||
| 215 | * @param string $type sitemap|url |
||
| 216 | * @param array $array Tag array |
||
| 217 | * @return bool |
||
| 218 | */ |
||
| 219 | protected function addArray($type, $array) |
||
| 233 | |||
| 234 | /** |
||
| 235 | * Generate the \SimpleXMLElement object if the XML is valid |
||
| 236 | * |
||
| 237 | * @param string $xml |
||
| 238 | * @return \SimpleXMLElement|false |
||
| 239 | */ |
||
| 240 | protected function generateXMLObject($xml) |
||
| 249 | |||
| 250 | /** |
||
| 251 | * Parse plain text |
||
| 252 | * |
||
| 253 | * @param string $string |
||
| 254 | * @return bool |
||
| 255 | */ |
||
| 256 | protected function parseString($string) |
||
| 257 | { |
||
| 258 | if (!isset($this->config['strict']) || $this->config['strict'] !== false) { |
||
| 259 | // Strings are not part of any sitemap standard |
||
| 260 | return false; |
||
| 261 | } |
||
| 262 | $offset = 0; |
||
| 263 | while (preg_match('/(\S+)/', $string, $match, PREG_OFFSET_CAPTURE, $offset)) { |
||
| 264 | $offset = $match[0][1] + strlen($match[0][0]); |
||
| 265 | if (filter_var($match[0][0], FILTER_VALIDATE_URL) !== false) { |
||
| 266 | if ($this->isSitemapURL($match[0][0])) { |
||
| 267 | $this->addArray('sitemap', ['loc' => $match[0][0]]); |
||
| 268 | continue; |
||
| 269 | } |
||
| 270 | $this->addArray('url', ['loc' => $match[0][0]]); |
||
| 271 | } |
||
| 272 | } |
||
| 273 | return true; |
||
| 274 | } |
||
| 275 | |||
| 276 | /** |
||
| 277 | * Check if the URL may contain an Sitemap |
||
| 278 | * |
||
| 279 | * @param string $url |
||
| 280 | * @return bool |
||
| 281 | */ |
||
| 282 | protected function isSitemapURL($url) |
||
| 290 | |||
| 291 | /** |
||
| 292 | * Parse Json object |
||
| 293 | * |
||
| 294 | * @param string $type Sitemap or URL |
||
| 295 | * @param \SimpleXMLElement $json object |
||
| 296 | * @return void |
||
| 297 | */ |
||
| 298 | protected function parseJson($type, $json) |
||
| 304 | |||
| 305 | /** |
||
| 306 | * Sitemaps discovered |
||
| 307 | * |
||
| 308 | * @return array |
||
| 309 | */ |
||
| 310 | public function getSitemaps() |
||
| 314 | |||
| 315 | /** |
||
| 316 | * URLs discovered |
||
| 317 | * |
||
| 318 | * @return array |
||
| 319 | */ |
||
| 320 | public function getURLs() |
||
| 324 | } |
||
| 325 |