Completed
Branch master (ef0e4a)
by George
06:37 queued 03:38
created

SiteChecker::checkAsset()   D

Complexity

Conditions 9
Paths 26

Size

Total Lines 45
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 7
Bugs 2 Features 2
Metric Value
c 7
b 2
f 2
dl 0
loc 45
rs 4.9091
cc 9
eloc 24
nc 26
nop 1
1
<?php
2
3
namespace SiteChecker;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Cookie\CookieJar;
7
use GuzzleHttp\Cookie\SetCookie;
8
use GuzzleHttp\Exception\RequestException;
9
use GuzzleHttp\RequestOptions;
10
use Psr\Http\Message\ResponseInterface;
11
use Symfony\Component\DomCrawler\Crawler;
12
13
/**
14
 * Class SiteChecker
15
 * @package SiteChecker
16
 */
17
class SiteChecker
18
{
19
20
    /**
21
     * @var array
22
     */
23
    protected $messages = [];
24
25
    /**
26
     * @var Asset[]
27
     */
28
    protected $checkedAssets = [];
29
30
    /**
31
     * @var Asset
32
     */
33
    protected $basePage;
34
35
    /**
36
     * @var SiteCheckObserver
37
     */
38
    protected $observer;
39
40
    /**
41
     * @var Client
42
     */
43
    protected $client;
44
45
    /**
46
     * @var Config
47
     */
48
    protected $config;
49
50
51
    /**
52
     * @return Config
53
     */
54
    public function getConfig()
55
    {
56
        return $this->config;
57
    }
58
59
    /**
60
     * @param Config $config
61
     */
62
    public function setConfig($config)
63
    {
64
        $this->config = $config;
65
    }
66
67
    /**
68
     * SiteChecker constructor.
69
     * @param \GuzzleHttp\Client $client
70
     * @param \SiteChecker\SiteCheckObserver|null $observer
71
     */
72
    public function __construct(
73
        Client $client,
74
        SiteCheckObserver $observer = null
75
    ) {
76
        $this->client = $client;
77
        $this->observer = $observer ?: new DummyObserver();
78
        $this->config = new Config();
79
    }
80
81
82
    /**
83
     * @param \SiteChecker\SiteCheckObserver $observer
84
     * @return static
85
     */
86
    public static function create(SiteCheckObserver $observer)
87
    {
88
        $client = new Client([
89
            RequestOptions::ALLOW_REDIRECTS => true,
90
            RequestOptions::COOKIES => true,
91
            RequestOptions::VERIFY => false,
92
        ]);
93
94
        return new static($client, $observer);
95
    }
96
97
    /**
98
     * Check the site for broken assets.
99
     *
100
     * @param string $baseUrl
101
     */
102
    public function check($baseUrl)
103
    {
104
        if (!$baseUrl instanceof Asset) {
105
            $baseUrl = new Asset($baseUrl);
106
        }
107
        $this->messages = [];
108
        $this->basePage = $baseUrl;
109
110
        $this->checkAsset($baseUrl);
111
        foreach ($this->config->includedUrls as $includedUrl) {
112
            $asset = new Asset($includedUrl);
113
            $this->normalizeUrl($asset);
114
            if (!in_array($asset->getUrl(), $this->checkedAssets)) {
115
                $this->checkAsset($asset);
116
            }
117
        }
118
        $this->observer->receiveResults($this->checkedAssets);
119
    }
120
121
122
    /**
123
     * @param Asset $asset
124
     */
125
    protected function checkAsset(Asset $asset)
126
    {
127
        if (!$this->shouldBeChecked($asset)) {
128
            return;
129
        }
130
131
        if (!$this->observer->pageToCheck($asset)) {
132
            return;
133
        }
134
        $cookies = $this->config->getCookies();
135
136
        foreach ($cookies as $key => $cookie) {
137
            $cookie['Domain'] = $this->basePage->host;
138
            $cookies[$key] = new SetCookie($cookie);
139
        }
140
141
        $jar = new CookieJar(false, $cookies);
142
143
        try {
144
            $response = $this->client->request('GET', $asset->getURL(),
145
                [
146
                    'cookies' => $jar
147
                ]);
148
        } catch (RequestException $exception) {
149
            $response = $exception->getResponse();
150
            $asset->setResponseCode('500');
151
        }
152
153
        if ($response) {
154
            $asset->setResponseCode($response->getStatusCode());
155
        }
156
157
        $this->observer->pageChecked($asset, $response);
158
159
        $this->checkedAssets[] = $asset;
160
161
        if (!$response) {
162
            return;
163
        }
164
165
        if (!$this->isExternal($asset) && $this->isHtmlPage($response)) {
166
            $this->checkAllAssets($response->getBody()->getContents(), $asset);
167
        }
168
169
    }
170
171
    /**
172
     * @param Asset $asset
173
     * @return bool
174
     */
175
    protected function isExternal(Asset $asset)
176
    {
177
        return $this->basePage->host !== $asset->host;
178
    }
179
180
    /**
181
     * Crawl all assets in the given html.
182
     *
183
     * @param string $html
184
     * @param Asset $parentAsset
185
     */
186
    protected function checkAllAssets($html, $parentAsset)
187
    {
188
        $allAssets = $this->getAllAssets($html, $parentAsset);
189
190
        /** @var Asset $asset */
191
        foreach ($allAssets as $asset) {
192
            if ($asset->isHttp()) {
193
                $this->normalizeUrl($asset);
194
                if ($this->shouldBeChecked($asset)) {
195
                    $this->checkAsset($asset);
196
                }
197
            }
198
        }
199
200
    }
201
202
    /**
203
     * Crawl all assets in the given html.
204
     *
205
     * @param $html
206
     * @param $parentPage
207
     * @return array
208
     */
209
    protected function getAllAssets($html, $parentPage)
210
    {
211
        $assets = [];
212
213
        $assetTypes = [
214
            'checkImages' => [
215
                '//img',
216
                'src',
217
                'image',
218
            ],
219
            'checkJS' => [
220
                '//script',
221
                'src',
222
                'js file',
223
            ],
224
            'checkCSS' => [
225
                '//link[@rel="stylesheet"]',
226
                'href',
227
                'image',
228
            ],
229
        ];
230
231
        $assets = array_merge(
232
            $assets,
233
            $this->createAssetsFromDOMElements(
234
                $html, '//a', 'href', 'page', $parentPage
235
            )
236
        );
237
238
        foreach ($assetTypes as $configKey => $args) {
239
            array_unshift($args, $html);
240
            $args[] = $parentPage;
241
            $assets = array_merge(
242
                $assets,
243
                call_user_func_array(
244
                    [$this, "createAssetsFromDOMElements"],
245
                    $args
246
                )
247
            );
248
        }
249
250
        return $assets;
251
    }
252
253
    /**
254
     * @param $html
255
     * @param $selector
256
     * @param $urlAttribute
257
     * @param $type
258
     * @param $parentPage
259
     * @return array
260
     */
261
    protected function createAssetsFromDOMElements(
262
        $html,
263
        $selector,
264
        $urlAttribute,
265
        $type,
266
        $parentPage
267
    ) {
268
        $assets = [];
269
270
        $crawler = new Crawler($html);
271
        $elements = $crawler->filterXpath($selector);
272
273
        /** @var \DOMElement $assetElement */
274
        foreach ($elements as $element) {
275
            if (!empty($element->getAttribute($urlAttribute))) {
276
                $urlValue = $element->getAttribute($urlAttribute);
277
                if ($this->config->ignoreWhiteSpaces) {
278
                    $urlValue = trim($urlValue);
279
                }
280
281
                $assets[] = new Asset(
282
                    $urlValue,
283
                    $parentPage,
284
                    $element->ownerDocument->saveHTML($element),
285
                    $type
286
                );
287
            }
288
        }
289
290
        return $assets;
291
    }
292
293
    /**
294
     * @param \Psr\Http\Message\ResponseInterface $response
295
     * @return bool
296
     */
297
    protected function isHtmlPage(ResponseInterface $response)
298
    {
299
        foreach ($response->getHeader('content-type') as $header) {
300
            if (stristr($header, 'text/html') !== false) {
301
                return true;
302
            }
303
        }
304
        return false;
305
    }
306
307
    /**
308
     * @param Asset $asset
309
     * @return bool
310
     */
311
    protected function shouldBeChecked(Asset $asset)
312
    {
313
        if (in_array($asset->getURL(), $this->config->excludedUrls)) {
314
            return false;
315
        }
316
        foreach ($this->config->excludedUrls as $excludedUrl) {
317
            if (preg_match('/' . $excludedUrl . '/i', $asset->getURL())) {
318
                return false;
319
            }
320
        }
321
        if (!$this->config->checkExternal && $this->isExternal($asset)) {
322
            return false;
323
        }
324
        return !in_array($asset->getUrl(), $this->checkedAssets);
325
    }
326
327
    /**
328
     * @param \SiteChecker\Asset $asset
329
     * @return bool
330
     */
331
    protected function isAlreadyChecked(Asset $asset)
332
    {
333
        return in_array($asset->getURL(), $this->checkedAssets);
334
    }
335
336
337
    /**
338
     * Normalize the given url.
339
     * @param \SiteChecker\Asset $asset
340
     * @return $this
341
     */
342
    protected function normalizeUrl(Asset $asset)
343
    {
344
        if ($asset->isRelative()) {
345
346
            $asset->setScheme($this->basePage->scheme)
347
                ->setHost($this->basePage->host)
348
                ->setPort($this->basePage->port);
349
        }
350
351
        if ($asset->isProtocolIndependent()) {
352
            $asset->setScheme($this->basePage->scheme);
353
        }
354
355
        return $asset->removeFragment();
356
    }
357
358
    /**
359
     * @return \SiteChecker\Asset[]
360
     */
361
    public function getResults()
362
    {
363
        return $this->checkedAssets;
364
    }
365
366
}
367