SiteChecker::checkAllAssets()   A
last analyzed

Complexity

Conditions 4
Paths 4

Size

Total Lines 15
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 15
rs 9.2
cc 4
eloc 7
nc 4
nop 2
1
<?php
2
3
namespace SiteChecker;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Cookie\CookieJar;
7
use GuzzleHttp\Cookie\SetCookie;
8
use GuzzleHttp\Exception\RequestException;
9
use GuzzleHttp\RequestOptions;
10
use Psr\Http\Message\ResponseInterface;
11
use Symfony\Component\DomCrawler\Crawler;
12
13
/**
14
 * Class SiteChecker
15
 * @package SiteChecker
16
 */
17
class SiteChecker
18
{
19
20
    /**
21
     * @var array
22
     */
23
    protected $messages = [];
24
25
    /**
26
     * @var Asset[]
27
     */
28
    protected $checkedAssets = [];
29
30
    /**
31
     * @var Asset
32
     */
33
    protected $basePage;
34
35
    /**
36
     * @var SiteCheckObserverInterface
37
     */
38
    protected $observer;
39
40
    /**
41
     * @var Client
42
     */
43
    protected $client;
44
45
    /**
46
     * @var Config
47
     */
48
    protected $config;
49
50
51
    /**
52
     * @return Config
53
     */
54
    public function getConfig()
55
    {
56
        return $this->config;
57
    }
58
59
    /**
60
     * @param Config $config
61
     */
62
    public function setConfig($config)
63
    {
64
        $this->config = $config;
65
    }
66
67
    /**
68
     * SiteChecker constructor.
69
     * @param \GuzzleHttp\Client $client
70
     * @param \SiteChecker\SiteCheckObserverInterface|null $observer
71
     */
72
    public function __construct(
73
        Client $client,
74
        SiteCheckObserverInterface $observer = null
75
    ) {
76
        $this->client = $client;
77
        $this->observer = $observer ?: new DummyObserver();
78
        $this->config = new Config();
79
    }
80
81
82
    /**
83
     * @param \SiteChecker\SiteCheckObserverInterface $observer
84
     * @return static
85
     */
86
    public static function create(SiteCheckObserverInterface $observer = null)
87
    {
88
        $client = new Client([
89
            RequestOptions::ALLOW_REDIRECTS => true,
90
            RequestOptions::COOKIES => true,
91
            RequestOptions::VERIFY => false,
92
        ]);
93
94
        return new static($client, $observer);
95
    }
96
97
    /**
98
     * Check the site for broken assets.
99
     *
100
     * @param string $baseUrl
101
     */
102
    public function check($baseUrl)
103
    {
104
        if (!$baseUrl instanceof Asset) {
105
            $baseUrl = new Asset($baseUrl);
106
        }
107
        $this->messages = [];
108
        $this->basePage = $baseUrl;
109
110
        $this->checkAsset($baseUrl);
111
        foreach ($this->config->includedUrls as $includedUrl) {
112
            $asset = new Asset($includedUrl);
113
            $this->normalizeUrl($asset);
114
            if (!in_array($asset->getUrl(), $this->checkedAssets)) {
115
                $this->checkAsset($asset);
116
            }
117
        }
118
        $this->observer->receiveResults($this->checkedAssets);
119
    }
120
121
122
    /**
123
     * @param Asset $asset
124
     */
125
    protected function checkAsset(Asset $asset)
126
    {
127
        if (!$this->shouldBeChecked($asset) || !$this->observer->pageToCheck($asset)) {
128
            return;
129
        }
130
131
        $cookies = $this->config->getCookies();
132
133
        foreach ($cookies as $key => $cookie) {
134
            $cookie['Domain'] = $this->basePage->host;
135
            $cookies[$key] = new SetCookie($cookie);
136
        }
137
138
        $jar = new CookieJar(false, $cookies);
139
140
        try {
141
            $response = $this->client->request('GET', $asset->getURL(),
142
                [
143
                    'cookies' => $jar,
144
                ]);
145
        } catch (RequestException $exception) {
146
            $response = $exception->getResponse();
147
            $asset->setResponseCode(Asset::CODE_ERROR);
148
        }
149
150
        if ($response) {
151
            $asset->setResponseCode($response->getStatusCode());
152
        }
153
154
        $this->observer->pageChecked($asset, $response);
155
156
        $this->checkedAssets[] = $asset;
157
158
        if (!$response) {
159
            return;
160
        }
161
162
        if (!$this->isExternal($asset) && $this->isHtmlPage($response)) {
163
            $this->checkAllAssets($response->getBody()->getContents(), $asset);
164
        }
165
166
    }
167
168
    /**
169
     * @param Asset $asset
170
     * @return bool
171
     */
172
    protected function isExternal(Asset $asset)
173
    {
174
        return $this->basePage->host !== $asset->host;
175
    }
176
177
    /**
178
     * Crawl all assets in the given html.
179
     *
180
     * @param string $html
181
     * @param Asset $parentAsset
182
     */
183
    protected function checkAllAssets($html, $parentAsset)
184
    {
185
        $allAssets = $this->getAllAssets($html, $parentAsset);
186
187
        /** @var Asset $asset */
188
        foreach ($allAssets as $asset) {
189
            if ($asset->isHttp()) {
190
                $this->normalizeUrl($asset);
191
                if ($this->shouldBeChecked($asset)) {
192
                    $this->checkAsset($asset);
193
                }
194
            }
195
        }
196
197
    }
198
199
    /**
200
     * Crawl all assets in the given html.
201
     *
202
     * @param $html
203
     * @param $parentPage
204
     * @return array
205
     */
206
    protected function getAllAssets($html, $parentPage)
207
    {
208
        $assets = [];
209
210
        $assetTypes = [
211
            'checkImages' => [
212
                '//img',
213
                'src',
214
                'image',
215
            ],
216
            'checkJS' => [
217
                '//script',
218
                'src',
219
                'js file',
220
            ],
221
            'checkCSS' => [
222
                '//link[@rel="stylesheet"]',
223
                'href',
224
                'image',
225
            ],
226
        ];
227
228
        $assets = array_merge(
229
            $assets,
230
            $this->createAssetsFromDOMElements(
231
                $html, '//a', 'href', 'page', $parentPage
232
            )
233
        );
234
235
        foreach ($assetTypes as $args) {
236
            array_unshift($args, $html);
237
            $args[] = $parentPage;
238
            $assets = array_merge(
239
                $assets,
240
                call_user_func_array(
241
                    [$this, "createAssetsFromDOMElements"],
242
                    $args
243
                )
244
            );
245
        }
246
247
        return $assets;
248
    }
249
250
    /**
251
     * @param $html
252
     * @param $selector
253
     * @param $urlAttribute
254
     * @param $type
255
     * @param $parentPage
256
     * @return array
257
     */
258
    protected function createAssetsFromDOMElements(
259
        $html,
260
        $selector,
261
        $urlAttribute,
262
        $type,
263
        $parentPage
264
    ) {
265
        $assets = [];
266
267
        $crawler = new Crawler($html);
268
        $elements = $crawler->filterXpath($selector);
269
270
        /** @var \DOMElement $assetElement */
271
        foreach ($elements as $element) {
272
            if (!empty($element->getAttribute($urlAttribute))) {
273
                $urlValue = $element->getAttribute($urlAttribute);
274
                if ($this->config->ignoreWhiteSpaces) {
275
                    $urlValue = trim($urlValue);
276
                }
277
278
                $assets[] = new Asset(
279
                    $urlValue,
280
                    $parentPage,
281
                    $element->ownerDocument->saveHTML($element),
282
                    $type
283
                );
284
            }
285
        }
286
287
        return $assets;
288
    }
289
290
    /**
291
     * @param \Psr\Http\Message\ResponseInterface $response
292
     * @return bool
293
     */
294
    protected function isHtmlPage(ResponseInterface $response)
295
    {
296
        foreach ($response->getHeader('content-type') as $header) {
297
            if (stristr($header, 'text/html') !== false) {
298
                return true;
299
            }
300
        }
301
        return false;
302
    }
303
304
    /**
305
     * @param Asset $asset
306
     * @return bool
307
     */
308
    protected function shouldBeChecked(Asset $asset)
309
    {
310
        if (in_array($asset->getURL(), $this->config->excludedUrls)) {
311
            return false;
312
        }
313
        foreach ($this->config->excludedUrls as $excludedUrl) {
314
            if (preg_match('/' . $excludedUrl . '/i', $asset->getURL())) {
315
                return false;
316
            }
317
        }
318
        if (!$this->config->checkExternal && $this->isExternal($asset)) {
319
            return false;
320
        }
321
        return !in_array($asset->getUrl(), $this->checkedAssets);
322
    }
323
324
    /**
325
     * @param \SiteChecker\Asset $asset
326
     * @return bool
327
     */
328
    protected function isAlreadyChecked(Asset $asset)
329
    {
330
        return in_array($asset->getURL(), $this->checkedAssets);
331
    }
332
333
334
    /**
335
     * Normalize the given url.
336
     * @param \SiteChecker\Asset $asset
337
     * @return $this
338
     */
339
    protected function normalizeUrl(Asset $asset)
340
    {
341
        if ($asset->isRelative()) {
342
343
            $asset->setScheme($this->basePage->scheme)
344
                ->setHost($this->basePage->host)
345
                ->setPort($this->basePage->port);
346
        }
347
348
        if ($asset->isProtocolIndependent()) {
349
            $asset->setScheme($this->basePage->scheme);
350
        }
351
352
        return $asset->removeFragment();
353
    }
354
355
    /**
356
     * @return \SiteChecker\Asset[]
357
     */
358
    public function getResults()
359
    {
360
        return $this->checkedAssets;
361
    }
362
363
}
364