Completed
Pull Request — master (#27)
by Yoshiaki
02:52
created

Checker.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace Error;
4
5
require_once (__DIR__ . '/vendor/autoload.php');
6
7
/**
8
 * Description of Checker Main
9
 *
10
 * @author bootjp
11
 */
12
class Checker
13
{
14
    protected $client;
15
16
    protected $contentsSize = 500;
17
18
    protected $doubleCheck = true;
19
20
    protected $recursion = false;
21
22
    protected $garbage = [];
23
24
    protected $isContentsFetch = true;
25
26
27
    /**
28
     * initialisation.
29
     * @param array $args
30
     */
31
    public function __construct(array $args)
32
    {
33
        $this->client = new \GuzzleHttp\Client([
34
                'defaults' => [
35
                    'exceptions' => false,
36
                    'headers' => [
37
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
38
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
39
                    ]
40
                ]
41
            ]
42
        );
43
        if (array_key_exists('contentSize', $args)) {
44
            $this->contentsSize = (int) $args['contentSize'];
45
        }
46
47
        if (array_key_exists('doubleCheck', $args)) {
48
            $this->doubleCheck = (bool) $args['doubleCheck'];
49
        }
50
51
        if (array_key_exists('isContentsFetch', $args)) {
52
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
53
        }
54
55
        if (array_key_exists('recursion', $args)) {
56
            $this->recursion = (bool) $args['recursion'];
57
        }
58
59
        if (array_key_exists('auth', $args)) {
60
            list($username, $password) = explode(':', $args['auth'], 2);
61
            $this->client->setDefaultOption('auth', [$username, $password]);
62
        }
63
64
    }
65
66
    /**
67
     * Wrapper
68
     * @param  mixed $url [require]
69
     * @return array
70
     * @throws \ErrorException
71
     * @throws \ReflectionException
72
     */
73
    public function start($url)
74
    {
75
        $urlList = [];
76
        $result['white'] = [];
77
        $result['black'] = [];
78
79
        if ((bool) $this->isContentsFetch) {
80
            echo 'Contents fetching..';
81
            $url = $this->fetchByContents($url);
82
83
            if ((bool) $this->recursion) {
84
                $url = $this->urlFilter($url);
85
            }
86
        }
87
88
        if (is_null($url)) {
89
            throw new \ReflectionException('Start URL is not null.');
90
        } else if (is_array($url)) {
91
            $urlList = $this->urlFilter($url);
92
        } else if (is_string($url)) {
93
            $urlList[] = $url;
94
        } else if (is_object($url)) {
95
            $urlList[] = (string) $url;
96
        }
97
98
        echo "\n";
99
        echo 'Cheking..';
100
101
        foreach ($urlList as $key => $url) {
102
            try {
103
                $metaData = $this->client->get($url);
104
            } catch (\Exception $e) {
105
                echo "\n {$url}\t {$e->getMessage()}";
106
            }
107
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
108
            $softCheck = (array) $this->softCheckByContents($metaData);
109
110
            if ($hardCheck['result'] && $softCheck['result']) {
111
                $result['white'][$key]['url'] = $url;
112
                $result['white'][$key]['status'] = 'OK';
113
            } else {
114
                $result['black'][$key]['url'] = $url;
115
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
116
            }
117
118
            usleep(500000);
119
            echo '.';
120
        }
121
        $result['UnknownLinks'] = $this->garbage;
122
123
        return $result;
124
    }
125
126
    /**
127
     * Fetch Page Contents Links
128
     * @param  mixed $baseUrl
129
     * @return array URlList
130
     * @throws \ErrorException
131
     */
132
    private function fetchByContents($baseUrl)
133
    {
134
        $urlList = [];
135
        $matches = [];
136
        $urlList['baseUrl'] = (string) $baseUrl;
137
        try {
138
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
139
        } catch (\Exception $e) {
140
            echo "\n {$baseUrl}\t {$e->getMessage()}";
141
        }
142
143
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
144
145
        if (!array_key_exists('url', $matches)) {
146
            throw new \ErrorException('Not match contents on url.');
147
        }
148
149
        foreach ($matches['url'] as $url) {
150
151
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
152
                $urlList[] = $url;
153
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
154
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
155
                    $this->garbage[] = $url;
156
                } else if (preg_match("#javascript.*#i", $url)) {
157
                    $this->garbage[] = $url;
158
                } else {
159
                    $urlList[] = $baseUrl . $url;
160
                }
161
            } else {
162
                $this->garbage[] = $url;
163
            }
164
165
            usleep(500000);
166
            echo '.';
167
        }
168
169
        return array_unique($urlList);
170
    }
171
172
    /**
173
     * Error check by header
174
     * @param \GuzzleHttp\Message\Response $metaData
175
     * @return array
176
     */
177
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
178
    {
179
        $headers = array_change_key_case($metaData->getHeaders());
180
        $statusCode = $metaData->getStatusCode();
181
182
        $isErrorPageCode = [
183
            '40x' => [401, 403, 404],
184
            '50x' => [500, 502, 503],
185
            '30x' => [301, 302, 308]
186
        ];
187
188
        foreach($isErrorPageCode as $errorType => $statuses) {
189
            if (in_array($statusCode, $statuses)) {
190
                return [
191
                    'result' => false,
192
                    'status' => "NG : status code {$errorType}"
193
                ];
194
            }
195
        }
196
197
        if ($statusCode === 200 && $statusCode === 304) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $statusCode (string) and 200 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
198
            return [
199
                'result' => true
200
            ];
201
        }
202
203
        if (array_key_exists('content-length', $headers) && $headers['content-length'][0] < $this->contentsSize) {
204
            return [
205
                'result' => false,
206
                'status' => 'NG : contentsSize'
207
            ];
208
        }
209
210
        return [
211
            'result' => true
212
        ];
213
    }
214
215
    /**
216
     * Soft404 check by contents Length
217
     * @param \GuzzleHttp\Message\Response $metaData
218
     * @return array
219
     */
220
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
221
    {
222
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
223
            return [
224
                'result' => false,
225
                'status' => 'NG : contentsSize'
226
            ];
227
        }
228
229
        if ($this->doubleCheck) {
230
            $result = $this->softCheckByContentsWords($metaData);
231
            if (!$result['result']) {
232
                return [
233
                    'result' => $result['result'],
234
                    'status' => $result['status']
235
                ];
236
            }
237
        }
238
239
        return [
240
            'result' => true
241
        ];
242
    }
243
244
    /**
245
     * Soft404 Error check by words
246
     * @param \GuzzleHttp\Message\Response $metaData
247
     * @return array Result
248
     */
249
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
250
    {
251
        foreach (self::getSoftErrorWords() as $word) {
252
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
253
                return [
254
                    'result' => false,
255
                    'status' => 'NG WORD : ' . $word
256
                ];
257
            }
258
        }
259
260
        return [
261
            'result' => true
262
        ];
263
264
    }
265
266
    /**
267
     * Return soft404 Page on Words.
268
     * @param  none
269
     * @return array
270
     */
271
    private static function getSoftErrorWords()
272
    {
273
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
274
    }
275
276
    /**
277
     * multidimensional array to single arry comvert.
278
     * @param array $urlList
279
     * @return array URLLIST
280
     */
281
    private function urlFilter(array $urlList)
282
    {
283
        $result = [];
284
        array_walk_recursive($urlList, function($v) use (&$result) {
285
            $result[] = $v;
286
        });
287
288
        return array_values(array_unique($result));
289
    }
290
}
291