Completed
Push — master ( b77d55...ba877c )
by Yoshiaki
22s
created

Checker.php (11 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace Error;
4
5
require_once (__DIR__ . '/vendor/autoload.php');
6
7
/**
8
 * Description of Checker Main
9
 *
10
 * @author bootjp
11
 */
12
class Checker
13
{
14
    protected $client;
15
16
    protected $contentsSize = 500;
17
18
    protected $doubleCheck = true;
19
20
    protected $recursion = false;
21
22
    protected $garbage = [];
23
24
    protected $isContentsFetch = true;
25
26
27
    /**
28
     * initialisation.
29
     * @param array $args
30
     */
31
    public function __construct(array $args)
32
    {
33
        $this->client = new \GuzzleHttp\Client([
34
                'defaults' => [
35
                    'exceptions' => false,
36
                    'headers' => [
37
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
38
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
39
                    ]
40
                ]
41
            ]
42
        );
43
        if (array_key_exists('contentSize', $args)) {
44
            $this->contentsSize = (int) $args['contentSize'];
45
        }
46
47
        if (array_key_exists('doubleCheck', $args)) {
48
            $this->doubleCheck = (bool) $args['doubleCheck'];
49
        }
50
51
        if (array_key_exists('isContentsFetch', $args)) {
52
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
53
        }
54
55
        if (array_key_exists('recursion', $args)) {
56
            $this->recursion = (bool) $args['recursion'];
57
        }
58
59
        if (array_key_exists('auth', $args)) {
60
            list($username, $password) = explode(':', $args['auth'], 2);
61
            $this->client->setDefaultOption('auth', [$username, $password]);
62
        }
63
64
    }
65
66
    /**
67
     * Wrapper
68
     * @param  mixed $url [require]
69
     * @return array
70
     * @throws \ErrorException
71
     * @throws \ReflectionException
72
     */
73
    public function start($url)
74
    {
75
        $urlList = [];
76
        $result['white'] = [];
77
        $result['black'] = [];
78
79
        if ((bool) $this->isContentsFetch) {
80
            echo 'Contents fetching..';
81
            $url = $this->fetchByContents($url);
82
83
            if ((bool) $this->recursion) {
84
                $url = $this->urlFilter($url);
85
            }
86
        }
87
88
        if (is_null($url)) {
89
            throw new \ReflectionException('Start URL is not null.');
90
        } else if (is_array($url)) {
91
            $urlList = $this->urlFilter($url);
92
        } else if (is_string($url)) {
93
            $urlList[] = $url;
94
        } else if (is_object($url)) {
95
            $urlList[] = (string) $url;
96
        }
97
98
        echo "\n";
99
        echo 'Cheking..';
100
101
        foreach ($urlList as $key => $url) {
102
            try {
103
                $metaData = $this->client->get($url);
104
            } catch (\Exception $e) {
105
                echo "\n {$url}\t {$e->getMessage()}";
106
            }
107
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
108
            $softCheck = (array) $this->softCheckByContents($metaData);
109
110
            if ($hardCheck['result'] && $softCheck['result']) {
111
                $result['white'][$key]['url'] = $url;
112
                $result['white'][$key]['status'] = 'OK';
113
            } else {
114
                $result['black'][$key]['url'] = $url;
115
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
116
            }
117
118
            usleep(500000);
119
            echo '.';
120
        }
121
        $result['UnknownLinks'] = $this->garbage;
122
123
        return $result;
124
    }
125
126
    /**
127
     * Fetch Page Contents Links
128
     * @param  mixed $baseUrl
129
     * @return array URlList
130
     * @throws \ErrorException
131
     */
132
    private function fetchByContents($baseUrl)
133
    {
134
        $urlList = [];
135
        $matches = [];
136
        $urlList['baseUrl'] = (string) $baseUrl;
137
        try {
138
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
139
        } catch (\Exception $e) {
140
            echo "\n {$baseUrl}\t {$e->getMessage()}";
141
        }
142
143
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
144
145
        if (!array_key_exists('url', $matches)) {
146
            throw new \ErrorException('Not match contents on url.');
147
        }
148
149
        foreach ($matches['url'] as $url) {
150
151
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
152
                $urlList[] = $url;
153
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
154
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
155
                    $this->garbage[] = $url;
156
                } else if (preg_match("#javascript.*#i", $url)) {
157
                    $this->garbage[] = $url;
158
                } else {
159
                    $urlList[] = $baseUrl . $url;
160
                }
161
            } else {
162
                $this->garbage[] = $url;
163
            }
164
165
            usleep(500000);
166
            echo '.';
167
        }
168
169
        return array_unique($urlList);
170
    }
171
172
    /**
173
     * Error check by header
174
     * @param \GuzzleHttp\Message\Response $metaData
175
     * @return array
176
     */
177
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
178
    {
179
        $head = array_change_key_case($metaData->getHeaders());
180
181
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 404) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 404 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
182
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 403) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 403 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
183
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 401) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 401 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
184
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 503) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 503 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
185
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 502) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 502 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
186
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 500)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 500 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
187
            return [
188
                'result' => false,
189
                'status' => 'NG : status code 40X or 50X'
190
            ];
191
        }
192
193
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 301) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 301 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
194
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 302) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 302 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
195
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 308)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 308 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
196
            return [
197
                'result' => false,
198
                'status' => 'NG : status code 30X'
199
            ];
200
        }
201
202
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 200) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 200 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
203
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 304)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 304 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
204
            return [
205
                'result' => true
206
            ];
207
        }
208
209
        if (array_key_exists('content-length', $head) && $head['content-length'][0] >= $this->contentsSize) {
210
            return [
211
                'result' => true
212
            ];
213
        }
214
215
        return [
216
            'result' => true
217
        ];
218
    }
219
220
    /**
221
     * Soft404 check by contents Length
222
     * @param \GuzzleHttp\Message\Response $metaData
223
     * @return array
224
     */
225
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
226
    {
227
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
228
            return [
229
                'result' => false,
230
                'status' => 'NG : contentsSize'
231
            ];
232
        }
233
234
        if ($this->doubleCheck) {
235
            $result = $this->softCheckByContentsWords($metaData);
236
            if (!$result['result']) {
237
                return [
238
                    'result' => $result['result'],
239
                    'status' => $result['status']
240
                ];
241
            }
242
        }
243
244
        return [
245
            'result' => true
246
        ];
247
    }
248
249
    /**
250
     * Soft404 Error check by words
251
     * @param \GuzzleHttp\Message\Response $metaData
252
     * @return array Result
253
     */
254
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
255
    {
256
        foreach (self::getSoftErrorWords() as $word) {
257
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
258
                return [
259
                    'result' => false,
260
                    'status' => 'NG WORD : ' . $word
261
                ];
262
            }
263
        }
264
265
        return [
266
            'result' => true
267
        ];
268
269
    }
270
271
    /**
272
     * Return soft404 Page on Words.
273
     * @param  none
274
     * @return array
275
     */
276
    private static function getSoftErrorWords()
277
    {
278
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
279
    }
280
281
    /**
282
     * multidimensional array to single arry comvert.
283
     * @param array $urlList
284
     * @return array URLLIST
285
     */
286
    private function urlFilter(array $urlList)
287
    {
288
        $result = [];
289
        array_walk_recursive($urlList, function($v) use (&$result) {
290
            $result[] = $v;
291
        });
292
293
        return array_values(array_unique($result));
294
    }
295
}
296