Completed
Pull Request — master (#27)
by Yoshiaki
02:52
created

Checker.php (11 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace Error;
4
5
require_once (__DIR__ . '/vendor/autoload.php');
6
7
/**
8
 * Description of Checker Main
9
 *
10
 * @author bootjp
11
 */
12
class Checker
13
{
14
    protected $client;
15
16
    protected $contentsSize = 500;
17
18
    protected $doubleCheck = true;
19
20
    protected $recursion = false;
21
22
    protected $garbage = [];
23
24
    protected $isContentsFetch = true;
25
26
27
    /**
28
     * initialisation.
29
     * @param array $args
30
     */
31
    public function __construct(array $args)
32
    {
33
        $this->client = new \GuzzleHttp\Client([
34
                'defaults' => [
35
                    'exceptions' => false,
36
                    'headers' => [
37
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
38
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
39
                    ]
40
                ]
41
            ]
42
        );
43
        if (array_key_exists('contentSize', $args)) {
44
            $this->contentsSize = (int) $args['contentSize'];
45
        }
46
47
        if (array_key_exists('doubleCheck', $args)) {
48
            $this->doubleCheck = (bool) $args['doubleCheck'];
49
        }
50
51
        if (array_key_exists('isContentsFetch', $args)) {
52
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
53
        }
54
55
        if (array_key_exists('recursion', $args)) {
56
            $this->recursion = (bool) $args['recursion'];
57
        }
58
59
        if (array_key_exists('auth', $args)) {
60
            list($username, $password) = explode(':', $args['auth'], 2);
61
            $this->client->setDefaultOption('auth', [$username, $password]);
62
        }
63
64
    }
65
66
    /**
67
     * Wrapper
68
     * @param  mixed $url [require]
69
     * @return array
70
     * @throws \ErrorException
71
     * @throws \ReflectionException
72
     */
73
    public function start($url)
74
    {
75
        $urlList = [];
76
        $result['white'] = [];
77
        $result['black'] = [];
78
79
        if ((bool) $this->isContentsFetch) {
80
            echo 'Contents fetching..';
81
            $url = $this->fetchByContents($url);
82
83
            if ((bool) $this->recursion) {
84
                $url = $this->urlFilter($url);
85
            }
86
        }
87
88
        if (is_null($url)) {
89
            throw new \ReflectionException('Start URL is not null.');
90
        } else if (is_array($url)) {
91
            $urlList = $this->urlFilter($url);
92
        } else if (is_string($url)) {
93
            $urlList[] = $url;
94
        } else if (is_object($url)) {
95
            $urlList[] = (string) $url;
96
        }
97
98
        echo "\n";
99
        echo 'Cheking..';
100
101
        foreach ($urlList as $key => $url) {
102
            try {
103
                $metaData = $this->client->get($url);
104
            } catch (\Exception $e) {
105
                echo "\n {$url}\t {$e->getMessage()}";
106
            }
107
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
108
            $softCheck = (array) $this->softCheckByContents($metaData);
109
110
            if ($hardCheck['result'] && $softCheck['result']) {
111
                $result['white'][$key]['url'] = $url;
112
                $result['white'][$key]['status'] = 'OK';
113
            } else {
114
                $result['black'][$key]['url'] = $url;
115
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
116
            }
117
118
            usleep(500000);
119
            echo '.';
120
        }
121
        $result['UnknownLinks'] = $this->garbage;
122
123
        return $result;
124
    }
125
126
    /**
127
     * Fetch Page Contents Links
128
     * @param  mixed $baseUrl
129
     * @return array URlList
130
     * @throws \ErrorException
131
     */
132
    private function fetchByContents($baseUrl)
133
    {
134
        $urlList = [];
135
        $matches = [];
136
        $urlList['baseUrl'] = (string) $baseUrl;
137
        try {
138
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
139
        } catch (\Exception $e) {
140
            echo "\n {$baseUrl}\t {$e->getMessage()}";
141
        }
142
143
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
144
145
        if (!array_key_exists('url', $matches)) {
146
            throw new \ErrorException('Not match contents on url.');
147
        }
148
149
        foreach ($matches['url'] as $url) {
150
151
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
152
                $urlList[] = $url;
153
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
154
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
155
                    $this->garbage[] = $url;
156
                } else if (preg_match("#javascript.*#i", $url)) {
157
                    $this->garbage[] = $url;
158
                } else {
159
                    $urlList[] = $baseUrl . $url;
160
                }
161
            } else {
162
                $this->garbage[] = $url;
163
            }
164
165
            usleep(500000);
166
            echo '.';
167
        }
168
169
        return array_unique($urlList);
170
    }
171
172
    /**
173
     * Error check by header
174
     * @param \GuzzleHttp\Message\Response $metaData
175
     * @return array
176
     */
177
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
178
    {
179
        $head = array_change_key_case($metaData->getHeaders());
180
181
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 404) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 404 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
182
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 403) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 403 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
183
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 401) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 401 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
184
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 503) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 503 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
185
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 502) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 502 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
186
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 500)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 500 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
187
            return [
188
                'result' => false,
189
                'status' => 'NG : status code 40X or 50X'
190
            ];
191
        }
192
193
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 301) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 301 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
194
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 302) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 302 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
195
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 308)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 308 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
196
            return [
197
                'result' => false,
198
                'status' => 'NG : status code 30X'
199
            ];
200
        }
201
202
        if (is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 200) ||
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 200 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
203
            is_int($metaData->getStatusCode() && $metaData->getStatusCode() === 304)) {
0 ignored issues
show
Unused Code Bug introduced by
The strict comparison === seems to always evaluate to false as the types of $metaData->getStatusCode() (string) and 304 (integer) can never be identical. Maybe you want to use a loose comparison == instead?
Loading history...
204
            return [
205
                'result' => true
206
            ];
207
        }
208
209
        if (array_key_exists('content-length', $head) && $head['content-length'][0] >= $this->contentsSize) {
210
            return [
211
                'result' => true
212
            ];
213
        }
214
215
        return [
216
            'result' => true
217
        ];
218
    }
219
220
    /**
221
     * Soft404 check by contents Length
222
     * @param \GuzzleHttp\Message\Response $metaData
223
     * @return array
224
     */
225
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
226
    {
227
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
228
            return [
229
                'result' => false,
230
                'status' => 'NG : contentsSize'
231
            ];
232
        }
233
234
        if ($this->doubleCheck) {
235
            $result = $this->softCheckByContentsWords($metaData);
236
            if (!$result['result']) {
237
                return [
238
                    'result' => $result['result'],
239
                    'status' => $result['status']
240
                ];
241
            }
242
        }
243
244
        return [
245
            'result' => true
246
        ];
247
    }
248
249
    /**
250
     * Soft404 Error check by words
251
     * @param \GuzzleHttp\Message\Response $metaData
252
     * @return array Result
253
     */
254
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
255
    {
256
        foreach (self::getSoftErrorWords() as $word) {
257
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
258
                return [
259
                    'result' => false,
260
                    'status' => 'NG WORD : ' . $word
261
                ];
262
            }
263
        }
264
265
        return [
266
            'result' => true
267
        ];
268
269
    }
270
271
    /**
272
     * Return soft404 Page on Words.
273
     * @param  none
274
     * @return array
275
     */
276
    private static function getSoftErrorWords()
277
    {
278
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
279
    }
280
281
    /**
282
     * multidimensional array to single arry comvert.
283
     * @param array $urlList
284
     * @return array URLLIST
285
     */
286
    private function urlFilter(array $urlList)
287
    {
288
        $result = [];
289
        array_walk_recursive($urlList, function($v) use (&$result) {
290
            $result[] = $v;
291
        });
292
293
        return array_values(array_unique($result));
294
    }
295
}
296