Completed
Pull Request — master (#31)
by Yoshiaki
01:45
created

Checker.php (1 issue)

Severity

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace Error;
4
5
require_once (__DIR__ . '/vendor/autoload.php');
6
7
/**
8
 * Description of Checker Main
9
 *
10
 * @author bootjp
11
 */
12
class Checker
13
{
14
    protected $client;
15
16
    protected $contentsSize = 500;
17
18
    protected $doubleCheck = true;
19
20
    protected $recursion = false;
21
22
    protected $garbage = [];
23
24
    protected $isContentsFetch = true;
25
26
27
    /**
28
     * initialisation.
29
     * @param array $args
30
     */
31
    public function __construct(array $args)
32
    {
33
        $this->client = new \GuzzleHttp\Client([
34
                'defaults' => [
35
                    'exceptions' => false,
36
                    'headers' => [
37
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
38
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
39
                    ]
40
                ]
41
            ]
42
        );
43
        if (array_key_exists('contentSize', $args)) {
44
            $this->contentsSize = (int) $args['contentSize'];
45
        }
46
47
        if (array_key_exists('doubleCheck', $args)) {
48
            $this->doubleCheck = (bool) $args['doubleCheck'];
49
        }
50
51
        if (array_key_exists('isContentsFetch', $args)) {
52
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
53
        }
54
55
        if (array_key_exists('recursion', $args)) {
56
            $this->recursion = (bool) $args['recursion'];
57
        }
58
59
        if (array_key_exists('auth', $args)) {
60
            list($username, $password) = explode(':', $args['auth'], 2);
61
            $this->client->setDefaultOption('auth', [$username, $password]);
62
        }
63
64
    }
65
66
    /**
67
     * Wrapper
68
     * @param  mixed $url [require]
69
     * @return array
70
     * @throws \ErrorException
71
     * @throws \ReflectionException
72
     */
73
    public function start($url)
74
    {
75
        $urlList = [];
76
        $result = [];
77
        $result['white'] = [];
78
        $result['black'] = [];
79
80
        if ((bool) $this->isContentsFetch) {
81
            echo 'Contents fetching..';
82
            $url = $this->fetchByContents($url);
83
84
            if ((bool) $this->recursion) {
85
                $url = $this->urlFilter($url);
86
            }
87
        }
88
89
        if (is_null($url)) {
90
            throw new \ReflectionException('Start URL is not null.');
91
        } else if (is_array($url)) {
92
            $urlList = $this->urlFilter($url);
93
        } else if (is_string($url)) {
94
            $urlList[] = $url;
95
        } else if (is_object($url)) {
96
            $urlList[] = (string) $url;
97
        }
98
99
        echo "\n";
100
        echo 'Cheking..';
101
102
        foreach ($urlList as $key => $url) {
103
            try {
104
                $metaData = $this->client->get($url);
105
            } catch (\Exception $e) {
106
                echo "\n {$url}\t {$e->getMessage()}";
107
            }
108
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
109
            $softCheck = (array) $this->softCheckByContents($metaData);
110
111
            if ($hardCheck['result'] && $softCheck['result']) {
112
                $result['white'][$key]['url'] = $url;
113
                $result['white'][$key]['status'] = 'OK';
114
            } else {
115
                $result['black'][$key]['url'] = $url;
116
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
117
            }
118
119
            usleep(500000);
120
            echo '.';
121
        }
122
        $result['UnknownLinks'] = $this->garbage;
123
124
        return $result;
125
    }
126
127
    /**
128
     * Fetch Page Contents Links
129
     * @param  mixed $baseUrl
130
     * @return array URlList
131
     * @throws \ErrorException
132
     */
133
    private function fetchByContents($baseUrl)
134
    {
135
        $urlList = [];
136
        $matches = [];
137
        $urlList['baseUrl'] = (string) $baseUrl;
138
        try {
139
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
140
        } catch (\Exception $e) {
141
            echo "\n {$baseUrl}\t {$e->getMessage()}";
142
        }
143
144
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
145
146
        if (!array_key_exists('url', $matches)) {
147
            throw new \ErrorException('Not match contents on url.');
148
        }
149
150
        foreach ($matches['url'] as $url) {
151
152
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
153
                $urlList[] = $url;
154
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
155
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
156
                    $this->garbage[] = $url;
157
                } else if (preg_match("#javascript.*#i", $url)) {
158
                    $this->garbage[] = $url;
159
                } else {
160
                    $urlList[] = $baseUrl . $url;
161
                }
162
            } else {
163
                $this->garbage[] = $url;
164
            }
165
166
            usleep(500000);
167
            echo '.';
168
        }
169
170
        return array_unique($urlList);
171
    }
172
173
    /**
174
     * Error check by header
175
     * @param \GuzzleHttp\Message\Response $metaData
176
     * @return array
177
     */
178
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
0 ignored issues
show
This method is not used, and could be removed.
Loading history...
179
    {
180
        $headers = array_change_key_case($metaData->getHeaders());
181
        $statusCode = (int) $metaData->getStatusCode();
182
183
        $isErrorPageCode = [
184
            '40x' => [401, 403, 404],
185
            '50x' => [500, 502, 503],
186
            '30x' => [301, 302, 308]
187
        ];
188
189
        foreach($isErrorPageCode as $errorType => $statuses) {
190
            if (in_array($statusCode, $statuses)) {
191
                return [
192
                    'result' => false,
193
                    'status' => "NG : status code {$errorType}"
194
                ];
195
            }
196
        }
197
198
        if ($statusCode === 200 && $statusCode === 304) {
199
            return [
200
                'result' => true
201
            ];
202
        }
203
204
        if (array_key_exists('content-length', $headers) && $headers['content-length'][0] < $this->contentsSize) {
205
            return [
206
                'result' => false,
207
                'status' => 'NG : contentsSize'
208
            ];
209
        }
210
211
        return [
212
            'result' => true
213
        ];
214
    }
215
216
    /**
217
     * Soft404 check by contents Length
218
     * @param \GuzzleHttp\Message\Response $metaData
219
     * @return array
220
     */
221
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
222
    {
223
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
224
            return [
225
                'result' => false,
226
                'status' => 'NG : contentsSize'
227
            ];
228
        }
229
230
        if ($this->doubleCheck) {
231
            $result = $this->softCheckByContentsWords($metaData);
232
            if (!$result['result']) {
233
                return [
234
                    'result' => $result['result'],
235
                    'status' => $result['status']
236
                ];
237
            }
238
        }
239
240
        return [
241
            'result' => true
242
        ];
243
    }
244
245
    /**
246
     * Soft404 Error check by words
247
     * @param \GuzzleHttp\Message\Response $metaData
248
     * @return array Result
249
     */
250
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
251
    {
252
        foreach (self::getSoftErrorWords() as $word) {
253
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
254
                return [
255
                    'result' => false,
256
                    'status' => 'NG WORD : ' . $word
257
                ];
258
            }
259
        }
260
261
        return [
262
            'result' => true
263
        ];
264
265
    }
266
267
    /**
268
     * Return soft404 Page on Words.
269
     * @param  none
270
     * @return array
271
     */
272
    private static function getSoftErrorWords()
273
    {
274
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
275
    }
276
277
    /**
278
     * multidimensional array to single arry comvert.
279
     * @param array $urlList
280
     * @return array URLLIST
281
     */
282
    private function urlFilter(array $urlList)
283
    {
284
        $result = [];
285
        array_walk_recursive($urlList, function($v) use (&$result) {
286
            $result[] = $v;
287
        });
288
289
        return array_values(array_unique($result));
290
    }
291
}
292