Completed
Pull Request — master (#29)
by Yoshiaki
02:19
created

Checker.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
require_once (__DIR__ . '/vendor/autoload.php');
4
5
/**
6
 * Description of Checker Main
7
 *
8
 * @author bootjp
9
 */
10
class Checker
0 ignored issues
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
11
{
12
    protected $client;
13
14
    protected $contentsSize = 500;
15
16
    protected $doubleCheck = true;
17
18
    protected $recursion = false;
19
20
    protected $garbage = [];
21
22
    protected $isContentsFetch = true;
23
24
25
    /**
26
     * initialisation.
27
     * @param array $args
28
     */
29
    public function __construct(array $args)
30
    {
31
        $this->client = new \GuzzleHttp\Client([
32
                'defaults' => [
33
                    'exceptions' => false,
34
                    'headers' => [
35
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
36
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
37
                    ]
38
                ]
39
            ]
40
        );
41
        if (array_key_exists('contentSize', $args)) {
42
            $this->contentsSize = (int) $args['contentSize'];
43
        }
44
45
        if (array_key_exists('doubleCheck', $args)) {
46
            $this->doubleCheck = (bool) $args['doubleCheck'];
47
        }
48
49
        if (array_key_exists('isContentsFetch', $args)) {
50
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
51
        }
52
53
        if (array_key_exists('recursion', $args)) {
54
            $this->recursion = (bool) $args['recursion'];
55
        }
56
57
        if (array_key_exists('auth', $args)) {
58
            list($username, $password) = explode(':', $args['auth'], 2);
59
            $this->client->setDefaultOption('auth', [$username, $password]);
60
        }
61
62
    }
63
64
    /**
65
     * Wrapper
66
     * @param  mixed $url [require]
67
     * @return array
68
     * @throws \ErrorException
69
     * @throws \ReflectionException
70
     */
71
    public function start($url)
72
    {
73
        $urlList = [];
74
        $result = [];
75
        $result['white'] = [];
76
        $result['black'] = [];
77
78
        if ((bool) $this->isContentsFetch) {
79
            echo 'Contents fetching..';
80
            $url = $this->fetchByContents($url);
81
82
            if ((bool) $this->recursion) {
83
                $url = $this->urlFilter($url);
84
            }
85
        }
86
87
        if (is_null($url)) {
88
            throw new \ReflectionException('Start URL is not null.');
89
        } else if (is_array($url)) {
90
            $urlList = $this->urlFilter($url);
91
        } else if (is_string($url)) {
92
            $urlList[] = $url;
93
        } else if (is_object($url)) {
94
            $urlList[] = (string) $url;
95
        }
96
97
        echo "\n";
98
        echo 'Cheking..';
99
100
        foreach ($urlList as $key => $url) {
101
            try {
102
                $metaData = $this->client->get($url);
103
            } catch (\Exception $e) {
104
                echo "\n {$url}\t {$e->getMessage()}";
105
            }
106
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
107
            $softCheck = (array) $this->softCheckByContents($metaData);
108
109
            if ($hardCheck['result'] && $softCheck['result']) {
110
                $result['white'][$key]['url'] = $url;
111
                $result['white'][$key]['status'] = 'OK';
112
            } else {
113
                $result['black'][$key]['url'] = $url;
114
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
115
            }
116
117
            usleep(500000);
118
            echo '.';
119
        }
120
        $result['UnknownLinks'] = $this->garbage;
121
122
        return $result;
123
    }
124
125
    /**
126
     * Fetch Page Contents Links
127
     * @param  mixed $baseUrl
128
     * @return array URlList
129
     * @throws \ErrorException
130
     */
131
    private function fetchByContents($baseUrl)
132
    {
133
        $urlList = [];
134
        $matches = [];
135
        $urlList['baseUrl'] = (string) $baseUrl;
136
        try {
137
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
138
        } catch (\Exception $e) {
139
            echo "\n {$baseUrl}\t {$e->getMessage()}";
140
        }
141
142
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
143
144
        if (!array_key_exists('url', $matches)) {
145
            throw new \ErrorException('Not match contents on url.');
146
        }
147
148
        foreach ($matches['url'] as $url) {
149
150
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
151
                $urlList[] = $url;
152
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
153
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
154
                    $this->garbage[] = $url;
155
                } else if (preg_match("#javascript.*#i", $url)) {
156
                    $this->garbage[] = $url;
157
                } else {
158
                    $urlList[] = $baseUrl . $url;
159
                }
160
            } else {
161
                $this->garbage[] = $url;
162
            }
163
164
            usleep(500000);
165
            echo '.';
166
        }
167
168
        return array_unique($urlList);
169
    }
170
171
    /**
172
     * Error check by header
173
     * @param \GuzzleHttp\Message\Response $metaData
174
     * @return array
175
     */
176
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
177
    {
178
        $headers = array_change_key_case($metaData->getHeaders());
179
        $statusCode = (int) $metaData->getStatusCode();
180
181
        $isErrorPageCode = [
182
            '40x' => [401, 403, 404],
183
            '50x' => [500, 502, 503],
184
            '30x' => [301, 302, 308]
185
        ];
186
187
        foreach($isErrorPageCode as $errorType => $statuses) {
188
            if (in_array($statusCode, $statuses)) {
189
                return [
190
                    'result' => false,
191
                    'status' => "NG : status code {$errorType}"
192
                ];
193
            }
194
        }
195
196
        if ($statusCode === 200 && $statusCode === 304) {
197
            return [
198
                'result' => true
199
            ];
200
        }
201
202
        if (array_key_exists('content-length', $headers) && $headers['content-length'][0] < $this->contentsSize) {
203
            return [
204
                'result' => false,
205
                'status' => 'NG : contentsSize'
206
            ];
207
        }
208
209
        return [
210
            'result' => true
211
        ];
212
    }
213
214
    /**
215
     * Soft404 check by contents Length
216
     * @param \GuzzleHttp\Message\Response $metaData
217
     * @return array
218
     */
219
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
220
    {
221
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
222
            return [
223
                'result' => false,
224
                'status' => 'NG : contentsSize'
225
            ];
226
        }
227
228
        if ($this->doubleCheck) {
229
            $result = $this->softCheckByContentsWords($metaData);
230
            if (!$result['result']) {
231
                return [
232
                    'result' => $result['result'],
233
                    'status' => $result['status']
234
                ];
235
            }
236
        }
237
238
        return [
239
            'result' => true
240
        ];
241
    }
242
243
    /**
244
     * Soft404 Error check by words
245
     * @param \GuzzleHttp\Message\Response $metaData
246
     * @return array Result
247
     */
248
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
249
    {
250
        foreach (self::getSoftErrorWords() as $word) {
251
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
252
                return [
253
                    'result' => false,
254
                    'status' => 'NG WORD : ' . $word
255
                ];
256
            }
257
        }
258
259
        return [
260
            'result' => true
261
        ];
262
263
    }
264
265
    /**
266
     * Return soft404 Page on Words.
267
     * @param  none
268
     * @return array
269
     */
270
    private static function getSoftErrorWords()
271
    {
272
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
273
    }
274
275
    /**
276
     * multidimensional array to single arry comvert.
277
     * @param array $urlList
278
     * @return array URLLIST
279
     */
280
    private function urlFilter(array $urlList)
281
    {
282
        $result = [];
283
        array_walk_recursive($urlList, function($v) use (&$result) {
284
            $result[] = $v;
285
        });
286
287
        return array_values(array_unique($result));
288
    }
289
}
290