Completed
Pull Request — master (#29)
by Yoshiaki
02:19
created

Checker.php (3 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
0 ignored issues
show
Coding Style Compatibility introduced by
For compatibility and reusability of your code, PSR1 recommends that a file should introduce either new symbols (like classes, functions, etc.) or have side-effects (like outputting something, or including other files), but not both at the same time. The first symbol is defined on line 10 and the first side effect is on line 3.

The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.

The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.

To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.

Loading history...
2
3
require_once (__DIR__ . '/vendor/autoload.php');
4
5
/**
6
 * Description of Checker Main
7
 *
8
 * @author bootjp
9
 */
10
class Checker
0 ignored issues
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
11
{
12
    protected $client;
13
14
    protected $contentsSize = 500;
15
16
    protected $doubleCheck = true;
17
18
    protected $recursion = false;
19
20
    protected $garbage = [];
21
22
    protected $isContentsFetch = true;
23
24
25
    /**
26
     * initialisation.
27
     * @param array $args
28
     */
29
    public function __construct(array $args)
30
    {
31
        $this->client = new \GuzzleHttp\Client([
32
                'defaults' => [
33
                    'exceptions' => false,
34
                    'headers' => [
35
                        'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) ' .
36
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
37
                    ]
38
                ]
39
            ]
40
        );
41
        if (array_key_exists('contentSize', $args)) {
42
            $this->contentsSize = (int) $args['contentSize'];
43
        }
44
45
        if (array_key_exists('doubleCheck', $args)) {
46
            $this->doubleCheck = (bool) $args['doubleCheck'];
47
        }
48
49
        if (array_key_exists('isContentsFetch', $args)) {
50
            $this->isContentsFetch = (bool) $args['isContentsFetch'];
51
        }
52
53
        if (array_key_exists('recursion', $args)) {
54
            $this->recursion = (bool) $args['recursion'];
55
        }
56
57
        if (array_key_exists('auth', $args)) {
58
            list($username, $password) = explode(':', $args['auth'], 2);
59
            $this->client->setDefaultOption('auth', [$username, $password]);
60
        }
61
62
    }
63
64
    /**
65
     * Wrapper
66
     * @param  mixed $url [require]
67
     * @return array
68
     * @throws \ErrorException
69
     * @throws \ReflectionException
70
     */
71
    public function start($url)
72
    {
73
        $urlList = [];
74
        $result = [];
75
        $result['white'] = [];
76
        $result['black'] = [];
77
78
        if ((bool) $this->isContentsFetch) {
79
            echo 'Contents fetching..';
80
            $url = $this->fetchByContents($url);
81
82
            if ((bool) $this->recursion) {
83
                $url = $this->urlFilter($url);
84
            }
85
        }
86
87
        if (is_null($url)) {
88
            throw new \ReflectionException('Start URL is not null.');
89
        } else if (is_array($url)) {
90
            $urlList = $this->urlFilter($url);
91
        } else if (is_string($url)) {
92
            $urlList[] = $url;
93
        } else if (is_object($url)) {
94
            $urlList[] = (string) $url;
95
        }
96
97
        echo "\n";
98
        echo 'Cheking..';
99
100
        foreach ($urlList as $key => $url) {
101
            try {
102
                $metaData = $this->client->get($url);
103
            } catch (\Exception $e) {
104
                echo "\n {$url}\t {$e->getMessage()}";
105
            }
106
            $hardCheck = (array) $this->hardCheckByHeader($metaData);
107
            $softCheck = (array) $this->softCheckByContents($metaData);
108
109
            if ($hardCheck['result'] && $softCheck['result']) {
110
                $result['white'][$key]['url'] = $url;
111
                $result['white'][$key]['status'] = 'OK';
112
            } else {
113
                $result['black'][$key]['url'] = $url;
114
                $result['black'][$key]['status'] = array_key_exists('status', $hardCheck) ? $hardCheck['status'] : $softCheck['status'];
115
            }
116
117
            usleep(500000);
118
            echo '.';
119
        }
120
        $result['UnknownLinks'] = $this->garbage;
121
122
        return $result;
123
    }
124
125
    /**
126
     * Fetch Page Contents Links
127
     * @param  mixed $baseUrl
128
     * @return array URlList
129
     * @throws \ErrorException
130
     */
131
    private function fetchByContents($baseUrl)
132
    {
133
        $urlList = [];
134
        $matches = [];
135
        $urlList['baseUrl'] = (string) $baseUrl;
136
        try {
137
            $contents = $this->client->get($baseUrl)->getBody()->getContents();
138
        } catch (\Exception $e) {
139
            echo "\n {$baseUrl}\t {$e->getMessage()}";
140
        }
141
142
        preg_match_all('{<a.+?href=[\"|\'](?<url>.+?)[\"\|\'].*?>}is', $contents, $matches);
143
144
        if (!array_key_exists('url', $matches)) {
145
            throw new \ErrorException('Not match contents on url.');
146
        }
147
148
        foreach ($matches['url'] as $url) {
149
150
            if (preg_match('{https?://[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $url)) {
151
                $urlList[] = $url;
152
            } else if (preg_match('{https?:\/\/[\w/:%#\$&\?\(\)~\.=\+\-]+}i', $baseUrl . $url)) {
153
                if (preg_match("{(^#[A-Z0-9].+?$)}i", $url)) {
154
                    $this->garbage[] = $url;
155
                } else if (preg_match("#javascript.*#i", $url)) {
156
                    $this->garbage[] = $url;
157
                } else {
158
                    $urlList[] = $baseUrl . $url;
159
                }
160
            } else {
161
                $this->garbage[] = $url;
162
            }
163
164
            usleep(500000);
165
            echo '.';
166
        }
167
168
        return array_unique($urlList);
169
    }
170
171
    /**
172
     * Error check by header
173
     * @param \GuzzleHttp\Message\Response $metaData
174
     * @return array
175
     */
176
    private function hardCheckByHeader(\GuzzleHttp\Message\Response $metaData)
0 ignored issues
show
This method is not used, and could be removed.
Loading history...
177
    {
178
        $headers = array_change_key_case($metaData->getHeaders());
179
        $statusCode = (int) $metaData->getStatusCode();
180
181
        $isErrorPageCode = [
182
            '40x' => [401, 403, 404],
183
            '50x' => [500, 502, 503],
184
            '30x' => [301, 302, 308]
185
        ];
186
187
        foreach($isErrorPageCode as $errorType => $statuses) {
188
            if (in_array($statusCode, $statuses)) {
189
                return [
190
                    'result' => false,
191
                    'status' => "NG : status code {$errorType}"
192
                ];
193
            }
194
        }
195
196
        if ($statusCode === 200 && $statusCode === 304) {
197
            return [
198
                'result' => true
199
            ];
200
        }
201
202
        if (array_key_exists('content-length', $headers) && $headers['content-length'][0] < $this->contentsSize) {
203
            return [
204
                'result' => false,
205
                'status' => 'NG : contentsSize'
206
            ];
207
        }
208
209
        return [
210
            'result' => true
211
        ];
212
    }
213
214
    /**
215
     * Soft404 check by contents Length
216
     * @param \GuzzleHttp\Message\Response $metaData
217
     * @return array
218
     */
219
    public function softCheckByContents(\GuzzleHttp\Message\Response $metaData)
220
    {
221
        if ($metaData->getBody()->getSize() <= $this->contentsSize) {
222
            return [
223
                'result' => false,
224
                'status' => 'NG : contentsSize'
225
            ];
226
        }
227
228
        if ($this->doubleCheck) {
229
            $result = $this->softCheckByContentsWords($metaData);
230
            if (!$result['result']) {
231
                return [
232
                    'result' => $result['result'],
233
                    'status' => $result['status']
234
                ];
235
            }
236
        }
237
238
        return [
239
            'result' => true
240
        ];
241
    }
242
243
    /**
244
     * Soft404 Error check by words
245
     * @param \GuzzleHttp\Message\Response $metaData
246
     * @return array Result
247
     */
248
    private function softCheckByContentsWords(\GuzzleHttp\Message\Response $metaData)
249
    {
250
        foreach (self::getSoftErrorWords() as $word) {
251
            if (mb_stripos($metaData->getBody()->getContents(), $word) !== false) {
252
                return [
253
                    'result' => false,
254
                    'status' => 'NG WORD : ' . $word
255
                ];
256
            }
257
        }
258
259
        return [
260
            'result' => true
261
        ];
262
263
    }
264
265
    /**
266
     * Return soft404 Page on Words.
267
     * @param  none
268
     * @return array
269
     */
270
    private static function getSoftErrorWords()
271
    {
272
        return file(__DIR__ . '/ErrorPageWords.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
273
    }
274
275
    /**
276
     * multidimensional array to single arry comvert.
277
     * @param array $urlList
278
     * @return array URLLIST
279
     */
280
    private function urlFilter(array $urlList)
281
    {
282
        $result = [];
283
        array_walk_recursive($urlList, function($v) use (&$result) {
284
            $result[] = $v;
285
        });
286
287
        return array_values(array_unique($result));
288
    }
289
}
290