1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Main Class |
4
|
|
|
* |
5
|
|
|
* PHP version 5.4 |
6
|
|
|
* |
7
|
|
|
* @category GLICER |
8
|
|
|
* @package GlLinkChecker |
9
|
|
|
* @author Emmanuel ROECKER |
10
|
|
|
* @author Rym BOUCHAGOUR |
11
|
|
|
* @copyright 2015 GLICER |
12
|
|
|
* @license MIT |
13
|
|
|
* @link http://dev.glicer.com/ |
14
|
|
|
* |
15
|
|
|
* Created : 10/03/15 |
16
|
|
|
* File : GlLinkChecker.php |
17
|
|
|
* |
18
|
|
|
*/ |
19
|
|
|
namespace GlLinkChecker; |
20
|
|
|
|
21
|
|
|
use GlHtml\GlHtml; |
22
|
|
|
use GuzzleHttp\Client; |
23
|
|
|
use Symfony\Component\Finder\Finder; |
24
|
|
|
use Symfony\Component\Finder\SplFileInfo; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* Class GlLinkChecker |
28
|
|
|
* @package GLLinkChecker |
29
|
|
|
*/ |
30
|
|
|
class GlLinkChecker |
31
|
|
|
{ |
32
|
|
|
/** |
33
|
|
|
* @var \GuzzleHttp\Client |
34
|
|
|
*/ |
35
|
|
|
private $client; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* @var array $internalurls |
39
|
|
|
*/ |
40
|
|
|
private $internalurls; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* |
44
|
|
|
*/ |
45
|
|
|
public function __construct($rooturl = null, array $internalurls = null) |
46
|
|
|
{ |
47
|
|
|
$this->client = new Client([ |
48
|
|
|
'base_url' => $rooturl, |
49
|
|
|
'defaults' => [ |
50
|
|
|
'headers' => [ |
51
|
|
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0', |
52
|
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
53
|
|
|
'Accept-Language' => 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3', |
54
|
|
|
'Accept-Encoding' => 'gzip, deflate' |
55
|
|
|
] |
56
|
|
|
] |
57
|
|
|
]); |
58
|
|
|
$this->client->setDefaultOption('verify', false); |
59
|
|
|
$this->internalurls = $internalurls; |
|
|
|
|
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* get all links in an object |
64
|
|
|
* |
65
|
|
|
* @param $obj |
66
|
|
|
* @param array $links |
67
|
|
|
*/ |
68
|
|
|
private function searchInArray($obj, array &$links) |
69
|
|
|
{ |
70
|
|
|
foreach ($obj as $key => $elem) { |
71
|
|
|
if (is_string($elem)) { |
72
|
|
|
if (preg_match("/^(http|https|ftp|ftps).*$/", $elem)) { |
73
|
|
|
if (filter_var($elem, FILTER_VALIDATE_URL)) { |
74
|
|
|
$links[$elem] = $elem; |
75
|
|
|
} |
76
|
|
|
} |
77
|
|
|
} else { |
78
|
|
|
if (is_array($elem)) { |
79
|
|
|
$this->searchInArray($elem, $links); |
80
|
|
|
} |
81
|
|
|
} |
82
|
|
|
} |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* get all links in a json |
87
|
|
|
* |
88
|
|
|
* @param string $json |
89
|
|
|
* |
90
|
|
|
* @return array |
91
|
|
|
*/ |
92
|
|
|
private function getJsonLinks($json) |
93
|
|
|
{ |
94
|
|
|
$obj = json_decode($json, true); |
95
|
|
|
$links = []; |
96
|
|
|
$this->searchInArray($obj, $links); |
97
|
|
|
|
98
|
|
|
return $links; |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
|
102
|
|
|
/** |
103
|
|
|
* check links in a sitemap |
104
|
|
|
* |
105
|
|
|
* @param string $sitemap |
106
|
|
|
* |
107
|
|
|
* @return array |
108
|
|
|
* @throws \Exception |
109
|
|
|
*/ |
110
|
|
|
private function checkSitemap($sitemap) |
111
|
|
|
{ |
112
|
|
|
$xml = new GlHtml($sitemap); |
113
|
|
|
$listloc = $xml->get("loc"); |
114
|
|
|
$result = []; |
115
|
|
|
foreach ($listloc as $loc) { |
116
|
|
|
$response = $this->client->get($loc->getText(), ['exceptions' => false]); |
117
|
|
|
if ($response->getStatusCode() != 200) { |
118
|
|
|
$result['error'][] = $loc->getText(); |
119
|
|
|
} else { |
120
|
|
|
$result['ok'][] = $loc->getText(); |
121
|
|
|
} |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
return $result; |
125
|
|
|
} |
126
|
|
|
|
127
|
|
|
/** |
128
|
|
|
* check 403 and 404 errors |
129
|
|
|
* |
130
|
|
|
* @param array $urlerrors |
131
|
|
|
* @param array $urlforbiddens |
132
|
|
|
* |
133
|
|
|
* @return string |
134
|
|
|
*/ |
135
|
|
|
public function checkErrors(array $urlerrors, array $urlforbiddens) |
136
|
|
|
{ |
137
|
|
|
$result = []; |
138
|
|
|
|
139
|
|
View Code Duplication |
foreach ($urlerrors as $urlerror) { |
|
|
|
|
140
|
|
|
$response = $this->client->get($urlerror, ['exceptions' => false]); |
141
|
|
|
if ($response->getStatusCode() != 404) { |
142
|
|
|
$result["404"]["error"][] = $urlerror; |
143
|
|
|
} else { |
144
|
|
|
$result["404"]["ok"][] = $urlerror; |
145
|
|
|
} |
146
|
|
|
} |
147
|
|
|
|
148
|
|
View Code Duplication |
foreach ($urlforbiddens as $urlforbidden) { |
|
|
|
|
149
|
|
|
$response = $this->client->get($urlforbidden, ['exceptions' => false]); |
150
|
|
|
if ($response->getStatusCode() != 403) { |
151
|
|
|
$result["403"]["error"][] = $urlforbidden; |
152
|
|
|
} else { |
153
|
|
|
$result["403"]["ok"][] = $urlforbidden; |
154
|
|
|
} |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
return $result; |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
/** |
161
|
|
|
* check links in robots.txt and sitemap |
162
|
|
|
* |
163
|
|
|
* @return array |
164
|
|
|
* @throws \Exception |
165
|
|
|
*/ |
166
|
|
|
public function checkRobotsSitemap() |
167
|
|
|
{ |
168
|
|
|
$response = $this->client->get("/robots.txt"); |
169
|
|
|
if ($response->getStatusCode() != 200) { |
170
|
|
|
throw new \Exception("Cannot find robots.txt"); |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
$robotstxt = $response->getBody()->getContents(); |
174
|
|
|
$robotstxt = explode("\n", $robotstxt); |
175
|
|
|
$result = []; |
176
|
|
|
foreach ($robotstxt as $line) { |
177
|
|
View Code Duplication |
if (preg_match('/^\s*Sitemap:(.*)/i', $line, $match)) { |
|
|
|
|
178
|
|
|
$urlsitemap = trim($match[1]); |
179
|
|
|
$response = $this->client->get($urlsitemap, ['exceptions' => false]); |
180
|
|
|
if ($response->getStatusCode() != 200) { |
181
|
|
|
$result['sitemap']['error'][] = $urlsitemap; |
182
|
|
|
} else { |
183
|
|
|
$result['sitemap']['ok'][$urlsitemap] = $this->checkSitemap($response->getBody()->getContents()); |
184
|
|
|
} |
185
|
|
|
} |
186
|
|
|
|
187
|
|
View Code Duplication |
if (preg_match('/^\s*Disallow:(.*)/i', $line, $match)) { |
|
|
|
|
188
|
|
|
$urldisallow = trim($match[1]); |
189
|
|
|
$response = $this->client->get($urldisallow, ['exceptions' => false]); |
190
|
|
|
if (($response->getStatusCode() != 200) && ($response->getStatusCode() != 403)) { |
191
|
|
|
$result['disallow']['error'][] = $urldisallow; |
192
|
|
|
} else { |
193
|
|
|
$result['disallow']['ok'][] = $urldisallow; |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
return $result; |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
|
202
|
|
|
/** |
203
|
|
|
* check links in html and json files |
204
|
|
|
* |
205
|
|
|
* @param Finder $files |
206
|
|
|
* @param callable $checkstart |
207
|
|
|
* @param callable $checking |
208
|
|
|
* @param callable $checkend |
209
|
|
|
* |
210
|
|
|
* @throws \Exception |
211
|
|
|
* @return GlLinkCheckerError[] |
212
|
|
|
*/ |
213
|
|
|
public function checkFiles(Finder $files, callable $checkstart, callable $checking, callable $checkend) |
214
|
|
|
{ |
215
|
|
|
$linksByFile = []; |
216
|
|
|
/** |
217
|
|
|
* @var SplFileInfo $file |
218
|
|
|
*/ |
219
|
|
|
foreach ($files as $file) { |
220
|
|
|
$inner = file_get_contents($file->getRealPath()); |
221
|
|
|
$keyname = $file->getRelativePathname(); |
222
|
|
|
if ($file->getExtension() == 'html') { |
223
|
|
|
$html = new GlHtml($inner); |
224
|
|
|
$linksByFile[$keyname] = $html->getLinks(); |
225
|
|
|
} else { |
226
|
|
|
if ($file->getExtension() == 'json') { |
227
|
|
|
$linksByFile[$keyname] = $this->getJsonLinks($inner); |
228
|
|
|
} else { |
229
|
|
|
throw new \Exception("Extension unknown : " . $keyname); |
230
|
|
|
} |
231
|
|
|
} |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
//reverse $linksByFile |
235
|
|
|
$links = []; |
236
|
|
|
foreach ($linksByFile as $filename => $filelinks) { |
237
|
|
|
foreach ($filelinks as $filelink) { |
238
|
|
|
$links[$filelink][] = $filename; |
239
|
|
|
} |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
$checkstart(count($links)); |
243
|
|
|
$result = []; |
244
|
|
|
foreach ($links as $link => $files) { |
245
|
|
|
$checking($link, $files); |
246
|
|
|
|
247
|
|
|
$gllink = new GlLinkCheckerError($this->client, $link, $files); |
248
|
|
|
|
249
|
|
|
if ($gllink->isInternal($this->internalurls)) { |
250
|
|
|
$gllink->check(['lowercase', 'endslash', 'absolute']); |
251
|
|
|
} |
252
|
|
|
|
253
|
|
|
$gllink->check(['exist']); |
254
|
|
|
$result[] = $gllink; |
255
|
|
|
} |
256
|
|
|
$checkend(); |
257
|
|
|
|
258
|
|
|
return $result; |
259
|
|
|
} |
260
|
|
|
} |
261
|
|
|
|
Our type inference engine has found an assignment of a scalar value (like a string, an integer or null) to a property which is an array.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property.
To type hint that a parameter can be either an array or null, you can set a type hint of array and a default value of null. The PHP interpreter will then accept both an array or null for that parameter.
The function can be called with either null or an array for the parameter
$needle
but will only accept an array as$haystack
.