These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * Main Class |
||
4 | * |
||
5 | * PHP version 5.4 |
||
6 | * |
||
7 | * @category GLICER |
||
8 | * @package GlLinkChecker |
||
9 | * @author Emmanuel ROECKER |
||
10 | * @author Rym BOUCHAGOUR |
||
11 | * @copyright 2015 GLICER |
||
12 | * @license MIT |
||
13 | * @link http://dev.glicer.com/ |
||
14 | * |
||
15 | * Created : 10/03/15 |
||
16 | * File : GlLinkChecker.php |
||
17 | * |
||
18 | */ |
||
19 | namespace GlLinkChecker; |
||
20 | |||
21 | use GlHtml\GlHtml; |
||
22 | use GuzzleHttp\Client; |
||
23 | use Symfony\Component\Finder\Finder; |
||
24 | use Symfony\Component\Finder\SplFileInfo; |
||
25 | |||
26 | /** |
||
27 | * Class GlLinkChecker |
||
28 | * @package GLLinkChecker |
||
29 | */ |
||
30 | class GlLinkChecker |
||
31 | { |
||
32 | /** |
||
33 | * @var \GuzzleHttp\Client |
||
34 | */ |
||
35 | private $client; |
||
36 | |||
37 | /** |
||
38 | * @var array $internalurls |
||
39 | */ |
||
40 | private $internalurls; |
||
41 | |||
42 | /** |
||
43 | * |
||
44 | */ |
||
45 | public function __construct($rooturl = null, array $internalurls = null) |
||
46 | { |
||
47 | $this->client = new Client([ |
||
48 | 'base_url' => $rooturl, |
||
49 | 'defaults' => [ |
||
50 | 'headers' => [ |
||
51 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0', |
||
52 | 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
||
53 | 'Accept-Language' => 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3', |
||
54 | 'Accept-Encoding' => 'gzip, deflate' |
||
55 | ] |
||
56 | ] |
||
57 | ]); |
||
58 | $this->client->setDefaultOption('verify', false); |
||
59 | $this->internalurls = $internalurls; |
||
60 | } |
||
61 | |||
62 | /** |
||
63 | * @param string $text |
||
64 | * @param array $links |
||
65 | * |
||
66 | * @return array |
||
67 | */ |
||
68 | private function getLinks($text, &$links) |
||
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
69 | { |
||
70 | $regexUrl = '/[">\s]+((http|https|ftp|ftps)\:\/\/(.*?))["<\s]+/'; |
||
71 | $urls = null; |
||
72 | if (preg_match_all($regexUrl, $text, $urls) > 0) { |
||
73 | $matches = $urls[1]; |
||
74 | foreach ($matches as $url) { |
||
75 | if (filter_var($url, FILTER_VALIDATE_URL)) { |
||
76 | $links[$url] = $url; |
||
77 | } |
||
78 | } |
||
79 | } |
||
80 | } |
||
81 | |||
82 | |||
83 | /** |
||
84 | * get all links in an object |
||
85 | * |
||
86 | * @param $obj |
||
87 | * @param array $links |
||
88 | */ |
||
89 | private function searchInArray($obj, array &$links) |
||
90 | { |
||
91 | foreach ($obj as $key => $elem) { |
||
92 | if (is_string($elem)) { |
||
93 | if (preg_match("/^(http|https|ftp|ftps).*$/", $elem)) { |
||
94 | if (filter_var($elem, FILTER_VALIDATE_URL)) { |
||
95 | $links[$elem] = $elem; |
||
96 | } |
||
97 | } |
||
98 | } else { |
||
99 | if (is_array($elem)) { |
||
100 | $this->searchInArray($elem, $links); |
||
101 | } |
||
102 | } |
||
103 | } |
||
104 | } |
||
105 | |||
106 | /** |
||
107 | * get all links in a json |
||
108 | * |
||
109 | * @param string $json |
||
110 | * |
||
111 | * @return array |
||
112 | */ |
||
113 | private function getJsonLinks($json) |
||
114 | { |
||
115 | $obj = json_decode($json, true); |
||
116 | $links = []; |
||
117 | $this->searchInArray($obj, $links); |
||
118 | |||
119 | return $links; |
||
120 | } |
||
121 | |||
122 | |||
123 | /** |
||
124 | * check links in a sitemap |
||
125 | * |
||
126 | * @param string $sitemap |
||
127 | * |
||
128 | * @return array |
||
129 | * @throws \Exception |
||
130 | */ |
||
131 | private function checkSitemap($sitemap) |
||
132 | { |
||
133 | $xml = new GlHtml($sitemap); |
||
134 | $listloc = $xml->get("loc"); |
||
135 | $result = []; |
||
136 | foreach ($listloc as $loc) { |
||
137 | $response = $this->client->get($loc->getText(), ['exceptions' => false]); |
||
138 | if ($response->getStatusCode() != 200) { |
||
139 | $result['error'][] = $loc->getText(); |
||
140 | } else { |
||
141 | $result['ok'][] = $loc->getText(); |
||
142 | } |
||
143 | } |
||
144 | |||
145 | return $result; |
||
146 | } |
||
147 | |||
148 | /** |
||
149 | * check 403 and 404 errors |
||
150 | * |
||
151 | * @param array $urlerrors |
||
152 | * @param array $urlforbiddens |
||
153 | * |
||
154 | * @return string |
||
155 | */ |
||
156 | public function checkErrors(array $urlerrors, array $urlforbiddens) |
||
157 | { |
||
158 | $result = []; |
||
159 | |||
160 | View Code Duplication | foreach ($urlerrors as $urlerror) { |
|
161 | $response = $this->client->get($urlerror, ['exceptions' => false]); |
||
162 | if ($response->getStatusCode() != 404) { |
||
163 | $result["404"]["error"][] = $urlerror; |
||
164 | } else { |
||
165 | $result["404"]["ok"][] = $urlerror; |
||
166 | } |
||
167 | } |
||
168 | |||
169 | View Code Duplication | foreach ($urlforbiddens as $urlforbidden) { |
|
170 | $response = $this->client->get($urlforbidden, ['exceptions' => false]); |
||
171 | if ($response->getStatusCode() != 403) { |
||
172 | $result["403"]["error"][] = $urlforbidden; |
||
173 | } else { |
||
174 | $result["403"]["ok"][] = $urlforbidden; |
||
175 | } |
||
176 | } |
||
177 | |||
178 | return $result; |
||
179 | } |
||
180 | |||
181 | /** |
||
182 | * check links in robots.txt and sitemap |
||
183 | * |
||
184 | * @return array |
||
185 | * @throws \Exception |
||
186 | */ |
||
187 | public function checkRobotsSitemap() |
||
188 | { |
||
189 | $response = $this->client->get("/robots.txt"); |
||
190 | if ($response->getStatusCode() != 200) { |
||
191 | throw new \Exception("Cannot find robots.txt"); |
||
192 | } |
||
193 | |||
194 | $robotstxt = $response->getBody()->getContents(); |
||
195 | $robotstxt = explode("\n", $robotstxt); |
||
196 | $result = []; |
||
197 | foreach ($robotstxt as $line) { |
||
198 | View Code Duplication | if (preg_match('/^\s*Sitemap:(.*)/i', $line, $match)) { |
|
199 | $urlsitemap = trim($match[1]); |
||
200 | $response = $this->client->get($urlsitemap, ['exceptions' => false]); |
||
201 | if ($response->getStatusCode() != 200) { |
||
202 | $result['sitemap']['error'][] = $urlsitemap; |
||
203 | } else { |
||
204 | $result['sitemap']['ok'][$urlsitemap] = $this->checkSitemap($response->getBody()->getContents()); |
||
205 | } |
||
206 | } |
||
207 | |||
208 | View Code Duplication | if (preg_match('/^\s*Disallow:(.*)/i', $line, $match)) { |
|
209 | $urldisallow = trim($match[1]); |
||
210 | $response = $this->client->get($urldisallow, ['exceptions' => false]); |
||
211 | if (($response->getStatusCode() != 200) && ($response->getStatusCode() != 403)) { |
||
212 | $result['disallow']['error'][] = $urldisallow; |
||
213 | } else { |
||
214 | $result['disallow']['ok'][] = $urldisallow; |
||
215 | } |
||
216 | } |
||
217 | } |
||
218 | |||
219 | return $result; |
||
220 | } |
||
221 | |||
222 | |||
223 | /** |
||
224 | * check links in html and json files |
||
225 | * |
||
226 | * @param Finder $files |
||
227 | * @param callable $checkstart |
||
228 | * @param callable $checking |
||
229 | * @param callable $checkend |
||
230 | * |
||
231 | * @throws \Exception |
||
232 | * @return GlLinkCheckerError[] |
||
233 | */ |
||
234 | public function checkFiles(Finder $files, callable $checkstart, callable $checking, callable $checkend) |
||
235 | { |
||
236 | $linksByFile = []; |
||
237 | /** |
||
238 | * @var SplFileInfo $file |
||
239 | */ |
||
240 | foreach ($files as $file) { |
||
241 | $inner = file_get_contents($file->getRealPath()); |
||
242 | $keyname = $file->getRelativePathname(); |
||
243 | if ($file->getExtension() == 'html') { |
||
244 | $html = new GlHtml($inner); |
||
245 | $linksByFile[$keyname] = $html->getLinks(); |
||
246 | } else { |
||
247 | if ($file->getExtension() == 'json') { |
||
248 | $linksByFile[$keyname] = $this->getJsonLinks($inner); |
||
249 | } else { |
||
250 | throw new \Exception("Extension unknown : " . $keyname); |
||
251 | } |
||
252 | } |
||
253 | } |
||
254 | |||
255 | //reverse $linksByFile |
||
256 | $links = []; |
||
257 | foreach ($linksByFile as $filename => $filelinks) { |
||
258 | foreach ($filelinks as $filelink) { |
||
259 | $links[$filelink][] = $filename; |
||
260 | } |
||
261 | } |
||
262 | |||
263 | $checkstart(count($links)); |
||
264 | $result = []; |
||
265 | foreach ($links as $link => $files) { |
||
266 | $checking($link, $files); |
||
267 | |||
268 | $gllink = new GlLinkCheckerError($this->client, $link, $files); |
||
269 | |||
270 | if ($gllink->isInternal($this->internalurls)) { |
||
271 | $gllink->check(['lowercase', 'endslash', 'absolute']); |
||
272 | } |
||
273 | |||
274 | $gllink->check(['exist']); |
||
275 | $result[] = $gllink; |
||
276 | } |
||
277 | $checkend(); |
||
278 | |||
279 | return $result; |
||
280 | } |
||
281 | } |