These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * Main Class |
||
4 | * |
||
5 | * PHP version 5.4 |
||
6 | * |
||
7 | * @category GLICER |
||
8 | * @package GlLinkChecker |
||
9 | * @author Emmanuel ROECKER |
||
10 | * @author Rym BOUCHAGOUR |
||
11 | * @copyright 2015 GLICER |
||
12 | * @license MIT |
||
13 | * @link http://dev.glicer.com/ |
||
14 | * |
||
15 | * Created : 10/03/15 |
||
16 | * File : GlLinkChecker.php |
||
17 | * |
||
18 | */ |
||
19 | namespace GlLinkChecker; |
||
20 | |||
21 | use GlHtml\GlHtml; |
||
22 | use GuzzleHttp\Client; |
||
23 | use Symfony\Component\Finder\Finder; |
||
24 | use Symfony\Component\Finder\SplFileInfo; |
||
25 | |||
26 | /** |
||
27 | * Class GlLinkChecker |
||
28 | * @package GLLinkChecker |
||
29 | */ |
||
30 | class GlLinkChecker |
||
31 | { |
||
32 | /** |
||
33 | * @var \GuzzleHttp\Client |
||
34 | */ |
||
35 | private $client; |
||
36 | |||
37 | /** |
||
38 | * @var array|null $internalurls |
||
39 | */ |
||
40 | private $internalurls; |
||
41 | |||
42 | /** |
||
43 | * |
||
44 | */ |
||
45 | public function __construct($rooturl = null, array $internalurls = null) |
||
46 | { |
||
47 | $this->client = new Client([ |
||
48 | 'base_uri' => $rooturl, |
||
49 | 'verify' => false, |
||
50 | 'defaults' => [ |
||
51 | 'headers' => [ |
||
52 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0', |
||
53 | 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
||
54 | 'Accept-Language' => 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3', |
||
55 | 'Accept-Encoding' => 'gzip, deflate' |
||
56 | ] |
||
57 | ] |
||
58 | ]); |
||
59 | $this->internalurls = $internalurls; |
||
60 | } |
||
61 | |||
62 | /** |
||
63 | * get all links in an object |
||
64 | * |
||
65 | * @param $obj |
||
66 | * @param array $links |
||
67 | */ |
||
68 | private function searchInArray($obj, array &$links) |
||
69 | { |
||
70 | foreach ($obj as $key => $elem) { |
||
71 | if (is_string($elem)) { |
||
72 | if (preg_match("/^(http|https|ftp|ftps).*$/", $elem)) { |
||
73 | if (filter_var($elem, FILTER_VALIDATE_URL)) { |
||
74 | $links[$elem] = $elem; |
||
75 | } |
||
76 | } |
||
77 | } else { |
||
78 | if (is_array($elem)) { |
||
79 | $this->searchInArray($elem, $links); |
||
80 | } |
||
81 | } |
||
82 | } |
||
83 | } |
||
84 | |||
85 | /** |
||
86 | * get all links in a json |
||
87 | * |
||
88 | * @param string $json |
||
89 | * |
||
90 | * @return array |
||
91 | */ |
||
92 | private function getJsonLinks($json) |
||
93 | { |
||
94 | $obj = json_decode($json, true); |
||
95 | $links = []; |
||
96 | $this->searchInArray($obj, $links); |
||
97 | |||
98 | return $links; |
||
99 | } |
||
100 | |||
101 | |||
102 | /** |
||
103 | * check links in a sitemap |
||
104 | * |
||
105 | * @param string $sitemap |
||
106 | * |
||
107 | * @return array |
||
108 | * @throws \Exception |
||
109 | */ |
||
110 | private function checkSitemap($sitemap) |
||
111 | { |
||
112 | $xml = new GlHtml($sitemap); |
||
113 | $listloc = $xml->get("loc"); |
||
114 | $result = []; |
||
115 | foreach ($listloc as $loc) { |
||
116 | $response = $this->client->get($loc->getText(), ['exceptions' => false]); |
||
117 | if ($response->getStatusCode() != 200) { |
||
118 | $result['error'][] = $loc->getText(); |
||
119 | } else { |
||
120 | $result['ok'][] = $loc->getText(); |
||
121 | } |
||
122 | } |
||
123 | |||
124 | return $result; |
||
125 | } |
||
126 | |||
127 | /** |
||
128 | * check http error status code |
||
129 | * |
||
130 | * @param array $result |
||
131 | * @param array $urls |
||
132 | * @param int $statuscode |
||
133 | */ |
||
134 | private function checkStatus(array &$result, array $urls, $statuscode) { |
||
135 | foreach ($urls as $url) { |
||
136 | $response = $this->client->get($url, ['exceptions' => false]); |
||
137 | if ($response->getStatusCode() != $statuscode) { |
||
138 | $result[$statuscode]["error"][] = $url; |
||
139 | } else { |
||
140 | $result[$statuscode]["ok"][] = $url; |
||
141 | } |
||
142 | } |
||
143 | } |
||
144 | |||
145 | /** |
||
146 | * check 403 and 404 errors |
||
147 | * |
||
148 | * @param array $urlerrors |
||
149 | * @param array $urlforbiddens |
||
150 | * |
||
151 | * @return string |
||
152 | */ |
||
153 | public function checkErrors(array $urlerrors, array $urlforbiddens) |
||
154 | { |
||
155 | $result = []; |
||
156 | |||
157 | $this->checkStatus($result,$urlerrors,404); |
||
158 | $this->checkStatus($result,$urlforbiddens, 403); |
||
159 | |||
160 | return $result; |
||
161 | } |
||
162 | |||
163 | /** |
||
164 | * check links in robots.txt and sitemap |
||
165 | * |
||
166 | * @return array |
||
167 | * @throws \Exception |
||
168 | */ |
||
169 | public function checkRobotsSitemap() |
||
170 | { |
||
171 | $response = $this->client->get("/robots.txt"); |
||
172 | if ($response->getStatusCode() != 200) { |
||
173 | throw new \Exception("Cannot find robots.txt"); |
||
174 | } |
||
175 | |||
176 | $robotstxt = $response->getBody()->getContents(); |
||
177 | $robotstxt = explode("\n", $robotstxt); |
||
178 | $result = []; |
||
179 | foreach ($robotstxt as $line) { |
||
180 | View Code Duplication | if (preg_match('/^\s*Sitemap:(.*)/i', $line, $match)) { |
|
0 ignored issues
–
show
|
|||
181 | $urlsitemap = trim($match[1]); |
||
182 | $response = $this->client->get($urlsitemap, ['exceptions' => false]); |
||
183 | if ($response->getStatusCode() != 200) { |
||
184 | $result['sitemap']['error'][] = $urlsitemap; |
||
185 | } else { |
||
186 | $result['sitemap']['ok'][$urlsitemap] = $this->checkSitemap($response->getBody()->getContents()); |
||
187 | } |
||
188 | } |
||
189 | |||
190 | View Code Duplication | if (preg_match('/^\s*Disallow:(.*)/i', $line, $match)) { |
|
0 ignored issues
–
show
This code seems to be duplicated across your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository.
Loading history...
|
|||
191 | $urldisallow = trim($match[1]); |
||
192 | $response = $this->client->get($urldisallow, ['exceptions' => false]); |
||
193 | if (($response->getStatusCode() != 200) && ($response->getStatusCode() != 403)) { |
||
194 | $result['disallow']['error'][] = $urldisallow; |
||
195 | } else { |
||
196 | $result['disallow']['ok'][] = $urldisallow; |
||
197 | } |
||
198 | } |
||
199 | } |
||
200 | |||
201 | return $result; |
||
202 | } |
||
203 | |||
204 | |||
205 | /** |
||
206 | * check links in html and json files |
||
207 | * |
||
208 | * @param Finder $files |
||
209 | * @param callable $checkstart |
||
210 | * @param callable $checking |
||
211 | * @param callable $checkend |
||
212 | * |
||
213 | * @throws \Exception |
||
214 | * @return GlLinkCheckerError[] |
||
215 | */ |
||
216 | public function checkFiles(Finder $files, callable $checkstart, callable $checking, callable $checkend) |
||
217 | { |
||
218 | $linksByFile = []; |
||
219 | /** |
||
220 | * @var SplFileInfo $file |
||
221 | */ |
||
222 | foreach ($files as $file) { |
||
223 | $inner = file_get_contents($file->getRealPath()); |
||
224 | $keyname = $file->getRelativePathname(); |
||
225 | if ($file->getExtension() == 'html') { |
||
226 | $html = new GlHtml($inner); |
||
227 | $linksByFile[$keyname] = $html->getLinks(); |
||
228 | } else { |
||
229 | if ($file->getExtension() == 'json') { |
||
230 | $linksByFile[$keyname] = $this->getJsonLinks($inner); |
||
231 | } else { |
||
232 | throw new \Exception("Extension unknown : " . $keyname); |
||
233 | } |
||
234 | } |
||
235 | } |
||
236 | |||
237 | //reverse $linksByFile |
||
238 | $links = []; |
||
239 | foreach ($linksByFile as $filename => $filelinks) { |
||
240 | foreach ($filelinks as $filelink) { |
||
241 | $links[$filelink][] = $filename; |
||
242 | } |
||
243 | } |
||
244 | |||
245 | $checkstart(count($links)); |
||
246 | $result = []; |
||
247 | foreach ($links as $link => $files) { |
||
248 | $checking($link, $files); |
||
249 | |||
250 | $gllink = new GlLinkCheckerError($this->client, $link, $files); |
||
251 | |||
252 | if ($gllink->isInternal($this->internalurls)) { |
||
253 | $gllink->check(['lowercase', 'endslash', 'absolute']); |
||
254 | } |
||
255 | |||
256 | $gllink->check(['exist']); |
||
257 | $result[] = $gllink; |
||
258 | } |
||
259 | $checkend(); |
||
260 | |||
261 | return $result; |
||
262 | } |
||
263 | } |
||
264 |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.