1 | <?php |
||
2 | |||
3 | namespace PiedWeb\CMSBundle\Extension\PageScanner; |
||
4 | |||
5 | use Doctrine\ORM\EntityManagerInterface; |
||
6 | use PiedWeb\CMSBundle\Entity\PageInterface; |
||
7 | use PiedWeb\CMSBundle\Service\App; |
||
8 | use PiedWeb\CMSBundle\Utils\GenerateLivePathForTrait; |
||
9 | use PiedWeb\CMSBundle\Utils\KernelTrait; |
||
10 | use PiedWeb\UrlHarvester\Harvest; |
||
11 | use Symfony\Component\HttpFoundation\Request; |
||
12 | use Symfony\Component\HttpKernel\KernelInterface; |
||
13 | use Symfony\Component\Routing\RouterInterface; |
||
14 | use Twig\Environment as Twig_Environment; |
||
15 | |||
16 | /** |
||
17 | * Permit to find error in image or link. |
||
18 | */ |
||
19 | class PageScannerService |
||
20 | { |
||
21 | use GenerateLivePathForTrait; |
||
22 | use KernelTrait; |
||
23 | |||
24 | /** |
||
25 | * @var App |
||
26 | */ |
||
27 | protected $app; |
||
28 | |||
29 | protected $em; |
||
30 | protected $pageHtml; |
||
31 | protected $twig; |
||
32 | protected $currentPage; |
||
33 | protected $webDir; |
||
34 | protected $previousRequest; |
||
35 | protected $apps; |
||
36 | protected $linksCheckedCounter = 0; |
||
37 | protected $errors = []; |
||
38 | protected $everChecked = []; |
||
39 | public static $appKernel; |
||
40 | |||
41 | public function __construct( |
||
42 | Twig_Environment $twig, |
||
43 | EntityManagerInterface $em, |
||
44 | string $webDir, |
||
45 | array $apps, |
||
46 | RouterInterface $router, |
||
47 | KernelInterface $kernel |
||
48 | ) { |
||
49 | $this->twig = $twig; |
||
50 | $this->router = $router; |
||
51 | $this->em = $em; |
||
52 | $this->webDir = $webDir; |
||
53 | $this->apps = $apps; |
||
54 | |||
55 | static::loadKernel($kernel); |
||
56 | } |
||
57 | |||
58 | protected function resetErrors() |
||
59 | { |
||
60 | $this->errors = []; |
||
61 | } |
||
62 | |||
63 | public function scan(PageInterface $page) |
||
64 | { |
||
65 | $this->app = new App($page->getHost(), $this->apps); |
||
66 | $this->currentPage = $page; |
||
67 | $this->resetErrors(); |
||
68 | $this->pageHtml = ''; |
||
69 | |||
70 | if (false !== $page->getRedirection()) { |
||
71 | // check $page->getRedirection() return 20X |
||
72 | |||
73 | return true; // or status code |
||
74 | } |
||
75 | |||
76 | $liveUri = $this->generateLivePathFor($page); |
||
77 | $this->pageHtml = $this->getHtml($liveUri); |
||
78 | |||
79 | // 2. Je récupère tout les liens et je les check |
||
80 | // href="", data-rot="" data-img="", src="", data-bg |
||
81 | if ($this->pageHtml) { |
||
82 | $this->checkLinkedDocs($this->getLinkedDocs()); |
||
83 | } |
||
84 | |||
85 | return empty($this->errors) ? true : $this->errors; |
||
86 | } |
||
87 | |||
88 | protected function getHtml($liveUri) |
||
89 | { |
||
90 | $request = Request::create($liveUri); |
||
91 | $response = static::$appKernel->handle($request); |
||
92 | |||
93 | if ($response->isRedirect()) { |
||
94 | //. $linkedDocs[] = $response->headers->get('location'); |
||
95 | // todo check redirection |
||
96 | return; |
||
97 | } elseif (200 != $response->getStatusCode()) { |
||
98 | $this->addError('error on generating the page ('.$response->getStatusCode().')'); |
||
99 | exit; |
||
0 ignored issues
–
show
|
|||
100 | |||
101 | return; |
||
0 ignored issues
–
show
return is not reachable.
This check looks for unreachable code. It uses sophisticated control flow analysis techniques to find statements which will never be executed. Unreachable code is most often the result of function fx() {
try {
doSomething();
return true;
}
catch (\Exception $e) {
return false;
}
return false;
}
In the above example, the last ![]() |
|||
102 | } |
||
103 | |||
104 | return $response->getContent(); |
||
105 | } |
||
106 | |||
107 | protected function addError($message) |
||
108 | { |
||
109 | $this->errors[] = [ |
||
110 | 'message' => $message, |
||
111 | 'page' => $this->currentPage, |
||
112 | ]; |
||
113 | } |
||
114 | |||
115 | protected static function prepareForRegex($var) |
||
116 | { |
||
117 | if (\is_string($var)) { |
||
118 | return preg_quote($var, '/'); |
||
119 | } |
||
120 | |||
121 | $var = array_map('static::prepareForRegex', $var); |
||
122 | |||
123 | return '('.implode('|', $var).')'; |
||
124 | } |
||
125 | |||
126 | protected static function isWebLink(string $url) |
||
127 | { |
||
128 | return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url); |
||
129 | } |
||
130 | |||
131 | protected function getLinkedDocs(): array |
||
132 | { |
||
133 | $urlInAttributes = ' '.self::prepareForRegex(['href', 'data-rot', 'src', 'data-img', 'data-bg']); |
||
134 | $regex = '/'.$urlInAttributes.'=((["\'])([^\3]+)\3|([^\s>]+)[\s>])/iU'; |
||
135 | preg_match_all($regex, $this->pageHtml, $matches); |
||
136 | |||
137 | $linkedDocs = []; |
||
138 | $matchesCount = \count($matches[0]); |
||
139 | for ($k = 0; $k < $matchesCount; ++$k) { |
||
140 | $uri = isset($matches[4][$k]) ? $matches[4][$k] : $matches[5][$k]; |
||
141 | $uri = 'data-rot' == $matches[1][$k] ? str_rot13($uri) : $uri; |
||
142 | $uri = strtok($uri, '#'); |
||
143 | $uri = $this->removeBase($uri); |
||
144 | if ('' !== $uri && self::isWebLink($uri)) { |
||
145 | $linkedDocs[] = $uri; |
||
146 | } |
||
147 | } |
||
148 | |||
149 | return array_unique($linkedDocs); |
||
150 | } |
||
151 | |||
152 | protected function removeBase($url) |
||
153 | { |
||
154 | if (0 === strpos($url, 'https://'.$this->app->getMainHost())) { |
||
155 | return substr($url, \strlen('https://'.$this->app->getMainHost())); |
||
156 | } |
||
157 | |||
158 | return $url; |
||
159 | } |
||
160 | |||
161 | public function getLinksCheckedCounter() |
||
162 | { |
||
163 | return $this->linksCheckedCounter; |
||
164 | } |
||
165 | |||
166 | protected function checkLinkedDocs(array $linkedDocs) |
||
167 | { |
||
168 | foreach ($linkedDocs as $uri) { |
||
169 | ++$this->linksCheckedCounter; |
||
170 | if (! \is_string($uri)) { |
||
171 | continue; |
||
172 | } |
||
173 | if (('/' == $uri[0] && ! $this->uriExist($uri)) |
||
174 | || (0 === strpos($uri, 'http') && ! $this->urlExist($uri))) { |
||
175 | $this->addError('<code>'.$uri.'</code> introuvable'); |
||
176 | } |
||
177 | } |
||
178 | } |
||
179 | |||
180 | /** |
||
181 | * this is really slow on big website. |
||
182 | * |
||
183 | * @param string $uri |
||
184 | * |
||
185 | * @return bool |
||
186 | */ |
||
187 | protected function urlExist($uri) |
||
188 | { |
||
189 | $harvest = Harvest::fromUrl( |
||
190 | $uri, |
||
191 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36', |
||
192 | 'en,en-US;q=0.5', |
||
193 | $this->previousRequest |
||
194 | ); |
||
195 | |||
196 | if (\is_int($harvest) || 200 !== $harvest->getResponse()->getStatusCode()) { |
||
197 | return false; |
||
198 | } |
||
199 | |||
200 | $this->previousRequest = $harvest->getResponse()->getRequest(); |
||
201 | |||
202 | return true; |
||
203 | } |
||
204 | |||
205 | protected function uriExist($uri) |
||
206 | { |
||
207 | $slug = ltrim($uri, '/'); |
||
208 | |||
209 | if (isset($this->everChecked[$slug])) { |
||
210 | return $this->everChecked[$slug]; |
||
211 | } |
||
212 | |||
213 | $checkDatabase = 0 !== strpos($slug, 'media/'); // we avoid to check in db the media, file exists is enough |
||
214 | $page = true !== $checkDatabase ? null : $this->em->getRepository(\get_class($this->currentPage)) |
||
215 | ->findOneBy(['slug' => '' == $slug ? 'homepage' : $slug]); // todo add domain check (currentPage domain) |
||
216 | |||
217 | $this->everChecked[$slug] = ( |
||
218 | null === $page |
||
219 | && ! file_exists($this->webDir.'/'.$slug) |
||
220 | && 'feed.xml' !== $slug |
||
221 | ) ? false : true; |
||
222 | |||
223 | return $this->everChecked[$slug]; |
||
224 | } |
||
225 | } |
||
226 |
In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.