1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\CMSBundle\Extension\PageScanner; |
4
|
|
|
|
5
|
|
|
use Doctrine\ORM\EntityManagerInterface; |
6
|
|
|
use PiedWeb\CMSBundle\Entity\PageInterface; |
7
|
|
|
use PiedWeb\CMSBundle\Service\App; |
8
|
|
|
use PiedWeb\CMSBundle\Utils\GenerateLivePathForTrait; |
9
|
|
|
use PiedWeb\CMSBundle\Utils\KernelTrait; |
10
|
|
|
use PiedWeb\UrlHarvester\Harvest; |
11
|
|
|
use Symfony\Component\HttpFoundation\Request; |
12
|
|
|
use Symfony\Component\HttpKernel\KernelInterface; |
13
|
|
|
use Symfony\Component\Routing\RouterInterface; |
14
|
|
|
use Twig\Environment as Twig_Environment; |
15
|
|
|
|
16
|
|
|
/** |
17
|
|
|
* Permit to find error in image or link. |
18
|
|
|
*/ |
19
|
|
|
class PageScannerService |
20
|
|
|
{ |
21
|
|
|
use GenerateLivePathForTrait; |
22
|
|
|
use KernelTrait; |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* @var App |
26
|
|
|
*/ |
27
|
|
|
protected $app; |
28
|
|
|
|
29
|
|
|
protected $em; |
30
|
|
|
protected $pageHtml; |
31
|
|
|
protected $twig; |
32
|
|
|
protected $currentPage; |
33
|
|
|
protected $webDir; |
34
|
|
|
protected $previousRequest; |
35
|
|
|
protected $apps; |
36
|
|
|
protected $linksCheckedCounter = 0; |
37
|
|
|
protected $errors = []; |
38
|
|
|
protected $everChecked = []; |
39
|
|
|
public static $appKernel; |
40
|
|
|
|
41
|
|
|
public function __construct( |
42
|
|
|
Twig_Environment $twig, |
43
|
|
|
EntityManagerInterface $em, |
44
|
|
|
string $webDir, |
45
|
|
|
array $apps, |
46
|
|
|
RouterInterface $router, |
47
|
|
|
KernelInterface $kernel |
48
|
|
|
) { |
49
|
|
|
$this->twig = $twig; |
50
|
|
|
$this->router = $router; |
51
|
|
|
$this->em = $em; |
52
|
|
|
$this->webDir = $webDir; |
53
|
|
|
$this->apps = $apps; |
54
|
|
|
|
55
|
|
|
static::loadKernel($kernel); |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
protected function resetErrors() |
59
|
|
|
{ |
60
|
|
|
$this->errors = []; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
public function scan(PageInterface $page) |
64
|
|
|
{ |
65
|
|
|
$this->app = new App($page->getHost(), $this->apps); |
66
|
|
|
$this->currentPage = $page; |
67
|
|
|
$this->resetErrors(); |
68
|
|
|
$this->pageHtml = ''; |
69
|
|
|
|
70
|
|
|
if (false !== $page->getRedirection()) { |
71
|
|
|
// check $page->getRedirection() return 20X |
72
|
|
|
|
73
|
|
|
return true; // or status code |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
$liveUri = $this->generateLivePathFor($page); |
77
|
|
|
$this->pageHtml = $this->getHtml($liveUri); |
78
|
|
|
|
79
|
|
|
// 2. Je récupère tout les liens et je les check |
80
|
|
|
// href="", data-rot="" data-img="", src="", data-bg |
81
|
|
|
if ($this->pageHtml) { |
82
|
|
|
$this->checkLinkedDocs($this->getLinkedDocs()); |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
return empty($this->errors) ? true : $this->errors; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
protected function getHtml($liveUri) |
89
|
|
|
{ |
90
|
|
|
$request = Request::create($liveUri); |
91
|
|
|
$response = static::$appKernel->handle($request); |
92
|
|
|
|
93
|
|
|
if ($response->isRedirect()) { |
94
|
|
|
//. $linkedDocs[] = $response->headers->get('location'); |
95
|
|
|
// todo check redirection |
96
|
|
|
return; |
97
|
|
|
} elseif (200 != $response->getStatusCode()) { |
98
|
|
|
$this->addError('error on generating the page ('.$response->getStatusCode().')'); |
99
|
|
|
exit; |
|
|
|
|
100
|
|
|
|
101
|
|
|
return; |
|
|
|
|
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
return $response->getContent(); |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
protected function addError($message) |
108
|
|
|
{ |
109
|
|
|
$this->errors[] = [ |
110
|
|
|
'message' => $message, |
111
|
|
|
'page' => $this->currentPage, |
112
|
|
|
]; |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
protected static function prepareForRegex($var) |
116
|
|
|
{ |
117
|
|
|
if (\is_string($var)) { |
118
|
|
|
return preg_quote($var, '/'); |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
$var = array_map('static::prepareForRegex', $var); |
122
|
|
|
|
123
|
|
|
return '('.implode('|', $var).')'; |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
protected static function isWebLink(string $url) |
127
|
|
|
{ |
128
|
|
|
return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url); |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
protected function getLinkedDocs(): array |
132
|
|
|
{ |
133
|
|
|
$urlInAttributes = ' '.self::prepareForRegex(['href', 'data-rot', 'src', 'data-img', 'data-bg']); |
134
|
|
|
$regex = '/'.$urlInAttributes.'=((["\'])([^\3]+)\3|([^\s>]+)[\s>])/iU'; |
135
|
|
|
preg_match_all($regex, $this->pageHtml, $matches); |
136
|
|
|
|
137
|
|
|
$linkedDocs = []; |
138
|
|
|
$matchesCount = \count($matches[0]); |
139
|
|
|
for ($k = 0; $k < $matchesCount; ++$k) { |
140
|
|
|
$uri = isset($matches[4][$k]) ? $matches[4][$k] : $matches[5][$k]; |
141
|
|
|
$uri = 'data-rot' == $matches[1][$k] ? str_rot13($uri) : $uri; |
142
|
|
|
$uri = strtok($uri, '#'); |
143
|
|
|
$uri = $this->removeBase($uri); |
144
|
|
|
if ('' !== $uri && self::isWebLink($uri)) { |
145
|
|
|
$linkedDocs[] = $uri; |
146
|
|
|
} |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
return array_unique($linkedDocs); |
150
|
|
|
} |
151
|
|
|
|
152
|
|
|
protected function removeBase($url) |
153
|
|
|
{ |
154
|
|
|
if (0 === strpos($url, 'https://'.$this->app->getMainHost())) { |
155
|
|
|
return substr($url, \strlen('https://'.$this->app->getMainHost())); |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
return $url; |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
public function getLinksCheckedCounter() |
162
|
|
|
{ |
163
|
|
|
return $this->linksCheckedCounter; |
164
|
|
|
} |
165
|
|
|
|
166
|
|
|
protected function checkLinkedDocs(array $linkedDocs) |
167
|
|
|
{ |
168
|
|
|
foreach ($linkedDocs as $uri) { |
169
|
|
|
++$this->linksCheckedCounter; |
170
|
|
|
if (! \is_string($uri)) { |
171
|
|
|
continue; |
172
|
|
|
} |
173
|
|
|
if (('/' == $uri[0] && ! $this->uriExist($uri)) |
174
|
|
|
|| (0 === strpos($uri, 'http') && ! $this->urlExist($uri))) { |
175
|
|
|
$this->addError('<code>'.$uri.'</code> introuvable'); |
176
|
|
|
} |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
/** |
181
|
|
|
* this is really slow on big website. |
182
|
|
|
* |
183
|
|
|
* @param string $uri |
184
|
|
|
* |
185
|
|
|
* @return bool |
186
|
|
|
*/ |
187
|
|
|
protected function urlExist($uri) |
188
|
|
|
{ |
189
|
|
|
$harvest = Harvest::fromUrl( |
190
|
|
|
$uri, |
191
|
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36', |
192
|
|
|
'en,en-US;q=0.5', |
193
|
|
|
$this->previousRequest |
194
|
|
|
); |
195
|
|
|
|
196
|
|
|
if (\is_int($harvest) || 200 !== $harvest->getResponse()->getStatusCode()) { |
197
|
|
|
return false; |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
$this->previousRequest = $harvest->getResponse()->getRequest(); |
201
|
|
|
|
202
|
|
|
return true; |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
protected function uriExist($uri) |
206
|
|
|
{ |
207
|
|
|
$slug = ltrim($uri, '/'); |
208
|
|
|
|
209
|
|
|
if (isset($this->everChecked[$slug])) { |
210
|
|
|
return $this->everChecked[$slug]; |
211
|
|
|
} |
212
|
|
|
|
213
|
|
|
$checkDatabase = 0 !== strpos($slug, 'media/'); // we avoid to check in db the media, file exists is enough |
214
|
|
|
$page = true !== $checkDatabase ? null : $this->em->getRepository(\get_class($this->currentPage)) |
215
|
|
|
->findOneBy(['slug' => '' == $slug ? 'homepage' : $slug]); // todo add domain check (currentPage domain) |
216
|
|
|
|
217
|
|
|
$this->everChecked[$slug] = ( |
218
|
|
|
null === $page |
219
|
|
|
&& ! file_exists($this->webDir.'/'.$slug) |
220
|
|
|
&& 'feed.xml' !== $slug |
221
|
|
|
) ? false : true; |
222
|
|
|
|
223
|
|
|
return $this->everChecked[$slug]; |
224
|
|
|
} |
225
|
|
|
} |
226
|
|
|
|
In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.