PageScannerService   A
last analyzed

Complexity

Total Complexity 39

Size/Duplication

Total Lines 205
Duplicated Lines 0 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
eloc 95
c 2
b 1
f 0
dl 0
loc 205
rs 9.28
wmc 39

13 Methods

Rating   Name   Duplication   Size   Complexity  
A scan() 0 23 4
A getLinkedDocs() 0 19 6
A removeBase() 0 7 2
A prepareForRegex() 0 9 2
B checkLinkedDocs() 0 10 7
A getLinksCheckedCounter() 0 3 1
A isWebLink() 0 3 1
A resetErrors() 0 3 1
A __construct() 0 15 1
A addError() 0 5 1
B uriExist() 0 19 7
A urlExist() 0 16 3
A getHtml() 0 17 3
1
<?php
2
3
namespace PiedWeb\CMSBundle\Extension\PageScanner;
4
5
use Doctrine\ORM\EntityManagerInterface;
6
use PiedWeb\CMSBundle\Entity\PageInterface;
7
use PiedWeb\CMSBundle\Service\App;
8
use PiedWeb\CMSBundle\Utils\GenerateLivePathForTrait;
9
use PiedWeb\CMSBundle\Utils\KernelTrait;
10
use PiedWeb\UrlHarvester\Harvest;
11
use Symfony\Component\HttpFoundation\Request;
12
use Symfony\Component\HttpKernel\KernelInterface;
13
use Symfony\Component\Routing\RouterInterface;
14
use Twig\Environment as Twig_Environment;
15
16
/**
17
 * Permit to find error in image or link.
18
 */
19
class PageScannerService
20
{
21
    use GenerateLivePathForTrait;
22
    use KernelTrait;
23
24
    /**
25
     * @var App
26
     */
27
    protected $app;
28
29
    protected $em;
30
    protected $pageHtml;
31
    protected $twig;
32
    protected $currentPage;
33
    protected $webDir;
34
    protected $previousRequest;
35
    protected $apps;
36
    protected $linksCheckedCounter = 0;
37
    protected $errors = [];
38
    protected $everChecked = [];
39
    public static $appKernel;
40
41
    public function __construct(
42
        Twig_Environment $twig,
43
        EntityManagerInterface $em,
44
        string $webDir,
45
        array $apps,
46
        RouterInterface $router,
47
        KernelInterface $kernel
48
    ) {
49
        $this->twig = $twig;
50
        $this->router = $router;
51
        $this->em = $em;
52
        $this->webDir = $webDir;
53
        $this->apps = $apps;
54
55
        static::loadKernel($kernel);
56
    }
57
58
    protected function resetErrors()
59
    {
60
        $this->errors = [];
61
    }
62
63
    public function scan(PageInterface $page)
64
    {
65
        $this->app = new App($page->getHost(), $this->apps);
66
        $this->currentPage = $page;
67
        $this->resetErrors();
68
        $this->pageHtml = '';
69
70
        if (false !== $page->getRedirection()) {
71
            // check $page->getRedirection() return 20X
72
73
            return true; // or status code
74
        }
75
76
        $liveUri = $this->generateLivePathFor($page);
77
        $this->pageHtml = $this->getHtml($liveUri);
78
79
        // 2. Je récupère tout les liens et je les check
80
        // href="", data-rot="" data-img="", src="", data-bg
81
        if ($this->pageHtml) {
82
            $this->checkLinkedDocs($this->getLinkedDocs());
83
        }
84
85
        return empty($this->errors) ? true : $this->errors;
86
    }
87
88
    protected function getHtml($liveUri)
89
    {
90
        $request = Request::create($liveUri);
91
        $response = static::$appKernel->handle($request);
92
93
        if ($response->isRedirect()) {
94
            //. $linkedDocs[] = $response->headers->get('location');
95
            // todo check redirection
96
            return;
97
        } elseif (200 != $response->getStatusCode()) {
98
            $this->addError('error on generating the page ('.$response->getStatusCode().')');
99
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
100
101
            return;
0 ignored issues
show
Unused Code introduced by
return is not reachable.

This check looks for unreachable code. It uses sophisticated control flow analysis techniques to find statements which will never be executed.

Unreachable code is most often the result of return, die or exit statements that have been added for debug purposes.

function fx() {
    try {
        doSomething();
        return true;
    }
    catch (\Exception $e) {
        return false;
    }

    return false;
}

In the above example, the last return false will never be executed, because a return statement has already been met in every possible execution path.

Loading history...
102
        }
103
104
        return $response->getContent();
105
    }
106
107
    protected function addError($message)
108
    {
109
        $this->errors[] = [
110
            'message' => $message,
111
            'page' => $this->currentPage,
112
        ];
113
    }
114
115
    protected static function prepareForRegex($var)
116
    {
117
        if (\is_string($var)) {
118
            return preg_quote($var, '/');
119
        }
120
121
        $var = array_map('static::prepareForRegex', $var);
122
123
        return '('.implode('|', $var).')';
124
    }
125
126
    protected static function isWebLink(string $url)
127
    {
128
        return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url);
129
    }
130
131
    protected function getLinkedDocs(): array
132
    {
133
        $urlInAttributes = ' '.self::prepareForRegex(['href', 'data-rot', 'src', 'data-img', 'data-bg']);
134
        $regex = '/'.$urlInAttributes.'=((["\'])([^\3]+)\3|([^\s>]+)[\s>])/iU';
135
        preg_match_all($regex, $this->pageHtml, $matches);
136
137
        $linkedDocs = [];
138
        $matchesCount = \count($matches[0]);
139
        for ($k = 0; $k < $matchesCount; ++$k) {
140
            $uri = isset($matches[4][$k]) ? $matches[4][$k] : $matches[5][$k];
141
            $uri = 'data-rot' == $matches[1][$k] ? str_rot13($uri) : $uri;
142
            $uri = strtok($uri, '#');
143
            $uri = $this->removeBase($uri);
144
            if ('' !== $uri && self::isWebLink($uri)) {
145
                $linkedDocs[] = $uri;
146
            }
147
        }
148
149
        return array_unique($linkedDocs);
150
    }
151
152
    protected function removeBase($url)
153
    {
154
        if (0 === strpos($url, 'https://'.$this->app->getMainHost())) {
155
            return substr($url, \strlen('https://'.$this->app->getMainHost()));
156
        }
157
158
        return $url;
159
    }
160
161
    public function getLinksCheckedCounter()
162
    {
163
        return $this->linksCheckedCounter;
164
    }
165
166
    protected function checkLinkedDocs(array $linkedDocs)
167
    {
168
        foreach ($linkedDocs as $uri) {
169
            ++$this->linksCheckedCounter;
170
            if (! \is_string($uri)) {
171
                continue;
172
            }
173
            if (('/' == $uri[0] && ! $this->uriExist($uri))
174
                || (0 === strpos($uri, 'http') && ! $this->urlExist($uri))) {
175
                $this->addError('<code>'.$uri.'</code> introuvable');
176
            }
177
        }
178
    }
179
180
    /**
181
     * this is really slow on big website.
182
     *
183
     * @param string $uri
184
     *
185
     * @return bool
186
     */
187
    protected function urlExist($uri)
188
    {
189
        $harvest = Harvest::fromUrl(
190
            $uri,
191
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36',
192
            'en,en-US;q=0.5',
193
            $this->previousRequest
194
        );
195
196
        if (\is_int($harvest) || 200 !== $harvest->getResponse()->getStatusCode()) {
197
            return false;
198
        }
199
200
        $this->previousRequest = $harvest->getResponse()->getRequest();
201
202
        return true;
203
    }
204
205
    protected function uriExist($uri)
206
    {
207
        $slug = ltrim($uri, '/');
208
209
        if (isset($this->everChecked[$slug])) {
210
            return $this->everChecked[$slug];
211
        }
212
213
        $checkDatabase = 0 !== strpos($slug, 'media/'); // we avoid to check in db the media, file exists is enough
214
        $page = true !== $checkDatabase ? null : $this->em->getRepository(\get_class($this->currentPage))
215
            ->findOneBy(['slug' => '' == $slug ? 'homepage' : $slug]); // todo add domain check (currentPage domain)
216
217
        $this->everChecked[$slug] = (
218
            null === $page
219
                && ! file_exists($this->webDir.'/'.$slug)
220
                && 'feed.xml' !== $slug
221
        ) ? false : true;
222
223
        return $this->everChecked[$slug];
224
    }
225
}
226