Passed
Push — master ( a8de00...373459 )
by Dev
04:36
created

Harvest::indexable()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
8
use Spatie\Robots\RobotsHeaders;
0 ignored issues
show
Bug introduced by
This use statement conflicts with another class in this namespace, PiedWeb\UrlHarvester\RobotsHeaders. Consider defining an alias.

Let?s assume that you have a directory layout like this:

.
|-- OtherDir
|   |-- Bar.php
|   `-- Foo.php
`-- SomeDir
    `-- Foo.php

and let?s assume the following content of Bar.php:

// Bar.php
namespace OtherDir;

use SomeDir\Foo; // This now conflicts the class OtherDir\Foo

If both files OtherDir/Foo.php and SomeDir/Foo.php are loaded in the same runtime, you will see a PHP error such as the following:

PHP Fatal error:  Cannot use SomeDir\Foo as Foo because the name is already in use in OtherDir/Foo.php

However, as OtherDir/Foo.php does not necessarily have to be loaded and the error is only triggered if it is loaded before OtherDir/Bar.php, this problem might go unnoticed for a while. In order to prevent this error from surfacing, you must import the namespace with a different alias:

// Bar.php
namespace OtherDir;

use SomeDir\Foo as SomeDirFoo; // There is no conflict anymore.
Loading history...
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait;
14
    use RobotsTxtTrait;
15
16
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
17
18
    /**
19
     * @var Response
20
     */
21
    protected $response;
22
23
    /**
24
     * @var \Symfony\Component\DomCrawler\Crawler
25
     */
26
    protected $dom;
27
28
    /** @var string */
29
    protected $baseUrl;
30
31
    /** @var bool */
32
    protected $follow;
33
34
    /** @var \PiedWeb\TextAnalyzer\Analysis */
35
    private $textAnalysis;
36
37
    /** @var Url */
38
    protected $urlRequested;
39
40
    /** @var Url */
41
    protected $url;
42
43
    /**
44
     * @return self|int
45
     */
46 18
    public static function fromUrl(
47
        string $url,
48
        string $userAgent = self::DEFAULT_USER_AGENT,
49
        string $language = 'en,en-US;q=0.5',
50
        ?CurlRequest $previousRequest = null
51
    ) {
52 18
        $url = Link::normalizeUrl($url); // add trailing slash for domain
53
54 18
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
55
56 18
        if ($response instanceof Response) {
57 18
            return new self($response);
58
        }
59
60
        return $response;
61
    }
62
63 24
    public function __construct(Response $response)
64
    {
65 24
        $this->response = $response;
66
67 24
        $this->url = new Url($this->response->getEffectiveUrl());
68 24
        $this->urlRequested = new Url($this->response->getUrl());
69 24
    }
70
71 6
    public function urlRequested(): Url
72
    {
73 6
        return $this->urlRequested;
74
    }
75
76
    /**
77
     * Return url response (curl effective url)
78
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
79
     */
80 21
    public function url(): Url
81
    {
82 21
        return $this->url;
83
    }
84
85 15
    public function getUrl(): Url
86
    {
87 15
        return $this->url;
88
    }
89
90 18
    public function getResponse(): Response
91
    {
92 18
        return $this->response;
93
    }
94
95 30
    public function getDom()
96
    {
97 30
        $this->dom = $this->dom ?? new DomCrawler($this->response->getContent());
98
99 30
        return $this->dom;
100
    }
101
102 21
    private function find($selector, $i = null): DomCrawler
103
    {
104 21
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
105
    }
106
107
    /**
108
     * Alias for find($selector, 0).
109
     */
110 18
    private function findOne($selector): DomCrawler
111
    {
112 18
        return $this->find($selector, 0);
113
    }
114
115
    /**
116
     * Return content inside a selector.
117
     * Eg.: getTag('title').
118
     *
119
     * @return string
120
     */
121 3
    public function getTag($selector)
122
    {
123 3
        $found = $this->findOne($selector);
124
125 3
        return $found->count() > 0 ? Helper::clean($found->text()) : null;
126
    }
127
128 6
    public function getUniqueTag($selector = 'title')
129
    {
130 6
        $found = $this->find($selector);
131
132 6
        if (0 === $found->count()) {
133 3
            return null;
134
        }
135
136 6
        if ($found->count() > 1) {
137
            return $found->count().' `'.$selector.'` /!\ ';
138
        }
139
140 6
        return Helper::clean($found->eq(0)->text());
141
    }
142
143
    /**
144
     * Return content inside a meta.
145
     *
146
     * @return string|null from content attribute
147
     */
148 18
    public function getMeta(string $name): ?string
149
    {
150 18
        $meta = $this->findOne('meta[name='.$name.']');
151
152 18
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '')
153 18
            : null;
154
    }
155
156
    /**
157
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
158
     */
159 12
    public function getCanonical(): ?string
160
    {
161 12
        $canonical = $this->findOne('link[rel=canonical]');
162
163 12
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
164
    }
165
166
    /*
167
     * @return bool true si canonical = url requested or no canonical balise
168
     */
169 12
    public function isCanonicalCorrect(): bool
170
    {
171 12
        $canonical = $this->getCanonical();
172
173 12
        return null === $canonical ? true : $this->urlRequested()->get() == $canonical;
174
    }
175
176 6
    public function getTextAnalysis()
177
    {
178 6
        if (null !== $this->textAnalysis) {
179 3
            return $this->textAnalysis;
180
        }
181
182 3
        return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get(
183 3
            $this->getDom()->text(),
184 3
            true,   // only sentences
185 3
            1,      // no expression, just words
186 3
            0      // keep trail
187 3
        ) : null;
188
    }
189
190 3
    public function getKws()
191
    {
192 3
        return $this->getTextAnalysis()->getExpressions(10);
193
    }
194
195 3
    public function getRatioTxtCode(): int
196
    {
197 3
        $textLenght = strlen($this->getDom()->text());
198 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
199
200 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
201
    }
202
203
    /**
204
     * Return an array of object with two elements Link and anchor.
205
     */
206 3
    public function getBreadCrumb(?string $separator = null): ?array
207
    {
208 3
        $breadcrumb = ExtractBreadcrumb::get($this);
209
210 3
        if (null !== $separator && is_array($breadcrumb)) {
211
            $breadcrumb = array_map(function ($item) {
212
                return $item->getCleanName();
213
            }, $breadcrumb);
214
            $breadcrumb = implode($separator, $breadcrumb);
215
        }
216
217 3
        return $breadcrumb;
218
    }
219
220
    /**
221
     * @return ?string absolute url
222
     */
223 3
    public function getRedirection(): ?string
224
    {
225 3
        $headers = $this->response->getHeaders();
226 3
        $headers = array_change_key_case($headers ? $headers : []);
227 3
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
228 3
            return $this->url()->resolve($headers['location']);
229
        }
230
231
        return null;
232
    }
233
234
    public function getRedirectionLink(): ?Link
235
    {
236
        $redirection = $this->getRedirection();
237
238
        if (null !== $redirection) {
239
            return Link::createRedirection($redirection, $this);
240
        }
241
242
        return null;
243
    }
244
245
    public function isRedirectToHttps(): bool
246
    {
247
        $redirUrl = $this->getRedirection();
248
249
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
250
    }
251
252
    /**
253
     * Return the value in base tag if exist, else, current Url.
254
     */
255 3
    public function getBaseUrl(): string
256
    {
257 3
        if (!isset($this->baseUrl)) {
258 3
            $base = $this->findOne('base');
259 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
260
                $this->baseUrl = $base->href;
261
            } else {
262 3
                $this->baseUrl = $this->url()->get();
263
            }
264
        }
265
266 3
        return $this->baseUrl;
267
    }
268
269
    /**
270
     * @return int correspond to a const from Indexable
271
     */
272 9
    public function indexable(string $userAgent = 'googlebot'): int
273
    {
274 9
        return Indexable::indexable($this, $userAgent);
275
    }
276
277 3
    public function isIndexable(string $userAgent = 'googlebot'): bool
278
    {
279 3
        return Indexable::INDEXABLE === $this->indexable($userAgent);
280
    }
281
282 6
    protected function metaAuthorizeToFollow()
283
    {
284 6
        return !(strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
285
    }
286
287 9
    public function mayFollow()
288
    {
289 9
        if (null === $this->follow) {
290 6
            $robotsHeaders = new RobotsHeaders($this->response->getHeaders());
291 6
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
292
        }
293
294 9
        return $this->follow;
295
    }
296
}
297