Passed
Push — master ( 81959d...92084b )
by Dev
14:57
created

Harvest::url()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
8
use Spatie\Robots\RobotsHeaders;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait;
14
    use RobotsTxtTrait;
15
16
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
17
18
    /**
19
     * @var Response
20
     */
21
    protected $response;
22
23
    /**
24
     * @var \Symfony\Component\DomCrawler\Crawler
25
     */
26
    protected $dom;
27
28
    /** @var string */
29
    protected $baseUrl;
30
31
    /** @var bool */
32
    protected $follow;
33
34
    /** @var \PiedWeb\TextAnalyzer\Analysis */
35
    private $textAnalysis;
36
37
    /** @var Url */
38
    protected $urlRequested;
39
40
    /** @var Url */
41
    protected $url;
42
43
    /**
44
     * @return self|int
45
     */
46
    public static function fromUrl(
47
        string $url,
48
        string $userAgent = self::DEFAULT_USER_AGENT,
49
        string $language = 'en,en-US;q=0.5',
50
        ?CurlRequest $previousRequest = null
51 15
    ) {
52
        $url = Link::normalizeUrl($url); // add trailing slash for domain
53
54
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
55
56
        if ($response instanceof Response) {
57 15
            return new self($response);
58
        }
59 15
60 15
        return $response;
61
    }
62
63
    public function __construct(Response $response)
64
    {
65
        $this->response = $response;
66
67
        $this->url = new Url($this->response->getEffectiveUrl());
68
        $this->urlRequested = new Url($this->response->getUrl());
69 18
    }
70
71 18
    public function urlRequested(): Url
72 18
    {
73
        return $this->urlRequested;
74 15
    }
75
76 15
    /**
77
     * Return url response (curl effective url)
78
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
79 3
     */
80
    public function url(): Url
81 3
    {
82 3
        return $this->url;
83 3
    }
84 3
85
    public function getUrl(): Url
86
    {
87
        return $this->url;
88
    }
89
90 21
    public function getResponse(): Response
91
    {
92 21
        return $this->response;
93 9
    }
94 9
95
    public function getDom()
96
    {
97 21
        $this->dom = $this->dom ?? new DomCrawler($this->response->getContent());
98
99
        return $this->dom;
100 18
    }
101
102 18
    private function find($selector, $i = null): DomCrawler
103
    {
104
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
105 15
    }
106
107 15
    /**
108
     * Alias for find($selector, 0).
109
     */
110
    private function findOne($selector): DomCrawler
111
    {
112
        return $this->find($selector, 0);
113
    }
114
115 3
    /**
116
     * Return content inside a selector.
117 3
     * Eg.: getTag('title').
118
     *
119 3
     * @return string
120
     */
121
    public function getTag($selector)
122 6
    {
123
        $found = $this->findOne($selector);
124 6
125 6
        return $found->count() > 0 ? Helper::clean($found->text()) : null;
126 6
    }
127
128
    public function getUniqueTag($selector = 'title')
129 6
    {
130
        $found = $this->find($selector);
131
132
        if (0 === $found->count()) {
133
            return null;
134
        }
135
136
        if ($found->count() > 1) {
137
            return $found->count().' `'.$selector.'` /!\ ';
138
        }
139 15
140
        return Helper::clean($found->eq(0)->text());
141 15
    }
142
143 15
    /**
144
     * Return content inside a meta.
145
     *
146
     * @return string|null from content attribute
147
     */
148
    public function getMeta(string $name): ?string
149
    {
150
        $meta = $this->findOne('meta[name='.$name.']');
151 9
152
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '')
153 9
            : null;
154
    }
155 9
156
    /**
157
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
158
     */
159
    public function getCanonical(): ?string
160
    {
161 9
        $canonical = $this->findOne('link[rel=canonical]');
162
163 9
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
164
    }
165 9
166
    /*
167
     * @return bool true si canonical = url requested or no canonical balise
168 3
     */
169
    public function isCanonicalCorrect(): bool
170 3
    {
171 3
        $canonical = $this->getCanonical();
172 3
173 3
        return null === $canonical ? true : $this->urlRequested()->get() == $canonical;
174 3
    }
175 3
176
    public function getTextAnalysis()
177
    {
178
        if (null !== $this->textAnalysis) {
179 3
            return $this->textAnalysis;
180
        }
181
182 3
        return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get(
183
                $this->getDom()->text(),
184 3
                true,   // only sentences
185
                1,      // no expression, just words
186
                0      // keep trail
187
        ) : null;
188
    }
189
190 3
    public function getKws()
191
    {
192 3
        return $this->getTextAnalysis()->getExpressions(10);
193 3
    }
194
195 3
    public function getRatioTxtCode(): int
196
    {
197
        $textLenght = strlen($this->getDom()->text());
198
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
199
200
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
201
    }
202
203 3
    /**
204
     * Return an array of object with two elements Link and anchor.
205 3
     */
206 3
    public function getBreadCrumb(?string $separator = null): ?array
207 3
    {
208 3
        $breadcrumb = ExtractBreadcrumb::get($this);
209
210
        if (null !== $separator && is_array($breadcrumb)) {
211 3
            $breadcrumb = array_map(function ($item) {
212
                return $item->getCleanName();
213
            }, $breadcrumb);
214
            $breadcrumb = implode($separator, $breadcrumb);
215
        }
216
217
        return $breadcrumb;
218 3
    }
219
220
    /**
221
     * @return ?string absolute url
222
     */
223
    public function getRedirection(): ?string
224
    {
225
        $headers = $this->response->getHeaders();
226
        $headers = array_change_key_case($headers ? $headers : []);
227
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
228
            return $this->url()->resolve($headers['location']);
229
        }
230
231
        return null;
232
    }
233
234
    public function getRedirectionLink(): ?Link
235
    {
236
        $redirection = $this->getRedirection();
237 3
238
        if (null !== $redirection) {
239 3
            return Link::createRedirection($redirection, $this);
240 3
        }
241 3
242
        return null;
243
    }
244 3
245
    public function isRedirectToHttps(): bool
246
    {
247
        $redirUrl = $this->getRedirection();
248 3
249
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
250
    }
251 6
252
    /**
253 6
     * Return the value in base tag if exist, else, current Url.
254 6
     */
255 6
    public function getBaseUrl(): string
256 6
    {
257
        if (!isset($this->baseUrl)) {
258
            $base = $this->findOne('base');
259 6
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
260
                $this->baseUrl = $base->href;
261
            } else {
262
                $this->baseUrl = $this->url()->get();
263
            }
264
        }
265 9
266
        return $this->baseUrl;
267 9
    }
268
269
    /**
270 12
     * @return int correspond to a const from Indexable
271
     */
272 12
    public function isIndexable(string $userAgent = 'googlebot'): int
273 12
    {
274
        return Indexable::isIndexable($this, $userAgent);
275
    }
276 12
277
    protected function metaAuthorizeToFollow()
278
    {
279 12
        return !(strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
280
    }
281 12
282
    public function mayFollow()
283 12
    {
284
        if (null === $this->follow) {
285
            $robotsHeaders = new RobotsHeaders($this->response->getHeaders());
286 3
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
287
        }
288 3
289
        return $this->follow;
290
    }
291
}
292