Passed
Push — master ( 510d2a...6579bd )
by Dev
02:04
created

Harvest::getWordCount()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
8
//use Spatie\Robots\RobotsHeaders;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait;
14
    use RobotsTxtTrait;
15
16
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
17
18
    /**
19
     * @var Response
20
     */
21
    protected $response;
22
23
    /**
24
     * @var \Symfony\Component\DomCrawler\Crawler
25
     */
26
    protected $dom;
27
28
    /** @var string */
29
    protected $baseUrl;
30
31
    /** @var bool */
32
    protected $follow;
33
34
    /** @var \PiedWeb\TextAnalyzer\Analysis */
35
    private $textAnalysis;
36
37
    /** @var Url */
38
    protected $urlRequested;
39
40
    /** @var Url */
41
    protected $url;
42
43
    /**
44
     * @return self|int
45
     */
46 18
    public static function fromUrl(
47
        string $url,
48
        string $userAgent = self::DEFAULT_USER_AGENT,
49
        string $language = 'en,en-US;q=0.5',
50
        ?CurlRequest $previousRequest = null
51
    ) {
52 18
        $url = Link::normalizeUrl($url); // add trailing slash for domain
53
54 18
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
55
56 18
        if ($response instanceof Response) {
57 18
            return new self($response);
58
        }
59
60
        return $response;
61
    }
62
63 27
    public function __construct(Response $response)
64
    {
65 27
        $this->response = $response;
66
67 27
        $this->url = new Url($this->response->getEffectiveUrl());
68 27
        $this->urlRequested = new Url($this->response->getUrl());
69 27
    }
70
71 6
    public function urlRequested(): Url
72
    {
73 6
        return $this->urlRequested;
74
    }
75
76
    /**
77
     * Return url response (curl effective url)
78
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
79
     */
80 21
    public function url(): Url
81
    {
82 21
        return $this->url;
83
    }
84
85 15
    public function getUrl(): Url
86
    {
87 15
        return $this->url;
88
    }
89
90 18
    public function getResponse(): Response
91
    {
92 18
        return $this->response;
93
    }
94
95 33
    public function getDom()
96
    {
97 33
        $this->dom = $this->dom ?? new DomCrawler($this->response->getContent());
98
99 33
        return $this->dom;
100
    }
101
102 21
    private function find($selector, $i = null): DomCrawler
103
    {
104 21
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
105
    }
106
107
    /**
108
     * Alias for find($selector, 0).
109
     */
110 18
    private function findOne($selector): DomCrawler
111
    {
112 18
        return $this->find($selector, 0);
113
    }
114
115
    /**
116
     * Return content inside a selector.
117
     * Eg.: getTag('title').
118
     *
119
     * @return string
120
     */
121 3
    public function getTag($selector)
122
    {
123 3
        $found = $this->findOne($selector);
124
125 3
        return $found->count() > 0 ? Helper::clean($found->text()) : null;
126
    }
127
128 6
    public function getUniqueTag($selector = 'title')
129
    {
130 6
        $found = $this->find($selector);
131
132 6
        if (0 === $found->count()) {
133 3
            return null;
134
        }
135
136 6
        if ($found->count() > 1) {
137
            return $found->count().' `'.$selector.'` /!\ ';
138
        }
139
140 6
        return Helper::clean($found->eq(0)->text());
141
    }
142
143
    /**
144
     * Return content inside a meta.
145
     *
146
     * @return string|null from content attribute
147
     */
148 18
    public function getMeta(string $name): ?string
149
    {
150 18
        $meta = $this->findOne('meta[name='.$name.']');
151
152 18
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '')
153 18
            : null;
154
    }
155
156
    /**
157
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
158
     */
159 12
    public function getCanonical(): ?string
160
    {
161 12
        $canonical = $this->findOne('link[rel=canonical]');
162
163 12
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
164
    }
165
166
    /*
167
     * @return bool true si canonical = url requested or no canonical balise
168
     */
169 12
    public function isCanonicalCorrect(): bool
170
    {
171 12
        $canonical = $this->getCanonical();
172
173 12
        return null === $canonical ? true : $this->urlRequested()->get() == $canonical;
174
    }
175
176 6
    public function getTextAnalysis()
177
    {
178 6
        if (null !== $this->textAnalysis) {
179 3
            return $this->textAnalysis;
180
        }
181
182 3
        return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get(
183 3
            $this->getDom()->text(),
184 3
            true,   // only sentences
185 3
            1,      // no expression, just words
186 3
            0      // keep trail
187 3
        ) : null;
188
    }
189
190 3
    public function getWordCount()
191
    {
192 3
        return str_word_count($this->getDom()->text() ?? '');
193
    }
194
195 3
    public function getKws()
196
    {
197 3
        return $this->getTextAnalysis()->getExpressions(10);
198
    }
199
200 3
    public function getRatioTxtCode(): int
201
    {
202 3
        $textLenght = strlen($this->getDom()->text());
203 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
204
205 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
206
    }
207
208
    /**
209
     * Return an array of object with two elements Link and anchor.
210
     */
211 3
    public function getBreadCrumb(?string $separator = null): ?array
212
    {
213 3
        $breadcrumb = ExtractBreadcrumb::get($this);
214
215 3
        if (null !== $separator && is_array($breadcrumb)) {
216
            $breadcrumb = array_map(function ($item) {
217
                return $item->getCleanName();
218
            }, $breadcrumb);
219
            $breadcrumb = implode($separator, $breadcrumb);
220
        }
221
222 3
        return $breadcrumb;
223
    }
224
225
    /**
226
     * @return ?string absolute url
227
     */
228 3
    public function getRedirection(): ?string
229
    {
230 3
        $headers = $this->response->getHeaders();
231 3
        $headers = array_change_key_case($headers ? $headers : []);
232 3
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
233 3
            return $this->url()->resolve($headers['location']);
234
        }
235
236
        return null;
237
    }
238
239
    public function getRedirectionLink(): ?Link
240
    {
241
        $redirection = $this->getRedirection();
242
243
        if (null !== $redirection) {
244
            return Link::createRedirection($redirection, $this);
245
        }
246
247
        return null;
248
    }
249
250
    public function isRedirectToHttps(): bool
251
    {
252
        $redirUrl = $this->getRedirection();
253
254
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
255
    }
256
257
    /**
258
     * Return the value in base tag if exist, else, current Url.
259
     */
260 3
    public function getBaseUrl(): string
261
    {
262 3
        if (!isset($this->baseUrl)) {
263 3
            $base = $this->findOne('base');
264 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
265
                $this->baseUrl = $base->href;
266
            } else {
267 3
                $this->baseUrl = $this->url()->get();
268
            }
269
        }
270
271 3
        return $this->baseUrl;
272
    }
273
274
    /**
275
     * @return int correspond to a const from Indexable
276
     */
277 9
    public function indexable(string $userAgent = 'googlebot'): int
278
    {
279 9
        return Indexable::indexable($this, $userAgent);
280
    }
281
282 3
    public function isIndexable(string $userAgent = 'googlebot'): bool
283
    {
284 3
        return Indexable::INDEXABLE === $this->indexable($userAgent);
285
    }
286
287 6
    protected function metaAuthorizeToFollow()
288
    {
289 6
        return !(strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
290
    }
291
292 9
    public function mayFollow()
293
    {
294 9
        if (null === $this->follow) {
295 6
            $robotsHeaders = new RobotsHeaders((array) $this->response->getHeaders());
296 6
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
297
        }
298
299 9
        return $this->follow;
300
    }
301
}
302