1 | <?php |
||
2 | |||
3 | namespace PiedWeb\UrlHarvester; |
||
4 | |||
5 | use PiedWeb\Curl\Request as CurlRequest; |
||
6 | use PiedWeb\Curl\Response; |
||
7 | use PiedWeb\TextAnalyzer\Analysis; |
||
8 | use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer; |
||
9 | use Spatie\Robots\RobotsHeaders; |
||
10 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
11 | |||
12 | class Harvest |
||
13 | { |
||
14 | use HarvestLinksTrait; |
||
15 | use RobotsTxtTrait; |
||
16 | |||
17 | public const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics'; |
||
18 | |||
19 | protected Response $response; |
||
20 | |||
21 | protected DomCrawler $dom; |
||
22 | |||
23 | protected string $baseUrl; |
||
24 | |||
25 | protected bool $follow; |
||
26 | |||
27 | private Analysis $textAnalysis; |
||
28 | |||
29 | protected Url $urlRequested; |
||
30 | |||
31 | protected Url $url; |
||
32 | |||
33 | /** |
||
34 | * @return self|int |
||
35 | */ |
||
36 | public static function fromUrl( |
||
37 | string $url, |
||
38 | string $userAgent = self::DEFAULT_USER_AGENT, |
||
39 | string $language = 'en,en-US;q=0.5', |
||
40 | ?CurlRequest $previousRequest = null |
||
41 | ) { |
||
42 | $url = Link::normalizeUrl($url); // add trailing slash for domain |
||
43 | |||
44 | $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language); |
||
45 | |||
46 | 18 | if ($response instanceof Response) { |
|
47 | return new self($response); |
||
48 | } |
||
49 | |||
50 | return $response; |
||
51 | } |
||
52 | 18 | ||
53 | public function __construct(Response $response) |
||
54 | 18 | { |
|
55 | $this->response = $response; |
||
56 | 18 | ||
57 | 18 | $this->url = new Url($this->response->getEffectiveUrl()); |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
58 | $this->urlRequested = new Url($this->response->getUrl()); |
||
59 | } |
||
60 | |||
61 | public function urlRequested(): Url |
||
62 | { |
||
63 | 27 | return $this->urlRequested; |
|
64 | } |
||
65 | 27 | ||
66 | /** |
||
67 | 27 | * Return url response (curl effective url) |
|
68 | 27 | * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench). |
|
69 | 27 | */ |
|
70 | public function url(): Url |
||
71 | 6 | { |
|
72 | return $this->url; |
||
73 | 6 | } |
|
74 | |||
75 | public function getUrl(): Url |
||
76 | { |
||
77 | return $this->url; |
||
78 | } |
||
79 | |||
80 | 21 | public function getResponse(): Response |
|
81 | { |
||
82 | 21 | return $this->response; |
|
83 | } |
||
84 | |||
85 | 15 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
|
86 | public function getDom() |
||
87 | 15 | { |
|
88 | $this->dom = isset($this->dom) ? $this->dom : new DomCrawler($this->response->getContent()); |
||
89 | |||
90 | 18 | return $this->dom; |
|
91 | } |
||
92 | 18 | ||
93 | private function find($selector, $i = null): DomCrawler |
||
94 | { |
||
95 | 33 | return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector); |
|
96 | } |
||
97 | 33 | ||
98 | /** |
||
99 | 33 | * Alias for find($selector, 0). |
|
100 | */ |
||
101 | private function findOne($selector): DomCrawler |
||
102 | 21 | { |
|
103 | return $this->find($selector, 0); |
||
104 | 21 | } |
|
105 | |||
106 | /** |
||
107 | * Return content inside a selector. |
||
108 | * Eg.: getTag('title'). |
||
109 | * |
||
110 | 18 | * @return ?string |
|
111 | */ |
||
112 | 18 | public function getTag($selector) |
|
113 | { |
||
114 | $found = $this->findOne($selector); |
||
115 | |||
116 | return $found->count() > 0 ? Helper::clean($found->text()) : null; |
||
117 | } |
||
118 | |||
119 | public function getUniqueTag($selector = 'title') |
||
120 | { |
||
121 | 3 | $found = $this->find($selector); |
|
122 | |||
123 | 3 | if (0 === $found->count()) { |
|
124 | return null; |
||
125 | 3 | } |
|
126 | |||
127 | if ($found->count() > 1) { |
||
128 | 6 | return $found->count().' `'.$selector.'` /!\ '; |
|
129 | } |
||
130 | 6 | ||
131 | return Helper::clean($found->eq(0)->text()); |
||
132 | 6 | } |
|
133 | 3 | ||
134 | /** |
||
135 | * Return content inside a meta. |
||
136 | 6 | * |
|
137 | * @return string|null from content attribute |
||
138 | */ |
||
139 | public function getMeta(string $name): ?string |
||
140 | 6 | { |
|
141 | $meta = $this->findOne('meta[name='.$name.']'); |
||
142 | |||
143 | return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '') |
||
144 | : null; |
||
145 | } |
||
146 | |||
147 | /** |
||
148 | 18 | * Renvoie le contenu de l'attribut href de la balise link rel=canonical. |
|
149 | */ |
||
150 | 18 | public function getCanonical(): ?string |
|
151 | { |
||
152 | 18 | $canonical = $this->findOne('link[rel=canonical]'); |
|
153 | 18 | ||
154 | return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null; |
||
155 | } |
||
156 | |||
157 | /* |
||
158 | * @return bool true si canonical = url requested or no canonical balise |
||
159 | 12 | */ |
|
160 | public function isCanonicalCorrect(?string $urlRequested = null): bool |
||
161 | 12 | { |
|
162 | $canonical = $this->getCanonical(); |
||
163 | 12 | ||
164 | if (null === $canonical) { |
||
165 | return true; |
||
166 | } |
||
167 | |||
168 | $urlRequested = $urlRequested ?? $this->urlRequested()->get(); |
||
169 | 12 | ||
170 | if ($urlRequested == $canonical) { |
||
171 | 12 | return true; |
|
172 | } |
||
173 | 12 | ||
174 | return $this->checkCanonicalException($urlRequested, $canonical); |
||
175 | } |
||
176 | 6 | ||
177 | private function checkCanonicalException(string $urlRequested, string $canonical): bool |
||
178 | 6 | { |
|
179 | 3 | if (false !== preg_match('/^.+?[^\/:](?=[?\/]|$)/', $urlRequested, $match) |
|
180 | && $match[0] === ltrim($urlRequested, '/') |
||
181 | && ($match[0] == $canonical || $match[0].'/' == $canonical)) { |
||
182 | 3 | return true; |
|
183 | 3 | } |
|
184 | 3 | ||
185 | 3 | return false; |
|
186 | 3 | } |
|
187 | 3 | ||
188 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
||
189 | public function getTextAnalysis() |
||
190 | 3 | { |
|
191 | if (isset($this->textAnalysis)) { |
||
192 | 3 | return $this->textAnalysis; |
|
193 | } |
||
194 | |||
195 | 3 | return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get( |
|
196 | $this->getDom()->text(), |
||
197 | 3 | true, // only sentences |
|
198 | 1, // no expression, just words |
||
199 | 0 // keep trail |
||
200 | 3 | ) : null; |
|
201 | } |
||
202 | 3 | ||
203 | 3 | public function getWordCount(): int |
|
204 | { |
||
205 | 3 | return (int) str_word_count($this->getDom()->text('') ?? ''); |
|
206 | } |
||
207 | |||
208 | public function getKws() |
||
209 | { |
||
210 | return $this->getTextAnalysis()->getExpressions(10); |
||
211 | 3 | } |
|
212 | |||
213 | 3 | public function getRatioTxtCode(): int |
|
214 | { |
||
215 | 3 | $textLenght = \strlen($this->getDom()->text('')); |
|
216 | $htmlLenght = \strlen(Helper::clean($this->response->getContent())); |
||
217 | |||
218 | return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0); |
||
219 | } |
||
220 | |||
221 | /** |
||
222 | 3 | * Return an array of object with two elements Link and anchor. |
|
223 | */ |
||
224 | public function getBreadCrumb(?string $separator = null) |
||
225 | { |
||
226 | $breadcrumb = ExtractBreadcrumb::get($this); |
||
227 | |||
228 | 3 | if (null !== $separator && \is_array($breadcrumb)) { |
|
229 | $breadcrumb = array_map(function ($item) { |
||
230 | 3 | return $item->getCleanName(); |
|
231 | 3 | }, $breadcrumb); |
|
232 | 3 | $breadcrumb = implode($separator, $breadcrumb); |
|
233 | 3 | } |
|
234 | |||
235 | return $breadcrumb; |
||
236 | } |
||
237 | |||
238 | /** |
||
239 | * @return ?string absolute url |
||
240 | */ |
||
241 | public function getRedirection(): ?string |
||
242 | { |
||
243 | $headers = $this->response->getHeaders(); |
||
244 | $headers = array_change_key_case($headers ?: []); |
||
245 | if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) { |
||
246 | return $this->url()->resolve($headers['location']); |
||
247 | } |
||
248 | |||
249 | return null; |
||
250 | } |
||
251 | |||
252 | public function getRedirectionLink(): ?Link |
||
253 | { |
||
254 | $redirection = $this->getRedirection(); |
||
255 | |||
256 | if (null !== $redirection) { |
||
257 | return Link::createRedirection($redirection, $this); |
||
258 | } |
||
259 | |||
260 | 3 | return null; |
|
261 | } |
||
262 | 3 | ||
263 | 3 | public function isRedirectToHttps(): bool |
|
264 | 3 | { |
|
265 | $redirUrl = $this->getRedirection(); |
||
266 | |||
267 | 3 | return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl; |
|
268 | } |
||
269 | |||
270 | /** |
||
271 | 3 | * Return the value in base tag if exist, else, current Url. |
|
272 | * |
||
273 | * @psalm-suppress RedundantPropertyInitializationCheck |
||
274 | */ |
||
275 | public function getBaseUrl(): string |
||
276 | { |
||
277 | 9 | if (! isset($this->baseUrl)) { |
|
278 | $base = $this->findOne('base'); |
||
279 | 9 | if ($base->getBaseHref() && filter_var($base->getBaseHref(), \FILTER_VALIDATE_URL)) { |
|
280 | $this->baseUrl = $base->getBaseHref(); |
||
281 | } else { |
||
282 | 3 | $this->baseUrl = $this->url()->get(); |
|
283 | } |
||
284 | 3 | } |
|
285 | |||
286 | return (string) $this->baseUrl; |
||
287 | 6 | } |
|
288 | |||
289 | 6 | /** |
|
290 | * @return int correspond to a const from Indexable |
||
291 | */ |
||
292 | 9 | public function indexable(string $userAgent = 'googlebot'): int |
|
293 | { |
||
294 | 9 | return Indexable::indexable($this, $userAgent); |
|
295 | 6 | } |
|
296 | 6 | ||
297 | public function isIndexable(string $userAgent = 'googlebot'): bool |
||
298 | { |
||
299 | 9 | return Indexable::INDEXABLE === $this->indexable($userAgent); |
|
300 | } |
||
301 | |||
302 | protected function metaAuthorizeToFollow() |
||
303 | { |
||
304 | return ! (strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow')); |
||
305 | } |
||
306 | |||
307 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
||
308 | public function mayFollow() |
||
309 | { |
||
310 | if (! isset($this->follow)) { |
||
311 | $robotsHeaders = new RobotsHeaders((array) $this->response->getHeaders()); |
||
312 | $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false; |
||
313 | } |
||
314 | |||
315 | return $this->follow; |
||
316 | } |
||
317 | } |
||
318 |