PiedWeb /
UrlHarvester
| 1 | <?php |
||
| 2 | |||
| 3 | namespace PiedWeb\UrlHarvester; |
||
| 4 | |||
| 5 | use PiedWeb\Curl\Request as CurlRequest; |
||
| 6 | use PiedWeb\Curl\Response; |
||
| 7 | use PiedWeb\TextAnalyzer\Analysis; |
||
| 8 | use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer; |
||
| 9 | use Spatie\Robots\RobotsHeaders; |
||
| 10 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
| 11 | |||
| 12 | class Harvest |
||
| 13 | { |
||
| 14 | use HarvestLinksTrait; |
||
| 15 | use RobotsTxtTrait; |
||
| 16 | |||
| 17 | public const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics'; |
||
| 18 | |||
| 19 | protected Response $response; |
||
| 20 | |||
| 21 | protected DomCrawler $dom; |
||
| 22 | |||
| 23 | protected string $baseUrl; |
||
| 24 | |||
| 25 | protected bool $follow; |
||
| 26 | |||
| 27 | private Analysis $textAnalysis; |
||
| 28 | |||
| 29 | protected Url $urlRequested; |
||
| 30 | |||
| 31 | protected Url $url; |
||
| 32 | |||
| 33 | /** |
||
| 34 | * @return self|int |
||
| 35 | */ |
||
| 36 | public static function fromUrl( |
||
| 37 | string $url, |
||
| 38 | string $userAgent = self::DEFAULT_USER_AGENT, |
||
| 39 | string $language = 'en,en-US;q=0.5', |
||
| 40 | ?CurlRequest $previousRequest = null |
||
| 41 | ) { |
||
| 42 | $url = Link::normalizeUrl($url); // add trailing slash for domain |
||
| 43 | |||
| 44 | $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language); |
||
| 45 | |||
| 46 | 18 | if ($response instanceof Response) { |
|
| 47 | return new self($response); |
||
| 48 | } |
||
| 49 | |||
| 50 | return $response; |
||
| 51 | } |
||
| 52 | 18 | ||
| 53 | public function __construct(Response $response) |
||
| 54 | 18 | { |
|
| 55 | $this->response = $response; |
||
| 56 | 18 | ||
| 57 | 18 | $this->url = new Url($this->response->getEffectiveUrl()); |
|
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 58 | $this->urlRequested = new Url($this->response->getUrl()); |
||
| 59 | } |
||
| 60 | |||
| 61 | public function urlRequested(): Url |
||
| 62 | { |
||
| 63 | 27 | return $this->urlRequested; |
|
| 64 | } |
||
| 65 | 27 | ||
| 66 | /** |
||
| 67 | 27 | * Return url response (curl effective url) |
|
| 68 | 27 | * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench). |
|
| 69 | 27 | */ |
|
| 70 | public function url(): Url |
||
| 71 | 6 | { |
|
| 72 | return $this->url; |
||
| 73 | 6 | } |
|
| 74 | |||
| 75 | public function getUrl(): Url |
||
| 76 | { |
||
| 77 | return $this->url; |
||
| 78 | } |
||
| 79 | |||
| 80 | 21 | public function getResponse(): Response |
|
| 81 | { |
||
| 82 | 21 | return $this->response; |
|
| 83 | } |
||
| 84 | |||
| 85 | 15 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
|
| 86 | public function getDom() |
||
| 87 | 15 | { |
|
| 88 | $this->dom = isset($this->dom) ? $this->dom : new DomCrawler($this->response->getContent()); |
||
| 89 | |||
| 90 | 18 | return $this->dom; |
|
| 91 | } |
||
| 92 | 18 | ||
| 93 | private function find($selector, $i = null): DomCrawler |
||
| 94 | { |
||
| 95 | 33 | return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector); |
|
| 96 | } |
||
| 97 | 33 | ||
| 98 | /** |
||
| 99 | 33 | * Alias for find($selector, 0). |
|
| 100 | */ |
||
| 101 | private function findOne($selector): DomCrawler |
||
| 102 | 21 | { |
|
| 103 | return $this->find($selector, 0); |
||
| 104 | 21 | } |
|
| 105 | |||
| 106 | /** |
||
| 107 | * Return content inside a selector. |
||
| 108 | * Eg.: getTag('title'). |
||
| 109 | * |
||
| 110 | 18 | * @return ?string |
|
| 111 | */ |
||
| 112 | 18 | public function getTag($selector) |
|
| 113 | { |
||
| 114 | $found = $this->findOne($selector); |
||
| 115 | |||
| 116 | return $found->count() > 0 ? Helper::clean($found->text()) : null; |
||
| 117 | } |
||
| 118 | |||
| 119 | public function getUniqueTag($selector = 'title') |
||
| 120 | { |
||
| 121 | 3 | $found = $this->find($selector); |
|
| 122 | |||
| 123 | 3 | if (0 === $found->count()) { |
|
| 124 | return null; |
||
| 125 | 3 | } |
|
| 126 | |||
| 127 | if ($found->count() > 1) { |
||
| 128 | 6 | return $found->count().' `'.$selector.'` /!\ '; |
|
| 129 | } |
||
| 130 | 6 | ||
| 131 | return Helper::clean($found->eq(0)->text()); |
||
| 132 | 6 | } |
|
| 133 | 3 | ||
| 134 | /** |
||
| 135 | * Return content inside a meta. |
||
| 136 | 6 | * |
|
| 137 | * @return string|null from content attribute |
||
| 138 | */ |
||
| 139 | public function getMeta(string $name): ?string |
||
| 140 | 6 | { |
|
| 141 | $meta = $this->findOne('meta[name='.$name.']'); |
||
| 142 | |||
| 143 | return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '') |
||
| 144 | : null; |
||
| 145 | } |
||
| 146 | |||
| 147 | /** |
||
| 148 | 18 | * Renvoie le contenu de l'attribut href de la balise link rel=canonical. |
|
| 149 | */ |
||
| 150 | 18 | public function getCanonical(): ?string |
|
| 151 | { |
||
| 152 | 18 | $canonical = $this->findOne('link[rel=canonical]'); |
|
| 153 | 18 | ||
| 154 | return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null; |
||
| 155 | } |
||
| 156 | |||
| 157 | /* |
||
| 158 | * @return bool true si canonical = url requested or no canonical balise |
||
| 159 | 12 | */ |
|
| 160 | public function isCanonicalCorrect(?string $urlRequested = null): bool |
||
| 161 | 12 | { |
|
| 162 | $canonical = $this->getCanonical(); |
||
| 163 | 12 | ||
| 164 | if (null === $canonical) { |
||
| 165 | return true; |
||
| 166 | } |
||
| 167 | |||
| 168 | $urlRequested = $urlRequested ?? $this->urlRequested()->get(); |
||
| 169 | 12 | ||
| 170 | if ($urlRequested == $canonical) { |
||
| 171 | 12 | return true; |
|
| 172 | } |
||
| 173 | 12 | ||
| 174 | return $this->checkCanonicalException($urlRequested, $canonical); |
||
| 175 | } |
||
| 176 | 6 | ||
| 177 | private function checkCanonicalException(string $urlRequested, string $canonical): bool |
||
| 178 | 6 | { |
|
| 179 | 3 | if (false !== preg_match('/^.+?[^\/:](?=[?\/]|$)/', $urlRequested, $match) |
|
| 180 | && $match[0] === ltrim($urlRequested, '/') |
||
| 181 | && ($match[0] == $canonical || $match[0].'/' == $canonical)) { |
||
| 182 | 3 | return true; |
|
| 183 | 3 | } |
|
| 184 | 3 | ||
| 185 | 3 | return false; |
|
| 186 | 3 | } |
|
| 187 | 3 | ||
| 188 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
||
| 189 | public function getTextAnalysis() |
||
| 190 | 3 | { |
|
| 191 | if (isset($this->textAnalysis)) { |
||
| 192 | 3 | return $this->textAnalysis; |
|
| 193 | } |
||
| 194 | |||
| 195 | 3 | return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get( |
|
| 196 | $this->getDom()->text(), |
||
| 197 | 3 | true, // only sentences |
|
| 198 | 1, // no expression, just words |
||
| 199 | 0 // keep trail |
||
| 200 | 3 | ) : null; |
|
| 201 | } |
||
| 202 | 3 | ||
| 203 | 3 | public function getWordCount(): int |
|
| 204 | { |
||
| 205 | 3 | return (int) str_word_count($this->getDom()->text('') ?? ''); |
|
| 206 | } |
||
| 207 | |||
| 208 | public function getKws() |
||
| 209 | { |
||
| 210 | return $this->getTextAnalysis()->getExpressions(10); |
||
| 211 | 3 | } |
|
| 212 | |||
| 213 | 3 | public function getRatioTxtCode(): int |
|
| 214 | { |
||
| 215 | 3 | $textLenght = \strlen($this->getDom()->text('')); |
|
| 216 | $htmlLenght = \strlen(Helper::clean($this->response->getContent())); |
||
| 217 | |||
| 218 | return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0); |
||
| 219 | } |
||
| 220 | |||
| 221 | /** |
||
| 222 | 3 | * Return an array of object with two elements Link and anchor. |
|
| 223 | */ |
||
| 224 | public function getBreadCrumb(?string $separator = null) |
||
| 225 | { |
||
| 226 | $breadcrumb = ExtractBreadcrumb::get($this); |
||
| 227 | |||
| 228 | 3 | if (null !== $separator && \is_array($breadcrumb)) { |
|
| 229 | $breadcrumb = array_map(function ($item) { |
||
| 230 | 3 | return $item->getCleanName(); |
|
| 231 | 3 | }, $breadcrumb); |
|
| 232 | 3 | $breadcrumb = implode($separator, $breadcrumb); |
|
| 233 | 3 | } |
|
| 234 | |||
| 235 | return $breadcrumb; |
||
| 236 | } |
||
| 237 | |||
| 238 | /** |
||
| 239 | * @return ?string absolute url |
||
| 240 | */ |
||
| 241 | public function getRedirection(): ?string |
||
| 242 | { |
||
| 243 | $headers = $this->response->getHeaders(); |
||
| 244 | $headers = array_change_key_case($headers ?: []); |
||
| 245 | if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) { |
||
| 246 | return $this->url()->resolve($headers['location']); |
||
| 247 | } |
||
| 248 | |||
| 249 | return null; |
||
| 250 | } |
||
| 251 | |||
| 252 | public function getRedirectionLink(): ?Link |
||
| 253 | { |
||
| 254 | $redirection = $this->getRedirection(); |
||
| 255 | |||
| 256 | if (null !== $redirection) { |
||
| 257 | return Link::createRedirection($redirection, $this); |
||
| 258 | } |
||
| 259 | |||
| 260 | 3 | return null; |
|
| 261 | } |
||
| 262 | 3 | ||
| 263 | 3 | public function isRedirectToHttps(): bool |
|
| 264 | 3 | { |
|
| 265 | $redirUrl = $this->getRedirection(); |
||
| 266 | |||
| 267 | 3 | return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl; |
|
| 268 | } |
||
| 269 | |||
| 270 | /** |
||
| 271 | 3 | * Return the value in base tag if exist, else, current Url. |
|
| 272 | * |
||
| 273 | * @psalm-suppress RedundantPropertyInitializationCheck |
||
| 274 | */ |
||
| 275 | public function getBaseUrl(): string |
||
| 276 | { |
||
| 277 | 9 | if (! isset($this->baseUrl)) { |
|
| 278 | $base = $this->findOne('base'); |
||
| 279 | 9 | if ($base->getBaseHref() && filter_var($base->getBaseHref(), \FILTER_VALIDATE_URL)) { |
|
| 280 | $this->baseUrl = $base->getBaseHref(); |
||
| 281 | } else { |
||
| 282 | 3 | $this->baseUrl = $this->url()->get(); |
|
| 283 | } |
||
| 284 | 3 | } |
|
| 285 | |||
| 286 | return (string) $this->baseUrl; |
||
| 287 | 6 | } |
|
| 288 | |||
| 289 | 6 | /** |
|
| 290 | * @return int correspond to a const from Indexable |
||
| 291 | */ |
||
| 292 | 9 | public function indexable(string $userAgent = 'googlebot'): int |
|
| 293 | { |
||
| 294 | 9 | return Indexable::indexable($this, $userAgent); |
|
| 295 | 6 | } |
|
| 296 | 6 | ||
| 297 | public function isIndexable(string $userAgent = 'googlebot'): bool |
||
| 298 | { |
||
| 299 | 9 | return Indexable::INDEXABLE === $this->indexable($userAgent); |
|
| 300 | } |
||
| 301 | |||
| 302 | protected function metaAuthorizeToFollow() |
||
| 303 | { |
||
| 304 | return ! (strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow')); |
||
| 305 | } |
||
| 306 | |||
| 307 | /** @psalm-suppress RedundantPropertyInitializationCheck */ |
||
| 308 | public function mayFollow() |
||
| 309 | { |
||
| 310 | if (! isset($this->follow)) { |
||
| 311 | $robotsHeaders = new RobotsHeaders((array) $this->response->getHeaders()); |
||
| 312 | $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false; |
||
| 313 | } |
||
| 314 | |||
| 315 | return $this->follow; |
||
| 316 | } |
||
| 317 | } |
||
| 318 |