PiedWeb /
UrlHarvester
| 1 | <?php |
||
| 2 | |||
| 3 | namespace PiedWeb\UrlHarvester; |
||
| 4 | |||
| 5 | use Spatie\Robots\RobotsHeaders; |
||
| 6 | |||
| 7 | class Indexable |
||
| 8 | { |
||
| 9 | // https://stackoverflow.com/questions/1880148/how-to-get-name-of-the-constant |
||
| 10 | public const INDEXABLE = 0; |
||
| 11 | |||
| 12 | public const NOT_INDEXABLE_ROBOTS = 1; |
||
| 13 | |||
| 14 | public const NOT_INDEXABLE_HEADER = 2; |
||
| 15 | |||
| 16 | public const NOT_INDEXABLE_META = 3; |
||
| 17 | |||
| 18 | public const NOT_INDEXABLE_CANONICAL = 4; |
||
| 19 | |||
| 20 | public const NOT_INDEXABLE_4XX = 5; |
||
| 21 | |||
| 22 | public const NOT_INDEXABLE_5XX = 6; |
||
| 23 | |||
| 24 | public const NOT_INDEXABLE_NETWORK_ERROR = 7; |
||
| 25 | |||
| 26 | public const NOT_INDEXABLE_TOO_BIG = 10; |
||
| 27 | |||
| 28 | 9 | public const NOT_INDEXABLE_3XX = 8; |
|
| 29 | |||
| 30 | 9 | public const NOT_INDEXABLE_NOT_HTML = 9; |
|
| 31 | 9 | ||
| 32 | 9 | /** @var Harvest */ |
|
| 33 | protected $harvest; |
||
| 34 | 9 | ||
| 35 | /** @var string */ |
||
| 36 | 9 | protected $isIndexableFor; |
|
| 37 | 9 | ||
| 38 | public function __construct(Harvest $harvest, string $isIndexableFor = 'googlebot') |
||
| 39 | 9 | { |
|
| 40 | $this->harvest = $harvest; |
||
| 41 | $this->isIndexableFor = $isIndexableFor; |
||
| 42 | 9 | } |
|
| 43 | |||
| 44 | 9 | public function robotsTxtAllows() |
|
| 45 | 9 | { |
|
| 46 | $url = $this->harvest->getResponse()->getUrl(); |
||
| 47 | 9 | $robotsTxt = $this->harvest->getRobotsTxt(); |
|
| 48 | |||
| 49 | return '' === $robotsTxt ? true : $robotsTxt->allows($url, $this->isIndexableFor); |
||
| 50 | 9 | } |
|
| 51 | |||
| 52 | 9 | public function metaAllows() |
|
| 53 | { |
||
| 54 | 9 | $meta = $this->harvest->getMeta($this->isIndexableFor); |
|
| 55 | $generic = $this->harvest->getMeta('robots'); |
||
| 56 | |||
| 57 | 9 | return ! (false !== stripos($meta, 'noindex') || false !== stripos($generic, 'noindex')); |
|
| 58 | } |
||
| 59 | 9 | ||
| 60 | public function headersAllow() |
||
| 61 | { |
||
| 62 | 9 | $headers = explode(\PHP_EOL, $this->harvest->getResponse()->getHeaders(false)); |
|
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 63 | 3 | ||
| 64 | return RobotsHeaders::create($headers)->mayIndex($this->isIndexableFor); |
||
| 65 | } |
||
| 66 | 9 | ||
| 67 | 3 | public static function indexable(Harvest $harvest, string $isIndexableFor = 'googlebot'): int |
|
| 68 | { |
||
| 69 | $self = new self($harvest, $isIndexableFor); |
||
| 70 | 9 | ||
| 71 | 3 | // robots |
|
| 72 | if (! $self->robotsTxtAllows()) { |
||
| 73 | return self::NOT_INDEXABLE_ROBOTS; |
||
| 74 | } |
||
| 75 | 9 | ||
| 76 | 3 | if (! $self->headersAllow()) { |
|
| 77 | return self::NOT_INDEXABLE_HEADER; |
||
| 78 | } |
||
| 79 | 9 | ||
| 80 | if (! $self->metaAllows()) { |
||
| 81 | return self::NOT_INDEXABLE_META; |
||
| 82 | 9 | } |
|
| 83 | 3 | ||
| 84 | // canonical |
||
| 85 | if (! $harvest->isCanonicalCorrect()) { |
||
| 86 | return self::NOT_INDEXABLE_CANONICAL; |
||
| 87 | 9 | } |
|
| 88 | 3 | ||
| 89 | $statusCode = $harvest->getResponse()->getStatusCode(); |
||
| 90 | |||
| 91 | // status 4XX |
||
| 92 | 9 | if ($statusCode < 500 && $statusCode > 399) { |
|
| 93 | 6 | return self::NOT_INDEXABLE_4XX; |
|
| 94 | } |
||
| 95 | |||
| 96 | 6 | // status 5XX |
|
| 97 | if ($statusCode < 600 && $statusCode > 499) { |
||
| 98 | return self::NOT_INDEXABLE_5XX; |
||
| 99 | } |
||
| 100 | |||
| 101 | // status 3XX |
||
| 102 | if ($statusCode < 400 && $statusCode > 299) { |
||
| 103 | return self::NOT_INDEXABLE_3XX; |
||
| 104 | } |
||
| 105 | |||
| 106 | return self::INDEXABLE; |
||
| 107 | } |
||
| 108 | } |
||
| 109 |