1 | <?php |
||
2 | |||
3 | namespace PiedWeb\UrlHarvester; |
||
4 | |||
5 | use Spatie\Robots\RobotsHeaders; |
||
6 | |||
7 | class Indexable |
||
8 | { |
||
9 | // https://stackoverflow.com/questions/1880148/how-to-get-name-of-the-constant |
||
10 | public const INDEXABLE = 0; |
||
11 | |||
12 | public const NOT_INDEXABLE_ROBOTS = 1; |
||
13 | |||
14 | public const NOT_INDEXABLE_HEADER = 2; |
||
15 | |||
16 | public const NOT_INDEXABLE_META = 3; |
||
17 | |||
18 | public const NOT_INDEXABLE_CANONICAL = 4; |
||
19 | |||
20 | public const NOT_INDEXABLE_4XX = 5; |
||
21 | |||
22 | public const NOT_INDEXABLE_5XX = 6; |
||
23 | |||
24 | public const NOT_INDEXABLE_NETWORK_ERROR = 7; |
||
25 | |||
26 | public const NOT_INDEXABLE_TOO_BIG = 10; |
||
27 | |||
28 | 9 | public const NOT_INDEXABLE_3XX = 8; |
|
29 | |||
30 | 9 | public const NOT_INDEXABLE_NOT_HTML = 9; |
|
31 | 9 | ||
32 | 9 | /** @var Harvest */ |
|
33 | protected $harvest; |
||
34 | 9 | ||
35 | /** @var string */ |
||
36 | 9 | protected $isIndexableFor; |
|
37 | 9 | ||
38 | public function __construct(Harvest $harvest, string $isIndexableFor = 'googlebot') |
||
39 | 9 | { |
|
40 | $this->harvest = $harvest; |
||
41 | $this->isIndexableFor = $isIndexableFor; |
||
42 | 9 | } |
|
43 | |||
44 | 9 | public function robotsTxtAllows() |
|
45 | 9 | { |
|
46 | $url = $this->harvest->getResponse()->getUrl(); |
||
47 | 9 | $robotsTxt = $this->harvest->getRobotsTxt(); |
|
48 | |||
49 | return '' === $robotsTxt ? true : $robotsTxt->allows($url, $this->isIndexableFor); |
||
50 | 9 | } |
|
51 | |||
52 | 9 | public function metaAllows() |
|
53 | { |
||
54 | 9 | $meta = $this->harvest->getMeta($this->isIndexableFor); |
|
55 | $generic = $this->harvest->getMeta('robots'); |
||
56 | |||
57 | 9 | return ! (false !== stripos($meta, 'noindex') || false !== stripos($generic, 'noindex')); |
|
58 | } |
||
59 | 9 | ||
60 | public function headersAllow() |
||
61 | { |
||
62 | 9 | $headers = explode(\PHP_EOL, $this->harvest->getResponse()->getHeaders(false)); |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
63 | 3 | ||
64 | return RobotsHeaders::create($headers)->mayIndex($this->isIndexableFor); |
||
65 | } |
||
66 | 9 | ||
67 | 3 | public static function indexable(Harvest $harvest, string $isIndexableFor = 'googlebot'): int |
|
68 | { |
||
69 | $self = new self($harvest, $isIndexableFor); |
||
70 | 9 | ||
71 | 3 | // robots |
|
72 | if (! $self->robotsTxtAllows()) { |
||
73 | return self::NOT_INDEXABLE_ROBOTS; |
||
74 | } |
||
75 | 9 | ||
76 | 3 | if (! $self->headersAllow()) { |
|
77 | return self::NOT_INDEXABLE_HEADER; |
||
78 | } |
||
79 | 9 | ||
80 | if (! $self->metaAllows()) { |
||
81 | return self::NOT_INDEXABLE_META; |
||
82 | 9 | } |
|
83 | 3 | ||
84 | // canonical |
||
85 | if (! $harvest->isCanonicalCorrect()) { |
||
86 | return self::NOT_INDEXABLE_CANONICAL; |
||
87 | 9 | } |
|
88 | 3 | ||
89 | $statusCode = $harvest->getResponse()->getStatusCode(); |
||
90 | |||
91 | // status 4XX |
||
92 | 9 | if ($statusCode < 500 && $statusCode > 399) { |
|
93 | 6 | return self::NOT_INDEXABLE_4XX; |
|
94 | } |
||
95 | |||
96 | 6 | // status 5XX |
|
97 | if ($statusCode < 600 && $statusCode > 499) { |
||
98 | return self::NOT_INDEXABLE_5XX; |
||
99 | } |
||
100 | |||
101 | // status 3XX |
||
102 | if ($statusCode < 400 && $statusCode > 299) { |
||
103 | return self::NOT_INDEXABLE_3XX; |
||
104 | } |
||
105 | |||
106 | return self::INDEXABLE; |
||
107 | } |
||
108 | } |
||
109 |