Issues (8)

src/Indexable.php (1 issue)

Labels
Severity
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use Spatie\Robots\RobotsHeaders;
6
7
class Indexable
8
{
9
    // https://stackoverflow.com/questions/1880148/how-to-get-name-of-the-constant
10
    public const INDEXABLE = 0;
11
12
    public const NOT_INDEXABLE_ROBOTS = 1;
13
14
    public const NOT_INDEXABLE_HEADER = 2;
15
16
    public const NOT_INDEXABLE_META = 3;
17
18
    public const NOT_INDEXABLE_CANONICAL = 4;
19
20
    public const NOT_INDEXABLE_4XX = 5;
21
22
    public const NOT_INDEXABLE_5XX = 6;
23
24
    public const NOT_INDEXABLE_NETWORK_ERROR = 7;
25
26
    public const NOT_INDEXABLE_TOO_BIG = 10;
27
28 9
    public const NOT_INDEXABLE_3XX = 8;
29
30 9
    public const NOT_INDEXABLE_NOT_HTML = 9;
31 9
32 9
    /** @var Harvest */
33
    protected $harvest;
34 9
35
    /** @var string */
36 9
    protected $isIndexableFor;
37 9
38
    public function __construct(Harvest $harvest, string $isIndexableFor = 'googlebot')
39 9
    {
40
        $this->harvest = $harvest;
41
        $this->isIndexableFor = $isIndexableFor;
42 9
    }
43
44 9
    public function robotsTxtAllows()
45 9
    {
46
        $url = $this->harvest->getResponse()->getUrl();
47 9
        $robotsTxt = $this->harvest->getRobotsTxt();
48
49
        return '' === $robotsTxt ? true : $robotsTxt->allows($url, $this->isIndexableFor);
50 9
    }
51
52 9
    public function metaAllows()
53
    {
54 9
        $meta = $this->harvest->getMeta($this->isIndexableFor);
55
        $generic = $this->harvest->getMeta('robots');
56
57 9
        return ! (false !== stripos($meta, 'noindex') || false !== stripos($generic, 'noindex'));
58
    }
59 9
60
    public function headersAllow()
61
    {
62 9
        $headers = explode(\PHP_EOL, $this->harvest->getResponse()->getHeaders(false));
0 ignored issues
show
It seems like $this->harvest->getResponse()->getHeaders(false) can also be of type null; however, parameter $string of explode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

62
        $headers = explode(\PHP_EOL, /** @scrutinizer ignore-type */ $this->harvest->getResponse()->getHeaders(false));
Loading history...
63 3
64
        return RobotsHeaders::create($headers)->mayIndex($this->isIndexableFor);
65
    }
66 9
67 3
    public static function indexable(Harvest $harvest, string $isIndexableFor = 'googlebot'): int
68
    {
69
        $self = new self($harvest, $isIndexableFor);
70 9
71 3
        // robots
72
        if (! $self->robotsTxtAllows()) {
73
            return self::NOT_INDEXABLE_ROBOTS;
74
        }
75 9
76 3
        if (! $self->headersAllow()) {
77
            return self::NOT_INDEXABLE_HEADER;
78
        }
79 9
80
        if (! $self->metaAllows()) {
81
            return self::NOT_INDEXABLE_META;
82 9
        }
83 3
84
        // canonical
85
        if (! $harvest->isCanonicalCorrect()) {
86
            return self::NOT_INDEXABLE_CANONICAL;
87 9
        }
88 3
89
        $statusCode = $harvest->getResponse()->getStatusCode();
90
91
        // status 4XX
92 9
        if ($statusCode < 500 && $statusCode > 399) {
93 6
            return self::NOT_INDEXABLE_4XX;
94
        }
95
96 6
        // status 5XX
97
        if ($statusCode < 600 && $statusCode > 499) {
98
            return self::NOT_INDEXABLE_5XX;
99
        }
100
101
        // status 3XX
102
        if ($statusCode < 400 && $statusCode > 299) {
103
            return self::NOT_INDEXABLE_3XX;
104
        }
105
106
        return self::INDEXABLE;
107
    }
108
}
109