Completed
Push — master ( 1d1c50...dc3dad )
by Jan-Petter
01:55
created

Parser::add()   C

Complexity

Conditions 7
Paths 15

Size

Total Lines 23
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 0 Features 0
Metric Value
c 4
b 0
f 0
dl 0
loc 23
rs 6.7272
cc 7
eloc 18
nc 15
nop 1
1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Directives\CleanParam;
5
use vipnytt\RobotsTxtParser\Directives\Host;
6
use vipnytt\RobotsTxtParser\Directives\Sitemap;
7
use vipnytt\RobotsTxtParser\Directives\UserAgent;
8
9
class Parser implements RobotsTxtInterface
10
{
11
    use ObjectTools;
12
13
    const TOP_LEVEL_DIRECTIVES = [
14
        self::DIRECTIVE_CLEAN_PARAM,
15
        self::DIRECTIVE_HOST,
16
        self::DIRECTIVE_SITEMAP,
17
        self::DIRECTIVE_USER_AGENT,
18
    ];
19
20
    protected $raw;
21
22
    protected $previousDirective;
23
    protected $userAgentValues;
24
25
    protected $cleanParam;
26
    protected $host;
27
    protected $sitemap;
28
    protected $userAgent;
29
30
    /**
31
     * Constructor
32
     *
33
     * @param string $content - file content
34
     * @param string $encoding - character encoding
35
     * @param integer|null $byteLimit - maximum of bytes to parse
36
     * @throws Exceptions\ParserException
37
     */
38
    public function __construct($content, $encoding = self::ENCODING, $byteLimit = self::BYTE_LIMIT)
39
    {
40
        if (!mb_internal_encoding($encoding)) {
41
            throw new Exceptions\ParserException('Unable to set internal character encoding to `' . $encoding . '`');
42
        }
43
44
        $this->cleanParam = new CleanParam();
45
        $this->host = new Host();
46
        $this->sitemap = new Sitemap();
47
        $this->userAgent = new UserAgent();
48
49
        $this->raw = is_int($byteLimit) ? mb_strcut($content, 0, $byteLimit, $encoding) : $content;
50
        $this->parseTxt();
51
    }
52
53
    /**
54
     * Parse robots.txt
55
     *
56
     * @return void
57
     */
58
    private function parseTxt()
59
    {
60
        $lines = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $this->raw)));
61
        // Parse each line individually
62
        foreach ($lines as $line) {
63
            // Limit rule length
64
            $line = mb_substr($line, 0, self::MAX_LENGTH_RULE);
65
            // Remove comments
66
            $line = mb_split('#', $line, 2)[0];
67
            // Parse line
68
            $this->add($line);
69
        }
70
    }
71
72
    public function add($line)
73
    {
74
        $previousDirective = $this->previousDirective;
75
        $pair = $this->generateRulePair($line, self::TOP_LEVEL_DIRECTIVES);
76
        if ($pair['directive'] === self::DIRECTIVE_USER_AGENT) {
77
            if ($previousDirective !== self::DIRECTIVE_USER_AGENT) {
78
                $this->userAgentValues = [];
79
            }
80
            $this->userAgentValues[] = $pair['value'];
81
        }
82
        $this->previousDirective = $pair['directive'];
83
        switch ($pair['directive']) {
84
            case self::DIRECTIVE_CLEAN_PARAM:
85
                return $this->cleanParam->add($pair['value']);
86
            case self::DIRECTIVE_HOST:
87
                return $this->host->add($pair['value']);
88
            case self::DIRECTIVE_SITEMAP:
89
                return $this->sitemap->add($pair['value']);
90
            case self::DIRECTIVE_USER_AGENT:
91
                return $this->userAgent->set($this->userAgentValues);
92
        }
93
        return $this->userAgent->add($line);
94
    }
95
96
    public function export()
97
    {
98
        return $this->cleanParam->export()
99
        + $this->host->export()
100
        + $this->sitemap->export()
101
        + $this->userAgent->export();
102
    }
103
104
    /**
105
     * Check if URL is allowed to crawl
106
     *
107
     * @param  string $url - url to check
108
     * @return bool
109
     */
110
    public function isAllowed($url)
111
    {
112
        return $this->userAgent->check($url, self::DIRECTIVE_ALLOW);
113
    }
114
115
    /**
116
     * Check if URL is disallowed to crawl
117
     *
118
     * @param  string $url - url to check
119
     * @return bool
120
     */
121
    public function isDisallowed($url)
122
    {
123
        return $this->userAgent->check($url, self::DIRECTIVE_DISALLOW);
124
    }
125
126
    /**
127
     * Get sitemaps
128
     *
129
     * @return array
130
     */
131
    public function getSitemaps()
132
    {
133
        return $this->sitemap->export();
134
    }
135
136
    /**
137
     * Get host
138
     *
139
     * @return string|null
140
     */
141
    public function getHost()
142
    {
143
        return $this->host->export();
144
    }
145
146
    /**
147
     * Get Clean-param
148
     *
149
     * @return array
150
     */
151
    public function getCleanParam()
152
    {
153
        return $this->cleanParam->export();
154
    }
155
}
156