Completed
Push — master ( fda467...a7ec5f )
by Jan-Petter
02:10
created

Parser::add()   B

Complexity

Conditions 9
Paths 9

Size

Total Lines 20
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 20
rs 7.756
cc 9
eloc 17
nc 9
nop 0
1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Exceptions;
5
6
/**
7
 * Class Read
8
 *
9
 * @package vipnytt\RobotsTxtParser
10
 */
11
class Parser implements RobotsTxtInterface
12
{
13
    use UrlToolbox;
14
15
    /**
16
     * Max length for each rule
17
     */
18
    protected $maxRuleLength = self::MAX_LENGTH_RULE;
19
20
    /**
21
     * RAW robots.txt content
22
     * @var string
23
     */
24
    protected $raw = '';
25
26
    /**
27
     * Rule array
28
     * @var array
29
     */
30
    protected $rules = [];
31
32
    /**
33
     * User-Agents
34
     * @var array
35
     */
36
    private $userAgents = [self::USER_AGENT];
37
38
    /**
39
     * Current line
40
     * @var string
41
     */
42
    private $line = '';
43
44
    /**
45
     * Previous directive
46
     * @var string
47
     */
48
    private $previous;
49
50
    /**
51
     * Current Directive
52
     * @var string
53
     */
54
    private $directive;
55
56
    /**
57
     * Current Rule
58
     * @var array|string
59
     */
60
    private $rule;
61
62
    /**
63
     * Constructor
64
     *
65
     * @param string $content - file content
66
     * @param string $encoding - character encoding
67
     * @param integer|null $byteLimit - maximum of bytes to parse
68
     * @param integer|null $maxRuleLength - max length of each rule
69
     * @throws Exceptions\ParserException
70
     */
71
    public function __construct($content, $encoding = self::ENCODING, $byteLimit = self::BYTE_LIMIT, $maxRuleLength = self::MAX_LENGTH_RULE)
72
    {
73
        if (!mb_internal_encoding($encoding)) {
74
            throw new Exceptions\ParserException('Unable to set internal character encoding to `' . $encoding . '`');
75
        }
76
        $this->maxRuleLength = $maxRuleLength;
77
        $this->raw = is_int($byteLimit) ? mb_strcut($content, 0, $byteLimit, $encoding) : $content;
78
        $this->parseTxt();
79
    }
80
81
    /**
82
     * Parse robots.txt
83
     *
84
     * @return void
85
     */
86
    private function parseTxt()
87
    {
88
        $lines = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $this->raw)));
89
        // Parse each line individually
90
        foreach ($lines as $this->line) {
91
            // Limit rule length
92
            if (is_int($this->maxRuleLength)) {
93
                $this->line = mb_substr($this->line, 0, $this->maxRuleLength);
94
            }
95
            // Remove comments
96
            $this->line = mb_split('#', $this->line, 2)[0];
97
            // Parse line
98
            if (
99
                ($this->generateRulePair()) === false ||
100
                ($result = $this->parseLine()) === false
101
            ) {
102
                continue;
103
            }
104
            // Add rule
105
            $this->previous = $this->directive;
106
            $this->rule = $result;
107
            $this->rules = array_merge_recursive($this->assignUserAgent(), $this->rules);
108
        }
109
    }
110
111
    /**
112
     * Generate Directive:Rule pair
113
     *
114
     * @return bool
115
     */
116
    private function generateRulePair()
117
    {
118
        // Split by directive and rule
119
        $pair = array_map('trim', mb_split(':', $this->line, 2));
120
        // Check if the line contains a rule
121
        if (
122
            empty($pair[1]) ||
123
            empty($pair[0]) ||
124
            !in_array(($pair[0] = mb_strtolower($pair[0])), $this->directives())
125
        ) {
126
            // Line does not contain any supported directive
127
            return false;
128
        }
129
        $this->directive = $pair[0];
130
        $this->rule = $pair[1];
131
        return true;
132
    }
133
134
    /**
135
     * Directives and sub directives
136
     *
137
     * @param string|null $parent
138
     * @return array
139
     */
140
    private function directives($parent = null)
141
    {
142
        $array = [
143
            self::DIRECTIVE_ALLOW => [
144
                self::DIRECTIVE_CLEAN_PARAM,
145
                self::DIRECTIVE_HOST,
146
            ],
147
            self::DIRECTIVE_CACHE_DELAY => [],
148
            self::DIRECTIVE_CLEAN_PARAM => [],
149
            self::DIRECTIVE_CRAWL_DELAY => [],
150
            self::DIRECTIVE_DISALLOW => [
151
                self::DIRECTIVE_CLEAN_PARAM,
152
                self::DIRECTIVE_HOST,
153
            ],
154
            self::DIRECTIVE_HOST => [],
155
            self::DIRECTIVE_SITEMAP => [],
156
            self::DIRECTIVE_USER_AGENT => [
157
                self::DIRECTIVE_ALLOW,
158
                self::DIRECTIVE_CACHE_DELAY,
159
                self::DIRECTIVE_CRAWL_DELAY,
160
                self::DIRECTIVE_DISALLOW,
161
            ],
162
        ];
163
        if ($parent !== null) {
164
            return isset($array[$parent]) ? $array[$parent] : [];
165
        }
166
        return array_keys($array);
167
    }
168
169
    /**
170
     * Parse line
171
     *
172
     * @param string|null $parent
173
     * @return array|false
174
     */
175
    private function parseLine($parent = null)
176
    {
177
        if (
178
            ($this->generateRulePair()) === false ||
179
            !in_array($this->directive, $this->directives($parent))
180
        ) {
181
            return false;
182
        }
183
        // Cache directive/rule variables to after inline directives has been parsed
184
        $directive = $this->directive;
185
        $rule = $this->rule;
186
        $this->line = (string)$this->rule;
187
        if (($inline = $this->parseLine($this->directive)) !== false) {
188
            $rule = $inline;
189
        };
190
        $this->directive = $directive;
191
        $this->rule = $rule;
192
        return $this->add();
193
    }
194
195
    /**
196
     * Add value to directive
197
     *
198
     * @return array|false
199
     */
200
    private function add()
201
    {
202
        switch ($this->directive) {
203
            case self::DIRECTIVE_ALLOW:
204
            case self::DIRECTIVE_DISALLOW:
205
                return $this->addDisAllow();
206
            case self::DIRECTIVE_CACHE_DELAY:
207
            case self::DIRECTIVE_CRAWL_DELAY:
208
                return $this->addFloat();
209
            case self::DIRECTIVE_CLEAN_PARAM:
210
                return $this->addCleanParam();
211
            case self::DIRECTIVE_HOST:
212
                return $this->addHost();
213
            case self::DIRECTIVE_SITEMAP:
214
                return $this->addSitemap();
215
            case self::DIRECTIVE_USER_AGENT:
216
                return $this->setUserAgent();
217
        }
218
        return false;
219
    }
220
221
    /**
222
     * Add an Allow or Disallow rule
223
     *
224
     * @return array
225
     */
226
    private function addDisAllow()
227
    {
228
        // If inline directive, pass the array
229
        if (is_array($this->rule)) {
230
            return [
231
                $this->directive => $this->rule
232
            ];
233
        }
234
        // Return an array of paths
235
        return [
236
            $this->directive => [
237
                'path' => [
238
                    $this->rule
239
                ]
240
            ]
241
        ];
242
    }
243
244
    /**
245
     * Add float value
246
     *
247
     * @return array|false
248
     */
249
    private function addFloat()
250
    {
251
        if (empty(($float = floatval($this->rule)))) {
252
            return false;
253
        }
254
        return [
255
            $this->directive => $float,
256
        ];
257
    }
258
259
    /**
260
     * Add Clean-Param record
261
     *
262
     * @return array|false
263
     */
264
    private function addCleanParam()
265
    {
266
        if (!is_string($this->rule)) {
267
            return false;
268
        }
269
        $result = [];
270
        $cleanParam = $this->explodeCleanParamRule($this->rule);
271
        foreach ($cleanParam['param'] as $param) {
272
            $result[$this->directive][$param][] = $cleanParam['path'];
273
        }
274
        return $result;
275
    }
276
277
    /**
278
     * Explode Clean-Param rule
279
     *
280
     * @param  string $rule
281
     * @return array
282
     */
283
    private function explodeCleanParamRule($rule)
284
    {
285
        // split into parameter and path
286
        $array = array_map('trim', mb_split('\s+', $rule, 2));
287
        $cleanParam = [];
288
        // strip any invalid characters from path prefix
289
        $cleanParam['path'] = isset($array[1]) ? $this->urlEncode(mb_ereg_replace('[^A-Za-z0-9\.-\/\*\_]', '', $array[1])) : '/*';
290
        $param = array_map('trim', mb_split('&', $array[0]));
291
        foreach ($param as $key) {
292
            $cleanParam['param'][] = $key;
293
        }
294
        return $cleanParam;
295
    }
296
297
    /**
298
     * Add Host
299
     *
300
     * @return array|false
301
     */
302
    private function addHost()
303
    {
304
        if (
305
            !is_string($this->rule) ||
306
            ($parsed = parse_url(($this->rule = $this->urlEncode(mb_strtolower($this->rule))))) === false
307
        ) {
308
            return false;
309
        }
310
        $host = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
311
        if (
312
            !$this->urlValidateHost($host) ||
313
            isset($parsed['scheme']) &&
314
            !$this->urlValidateScheme($parsed['scheme'])
315
        ) {
316
            return false;
317
        }
318
        $scheme = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : '';
319
        $port = isset($parsed['port']) ? ':' . $parsed['port'] : '';
320
        return [
321
            self::DIRECTIVE_HOST => [
322
                $scheme . $host . $port,
323
            ]
324
        ];
325
    }
326
327
    /**
328
     * Add Sitemap
329
     *
330
     * @return array|false
331
     */
332
    private function addSitemap()
333
    {
334
        if (
335
            !is_string($this->rule) ||
336
            !$this->urlValidate(($url = $this->urlEncode($this->rule)))
337
        ) {
338
            return false;
339
        }
340
        return [
341
            self::DIRECTIVE_SITEMAP => [
342
                $url
343
            ]
344
        ];
345
    }
346
347
    /**
348
     * Set User-Agent(s)
349
     *
350
     * @return array
351
     */
352
    private function setUserAgent()
353
    {
354
        switch ($this->previous) {
355
            case self::DIRECTIVE_USER_AGENT:
356
                $this->userAgents[] = $this->rule;
357
                break;
358
            default:
359
                $this->userAgents = [
360
                    $this->rule
361
                ];
362
        }
363
        return [];
364
    }
365
366
    /**
367
     * Assign User-Agent dependent rules to the User-Agent arrays
368
     *
369
     * @return array
370
     */
371
    private function assignUserAgent()
372
    {
373
        if (in_array($this->directive, $this->directives(self::DIRECTIVE_USER_AGENT))) {
374
            $rule = [];
375
            foreach ($this->userAgents as $userAgent) {
376
                $rule[self::DIRECTIVE_USER_AGENT][$userAgent] = $this->rule;
377
            }
378
            return $rule;
379
        }
380
        return $this->rule;
381
    }
382
383
    /**
384
     * Get rules
385
     */
386
    public function export()
387
    {
388
        return $this->rules;
389
    }
390
}
391