Completed
Push — master ( 659d48...d1d688 )
by Jan-Petter
03:23
created

TxtParser   C

Complexity

Total Complexity 63

Size/Duplication

Total Lines 476
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 63
c 1
b 0
f 0
lcom 1
cbo 1
dl 0
loc 476
rs 5.8893

19 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 12 2
A parseTxt() 0 21 4
A generateRulePair() 0 20 4
A directives() 0 23 3
A parseLine() 0 19 4
B add() 0 20 9
A setUserAgent() 0 13 2
A addFloat() 0 9 2
C addHost() 0 23 10
B UrlEncode() 0 25 1
A UrlValidateHost() 0 9 4
A UrlValidateScheme() 0 8 1
A addSitemap() 0 11 2
A UrlValidate() 0 9 4
A addCleanParam() 0 9 2
A explodeCleanParamRule() 0 16 3
A addDisAllow() 0 17 2
A assignUserAgent() 0 11 3
A getRules() 0 4 1

How to fix   Complexity   

Complex Class

Complex classes like TxtParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use TxtParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Exceptions\TxtParserException;
5
6
/**
7
 * Class Parser
8
 *
9
 * @package vipnytt\RobotsTxtParser
10
 */
11
class TxtParser
12
{
13
    /**
14
     * Default User-Agent
15
     */
16
    const USERAGENT_DEFAULT = '*';
17
18
    /**
19
     * Max rule length
20
     */
21
    const RULE_MAX_LENGTH = 500;
22
23
    /**
24
     * Directives
25
     */
26
    const DIRECTIVE_ALLOW = 'allow';
27
    const DIRECTIVE_CACHE_DELAY = 'cache-delay';
28
    const DIRECTIVE_CLEAN_PARAM = 'clean-param';
29
    const DIRECTIVE_CRAWL_DELAY = 'crawl-delay';
30
    const DIRECTIVE_DISALLOW = 'disallow';
31
    const DIRECTIVE_HOST = 'host';
32
    const DIRECTIVE_SITEMAP = 'sitemap';
33
    const DIRECTIVE_USERAGENT = 'user-agent';
34
35
    /**
36
     * User-Agent dependent directives
37
     */
38
    const USERAGENT_DEPENDENT_DIRECTIVES = [
39
        self::DIRECTIVE_ALLOW,
40
        self::DIRECTIVE_CACHE_DELAY,
41
        self::DIRECTIVE_CRAWL_DELAY,
42
        self::DIRECTIVE_DISALLOW,
43
    ];
44
45
    /**
46
     * RAW robots.txt content
47
     * @var string
48
     */
49
    private $raw = '';
50
51
    /**
52
     * Rule array
53
     * @var array
54
     */
55
    private $rules = [];
56
57
    /**
58
     * User-Agents
59
     * @var array
60
     */
61
    private $userAgents = [self::USERAGENT_DEFAULT];
62
63
    /**
64
     * Previous directive
65
     * @var string
66
     */
67
    private $previous;
68
69
    /**
70
     * Current Directive
71
     * @var string
72
     */
73
    private $directive;
74
75
    /**
76
     * Current Rule
77
     * @var string
78
     */
79
    private $rule;
80
81
    private $line = '';
82
83
    /**
84
     * Constructor
85
     *
86
     * @param string $content - file content
87
     * @throws TxtParserException
88
     */
89
    public function __construct($content)
90
    {
91
        mb_language("uni");
92
        if (!mb_internal_encoding('UTF-8')) {
93
            throw new TxtParserException('Unable to set internal character encoding to `UTF-8`');
94
        }
95
        mb_internal_encoding(mb_detect_encoding($content));
96
        mb_regex_encoding(mb_detect_encoding($content));
97
        $this->raw = $content;
98
99
        $this->parseTxt();
100
    }
101
102
    /**
103
     * Parse robots.txt
104
     *
105
     * @return void
106
     */
107
    private function parseTxt()
108
    {
109
        $lines = array_filter(array_map('trim', mb_split('\n', $this->raw)));
110
        // Parse each line individually
111
        foreach ($lines as $this->line) {
112
            // Limit rule length
113
            $this->line = mb_substr($this->line, 0, self::RULE_MAX_LENGTH);
114
            // Remove comments
115
            $this->line = mb_split('#', $this->line, 2)[0];
116
            // Generate pair
117
            if (($this->generateRulePair()) === false) {
118
                continue;
119
            }
120
            // Parse line
121
            if (($result = $this->parseLine()) !== false) {
122
                $this->previous = $this->directive;
123
                $this->rule = $result;
0 ignored issues
show
Documentation Bug introduced by
It seems like $result of type array is incompatible with the declared type string of property $rule.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
124
                $this->rules = array_merge_recursive($this->assignUserAgent(), $this->rules);
125
            }
126
        }
127
    }
128
129
    /**
130
     * Generate Directive:Rule pair
131
     *
132
     * @return bool
133
     */
134
    private function generateRulePair()
135
    {
136
        // Split by directive and rule
137
        $pair = array_map('trim', mb_split(':', $this->line, 2));
138
        // Validate rule
139
        if (empty($pair[1])) {
140
            // Line does not contain any rule
141
            return false;
142
        }
143
        // Validate directive
144
        $pair[0] = mb_strtolower($pair[0]);
145
        if (empty($pair[0]) || !in_array($pair[0], $this->directives())) {
146
            // Line does not contain any supported directive
147
            return false;
148
        }
149
        $this->directive = $pair[0];
150
        $this->rule = $pair[1];
151
        $this->line = $this->rule;
152
        return true;
153
    }
154
155
    /**
156
     * Directives
157
     *
158
     * @param string|null $parent
159
     * @return array
160
     */
161
    private function directives($parent = null)
162
    {
163
        $array = [
164
            self::DIRECTIVE_ALLOW => [
165
                self::DIRECTIVE_CLEAN_PARAM,
166
                self::DIRECTIVE_HOST,
167
            ],
168
            self::DIRECTIVE_CACHE_DELAY => [],
169
            self::DIRECTIVE_CLEAN_PARAM => [],
170
            self::DIRECTIVE_CRAWL_DELAY => [],
171
            self::DIRECTIVE_DISALLOW => [
172
                self::DIRECTIVE_CLEAN_PARAM,
173
                self::DIRECTIVE_HOST,
174
            ],
175
            self::DIRECTIVE_HOST => [],
176
            self::DIRECTIVE_SITEMAP => [],
177
            self::DIRECTIVE_USERAGENT => [],
178
        ];
179
        if ($parent !== null) {
180
            return isset($array[$parent]) ? $array[$parent] : [];
181
        }
182
        return array_keys($array);
183
    }
184
185
186
    /**
187
     * Parse line
188
     *
189
     * @param string|null $parent
190
     * @return array|false
191
     */
192
    private function parseLine($parent = null)
193
    {
194
        if (($this->generateRulePair()) === false) {
195
            return false;
196
        }
197
        if (!in_array($this->directive, $this->directives($parent))) {
198
            return false;
199
        }
200
        // Cache directive/rule variables to after inline directives has been parsed
201
        $directive = $this->directive;
202
        $rule = $this->rule;
203
        $this->line = $this->rule;
204
        if (($inline = $this->parseLine($this->directive)) !== false) {
205
            $pair[1] = $inline;
0 ignored issues
show
Coding Style Comprehensibility introduced by
$pair was never initialized. Although not strictly required by PHP, it is generally a good practice to add $pair = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
206
        };
207
        $this->directive = $directive;
208
        $this->rule = $rule;
209
        return $this->add();
210
    }
211
212
    /**
213
     * Add value to directive
214
     *
215
     * @return array|false
216
     */
217
    private function add()
218
    {
219
        switch ($this->directive) {
220
            case self::DIRECTIVE_USERAGENT:
221
                return $this->setUserAgent();
222
            case self::DIRECTIVE_CACHE_DELAY:
223
            case self::DIRECTIVE_CRAWL_DELAY:
224
                return $this->addFloat();
225
            case self::DIRECTIVE_HOST:
226
                return $this->addHost();
227
            case self::DIRECTIVE_SITEMAP:
228
                return $this->addSitemap();
229
            case self::DIRECTIVE_CLEAN_PARAM:
230
                return $this->addCleanParam();
231
            case self::DIRECTIVE_ALLOW:
232
            case self::DIRECTIVE_DISALLOW:
233
                return $this->addDisAllow();
234
        }
235
        return false;
236
    }
237
238
    /**
239
     * Set User-Agent(s)
240
     *
241
     * @return array
242
     */
243
    private function setUserAgent()
244
    {
245
        switch ($this->previous) {
246
            case self::DIRECTIVE_USERAGENT:
247
                $this->userAgents[] = $this->rule;
248
                break;
249
            default:
250
                $this->userAgents = [
251
                    $this->rule
252
                ];
253
        }
254
        return [];
255
    }
256
257
    /**
258
     * Add float value
259
     *
260
     * @return array|false
261
     */
262
    private function addFloat()
263
    {
264
        if (empty(($float = floatval($this->rule)))) {
265
            return false;
266
        }
267
        return [
268
            $this->directive => $float,
269
        ];
270
    }
271
272
    /**
273
     * Add Host
274
     *
275
     * @return array|false
276
     */
277
    private function addHost()
278
    {
279
        $parsed = parse_url($this->UrlEncode($this->rule));
280
        if (isset($this->host) || $parsed === false) {
281
            return false;
282
        }
283
        $host = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
284
        if (!$this->UrlValidateHost($host)) {
285
            return false;
286
        } elseif (isset($parsed['scheme']) && !$this->UrlValidateScheme($parsed['scheme'])) {
287
            return false;
288
        }
289
        $scheme = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : '';
290
        $port = isset($parsed['port']) ? ':' . $parsed['port'] : '';
291
        if ($this->rule !== $scheme . $host . $port) {
292
            return false;
293
        }
294
        return [
295
            self::DIRECTIVE_HOST => [
296
                $this->rule,
297
            ]
298
        ];
299
    }
300
301
    /**
302
     * URL encoder according to RFC 3986
303
     * Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings.
304
     * @link http://publicmind.in/blog/url-encoding/
305
     *
306
     * @param string $url
307
     * @return string
308
     */
309
    private function UrlEncode($url)
310
    {
311
        $reserved = [
312
            ":" => '!%3A!ui',
313
            "/" => '!%2F!ui',
314
            "?" => '!%3F!ui',
315
            "#" => '!%23!ui',
316
            "[" => '!%5B!ui',
317
            "]" => '!%5D!ui',
318
            "@" => '!%40!ui',
319
            "!" => '!%21!ui',
320
            "$" => '!%24!ui',
321
            "&" => '!%26!ui',
322
            "'" => '!%27!ui',
323
            "(" => '!%28!ui',
324
            ")" => '!%29!ui',
325
            "*" => '!%2A!ui',
326
            "+" => '!%2B!ui',
327
            "," => '!%2C!ui',
328
            ";" => '!%3B!ui',
329
            "=" => '!%3D!ui',
330
            "%" => '!%25!ui'
331
        ];
332
        return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url));
333
    }
334
335
    /**
336
     * Validate host name
337
     *
338
     * @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php
339
     *
340
     * @param  string $host
341
     * @return bool
342
     */
343
    private static function  UrlValidateHost($host)
344
    {
345
        return (
346
            preg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check
347
            && preg_match("/^.{1,253}$/", $host) //overall length check
348
            && preg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label
349
            && !filter_var($host, FILTER_VALIDATE_IP) //is not an IP address
350
        );
351
    }
352
353
    /**
354
     * Validate URL scheme
355
     *
356
     * @param  string $scheme
357
     * @return bool
358
     */
359
    private static function UrlValidateScheme($scheme)
360
    {
361
        return in_array($scheme, [
362
                'http', 'https',
363
                'ftp', 'sftp'
364
            ]
365
        );
366
    }
367
368
    /**
369
     * Add Sitemap
370
     *
371
     * @return array|false
372
     */
373
    private function addSitemap()
374
    {
375
        if (!$this->UrlValidate(($url = $this->UrlEncode($this->rule)))) {
376
            return false;
377
        }
378
        return [
379
            self::DIRECTIVE_SITEMAP => [
380
                $url
381
            ]
382
        ];
383
    }
384
385
    /**
386
     * Validate URL
387
     *
388
     * @param string $url
389
     * @return bool
390
     */
391
    public function UrlValidate($url)
392
    {
393
        return (
394
            filter_var($url, FILTER_VALIDATE_URL)
395
            && ($parsed = parse_url($url)) !== false
396
            && $this->UrlValidateHost($parsed['host'])
397
            && $this->UrlValidateScheme($parsed['scheme'])
398
        );
399
    }
400
401
    /**
402
     * Add Clean-Param record
403
     *
404
     * @return array
405
     */
406
    private function addCleanParam()
407
    {
408
        $result = [];
409
        $cleanParam = $this->explodeCleanParamRule($this->rule);
410
        foreach ($cleanParam['param'] as $param) {
411
            $result[$this->directive]['path'][$cleanParam['path']]['param'][] = $param;
412
        }
413
        return $result;
414
    }
415
416
    /**
417
     * Explode Clean-Param rule
418
     *
419
     * @param  string $rule
420
     * @return array
421
     */
422
    private function explodeCleanParamRule($rule)
423
    {
424
        // strip multi-spaces
425
        $rule = mb_ereg_replace('/\s+/S', ' ', $rule);
426
        // split into parameter and path
427
        $array = mb_split(' ', $rule, 2);
428
        $cleanParam = [];
429
        // strip any invalid characters from path prefix
430
431
        $cleanParam['path'] = isset($array[1]) ? $this->UrlEncode(mb_ereg_replace('/[^A-Za-z0-9\.-\/\*\_]/', '', $array[1])) : "/*";
432
        $param = array_map('trim', mb_split('&', $array[0]));
433
        foreach ($param as $key) {
434
            $cleanParam['param'][] = $key;
435
        }
436
        return $cleanParam;
437
    }
438
439
    /**
440
     * Add an Allow or Disallow rule
441
     *
442
     * @return array
443
     */
444
    private function addDisAllow()
445
    {
446
        // If inline directive, pass the array
447
        if (is_array($this->rule)) {
448
            return [
449
                $this->directive => $this->rule
450
            ];
451
        }
452
        // Return an array of paths
453
        return [
454
            $this->directive => [
455
                'path' => [
456
                    $this->rule
457
                ]
458
            ]
459
        ];
460
    }
461
462
    /**
463
     * Assign User-Agent dependent rules to the User-Agent arrays
464
     *
465
     * @return array
466
     */
467
    private function assignUserAgent()
468
    {
469
        if (in_array($this->directive, self::USERAGENT_DEPENDENT_DIRECTIVES)) {
470
            $rule = [];
471
            foreach ($this->userAgents as $userAgent) {
472
                $rule[self::DIRECTIVE_USERAGENT][$userAgent] = $this->rule;
473
            }
474
            return $rule;
475
        }
476
        return $this->rule;
477
    }
478
479
    /**
480
     * Get rules
481
     */
482
    public function getRules()
483
    {
484
        return $this->rules;
485
    }
486
}
487