Completed
Push — master ( cf0606...512c88 )
by Jan-Petter
04:57
created

Toolbox::checkPath()   D

Complexity

Conditions 9
Paths 25

Size

Total Lines 43
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 5
Bugs 2 Features 0
Metric Value
c 5
b 2
f 0
dl 0
loc 43
rs 4.909
cc 9
eloc 18
nc 25
nop 2
1
<?php
2
namespace vipnytt\RobotsTxtParser\Parser;
3
4
use vipnytt\RobotsTxtParser\Exceptions\ParserException;
5
6
/**
7
 * Trait Toolbox
8
 *
9
 * @package vipnytt\RobotsTxtParser\Parser
10
 */
11
trait Toolbox
12
{
13
    /**
14
     * Check basic rule
15
     *
16
     * @param string $path
17
     * @param array $paths
18
     * @return bool
19
     */
20
    protected function checkPath($path, array $paths)
21
    {
22
        foreach ($paths as $rule) {
23
            $escape = ['?' => '\?', '.' => '\.', '*' => '.*'];
24
            foreach ($escape as $search => $replace) {
25
                $rule = str_replace($search, $replace, $rule);
26
            }
27
            /**
28
             * Warning: preg_match need to be replaced
29
             *
30
             * Bug report
31
             * @link https://github.com/t1gor/Robots.txt-Parser-Class/issues/62
32
             *
33
             * An robots.txt parser, where a bug-fix is planned
34
             * @link https://github.com/diggin/Diggin_RobotRules
35
             *
36
             * The solution?
37
             * PHP PEG (parsing expression grammar)
38
             * @link https://github.com/hafriedlander/php-peg
39
             */
40
            try {
41
                if (!preg_match('#' . $rule . '#', $path)) {
42
                    // Rule does not match
43
                    continue;
44
                } else if (mb_stripos($rule, '$') === false) {
45
                    // No special parsing required
46
                    return true;
47
                } else if (($wildcardPos = mb_strrpos($rule, '*')) !== false) {
48
                    // Rule contains both an end anchor ($) and wildcard (*)
49
                    $afterWildcard = mb_substr($rule, $wildcardPos + 1, mb_strlen($rule) - $wildcardPos - 2);
50
                    if ($afterWildcard == mb_substr($path, -mb_strlen($afterWildcard))) {
51
                        return true;
52
                    }
53
                } else if (mb_substr($rule, 0, -1) == $path) {
54
                    // Rule does contains an end anchor
55
                    return true;
56
                }
57
            } catch (\Exception $e) {
58
                // An preg_match bug has occurred
59
            }
60
        }
61
        return false;
62
    }
63
64
    /**
65
     * Generate directive/rule pair
66
     *
67
     * @param string $line
68
     * @param array $whiteList
69
     * @return array|false
70
     */
71
    protected function generateRulePair($line, array $whiteList)
72
    {
73
        $whiteList = array_map('mb_strtolower', $whiteList);
74
        // Split by directive and rule
75
        $pair = array_map('trim', mb_split(':', $line, 2));
76
        // Check if the line contains a rule
77
        if (
78
            empty($pair[1]) ||
79
            empty($pair[0]) ||
80
            !in_array(($pair[0] = mb_strtolower($pair[0])), $whiteList)
81
        ) {
82
            // Line does not contain any supported directive
83
            return false;
84
        }
85
        return [
86
            'directive' => $pair[0],
87
            'value' => $pair[1],
88
        ];
89
    }
90
91
    /**
92
     * Validate directive
93
     *
94
     * @param string $directive
95
     * @param array $directives
96
     * @return string
97
     * @throws ParserException
98
     */
99
    protected function validateDirective($directive, array $directives)
100
    {
101
        if (!in_array($directive, $directives, true)) {
102
            throw new ParserException('Directive is not allowed here');
103
        }
104
        return mb_strtolower($directive);
105
    }
106
107
    /**
108
     * Parse rate as specified in the `Robot exclusion standard` version 2.0 draft
109
     * rate = numDocuments / timeUnit
110
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.request-rate
111
     *
112
     * @param $string
113
     * @return int|float|false
114
     */
115
    protected function draftParseRate($string)
116
    {
117
        $parts = array_map('trim', mb_split('/', $string));
118
        if (count($parts) != 2) {
119
            return false;
120
        }
121
        $multiplier = 1;
122
        switch (mb_substr(mb_strtolower(preg_replace('/[^A-Za-z]/', '', $parts[1])), 0, 1)) {
123
            case 'm':
124
                $multiplier = 60;
125
                break;
126
            case 'h':
127
                $multiplier = 3600;
128
                break;
129
            case 'd':
130
                $multiplier = 86400;
131
                break;
132
        }
133
        $num = intval(preg_replace('/[^0-9]/', '', $parts[0]));
134
        $sec = intval(preg_replace('/[^0-9]/', '', $parts[1])) * $multiplier;
135
        $rate = $num / $sec;
136
        return $rate > 0 ? $rate : false;
137
    }
138
139
    protected function draftParseTime($string)
140
    {
141
        $array = preg_replace('/[^0-9]/', '', mb_split('-', $string));
142
        if (
143
            count($array) != 2 ||
144
            ($from = date_create_from_format('Hi', $array[0], 'UTC')) === false ||
145
            ($to = date_create_from_format('Hi', $array[1], 'UTC')) === false
146
        ) {
147
            return false;
148
        }
149
        return [
150
            'from' => date_format($from, 'Hi'),
151
            'to' => date_format($to, 'Hi'),
152
        ];
153
    }
154
}
155