Completed
Push — 2.0-dev ( 97b412...d10788 )
by Jan-Petter
02:43
created

DirectiveParserCommons::checkPaths()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 17
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 17
rs 9.2
cc 4
eloc 11
nc 5
nop 2
1
<?php
2
namespace vipnytt\RobotsTxtParser\Parser\Directives;
3
4
use DateTimeZone;
5
use vipnytt\RobotsTxtParser\Exceptions\ParserException;
6
7
/**
8
 * Class DirectiveParserCommons
9
 *
10
 * @package vipnytt\RobotsTxtParser\Directive
11
 */
12
trait DirectiveParserCommons
13
{
14
    /**
15
     * Check path rule
16
     *
17
     * @param string $path
18
     * @param string[] $paths
19
     * @return bool
20
     */
21
    private function checkPaths($path, array $paths)
22
    {
23
        foreach ($paths as $rule) {
24
            $escape = [
25
                '?' => '\?',
26
                '.' => '\.',
27
                '*' => '.*',
28
            ];
29
            foreach ($escape as $search => $replace) {
30
                $rule = str_replace($search, $replace, $rule);
31
            }
32
            if ($this->checkPathsCallback($rule, $path)) {
33
                return true;
34
            }
35
        }
36
        return false;
37
    }
38
39
    /**
40
     * Callback for CheckPath
41
     *
42
     * @param string $rule
43
     * @param string $path
44
     * @return bool
45
     */
46
    private function checkPathsCallback($rule, $path)
47
    {
48
        /**
49
         * Warning: preg_match need to be replaced
50
         *
51
         * Bug report
52
         * @link https://github.com/t1gor/Robots.txt-Core-Class/issues/62
53
         *
54
         * An robots.txt parser, where a bug-fix is planned
55
         * @link https://github.com/diggin/Diggin_RobotRules
56
         *
57
         * The solution?
58
         * PHP PEG (parsing expression grammar)
59
         * @link https://github.com/hafriedlander/php-peg
60
         */
61
        try {
62
            if (!preg_match('#' . $rule . '#', $path)) {
63
                // Rule does not match
64
                return false;
65
            } else if (mb_stripos($rule, '$') === false) {
66
                // No special parsing required
67
                return true;
68
            } elseif (($wildcardPos = mb_strrpos($rule, '*')) !== false) {
69
                // Rule contains both an end anchor ($) and wildcard (*)
70
                $afterWildcard = mb_substr($rule, $wildcardPos + 1, mb_strlen($rule) - $wildcardPos - 2);
71
                if ($afterWildcard == mb_substr($path, -mb_strlen($afterWildcard))) {
72
                    return true;
73
                }
74
            } elseif (mb_substr($rule, 0, -1) == $path) {
75
                // Rule does contains an end anchor
76
                return true;
77
            }
78
        } catch (\Exception $e) {
79
            // An preg_match bug has occurred
80
        }
81
        return false;
82
    }
83
84
    /**
85
     * Generate directive/rule pair
86
     *
87
     * @param string $line
88
     * @param string[] $whiteList
89
     * @return string[]|false
90
     */
91
    private function generateRulePair($line, array $whiteList)
92
    {
93
        $whiteList = array_map('mb_strtolower', $whiteList);
94
        // Split by directive and rule
95
        $pair = array_map('trim', mb_split(':', $line, 2));
96
        // Check if the line contains a rule
97
        if (
98
            empty($pair[1]) ||
99
            empty($pair[0]) ||
100
            !in_array(($pair[0] = mb_strtolower($pair[0])), $whiteList)
101
        ) {
102
            // Line does not contain any supported directive
103
            return false;
104
        }
105
        return [
106
            'directive' => $pair[0],
107
            'value' => $pair[1],
108
        ];
109
    }
110
111
    /**
112
     * Client rate as specified in the `Robot exclusion standard` version 2.0 draft
113
     * rate = numDocuments / timeUnit
114
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.request-rate
115
     *
116
     * @param string $string
117
     * @return float|int|false
118
     */
119
    private function draftParseRate($string)
120
    {
121
        $parts = array_map('trim', mb_split('/', $string));
122
        if (count($parts) != 2) {
123
            return false;
124
        }
125
        $multiplier = 1;
126
        switch (mb_substr(mb_strtolower(preg_replace('/[^A-Za-z]/', '', $parts[1])), 0, 1)) {
127
            case 'm':
128
                $multiplier = 60;
129
                break;
130
            case 'h':
131
                $multiplier = 3600;
132
                break;
133
            case 'd':
134
                $multiplier = 86400;
135
                break;
136
        }
137
        $num = floatval(preg_replace('/[^0-9]/', '', $parts[0]));
138
        $sec = floatval(preg_replace('/[^0-9.]/', '', $parts[1])) * $multiplier;
139
        $rate = $sec / $num;
140
        return $rate > 0 ? $rate : false;
141
    }
142
143
    /**
144
     * Client timestamp range as specified in the `Robot exclusion standard` version 2.0 draft
145
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.visit-time
146
     *
147
     * @param $string
148
     * @return string[]|false
149
     */
150
    private function draftParseTime($string)
151
    {
152
        $array = preg_replace('/[^0-9]/', '', mb_split('-', $string));
153
        if (
154
            count($array) != 2 ||
155
            ($fromTime = date_create_from_format('Hi', $array[0], $dtz = new DateTimeZone('UTC'))) === false ||
156
            ($toTime = date_create_from_format('Hi', $array[1], $dtz)) === false
157
        ) {
158
            return false;
159
        }
160
        return [
161
            'from' => date_format($fromTime, 'Hi'),
162
            'to' => date_format($toTime, 'Hi'),
163
        ];
164
    }
165
166
    /**
167
     * Validate directive
168
     *
169
     * @param string $directive
170
     * @param string[] $directives
171
     * @return string
172
     * @throws ParserException
173
     */
174
    private function validateDirective($directive, array $directives)
175
    {
176
        if (!in_array($directive, $directives, true)) {
177
            throw new ParserException('Directive not supported by this class');
178
        }
179
        return mb_strtolower($directive);
180
    }
181
}
182