Completed
Branch 2.0-dev (d250b8)
by Jan-Petter
03:02
created

DirectiveParserCommons::checkPath()   D

Complexity

Conditions 9
Paths 25

Size

Total Lines 43
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 43
rs 4.909
cc 9
eloc 18
nc 25
nop 2
1
<?php
2
namespace vipnytt\RobotsTxtParser\Parser\Directives;
3
4
use DateTimeZone;
5
use vipnytt\RobotsTxtParser\Exceptions\ParserException;
6
7
/**
8
 * Class DirectiveParserCommons
9
 *
10
 * @package vipnytt\RobotsTxtParser\Directive
11
 */
12
trait DirectiveParserCommons
13
{
14
    /**
15
     * Check basic rule
16
     *
17
     * @param string $path
18
     * @param string[] $paths
19
     * @return bool
20
     */
21
    private function checkPath($path, array $paths)
22
    {
23
        foreach ($paths as $rule) {
24
            $escape = ['?' => '\?', '.' => '\.', '*' => '.*'];
25
            foreach ($escape as $search => $replace) {
26
                $rule = str_replace($search, $replace, $rule);
27
            }
28
            /**
29
             * Warning: preg_match need to be replaced
30
             *
31
             * Bug report
32
             * @link https://github.com/t1gor/Robots.txt-Core-Class/issues/62
33
             *
34
             * An robots.txt parser, where a bug-fix is planned
35
             * @link https://github.com/diggin/Diggin_RobotRules
36
             *
37
             * The solution?
38
             * PHP PEG (parsing expression grammar)
39
             * @link https://github.com/hafriedlander/php-peg
40
             */
41
            try {
42
                if (!preg_match('#' . $rule . '#', $path)) {
43
                    // Rule does not match
44
                    continue;
45
                } else if (mb_stripos($rule, '$') === false) {
46
                    // No special parsing required
47
                    return true;
48
                } else if (($wildcardPos = mb_strrpos($rule, '*')) !== false) {
49
                    // Rule contains both an end anchor ($) and wildcard (*)
50
                    $afterWildcard = mb_substr($rule, $wildcardPos + 1, mb_strlen($rule) - $wildcardPos - 2);
51
                    if ($afterWildcard == mb_substr($path, -mb_strlen($afterWildcard))) {
52
                        return true;
53
                    }
54
                } else if (mb_substr($rule, 0, -1) == $path) {
55
                    // Rule does contains an end anchor
56
                    return true;
57
                }
58
            } catch (\Exception $e) {
59
                // An preg_match bug has occurred
60
            }
61
        }
62
        return false;
63
    }
64
65
66
    /**
67
     * Generate directive/rule pair
68
     *
69
     * @param string $line
70
     * @param array $whiteList
71
     * @return array|false
72
     */
73
    private function generateRulePair($line, array $whiteList)
74
    {
75
        $whiteList = array_map('mb_strtolower', $whiteList);
76
        // Split by directive and rule
77
        $pair = array_map('trim', mb_split(':', $line, 2));
78
        // Check if the line contains a rule
79
        if (
80
            empty($pair[1]) ||
81
            empty($pair[0]) ||
82
            !in_array(($pair[0] = mb_strtolower($pair[0])), $whiteList)
83
        ) {
84
            // Line does not contain any supported directive
85
            return false;
86
        }
87
        return [
88
            'directive' => $pair[0],
89
            'value' => $pair[1],
90
        ];
91
    }
92
93
    /**
94
     * Client rate as specified in the `Robot exclusion standard` version 2.0 draft
95
     * rate = numDocuments / timeUnit
96
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.request-rate
97
     *
98
     * @param string $string
99
     * @return float|int|false
100
     */
101
    private function draftParseRate($string)
102
    {
103
        $parts = array_map('trim', mb_split('/', $string));
104
        if (count($parts) != 2) {
105
            return false;
106
        }
107
        $multiplier = 1;
108
        switch (mb_substr(mb_strtolower(preg_replace('/[^A-Za-z]/', '', $parts[1])), 0, 1)) {
109
            case 'm':
110
                $multiplier = 60;
111
                break;
112
            case 'h':
113
                $multiplier = 3600;
114
                break;
115
            case 'd':
116
                $multiplier = 86400;
117
                break;
118
        }
119
        $num = floatval(preg_replace('/[^0-9]/', '', $parts[0]));
120
        $sec = floatval(preg_replace('/[^0-9.]/', '', $parts[1])) * $multiplier;
121
        $rate = $sec / $num;
122
        return $rate > 0 ? $rate : false;
123
    }
124
125
    /**
126
     * Client timestamp range as specified in the `Robot exclusion standard` version 2.0 draft
127
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.visit-time
128
     *
129
     * @param $string
130
     * @return array|bool
131
     */
132
    private function draftParseTime($string)
133
    {
134
        $array = preg_replace('/[^0-9]/', '', mb_split('-', $string));
135
        if (
136
            count($array) != 2 ||
137
            ($fromTime = date_create_from_format('Hi', $array[0], new DateTimeZone('UTC'))) === false ||
138
            ($toTime = date_create_from_format('Hi', $array[1], new DateTimeZone('UTC'))) === false
139
        ) {
140
            return false;
141
        }
142
        return [
143
            'from' => date_format($fromTime, 'Hi'),
144
            'to' => date_format($toTime, 'Hi'),
145
        ];
146
    }
147
148
    /**
149
     * Validate directive
150
     *
151
     * @param string $directive
152
     * @param array $directives
153
     * @return string
154
     * @throws ParserException
155
     */
156
    private function validateDirective($directive, array $directives)
157
    {
158
        if (!in_array($directive, $directives, true)) {
159
            throw new ParserException('Directive is not allowed here');
160
        }
161
        return mb_strtolower($directive);
162
    }
163
}
164