Completed
Branch 2.0-dev (131e57)
by Jan-Petter
02:08
created

Toolbox::checkPath()   D

Complexity

Conditions 9
Paths 25

Size

Total Lines 43
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 43
rs 4.909
cc 9
eloc 18
nc 25
nop 2
1
<?php
2
namespace vipnytt\RobotsTxtParser\Core;
3
4
use DateTimeZone;
5
use vipnytt\RobotsTxtParser\Exceptions\ParserException;
6
7
/**
8
 * Trait Toolbox
9
 *
10
 * @package vipnytt\RobotsTxtParser\Core
11
 */
12
trait Toolbox
13
{
14
    /**
15
     * Check basic rule
16
     *
17
     * @param string $path
18
     * @param array $paths
19
     * @return bool
20
     */
21
    protected function checkPath($path, array $paths)
22
    {
23
        foreach ($paths as $rule) {
24
            $escape = ['?' => '\?', '.' => '\.', '*' => '.*'];
25
            foreach ($escape as $search => $replace) {
26
                $rule = str_replace($search, $replace, $rule);
27
            }
28
            /**
29
             * Warning: preg_match need to be replaced
30
             *
31
             * Bug report
32
             * @link https://github.com/t1gor/Robots.txt-Core-Class/issues/62
33
             *
34
             * An robots.txt parser, where a bug-fix is planned
35
             * @link https://github.com/diggin/Diggin_RobotRules
36
             *
37
             * The solution?
38
             * PHP PEG (parsing expression grammar)
39
             * @link https://github.com/hafriedlander/php-peg
40
             */
41
            try {
42
                if (!preg_match('#' . $rule . '#', $path)) {
43
                    // Rule does not match
44
                    continue;
45
                } else if (mb_stripos($rule, '$') === false) {
46
                    // No special parsing required
47
                    return true;
48
                } else if (($wildcardPos = mb_strrpos($rule, '*')) !== false) {
49
                    // Rule contains both an end anchor ($) and wildcard (*)
50
                    $afterWildcard = mb_substr($rule, $wildcardPos + 1, mb_strlen($rule) - $wildcardPos - 2);
51
                    if ($afterWildcard == mb_substr($path, -mb_strlen($afterWildcard))) {
52
                        return true;
53
                    }
54
                } else if (mb_substr($rule, 0, -1) == $path) {
55
                    // Rule does contains an end anchor
56
                    return true;
57
                }
58
            } catch (\Exception $e) {
59
                // An preg_match bug has occurred
60
            }
61
        }
62
        return false;
63
    }
64
65
    /**
66
     * Generate directive/rule pair
67
     *
68
     * @param string $line
69
     * @param array $whiteList
70
     * @return array|false
71
     */
72
    protected function generateRulePair($line, array $whiteList)
73
    {
74
        $whiteList = array_map('mb_strtolower', $whiteList);
75
        // Split by directive and rule
76
        $pair = array_map('trim', mb_split(':', $line, 2));
77
        // Check if the line contains a rule
78
        if (
79
            empty($pair[1]) ||
80
            empty($pair[0]) ||
81
            !in_array(($pair[0] = mb_strtolower($pair[0])), $whiteList)
82
        ) {
83
            // Line does not contain any supported directive
84
            return false;
85
        }
86
        return [
87
            'directive' => $pair[0],
88
            'value' => $pair[1],
89
        ];
90
    }
91
92
    /**
93
     * Validate directive
94
     *
95
     * @param string $directive
96
     * @param array $directives
97
     * @return string
98
     * @throws ParserException
99
     */
100
    protected function validateDirective($directive, array $directives)
101
    {
102
        if (!in_array($directive, $directives, true)) {
103
            throw new ParserException('Directive is not allowed here');
104
        }
105
        return mb_strtolower($directive);
106
    }
107
108
    /**
109
     * Client rate as specified in the `Robot exclusion standard` version 2.0 draft
110
     * rate = numDocuments / timeUnit
111
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.request-rate
112
     *
113
     * @param $string
114
     * @return int|float|false
115
     */
116
    protected function draftParseRate($string)
117
    {
118
        $parts = array_map('trim', mb_split('/', $string));
119
        if (count($parts) != 2) {
120
            return false;
121
        }
122
        $multiplier = 1;
123
        switch (mb_substr(mb_strtolower(preg_replace('/[^A-Za-z]/', '', $parts[1])), 0, 1)) {
124
            case 'm':
125
                $multiplier = 60;
126
                break;
127
            case 'h':
128
                $multiplier = 3600;
129
                break;
130
            case 'd':
131
                $multiplier = 86400;
132
                break;
133
        }
134
        $num = floatval(preg_replace('/[^0-9]/', '', $parts[0]));
135
        $sec = floatval(preg_replace('/[^0-9.]/', '', $parts[1])) * $multiplier;
136
        $rate = $sec / $num;
137
        return $rate > 0 ? $rate : false;
138
    }
139
140
    /**
141
     * Client timestamp range as specified in the `Robot exclusion standard` version 2.0 draft
142
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.visit-time
143
     *
144
     * @param $string
145
     * @return array|bool
146
     */
147
    protected function draftParseTime($string)
148
    {
149
        $array = preg_replace('/[^0-9]/', '', mb_split('-', $string));
150
        if (
151
            count($array) != 2 ||
152
            ($fromTime = date_create_from_format('Hi', $array[0], new DateTimeZone('UTC'))) === false ||
153
            ($toTime = date_create_from_format('Hi', $array[1], new DateTimeZone('UTC'))) === false
154
        ) {
155
            return false;
156
        }
157
        return [
158
            'from' => date_format($fromTime, 'Hi'),
159
            'to' => date_format($toTime, 'Hi'),
160
        ];
161
    }
162
}
163