Completed
Push — master ( bb4c66...673514 )
by Jan-Petter
03:55
created

XRobotsTagParser   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 222
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 5

Importance

Changes 27
Bugs 6 Features 1
Metric Value
wmc 24
c 27
b 6
f 1
lcom 1
cbo 5
dl 0
loc 222
rs 10

9 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 2
A parse() 0 13 4
B detectDirectives() 0 16 5
A addRule() 0 14 3
A cleanup() 0 5 1
A matchUserAgent() 0 7 2
A getRules() 0 18 4
A export() 0 4 1
A getDirectiveMeaning() 0 8 2
1
<?php
2
namespace vipnytt;
3
4
use vipnytt\XRobotsTagParser\Directives;
5
use vipnytt\XRobotsTagParser\Exceptions\XRobotsTagParserException;
6
use vipnytt\XRobotsTagParser\Rebuild;
7
8
/**
9
 * Class XRobotsTagParser
10
 * X-Robots-Tag HTTP header parser
11
 *
12
 * @package vipnytt
13
 *
14
 * @author VIP nytt ([email protected])
15
 * @author Jan-Petter Gundersen ([email protected])
16
 *
17
 * Project:
18
 * @link https://github.com/VIPnytt/X-Robots-Tag-parser
19
 * @license https://opensource.org/licenses/MIT MIT license
20
 *
21
 * Specification:
22
 * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag#using-the-x-robots-tag-http-header
23
 */
24
class XRobotsTagParser
25
{
26
    /**
27
     * HTTP header prefix
28
     */
29
    const HEADER_RULE_IDENTIFIER = 'X-Robots-Tag';
30
31
    /**
32
     * Directives
33
     */
34
    const DIRECTIVE_ALL = 'all';
35
    const DIRECTIVE_NONE = 'none';
36
    const DIRECTIVE_NO_ARCHIVE = 'noarchive';
37
    const DIRECTIVE_NO_FOLLOW = 'nofollow';
38
    const DIRECTIVE_NO_IMAGE_INDEX = 'noimageindex';
39
    const DIRECTIVE_NO_INDEX = 'noindex';
40
    const DIRECTIVE_NO_ODP = 'noodp';
41
    const DIRECTIVE_NO_SNIPPET = 'nosnippet';
42
    const DIRECTIVE_NO_TRANSLATE = 'notranslate';
43
    const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after';
44
45
    const DIRECTIVES = [
46
        self::DIRECTIVE_ALL => 'There are no restrictions for indexing or serving. Note: this directive is the default value and has no effect if explicitly listed.',
47
        self::DIRECTIVE_NO_ARCHIVE => 'Do not show a `Cached` link in search results.',
48
        self::DIRECTIVE_NO_FOLLOW => 'Do not follow the links on this page.',
49
        self::DIRECTIVE_NO_IMAGE_INDEX => 'Do not index images on this page.',
50
        self::DIRECTIVE_NO_INDEX => 'Do not show this page in search results and do not show a `Cached` link in search results.',
51
        self::DIRECTIVE_NONE => 'Equivalent to `noindex` and `nofollow`.',
52
        self::DIRECTIVE_NO_ODP => 'Do not use metadata from the `Open Directory project` (http://dmoz.org/) for titles or snippets shown for this page.',
53
        self::DIRECTIVE_NO_SNIPPET => 'Do not show a snippet in the search results for this page.',
54
        self::DIRECTIVE_NO_TRANSLATE => 'Do not offer translation of this page in search results.',
55
        self::DIRECTIVE_UNAVAILABLE_AFTER => 'Do not show this page in search results after the specified date/time.',
56
    ];
57
58
    /**
59
     * User-Agent string
60
     *
61
     * @var string
62
     */
63
    protected $userAgent = '';
64
65
    /**
66
     * User-Agent for rule selection
67
     *
68
     * @var string
69
     */
70
    protected $userAgentMatch = '';
71
72
    /**
73
     * Current rule
74
     *
75
     * @var string
76
     */
77
    protected $currentRule = '';
78
79
    /**
80
     * User-Agent for the current rule
81
     *
82
     * @var string
83
     */
84
    protected $currentUserAgent;
85
86
    /**
87
     * Rule array
88
     *
89
     * @var array
90
     */
91
    protected $rules = [];
92
93
    /**
94
     * Constructor
95
     *
96
     * @param string $userAgent
97
     * @param array $headers
98
     */
99
    public function __construct($userAgent = '', $headers = null)
100
    {
101
        $this->userAgent = $userAgent;
102
        if (isset($headers)) {
103
            $this->parse($headers);
104
        }
105
    }
106
107
    /**
108
     * Parse HTTP headers
109
     *
110
     * @param array $headers
111
     * @return void
112
     */
113
    public function parse(array $headers)
114
    {
115
        foreach ($headers as $header) {
116
            $parts = array_map('trim', mb_split(':', mb_strtolower($header), 2));
117
            if (count($parts) < 2 || $parts[0] != mb_strtolower(self::HEADER_RULE_IDENTIFIER)) {
118
                // Header is not a rule
119
                continue;
120
            }
121
            $this->currentRule = $parts[1];
122
            $this->detectDirectives();
123
        }
124
        $this->matchUserAgent();
125
    }
126
127
    /**
128
     * Detect directives in rule
129
     *
130
     * @return void
131
     */
132
    protected function detectDirectives()
133
    {
134
        $directives = array_map('trim', mb_split(',', $this->currentRule));
135
        $pair = array_map('trim', mb_split(':', $directives[0], 2));
136
        if (count($pair) == 2 && !in_array($pair[0], array_keys(self::DIRECTIVES))) {
137
            $this->currentUserAgent = $pair[0];
138
            $directives[0] = $pair[1];
139
        }
140
        foreach ($directives as $rule) {
141
            $directive = trim(mb_split(':', $rule, 2)[0]);
142
            if (in_array($directive, array_keys(self::DIRECTIVES))) {
143
                $this->addRule($directive);
144
            }
145
        }
146
        $this->cleanup();
147
    }
148
149
    /**
150
     * Add rule
151
     *
152
     * @param string $directive
153
     * @return void
154
     * @throws XRobotsTagParserException
155
     */
156
    protected function addRule($directive)
157
    {
158
        if (!isset($this->rules[$this->currentUserAgent])) {
159
            $this->rules[$this->currentUserAgent] = [];
160
        }
161
        switch ($directive) {
162
            case self::DIRECTIVE_UNAVAILABLE_AFTER:
163
                $object = new Directives\UnavailableAfter($directive, $this->currentRule);
164
                break;
165
            default:
166
                $object = new Directives\Basic($directive, $this->currentRule);
167
        }
168
        $this->rules[$this->currentUserAgent] = array_merge($this->rules[$this->currentUserAgent], [$object->getDirective() => $object->getValue()]);
169
    }
170
171
    /**
172
     * Cleanup before next rule is read
173
     *
174
     * @return void
175
     */
176
    protected function cleanup()
177
    {
178
        $this->currentRule = '';
179
        $this->currentUserAgent = '';
180
    }
181
182
    /**
183
     * Find the most rule-matching User-Agent
184
     *
185
     * @return string
186
     */
187
    protected function matchUserAgent()
188
    {
189
        $userAgentParser = new UserAgentParser($this->userAgent);
190
        $match = $userAgentParser->match(array_keys($this->rules));
191
        $this->userAgentMatch = ($match !== false) ? $match : '';
192
        return $this->userAgentMatch;
193
    }
194
195
    /**
196
     * Return all applicable rules
197
     *
198
     * @param bool $raw
199
     * @return array
200
     */
201
    public function getRules($raw = false)
202
    {
203
        $rules = [];
204
        // Default UserAgent
205
        if (isset($this->rules[''])) {
206
            $rules = array_merge($rules, $this->rules['']);
207
        }
208
        // Matching UserAgent
209
        if (isset($this->rules[$this->userAgentMatch])) {
210
            $rules = array_merge($rules, $this->rules[$this->userAgentMatch]);
211
        }
212
        if (!$raw) {
213
            $rebuild = new Rebuild($rules);
214
            $rules = $rebuild->getResult();
215
        }
216
        // Result
217
        return $rules;
218
    }
219
220
    /**
221
     * Export all rules for all UserAgents
222
     *
223
     * @return array
224
     */
225
    public function export()
226
    {
227
        return $this->rules;
228
    }
229
230
    /**
231
     * Get the meaning of an Directive
232
     *
233
     * @param string $directive
234
     * @return string
235
     * @throws XRobotsTagParserException
236
     */
237
    public function getDirectiveMeaning($directive)
238
    {
239
        $directive = mb_strtolower($directive);
240
        if (!in_array($directive, array_keys(self::DIRECTIVES))) {
241
            throw new XRobotsTagParserException('Unknown directive');
242
        }
243
        return self::DIRECTIVES[$directive];
244
    }
245
}
246