Completed
Push — master ( 1084b1...9b56e4 )
by Jan-Petter
02:27
created

XRobotsTagParser::selectHeaderSource()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
c 1
b 1
f 0
dl 0
loc 8
rs 9.4285
cc 3
eloc 4
nc 2
nop 0
1
<?php
2
namespace vipnytt;
3
4
/**
5
 * X-Robots-Tag HTTP header parser class
6
 *
7
 * @author VIP nytt ([email protected])
8
 * @author Jan-Petter Gundersen ([email protected])
9
 *
10
 * Project:
11
 * @link https://github.com/VIPnytt/X-Robots-Tag-parser
12
 * @license https://opensource.org/licenses/MIT MIT license
13
 *
14
 * Specification:
15
 * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag#using-the-x-robots-tag-http-header
16
 */
17
18
use GuzzleHttp;
19
use vipnytt\XRobotsTagParser\Exceptions\XRobotsTagParserException;
20
use vipnytt\XRobotsTagParser\Rebuild;
21
use vipnytt\XRobotsTagParser\UserAgentParser;
22
23
class XRobotsTagParser
24
{
25
    const HEADER_RULE_IDENTIFIER = 'X-Robots-Tag';
26
    const USERAGENT_DEFAULT = '';
27
28
    const DIRECTIVE_ALL = 'all';
29
    const DIRECTIVE_NONE = 'none';
30
    const DIRECTIVE_NO_ARCHIVE = 'noarchive';
31
    const DIRECTIVE_NO_FOLLOW = 'nofollow';
32
    const DIRECTIVE_NO_IMAGE_INDEX = 'noimageindex';
33
    const DIRECTIVE_NO_INDEX = 'noindex';
34
    const DIRECTIVE_NO_ODP = 'noodp';
35
    const DIRECTIVE_NO_SNIPPET = 'nosnippet';
36
    const DIRECTIVE_NO_TRANSLATE = 'notranslate';
37
    const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after';
38
39
    protected $url = '';
40
    protected $userAgent = self::USERAGENT_DEFAULT;
41
    protected $userAgentMatch = self::USERAGENT_DEFAULT;
42
    protected $config = [];
43
44
    protected $headers = [];
45
    protected $currentRule = '';
46
    protected $currentUserAgent = self::USERAGENT_DEFAULT;
47
48
    protected $rules = [];
49
50
    /**
51
     * Constructor
52
     *
53
     * @param string $url
54
     * @param string $userAgent
55
     * @param array $config
56
     * @throws XRobotsTagParserException
57
     */
58
    public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, array $config = [])
59
    {
60
        $this->url = $url;
61
        if (!filter_var($this->url, FILTER_VALIDATE_URL)) {
62
            throw new XRobotsTagParserException('Invalid URL provided');
63
        }
64
        // User-Agent for HTTP request
65
        $this->userAgent = $userAgent;
66
        // Set any optional configuration options
67
        $this->config = $config;
68
        // Parse rules
69
        $this->parse();
70
        // User-Agent matching rules
71
        $parser = new UserAgentParser($this->userAgent);
72
        $this->userAgentMatch = $parser->match(array_keys($this->rules), self::USERAGENT_DEFAULT);
73
    }
74
75
    /**
76
     * Parse HTTP headers
77
     *
78
     * @return void
79
     */
80
    protected function parse()
81
    {
82
        $this->headers = $this->selectHeaderSource();
83
        foreach ($this->headers as $header) {
84
            $parts = array_map('trim', explode(':', mb_strtolower($header), 2));
85
            if (count($parts) < 2 || $parts[0] != mb_strtolower(self::HEADER_RULE_IDENTIFIER)) {
86
                // Header is not a rule
87
                continue;
88
            }
89
            $this->currentRule = $parts[1];
90
            $this->detectDirectives();
91
        }
92
    }
93
94
    /**
95
     * Select HTTP header source
96
     *
97
     * @return array
98
     */
99
    protected function selectHeaderSource()
100
    {
101
        if (isset($this->config['headers']) && is_array($this->config['headers'])) {
102
            return $this->config['headers'];
103
        }
104
        // No provided HTTP headers
105
        return $this->getHeaders();
106
    }
107
108
    /**
109
     * Request the HTTP headers from an URL
110
     *
111
     * @return array Raw HTTP headers
112
     * @throws XRobotsTagParserException
113
     */
114
    protected function getHeaders()
115
    {
116
        try {
117
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
118
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
119
            }
120
            $client = new GuzzleHttp\Client();
121
            $res = $client->head($this->url, $this->config['guzzle']);
122
            $headers = [];
123
            foreach ($res->getHeader(self::HEADER_RULE_IDENTIFIER) as $name => $values) {
124
                $headers[] = $name . ': ' . implode(' ', $values) . "\r\n";
125
            }
126
            return $headers;
127
        } catch (GuzzleHttp\Exception\TransferException $e) {
128
            throw new XRobotsTagParserException($e->getMessage());
129
        }
130
    }
131
132
    /**
133
     * Detect directives in rule
134
     *
135
     * @return void
136
     */
137
    protected function detectDirectives()
138
    {
139
        $directives = array_map('trim', explode(',', $this->currentRule));
140
        $pair = array_map('trim', explode(':', $directives[0], 2));
141
        if (count($pair) == 2 && !in_array($pair[0], array_keys($this->directiveClasses()))) {
142
            $this->currentUserAgent = $pair[0];
143
            $directives[0] = $pair[1];
144
        }
145
        foreach ($directives as $rule) {
146
            $directive = trim(explode(':', $rule, 2)[0]);
147
            if (in_array($directive, array_keys($this->directiveClasses()))) {
148
                $this->addRule($directive);
149
            }
150
        }
151
        $this->cleanup();
152
    }
153
154
    /**
155
     * Array of directives and their class names
156
     *
157
     * @return array
158
     */
159
    protected function directiveClasses()
160
    {
161
        return [
162
            self::DIRECTIVE_ALL => 'All',
163
            self::DIRECTIVE_NO_ARCHIVE => 'NoArchive',
164
            self::DIRECTIVE_NO_FOLLOW => 'NoFollow',
165
            self::DIRECTIVE_NO_IMAGE_INDEX => 'NoImageIndex',
166
            self::DIRECTIVE_NO_INDEX => 'NoIndex',
167
            self::DIRECTIVE_NONE => 'None',
168
            self::DIRECTIVE_NO_ODP => 'NoODP',
169
            self::DIRECTIVE_NO_SNIPPET => 'NoSnippet',
170
            self::DIRECTIVE_NO_TRANSLATE => 'NoTranslate',
171
            self::DIRECTIVE_UNAVAILABLE_AFTER => 'UnavailableAfter',
172
        ];
173
    }
174
175
    /**
176
     * Add rule
177
     *
178
     * @param string $directive
179
     * @return void
180
     * @throws XRobotsTagParserException
181
     */
182
    protected function addRule($directive)
183
    {
184
        if (!isset($this->rules[$this->currentUserAgent])) {
185
            $this->rules[$this->currentUserAgent] = [];
186
        }
187
        $class = "\\" . __CLASS__ . "\\directives\\" . $this->directiveClasses()[$directive];
188
        $object = new $class($this->currentRule);
189
        if (!$object instanceof XRobotsTagParser\directives\directiveInterface) {
190
            throw new XRobotsTagParserException('Unsupported directive class');
191
        }
192
        $this->rules[$this->currentUserAgent] = array_merge($this->rules[$this->currentUserAgent], [$object->getDirective() => $object->getValue()]);
193
    }
194
195
    /**
196
     * Cleanup before next rule is read
197
     *
198
     * @return void
199
     */
200
    protected function cleanup()
201
    {
202
        $this->currentRule = '';
203
        $this->currentUserAgent = self::USERAGENT_DEFAULT;
204
    }
205
206
    /**
207
     * Return all applicable rules
208
     *
209
     * @param bool $raw
210
     * @return array
211
     */
212
    public function getRules($raw = false)
213
    {
214
        $rules = [];
215
        // Default UserAgent
216
        if (isset($this->rules[self::USERAGENT_DEFAULT])) {
217
            $rules = array_merge($rules, $this->rules[self::USERAGENT_DEFAULT]);
218
        }
219
        // Matching UserAgent
220
        if (isset($this->rules[$this->userAgentMatch])) {
221
            $rules = array_merge($rules, $this->rules[$this->userAgentMatch]);
222
        }
223
        if (!$raw) {
224
            $rebuild = new Rebuild($rules);
225
            $rules = $rebuild->getResult();
226
        }
227
        // Result
228
        return $rules;
229
    }
230
231
    /**
232
     * Export all rules for all UserAgents
233
     *
234
     * @return array
235
     */
236
    public function export()
237
    {
238
        return $this->rules;
239
    }
240
241
    /**
242
     * Get the meaning of an Directive
243
     *
244
     * @param string $directive
245
     * @return string
246
     * @throws XRobotsTagParserException
247
     */
248
    public function getDirectiveMeaning($directive)
249
    {
250
        if (!in_array($directive, array_keys($this->directiveClasses()))) {
251
            throw new XRobotsTagParserException('Unknown directive');
252
        }
253
        $class = "\\" . __CLASS__ . "\\directives\\" . $this->directiveClasses()[$directive];
254
        $object = new $class($this->directiveClasses()[$directive]);
255
        if (!$object instanceof XRobotsTagParser\directives\directiveInterface) {
256
            throw new XRobotsTagParserException('Unsupported directive class');
257
        }
258
        return $object->getMeaning();
259
    }
260
}
261