Completed
Push — master ( 4cd08d...5753af )
by Jan-Petter
04:55
created

XRobotsTagParser::parse()   B

Complexity

Conditions 5
Paths 6

Size

Total Lines 16
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 9
Bugs 1 Features 0
Metric Value
c 9
b 1
f 0
dl 0
loc 16
rs 8.8571
cc 5
eloc 9
nc 6
nop 0
1
<?php
2
namespace vipnytt;
3
4
/**
5
 * X-Robots-Tag HTTP header parser class
6
 *
7
 * @author VIP nytt ([email protected])
8
 * @author Jan-Petter Gundersen ([email protected])
9
 *
10
 * Project:
11
 * @link https://github.com/VIPnytt/X-Robots-Tag-parser
12
 * @license https://opensource.org/licenses/MIT MIT license
13
 *
14
 * Specification:
15
 * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag#using-the-x-robots-tag-http-header
16
 */
17
18
use GuzzleHttp;
19
use vipnytt\XRobotsTagParser\Exceptions\XRobotsTagParserException;
20
use vipnytt\XRobotsTagParser\Rebuild;
21
use vipnytt\XRobotsTagParser\URLParser;
22
use vipnytt\XRobotsTagParser\UserAgentParser;
23
24
class XRobotsTagParser
25
{
26
    const HEADER_RULE_IDENTIFIER = 'x-robots-tag';
27
    const USERAGENT_DEFAULT = '';
28
29
    const DIRECTIVE_ALL = 'all';
30
    const DIRECTIVE_NONE = 'none';
31
    const DIRECTIVE_NO_ARCHIVE = 'noarchive';
32
    const DIRECTIVE_NO_FOLLOW = 'nofollow';
33
    const DIRECTIVE_NO_IMAGE_INDEX = 'noimageindex';
34
    const DIRECTIVE_NO_INDEX = 'noindex';
35
    const DIRECTIVE_NO_ODP = 'noodp';
36
    const DIRECTIVE_NO_SNIPPET = 'nosnippet';
37
    const DIRECTIVE_NO_TRANSLATE = 'notranslate';
38
    const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after';
39
40
    protected $url = '';
41
    protected $userAgent = self::USERAGENT_DEFAULT;
42
    protected $config = [];
43
44
    protected $headers = [];
45
    protected $currentRule = '';
46
    protected $currentUserAgent = self::USERAGENT_DEFAULT;
47
48
    protected $options = [];
49
    protected $rules = [];
50
51
    /**
52
     * Constructor
53
     *
54
     * @param string $url
55
     * @param string $userAgent
56
     * @param array $options
57
     * @throws XRobotsTagParserException
58
     */
59
    public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, array $options = [])
60
    {
61
        // Parse URL
62
        $urlParser = new URLParser(trim($url));
63
        if (!$urlParser->isValid()) {
64
            throw new XRobotsTagParserException('Invalid URL');
65
        }
66
        // Encode URL
67
        $this->url = $urlParser->encode();
68
        // Set any optional options
69
        $this->options = $options;
70
        if (isset($this->config['headers']) && is_array($this->config['headers'])) {
71
            $this->headers = $this->config['headers'];
72
        }
73
        // Parse rules
74
        $this->parse();
75
        // Set User-Agent
76
        $parser = new UserAgentParser($userAgent);
77
        $this->userAgent = $parser->match(array_keys($this->rules), self::USERAGENT_DEFAULT);
78
    }
79
80
    /**
81
     * Parse HTTP headers
82
     *
83
     * @return void
84
     */
85
    protected function parse()
86
    {
87
        if (empty($this->headers)) {
88
            $this->getHeaders();
89
        }
90
        foreach ($this->headers as $header) {
91
            $parts = array_map('trim', explode(':', mb_strtolower($header), 2));
92
            if (count($parts) < 2 || $parts[0] != self::HEADER_RULE_IDENTIFIER) {
93
                // Header is not a rule
94
                continue;
95
            }
96
            $this->currentRule = $parts[1];
97
            $this->detectDirectives();
98
        }
99
100
    }
101
102
    /**
103
     * Request the HTTP headers from an URL
104
     *
105
     * @return array Raw HTTP headers
106
     * @throws XRobotsTagParserException
107
     */
108
    protected function getHeaders()
109
    {
110
        if (!filter_var($this->url, FILTER_VALIDATE_URL)) {
111
            throw new XRobotsTagParserException('Passed URL not valid according to the filter_var function');
112
        }
113
        try {
114
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
115
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
116
            }
117
            $client = new GuzzleHttp\Client();
118
            $res = $client->head($this->url, $this->config['guzzle']);
119
            return $res->getHeaders();
120
        } catch (GuzzleHttp\Exception\TransferException $e) {
121
            throw new XRobotsTagParserException($e->getMessage());
122
        }
123
    }
124
125
    /**
126
     * Detect directives in rule
127
     *
128
     * @return void
129
     */
130
    protected function detectDirectives()
131
    {
132
        $directives = array_map('trim', explode(',', $this->currentRule));
133
        $pair = array_map('trim', explode(':', $directives[0], 2));
134
        if (count($pair) == 2 && !in_array($pair[0], array_keys($this->directiveClasses()))) {
135
            $this->currentUserAgent = $pair[0];
136
            $directives[0] = $pair[1];
137
        }
138
        foreach ($directives as $rule) {
139
            $directive = trim(explode(':', $rule, 2)[0]);
140
            if (in_array($directive, array_keys($this->directiveClasses()))) {
141
                $this->addRule($this->directiveClasses()[$directive]);
142
            }
143
        }
144
        $this->cleanup();
145
    }
146
147
    /**
148
     * Array of directives and their class names
149
     *
150
     * @return array
151
     */
152
    protected function directiveClasses()
153
    {
154
        return [
155
            self::DIRECTIVE_ALL => 'All',
156
            self::DIRECTIVE_NO_ARCHIVE => 'NoArchive',
157
            self::DIRECTIVE_NO_FOLLOW => 'NoFollow',
158
            self::DIRECTIVE_NO_IMAGE_INDEX => 'NoImageIndex',
159
            self::DIRECTIVE_NO_INDEX => 'NoIndex',
160
            self::DIRECTIVE_NONE => 'None',
161
            self::DIRECTIVE_NO_ODP => 'NoODP',
162
            self::DIRECTIVE_NO_SNIPPET => 'NoSnippet',
163
            self::DIRECTIVE_NO_TRANSLATE => 'NoTranslate',
164
            self::DIRECTIVE_UNAVAILABLE_AFTER => 'UnavailableAfter',
165
        ];
166
    }
167
168
    /**
169
     * Add rule
170
     *
171
     * @param string $directive
172
     * @return void
173
     * @throws XRobotsTagParserException
174
     */
175
    protected function addRule($directive)
176
    {
177
        if (!isset($this->rules[$this->currentUserAgent])) {
178
            $this->rules[$this->currentUserAgent] = [];
179
        }
180
        $class = "XRobotsTagParser\\directives\\$directive";
181
        $object = new $class($this->currentRule);
182
        if (!$object instanceof XRobotsTagParser\directives\directiveInterface) {
183
            throw new XRobotsTagParserException('Unsupported directive class');
184
        }
185
        $this->rules[$this->currentUserAgent] = array_merge($this->rules[$this->currentUserAgent], [$object->getDirective() => $object->getValue()]);
186
    }
187
188
    /**
189
     * Cleanup before next rule is read
190
     *
191
     * @return void
192
     */
193
    protected function cleanup()
194
    {
195
        $this->currentRule = '';
196
        $this->currentUserAgent = self::USERAGENT_DEFAULT;
197
    }
198
199
    /**
200
     * Return all applicable rules
201
     *
202
     * @param bool $raw
203
     * @return array
204
     */
205
    public function getRules($raw = false)
206
    {
207
        $rules = [];
208
        // Default UserAgent
209
        if (isset($this->rules[self::USERAGENT_DEFAULT])) {
210
            $rules = array_merge($rules, $this->rules[self::USERAGENT_DEFAULT]);
211
        }
212
        // Matching UserAgent
213
        if (isset($this->rules[$this->userAgent])) {
214
            $rules = array_merge($rules, $this->rules[$this->userAgent]);
215
        }
216
        if (!$raw) {
217
            $rebuild = new Rebuild($rules);
218
            $rules = $rebuild->getResult();
219
        }
220
        // Result
221
        return $rules;
222
    }
223
224
    /**
225
     * Export all rules for all UserAgents
226
     *
227
     * @return array
228
     */
229
    public function export()
230
    {
231
        return $this->rules;
232
    }
233
}
234