1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Bee4\RobotsTxt; |
4
|
|
|
|
5
|
|
|
use Bee4\RobotsTxt\Exception\RuntimeException; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Class Parser |
9
|
|
|
* Take the content of a robots.txt file and transform it to rules |
10
|
|
|
* |
11
|
|
|
* @copyright Bee4 2015 |
12
|
|
|
* @author Stephane HULARD <[email protected]> |
13
|
|
|
*/ |
14
|
|
|
class Parser |
15
|
|
|
{ |
16
|
|
|
/** |
17
|
|
|
* Transform file content to structured Rules |
18
|
|
|
* @param string|Content $content |
19
|
|
|
* @return Rules |
20
|
|
|
*/ |
21
|
4 |
|
public static function parse($content) |
22
|
|
|
{ |
23
|
4 |
|
if (is_string($content)) { |
24
|
2 |
|
$content = new Content($content); |
25
|
2 |
|
} |
26
|
4 |
|
if (!($content instanceof Content)) { |
27
|
|
|
throw new RuntimeException( |
28
|
|
|
'You must use a `string` or a `Content` instance to the `Parser`!' |
29
|
|
|
); |
30
|
|
|
} |
31
|
|
|
|
32
|
4 |
|
$rules = new Rules(); |
33
|
4 |
|
$userAgent = $rule = null; |
34
|
4 |
|
$separator = "\r\n"; |
35
|
4 |
|
$line = strtok($content->get(), $separator); |
36
|
4 |
|
while ($line !== false) { |
37
|
2 |
|
if (strpos($line, '#') !== 0) { |
38
|
2 |
|
if (preg_match('/^User-Agent\: (.*)$/i', $line, $matches)) { |
39
|
2 |
|
if ($userAgent !== null && $rule !== null) { |
40
|
2 |
|
$rules->add($userAgent, $rule); |
41
|
2 |
|
} |
42
|
2 |
|
$userAgent = $matches[1]; |
43
|
2 |
|
$rule = new Rule(); |
44
|
2 |
|
} elseif (preg_match('/^Allow: (.*)$/i', $line, $matches)) { |
45
|
2 |
|
$rule->allow($matches[1]); |
46
|
2 |
|
} elseif (preg_match('/^Disallow: (.*)$/i', $line, $matches)) { |
47
|
2 |
|
$rule->disallow($matches[1]); |
48
|
2 |
|
} |
49
|
2 |
|
} |
50
|
|
|
|
51
|
2 |
|
$line = strtok($separator); |
52
|
2 |
|
} |
53
|
|
|
//Handle the last item in the loop |
54
|
4 |
|
if ($rule instanceof Rule) { |
55
|
2 |
|
$rules->add($userAgent, $rule); |
56
|
1 |
|
} |
57
|
|
|
|
58
|
3 |
|
return $rules; |
59
|
|
|
} |
60
|
|
|
} |
61
|
|
|
|