1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Client\Download; |
5
|
|
|
use vipnytt\RobotsTxtParser\Client\UserAgentClient; |
6
|
|
|
use vipnytt\RobotsTxtParser\Parser\StatusCodeParser; |
7
|
|
|
use vipnytt\UserAgentParser; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class Parser |
11
|
|
|
* |
12
|
|
|
* @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt |
13
|
|
|
* @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml |
14
|
|
|
* @link http://www.robotstxt.org/robotstxt.html |
15
|
|
|
* @link https://www.w3.org/TR/html4/appendix/notes.html#h-B.4.1.1 |
16
|
|
|
* |
17
|
|
|
* @package vipnytt\RobotsTxtParser |
18
|
|
|
*/ |
19
|
|
|
class Client extends Parser |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* HTTP status code parser |
23
|
|
|
* @var StatusCodeParser |
24
|
|
|
*/ |
25
|
|
|
protected $statusCodeParser; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* Robots.txt base |
29
|
|
|
* @var string |
30
|
|
|
*/ |
31
|
|
|
protected $baseUrl; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Status code |
35
|
|
|
* @var int|null |
36
|
|
|
*/ |
37
|
|
|
protected $statusCode; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Parser constructor. |
41
|
|
|
* |
42
|
|
|
* @param string $baseUrl |
43
|
|
|
* @param int|null $statusCode |
44
|
|
|
* @param string|null $content |
45
|
|
|
* @param string $encoding |
46
|
|
|
* @param int $byteLimit |
47
|
|
|
*/ |
48
|
|
|
public function __construct($baseUrl, $statusCode = null, $content = null, $encoding = self::ENCODING, $byteLimit = self::BYTE_LIMIT) |
49
|
|
|
{ |
50
|
|
|
$this->baseUrl = $baseUrl; |
51
|
|
|
$this->statusCode = $statusCode; |
52
|
|
|
if ($content === null) { |
53
|
|
|
$client = new Download($this->baseUrl); |
54
|
|
|
$this->statusCode = $client->getStatusCode(); |
55
|
|
|
$content = $client->getBody(); |
56
|
|
|
$encoding = $client->getEncoding(); |
57
|
|
|
} |
58
|
|
|
parent::__construct($content, $encoding, $byteLimit); |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* Get sitemaps |
63
|
|
|
* |
64
|
|
|
* @return array |
65
|
|
|
*/ |
66
|
|
|
public function getSitemaps() |
67
|
|
|
{ |
68
|
|
|
if (isset($this->sitemap->export()[self::DIRECTIVE_SITEMAP])) { |
69
|
|
|
return $this->sitemap->export()[self::DIRECTIVE_SITEMAP]; |
70
|
|
|
} |
71
|
|
|
return []; |
72
|
|
|
} |
73
|
|
|
|
74
|
|
|
/** |
75
|
|
|
* Get host |
76
|
|
|
* |
77
|
|
|
* @return string|null |
78
|
|
|
*/ |
79
|
|
View Code Duplication |
public function getHost() |
|
|
|
|
80
|
|
|
{ |
81
|
|
|
if (isset($this->host->export()[self::DIRECTIVE_HOST])) { |
82
|
|
|
return $this->host->export()[self::DIRECTIVE_HOST][0]; |
83
|
|
|
} |
84
|
|
|
return null; |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Get Clean-param |
89
|
|
|
* |
90
|
|
|
* @return array |
91
|
|
|
*/ |
92
|
|
View Code Duplication |
public function getCleanParam() |
|
|
|
|
93
|
|
|
{ |
94
|
|
|
if (isset($this->cleanParam->export()[self::DIRECTIVE_CLEAN_PARAM])) { |
95
|
|
|
return $this->cleanParam->export()[self::DIRECTIVE_CLEAN_PARAM]; |
96
|
|
|
} |
97
|
|
|
return null; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* Return an User-agent instance, for future usage |
102
|
|
|
* |
103
|
|
|
* @param string $string |
104
|
|
|
* @return UserAgentClient |
105
|
|
|
*/ |
106
|
|
|
public function userAgent($string = self::USER_AGENT) |
107
|
|
|
{ |
108
|
|
|
$userAgentParser = new UserAgentParser(mb_strtolower($string)); |
109
|
|
|
if (($userAgent = $userAgentParser->match($this->userAgent->userAgents)) === false) { |
110
|
|
|
$userAgent = self::USER_AGENT; |
111
|
|
|
} |
112
|
|
|
$rules = [ |
113
|
|
|
self::DIRECTIVE_ALLOW => $this->userAgent->allow[$userAgent], |
114
|
|
|
self::DIRECTIVE_DISALLOW => $this->userAgent->disallow[$userAgent], |
115
|
|
|
self::DIRECTIVE_CRAWL_DELAY => $this->userAgent->crawlDelay[$userAgent], |
116
|
|
|
self::DIRECTIVE_CACHE_DELAY => $this->userAgent->cacheDelay[$userAgent], |
117
|
|
|
]; |
118
|
|
|
return new UserAgentClient($rules, $userAgent, $this->baseUrl, $this->statusCode); |
119
|
|
|
} |
120
|
|
|
} |
121
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.