1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* vipnytt/RobotsTxtParser |
4
|
|
|
* |
5
|
|
|
* @link https://github.com/VIPnytt/RobotsTxtParser |
6
|
|
|
* @license https://github.com/VIPnytt/RobotsTxtParser/blob/master/LICENSE The MIT License (MIT) |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
namespace vipnytt\RobotsTxtParser; |
10
|
|
|
|
11
|
|
|
use vipnytt\RobotsTxtParser\Handler\PhpAddOnTrait; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* Class Import |
15
|
|
|
* |
16
|
|
|
* @package vipnytt\RobotsTxtParser |
17
|
|
|
*/ |
18
|
|
|
class Import extends TxtClient |
19
|
|
|
{ |
20
|
|
|
use PhpAddOnTrait; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Root level export template |
24
|
|
|
*/ |
25
|
|
|
const TEMPLATE_ROOT = [ |
26
|
|
|
self::DIRECTIVE_HOST => null, |
27
|
|
|
self::DIRECTIVE_CLEAN_PARAM => [], |
28
|
|
|
self::DIRECTIVE_SITEMAP => [], |
29
|
|
|
self::DIRECTIVE_USER_AGENT => [] |
30
|
|
|
]; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* User-agent level export template |
34
|
|
|
*/ |
35
|
|
|
const TEMPLATE_SUB = [ |
36
|
|
|
self::DIRECTIVE_ROBOT_VERSION => null, |
37
|
|
|
self::DIRECTIVE_VISIT_TIME => [], |
38
|
|
|
self::DIRECTIVE_NO_INDEX => [ |
39
|
|
|
self::DIRECTIVE_HOST => [], |
40
|
|
|
'path' => [], |
41
|
|
|
self::DIRECTIVE_CLEAN_PARAM => [], |
42
|
|
|
], |
43
|
|
|
self::DIRECTIVE_DISALLOW => |
44
|
|
|
[ |
45
|
|
|
self::DIRECTIVE_HOST => [], |
46
|
|
|
'path' => [], |
47
|
|
|
self::DIRECTIVE_CLEAN_PARAM => [], |
48
|
|
|
], |
49
|
|
|
self::DIRECTIVE_ALLOW => |
50
|
|
|
[ |
51
|
|
|
self::DIRECTIVE_HOST => [], |
52
|
|
|
'path' => [], |
53
|
|
|
self::DIRECTIVE_CLEAN_PARAM => [], |
54
|
|
|
], |
55
|
|
|
self::DIRECTIVE_CRAWL_DELAY => null, |
56
|
|
|
self::DIRECTIVE_CACHE_DELAY => null, |
57
|
|
|
self::DIRECTIVE_REQUEST_RATE => [], |
58
|
|
|
self::DIRECTIVE_COMMENT => [], |
59
|
|
|
]; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* Array |
63
|
|
|
* @var array |
64
|
|
|
*/ |
65
|
|
|
private $array; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* Import constructor. |
69
|
|
|
* |
70
|
|
|
* @param array $export |
71
|
|
|
* @param string $baseUri |
72
|
|
|
*/ |
73
|
|
|
public function __construct(array $export, $baseUri = 'https://example.com') |
74
|
|
|
{ |
75
|
|
|
$this->array = $this->arrayMergeRecursiveEx(self::TEMPLATE_ROOT, $export); |
76
|
|
|
foreach (array_keys($this->array[self::DIRECTIVE_USER_AGENT]) as $userAgent) { |
77
|
|
|
$this->array[self::DIRECTIVE_USER_AGENT][$userAgent] = $this->arrayMergeRecursiveEx(self::TEMPLATE_SUB, $this->array[self::DIRECTIVE_USER_AGENT][$userAgent]); |
78
|
|
|
} |
79
|
|
|
parent::__construct($baseUri, null, implode(PHP_EOL, array_merge( |
80
|
|
|
$this->buildHost($this->array[self::DIRECTIVE_HOST]), |
81
|
|
|
$this->buildCleanParam($this->array[self::DIRECTIVE_CLEAN_PARAM]), |
82
|
|
|
$this->buildGenericArray($this->array[self::DIRECTIVE_SITEMAP], self::DIRECTIVE_SITEMAP), |
83
|
|
|
$this->buildUserAgent($this->array[self::DIRECTIVE_USER_AGENT]) |
84
|
|
|
))); |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Host |
89
|
|
|
* |
90
|
|
|
* @param string[]|string|null $array |
91
|
|
|
* @return string[] |
92
|
|
|
*/ |
93
|
|
|
private function buildHost($array) |
94
|
|
|
{ |
95
|
|
|
if (!is_array($array)) { |
96
|
|
|
$array = [$array]; |
97
|
|
|
} |
98
|
|
|
return preg_filter('/^/', self::DIRECTIVE_HOST . ':', $array); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* Clean-param |
103
|
|
|
* |
104
|
|
|
* @param string[][] $array |
105
|
|
|
* @return string[] |
106
|
|
|
*/ |
107
|
|
|
private function buildCleanParam($array) |
108
|
|
|
{ |
109
|
|
|
$result = []; |
110
|
|
|
foreach ($array as $param => $paths) { |
111
|
|
|
foreach ($paths as $path) { |
112
|
|
|
$result[] = self::DIRECTIVE_CLEAN_PARAM . ':' . $param . ' ' . $path; |
113
|
|
|
} |
114
|
|
|
} |
115
|
|
|
return $result; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* Comment | Sitemap |
120
|
|
|
* |
121
|
|
|
* @param string[] $array |
122
|
|
|
* @param string $directive |
123
|
|
|
* @return string[] |
124
|
|
|
*/ |
125
|
|
|
private function buildGenericArray($array, $directive) |
126
|
|
|
{ |
127
|
|
|
return preg_filter('/^/', $directive . ':', $array); |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
/** |
131
|
|
|
* User-agent |
132
|
|
|
* |
133
|
|
|
* @param array $array |
134
|
|
|
* @return string[] |
135
|
|
|
*/ |
136
|
|
|
private function buildUserAgent($array) |
137
|
|
|
{ |
138
|
|
|
$result = []; |
139
|
|
|
foreach ($array as $userAgent => $rules) { |
140
|
|
|
$result = array_merge( |
141
|
|
|
$result, |
142
|
|
|
[self::DIRECTIVE_USER_AGENT . ':' . $userAgent], |
143
|
|
|
$this->buildGenericString($rules[self::DIRECTIVE_ROBOT_VERSION], self::DIRECTIVE_ROBOT_VERSION), |
144
|
|
|
$this->buildVisitTime($rules[self::DIRECTIVE_VISIT_TIME]), |
145
|
|
|
$this->buildAllow($rules[self::DIRECTIVE_NO_INDEX], self::DIRECTIVE_NO_INDEX), |
146
|
|
|
$this->buildAllow($rules[self::DIRECTIVE_DISALLOW], self::DIRECTIVE_DISALLOW), |
147
|
|
|
$this->buildAllow($rules[self::DIRECTIVE_ALLOW], self::DIRECTIVE_ALLOW), |
148
|
|
|
$this->buildGenericString($rules[self::DIRECTIVE_CRAWL_DELAY], self::DIRECTIVE_CRAWL_DELAY), |
149
|
|
|
$this->buildGenericString($rules[self::DIRECTIVE_CACHE_DELAY], self::DIRECTIVE_CACHE_DELAY), |
150
|
|
|
$this->buildRequestRate($rules[self::DIRECTIVE_REQUEST_RATE]), |
151
|
|
|
$this->buildGenericArray($rules[self::DIRECTIVE_COMMENT], self::DIRECTIVE_COMMENT) |
152
|
|
|
); |
153
|
|
|
} |
154
|
|
|
return $result; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* Cache-delay | Comment | Crawl-delay | Robot-version |
159
|
|
|
* |
160
|
|
|
* @param float|int|string|null $value |
161
|
|
|
* @param string $directive |
162
|
|
|
* @return string[] |
163
|
|
|
*/ |
164
|
|
|
private function buildGenericString($value, $directive) |
165
|
|
|
{ |
166
|
|
|
return [$directive . ':' . $value]; |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
/** |
170
|
|
|
* Visit-time |
171
|
|
|
* |
172
|
|
|
* @param int[]|string[] $array |
173
|
|
|
* @return string[] |
174
|
|
|
*/ |
175
|
|
|
private function buildVisitTime($array) |
176
|
|
|
{ |
177
|
|
|
$result = []; |
178
|
|
|
foreach ($array as $pair) { |
179
|
|
|
$result[] = self::DIRECTIVE_VISIT_TIME . ':' . $pair['from'] . '-' . $pair['to']; |
180
|
|
|
} |
181
|
|
|
return $result; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* Allow / Disallow / NoIndex |
186
|
|
|
* |
187
|
|
|
* @param array $array |
188
|
|
|
* @param string $directive |
189
|
|
|
* @return string[] |
190
|
|
|
*/ |
191
|
|
|
private function buildAllow($array, $directive) |
192
|
|
|
{ |
193
|
|
|
return preg_filter('/^/', $directive . ':', array_merge( |
194
|
|
|
$this->buildHost($array[self::DIRECTIVE_HOST]), |
195
|
|
|
$this->buildCleanParam($array[self::DIRECTIVE_CLEAN_PARAM]), |
196
|
|
|
$array['path'] |
197
|
|
|
) |
198
|
|
|
); |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* Request-rate |
203
|
|
|
* |
204
|
|
|
* @param array $array |
205
|
|
|
* @return string[] |
206
|
|
|
*/ |
207
|
|
|
private function buildRequestRate($array) |
208
|
|
|
{ |
209
|
|
|
$result = []; |
210
|
|
View Code Duplication |
foreach ($array as $pair) { |
|
|
|
|
211
|
|
|
$string = self::DIRECTIVE_REQUEST_RATE . ':1/' . $pair['rate'] . 's'; |
212
|
|
|
if (isset($pair['from']) && |
213
|
|
|
isset($pair['to']) |
214
|
|
|
) { |
215
|
|
|
$string .= ' ' . $pair['from'] . '-' . $pair['to']; |
216
|
|
|
} |
217
|
|
|
$result[] = $string; |
218
|
|
|
} |
219
|
|
|
return $result; |
220
|
|
|
} |
221
|
|
|
|
222
|
|
|
/** |
223
|
|
|
* Get difference |
224
|
|
|
* |
225
|
|
|
* @return array |
226
|
|
|
*/ |
227
|
|
|
public function getIgnoredImportData() |
228
|
|
|
{ |
229
|
|
|
$source = $this->array; |
230
|
|
|
$source = $this->arrayFilterRecursive($source); |
231
|
|
|
array_multisort($source); |
232
|
|
|
$parsed = $this->arrayFilterRecursive($this->export()); |
|
|
|
|
233
|
|
|
array_multisort($parsed); |
234
|
|
|
return $this->arrayDiffAssocRecursive($source, $parsed); |
235
|
|
|
} |
236
|
|
|
} |
237
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.