1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Class for filtering duplicate URLs according to Yandex Clean-Param specifications. |
4
|
|
|
* |
5
|
|
|
* @author VIP nytt ([email protected]) |
6
|
|
|
* @author Jan-Petter Gundersen ([email protected]) |
7
|
|
|
* |
8
|
|
|
* Project: |
9
|
|
|
* @link https://github.com/VIPnytt/CleanParam-URL-filter |
10
|
|
|
* @license https://opensource.org/licenses/MIT MIT license |
11
|
|
|
* |
12
|
|
|
* Clean-Param directive specifications: |
13
|
|
|
* @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#clean-param |
14
|
|
|
*/ |
15
|
|
|
|
16
|
|
|
namespace vipnytt; |
17
|
|
|
|
18
|
|
|
use vipnytt\CleanParamFilter\URLParser; |
19
|
|
|
|
20
|
|
|
class CleanParamFilter |
21
|
|
|
{ |
22
|
|
|
// Clean-Param set |
23
|
|
|
private $cleanParam = []; |
24
|
|
|
|
25
|
|
|
// URL set |
26
|
|
|
private $urls = []; |
27
|
|
|
|
28
|
|
|
// Status |
29
|
|
|
private $filtered = false; |
30
|
|
|
|
31
|
|
|
// Approved and duplicate URLs |
32
|
|
|
private $approved = []; |
33
|
|
|
private $duplicate = []; |
34
|
|
|
|
35
|
|
|
// Invalid URLs |
36
|
|
|
private $invalid = []; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Constructor |
40
|
|
|
* |
41
|
|
|
* @param array $urls |
42
|
|
|
*/ |
43
|
|
|
public function __construct($urls) |
44
|
|
|
{ |
45
|
|
|
// Parse URLs |
46
|
|
|
sort($urls); |
47
|
|
|
foreach ($urls as $url) { |
48
|
|
|
$urlParser = new URLParser(trim($url)); |
49
|
|
|
if (!$urlParser->isValid()) { |
50
|
|
|
$this->invalid[] = $url; |
51
|
|
|
continue; |
52
|
|
|
} |
53
|
|
|
$url = $urlParser->encode(); |
54
|
|
|
$this->urls[parse_url($url, PHP_URL_HOST)][] = $url; |
55
|
|
|
} |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Lists all approved URLs |
60
|
|
|
* |
61
|
|
|
* @return array |
62
|
|
|
*/ |
63
|
|
|
public function listApproved() |
64
|
|
|
{ |
65
|
|
|
$this->filter(); |
66
|
|
|
return $this->approved; |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Filter URLs |
71
|
|
|
* |
72
|
|
|
* @return void |
73
|
|
|
*/ |
74
|
|
|
private function filter() |
75
|
|
|
{ |
76
|
|
|
// skip the filtering process if it's already done |
77
|
|
|
if ($this->filtered) { |
78
|
|
|
return; |
79
|
|
|
} |
80
|
|
|
$urlsByHost = []; |
81
|
|
|
$parsed = []; |
82
|
|
|
// Loop |
83
|
|
|
foreach ($this->urls as $host => $urlArray) { |
84
|
|
|
// prepare each individual URL |
85
|
|
|
foreach ($urlArray as $url) { |
86
|
|
|
$path = parse_url($url, PHP_URL_PATH); |
87
|
|
View Code Duplication |
if ($path !== false && mb_substr($path, -1) == '/') { |
|
|
|
|
88
|
|
|
$path = substr_replace($path, '', -1); |
89
|
|
|
} |
90
|
|
|
$urlsByHost[$host][$path][$url] = $this->prepareURL($url); |
91
|
|
|
} |
92
|
|
|
// Filter |
93
|
|
|
foreach ($urlsByHost[$host] as $array) { |
94
|
|
|
$parsed[] = $this->filterDuplicates($array, $host); |
95
|
|
|
} |
96
|
|
|
} |
97
|
|
|
// generate lists of URLs for 3rd party usage |
98
|
|
|
$allURLs = call_user_func_array('array_merge', $this->urls); |
99
|
|
|
$this->approved = call_user_func_array('array_merge', $parsed); |
|
|
|
|
100
|
|
|
$this->duplicate = array_diff($allURLs, $this->approved); |
101
|
|
|
// Sort the result arrays |
102
|
|
|
sort($this->approved); |
103
|
|
|
sort($this->duplicate); |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
/** |
107
|
|
|
* Prepare URL |
108
|
|
|
* |
109
|
|
|
* @param string $url |
110
|
|
|
* @return string |
111
|
|
|
*/ |
112
|
|
|
private function prepareURL($url) |
113
|
|
|
{ |
114
|
|
|
$parsed = parse_url($url); |
115
|
|
|
// sort URL parameters alphabetically |
116
|
|
|
if (isset($parsed['query'])) { |
117
|
|
|
$qPieces = explode('&', $parsed['query']); |
118
|
|
|
sort($qPieces); |
119
|
|
|
$parsed['query'] = implode('&', $qPieces); |
120
|
|
|
} |
121
|
|
|
// remove port number if needless |
122
|
|
|
if (isset($parsed['port']) && isset($parsed['scheme'])) { |
123
|
|
|
$defaultPort = getservbyname($parsed['scheme'], 'tcp'); |
124
|
|
|
if (is_int($defaultPort) && $parsed['port'] == $defaultPort) { |
125
|
|
|
// port number identical to scheme port default. |
126
|
|
|
$parsed['port'] = null; |
127
|
|
|
} |
128
|
|
|
} |
129
|
|
|
return $this->unParseURL($parsed); |
|
|
|
|
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Build URL from array |
134
|
|
|
* |
135
|
|
|
* @param array $parsedURL |
136
|
|
|
* @return string |
137
|
|
|
*/ |
138
|
|
|
private function unParseURL($parsedURL) |
139
|
|
|
{ |
140
|
|
|
$scheme = isset($parsedURL['scheme']) ? $parsedURL['scheme'] . '://' : ''; |
141
|
|
|
$host = isset($parsedURL['host']) ? $parsedURL['host'] : ''; |
142
|
|
|
$port = isset($parsedURL['port']) ? ':' . $parsedURL['port'] : ''; |
143
|
|
|
$path = isset($parsedURL['path']) ? $parsedURL['path'] : '/'; |
144
|
|
|
$query = isset($parsedURL['query']) ? '?' . $parsedURL['query'] : ''; |
145
|
|
|
return $scheme . $host . $port . $path . $query; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Filter duplicate URLs |
150
|
|
|
* |
151
|
|
|
* @param array $array - URLs to filter |
152
|
|
|
* @param string $host - Hostname |
153
|
|
|
* @return array |
154
|
|
|
*/ |
155
|
|
|
private function filterDuplicates($array, $host) |
156
|
|
|
{ |
157
|
|
|
$new = []; |
158
|
|
|
// loop until all duplicates is filtered |
159
|
|
|
for ($count = 0; $count <= 1; $count++) { |
160
|
|
|
// for each URL |
161
|
|
|
foreach ($array as $url => $sorted) { |
162
|
|
|
$params = $this->findCleanParam($sorted, $host); |
163
|
|
|
$selected = $this->stripParam($sorted, $params); |
164
|
|
|
// Check against already checked URLs |
165
|
|
|
foreach ($new as $random) { |
166
|
|
|
$random = $this->stripParam($random, $params); |
167
|
|
|
if ($selected === $random) { |
168
|
|
|
// URL is duplicate |
169
|
|
|
continue 2; |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
// URL is not a duplicate, add it |
173
|
|
|
$new[$url] = $sorted; |
174
|
|
|
$count = 0; |
175
|
|
|
} |
176
|
|
|
// update the list of non-duplicate URLs |
177
|
|
|
$array = $new; |
178
|
|
|
} |
179
|
|
|
return array_keys($array); |
180
|
|
|
} |
181
|
|
|
|
182
|
|
|
/** |
183
|
|
|
* Find CleanParam parameters in provided URL |
184
|
|
|
* |
185
|
|
|
* @param string $url |
186
|
|
|
* @param string $host |
187
|
|
|
* @return array |
188
|
|
|
*/ |
189
|
|
|
private function findCleanParam($url, $host) |
190
|
|
|
{ |
191
|
|
|
$paramPrefix = ['?', '&']; |
192
|
|
|
$paramsFound = []; |
193
|
|
|
// check if CleanParam is set for current host |
194
|
|
|
if (!isset($this->cleanParam[$host])) { |
195
|
|
|
return $paramsFound; |
196
|
|
|
} |
197
|
|
|
foreach ($this->cleanParam[$host] as $path => $cleanParam) { |
198
|
|
|
// make sure the path matches |
199
|
|
|
if (!$this->checkPath($path, parse_url($url, PHP_URL_PATH))) { |
|
|
|
|
200
|
|
|
continue; |
201
|
|
|
} |
202
|
|
|
foreach ($cleanParam as $param) { |
203
|
|
|
// check if parameter is found |
204
|
|
|
foreach ($paramPrefix as $char) { |
205
|
|
|
if (mb_strpos($url, $char . $param . '=') !== false) { |
206
|
|
|
$paramsFound[] = $param; |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
} |
210
|
|
|
} |
211
|
|
|
return $paramsFound; |
212
|
|
|
} |
213
|
|
|
|
214
|
|
|
/** |
215
|
|
|
* Check if path matches |
216
|
|
|
* |
217
|
|
|
* @param string $path - Path compare |
218
|
|
|
* @param string $prefix - Path prefix |
219
|
|
|
* @return bool |
220
|
|
|
*/ |
221
|
|
|
private function checkPath($path, $prefix) |
222
|
|
|
{ |
223
|
|
|
$pathParser = new URLParser($path); |
224
|
|
|
$path = $pathParser->encode(); |
225
|
|
|
// change @ to \@ |
226
|
|
|
$escaped = strtr($path, ["@" => '\@']); |
227
|
|
|
// match result |
228
|
|
|
if (preg_match('@' . $escaped . '@', $prefix)) { |
229
|
|
|
return true; |
230
|
|
|
} |
231
|
|
|
return false; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
/** |
235
|
|
|
* Strip provided parameters from URL |
236
|
|
|
* |
237
|
|
|
* @param string $url - URL to check |
238
|
|
|
* @param array $paramArray - parameters to remove |
239
|
|
|
* @return string |
240
|
|
|
*/ |
241
|
|
|
private function stripParam($url, $paramArray) |
242
|
|
|
{ |
243
|
|
|
$prefixArray = ['?', '&']; |
244
|
|
|
foreach ($paramArray as $param) { |
245
|
|
|
foreach ($prefixArray as $prefix) { |
246
|
|
|
// get character positions |
247
|
|
|
$posParam = mb_stripos($url, $prefix . $param . '='); |
248
|
|
|
$posDelimiter = mb_stripos($url, '&', min($posParam + 1, mb_strlen($url))); |
249
|
|
|
if ($posParam === false) { |
250
|
|
|
// not found |
251
|
|
|
continue; |
252
|
|
|
} |
253
|
|
|
$len = ($posDelimiter !== false && $posParam < $posDelimiter) ? $posDelimiter - $posParam : mb_strlen($url); |
254
|
|
|
// stripped URL |
255
|
|
|
$url = substr_replace($url, '', $posParam, $len); |
256
|
|
|
} |
257
|
|
|
} |
258
|
|
|
// fix any newly caused URL format problems |
259
|
|
|
$url = $this->fixURL($url); |
260
|
|
|
return $url; |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
/** |
264
|
|
|
* Fix damaged URL query string |
265
|
|
|
* |
266
|
|
|
* @param string $url |
267
|
|
|
* @return string |
268
|
|
|
*/ |
269
|
|
|
private static function fixURL($url) |
270
|
|
|
{ |
271
|
|
|
// if ? is missing, but & exists, switch |
272
|
|
|
if (mb_strpos($url, '?') === false && mb_strpos($url, '&') !== false) { |
273
|
|
|
$url = substr_replace($url, '?', mb_strpos($url, '&'), 1); |
274
|
|
|
} |
275
|
|
|
// Strip last character |
276
|
|
|
$strip = ['&', '?', '/']; |
277
|
|
|
foreach ($strip as $char) { |
278
|
|
View Code Duplication |
if (mb_substr($url, -1) == $char) { |
|
|
|
|
279
|
|
|
$url = substr_replace($url, '', -1); |
280
|
|
|
} |
281
|
|
|
} |
282
|
|
|
return $url; |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* Lists all duplicate URLs |
287
|
|
|
* |
288
|
|
|
* @return array |
289
|
|
|
*/ |
290
|
|
|
public function listDuplicate() |
291
|
|
|
{ |
292
|
|
|
$this->filter(); |
293
|
|
|
return $this->duplicate; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
/** |
297
|
|
|
* Lists all invalid URLs |
298
|
|
|
* |
299
|
|
|
* @return array |
300
|
|
|
*/ |
301
|
|
|
public function listInvalid() |
302
|
|
|
{ |
303
|
|
|
return $this->invalid; |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
/** |
307
|
|
|
* Add CleanParam |
308
|
|
|
* |
309
|
|
|
* @param string $param - parameter(s) |
310
|
|
|
* @param string $path - path the param is valid for |
311
|
|
|
* @param string $host - limit to a single hostname |
312
|
|
|
* @return void |
313
|
|
|
*/ |
314
|
|
|
public function addCleanParam($param, $path = '/', $host = null) |
315
|
|
|
{ |
316
|
|
|
if (!isset($host) && count($this->urls) > 1) { |
317
|
|
|
trigger_error("Missing host parameter for param `$param`. Required because of URLs from multiple hosts is being filtered.", E_USER_WARNING); |
318
|
|
|
return; |
319
|
|
|
} elseif (!isset($host)) { |
320
|
|
|
// use host from URLs |
321
|
|
|
$host = key($this->urls); |
322
|
|
|
} |
323
|
|
|
$urlParser = new URLParser($path); |
324
|
|
|
$encodedPath = $urlParser->encode(); |
325
|
|
|
$paramArray = explode('&', $param); |
326
|
|
|
foreach ($paramArray as $parameter) { |
327
|
|
|
$this->cleanParam[$host][$encodedPath][$parameter] = $parameter; |
328
|
|
|
} |
329
|
|
|
$this->filtered = false; |
330
|
|
|
} |
331
|
|
|
} |
332
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.