1
|
|
|
<?php |
2
|
|
|
declare(strict_types = 1); |
3
|
|
|
namespace hexydec\agentzero; |
4
|
|
|
|
5
|
|
|
class urls { |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Generates a configuration array for matching URL's |
9
|
|
|
* |
10
|
|
|
* @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set |
11
|
|
|
*/ |
12
|
100 |
|
public static function get() : array { |
13
|
1 |
|
$fn = function (string $value, int $i, array $tokens) : ?array { |
14
|
14 |
|
if (($start = \stripos($value, 'http://')) === false) { |
15
|
10 |
|
if (($start = \stripos($value, 'https://')) === false) { |
16
|
6 |
|
$start = \stripos($value, 'www.'); |
17
|
|
|
} |
18
|
|
|
} |
19
|
14 |
|
if ($start !== false) { |
20
|
14 |
|
$url = \rtrim(\substr($value, $start, \strcspn($value, '), ', $start)), '?+'); |
21
|
14 |
|
$data = $i > 0 ? crawlers::getApp($tokens[--$i]) : []; |
22
|
14 |
|
return \array_merge([ |
23
|
14 |
|
'type' => 'robot', |
24
|
14 |
|
'url' => $url, |
25
|
14 |
|
'category' => empty($data['app']) ? 'scraper' : 'crawler' |
26
|
14 |
|
], $data); |
27
|
|
|
} |
28
|
6 |
|
return null; |
29
|
1 |
|
}; |
30
|
1 |
|
return [ |
31
|
1 |
|
'http://' => new props('any', $fn), |
32
|
1 |
|
'https://' => new props('any', $fn), |
33
|
1 |
|
'www.' => new props('start', $fn), |
34
|
1 |
|
'.com' => new props('any', $fn), |
35
|
1 |
|
'.' => new props('any', function (string $value) : ?array { |
36
|
100 |
|
foreach (\explode(' ', $value) AS $item) { |
37
|
100 |
|
if (!\str_starts_with($item, 'com.') && \substr_count($item, '.') >= 2) { |
38
|
96 |
|
foreach (\explode('.', $item) AS $part) { |
39
|
96 |
|
if (!\ctype_alpha(\substr($part, 0, 1)) || \strspn($part, '0123456789qwertyuiopasdfghjklzxcvbnm-') !== \strlen($part)) { |
40
|
96 |
|
return null; |
41
|
|
|
} |
42
|
|
|
} |
43
|
3 |
|
return [ |
44
|
3 |
|
'url' => $item |
45
|
3 |
|
]; |
46
|
|
|
} |
47
|
|
|
} |
48
|
96 |
|
return null; |
49
|
1 |
|
}) |
50
|
1 |
|
]; |
51
|
|
|
} |
52
|
|
|
} |