1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\UrlHarvester; |
4
|
|
|
|
5
|
|
|
class ExtractLinks |
6
|
|
|
{ |
7
|
|
|
const SELECT_A = 'a[href]'; |
8
|
|
|
|
9
|
|
|
const SELECT_ALL = '[href],[src]'; |
10
|
|
|
|
11
|
|
|
/** @var Harvest */ |
12
|
|
|
private $harvest; |
13
|
|
|
|
14
|
|
|
/** @var string */ |
15
|
|
|
private $selector; |
16
|
|
|
|
17
|
|
|
public static function get(Harvest $harvest, $selector = self::SELECT_A): array |
18
|
|
|
{ |
19
|
|
|
$self = new self(); |
20
|
|
|
|
21
|
|
|
$self->selector = $selector; |
22
|
|
|
$self->harvest = $harvest; |
23
|
|
|
|
24
|
|
|
return $self->extractLinks(); |
25
|
|
|
} |
26
|
|
|
|
27
|
|
|
private function __construct() |
28
|
|
|
{ |
29
|
|
|
} |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* @return array |
33
|
|
|
*/ |
34
|
|
|
private function extractLinks() |
35
|
9 |
|
{ |
36
|
|
|
$links = []; |
37
|
9 |
|
$elements = $this->harvest->getDom()->filter($this->selector); // what happen if find nothing |
38
|
|
|
|
39
|
9 |
|
foreach ($elements as $element) { |
40
|
9 |
|
//var_dump(get_class_methods($element->getNode())); |
41
|
9 |
|
//if (!$element instanceof \DomElement) { continue; } // wtf ? |
42
|
|
|
$url = $this->extractUrl($element); |
43
|
9 |
|
//$type = $element->getAttribute('href') ? Link::LINK_A : Link::LINK_SRC; |
44
|
|
|
if (null !== $url) { |
45
|
|
|
//$links[] = (new Link($url, $element, $type))->setParent($this->parentUrl); |
46
|
9 |
|
$links[] = (new Link($url, $this->harvest, $element)); |
47
|
|
|
} |
48
|
9 |
|
} |
49
|
|
|
|
50
|
|
|
return $links; |
51
|
|
|
} |
52
|
|
|
|
53
|
9 |
|
/** |
54
|
|
|
* @return string|null absolute url |
55
|
9 |
|
*/ |
56
|
9 |
|
private function extractUrl(\DomElement $element): ?string |
57
|
|
|
{ |
58
|
9 |
|
$attributes = explode(',', str_replace(['a[', '*[', '[', ']'], '', $this->selector)); |
59
|
9 |
|
foreach ($attributes as $attribute) { |
60
|
9 |
|
$url = $element->getAttribute($attribute); |
61
|
|
|
if (null !== $url) { |
62
|
9 |
|
break; |
63
|
9 |
|
} |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
if (!$this->isWebLink($url)) { |
|
|
|
|
67
|
|
|
return null; |
68
|
9 |
|
} |
69
|
|
|
|
70
|
|
|
return $this->harvest->url()->resolve($url); |
71
|
9 |
|
} |
72
|
|
|
|
73
|
9 |
|
public static function isWebLink(string $url) |
74
|
9 |
|
{ |
75
|
|
|
return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url); |
76
|
6 |
|
} |
77
|
|
|
} |
78
|
|
|
|