1 | <?php |
||
2 | |||
3 | namespace PiedWeb\UrlHarvester; |
||
4 | |||
5 | class ExtractLinks |
||
6 | { |
||
7 | public const SELECT_A = 'a[href]'; |
||
8 | |||
9 | public const SELECT_ALL = '[href],[src]'; |
||
10 | |||
11 | /** @var Harvest */ |
||
12 | private $harvest; |
||
13 | |||
14 | /** @var string */ |
||
15 | private $selector; |
||
16 | |||
17 | 9 | public static function get(Harvest $harvest, $selector = self::SELECT_A): array |
|
18 | { |
||
19 | 9 | $self = new self(); |
|
20 | |||
21 | 9 | $self->selector = $selector; |
|
22 | 9 | $self->harvest = $harvest; |
|
23 | |||
24 | 9 | return $self->extractLinks(); |
|
25 | } |
||
26 | |||
27 | 9 | private function __construct() |
|
28 | { |
||
29 | 9 | } |
|
30 | |||
31 | /** |
||
32 | * @return array |
||
33 | */ |
||
34 | 9 | private function extractLinks() |
|
35 | { |
||
36 | 9 | $links = []; |
|
37 | 9 | $elements = $this->harvest->getDom()->filter($this->selector); // what happen if find nothing |
|
38 | |||
39 | 9 | foreach ($elements as $element) { |
|
40 | //var_dump(get_class_methods($element->getNode())); |
||
41 | //if (!$element instanceof \DomElement) { continue; } // wtf ? |
||
42 | 9 | $url = $this->extractUrl($element); |
|
43 | //$type = $element->getAttribute('href') ? Link::LINK_A : Link::LINK_SRC; |
||
44 | 9 | if (null !== $url) { |
|
45 | //$links[] = (new Link($url, $element, $type))->setParent($this->parentUrl); |
||
46 | 9 | $links[] = (new Link($url, $this->harvest, $element)); |
|
47 | } |
||
48 | } |
||
49 | |||
50 | 9 | return $links; |
|
51 | } |
||
52 | |||
53 | /** |
||
54 | * @return string|null absolute url |
||
55 | */ |
||
56 | 9 | private function extractUrl(\DomElement $element): ?string |
|
57 | { |
||
58 | 9 | $attributes = explode(',', str_replace(['a[', '*[', '[', ']'], '', $this->selector)); |
|
59 | 9 | foreach ($attributes as $attribute) { |
|
60 | 9 | $url = $element->getAttribute($attribute); |
|
61 | 9 | if ($url) { |
|
62 | 9 | break; |
|
63 | } |
||
64 | } |
||
65 | |||
66 | 9 | if (! $url || ! $this->isWebLink($url)) { |
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
67 | 3 | return null; |
|
68 | } |
||
69 | |||
70 | 9 | return $this->harvest->url()->resolve($url); |
|
71 | } |
||
72 | |||
73 | 18 | public static function isWebLink(string $url) |
|
74 | { |
||
75 | 18 | return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url); |
|
76 | } |
||
77 | } |
||
78 |