PiedWeb /
UrlHarvester
| 1 | <?php |
||
| 2 | |||
| 3 | namespace PiedWeb\UrlHarvester; |
||
| 4 | |||
| 5 | class ExtractLinks |
||
| 6 | { |
||
| 7 | public const SELECT_A = 'a[href]'; |
||
| 8 | |||
| 9 | public const SELECT_ALL = '[href],[src]'; |
||
| 10 | |||
| 11 | /** @var Harvest */ |
||
| 12 | private $harvest; |
||
| 13 | |||
| 14 | /** @var string */ |
||
| 15 | private $selector; |
||
| 16 | |||
| 17 | 9 | public static function get(Harvest $harvest, $selector = self::SELECT_A): array |
|
| 18 | { |
||
| 19 | 9 | $self = new self(); |
|
| 20 | |||
| 21 | 9 | $self->selector = $selector; |
|
| 22 | 9 | $self->harvest = $harvest; |
|
| 23 | |||
| 24 | 9 | return $self->extractLinks(); |
|
| 25 | } |
||
| 26 | |||
| 27 | 9 | private function __construct() |
|
| 28 | { |
||
| 29 | 9 | } |
|
| 30 | |||
| 31 | /** |
||
| 32 | * @return array |
||
| 33 | */ |
||
| 34 | 9 | private function extractLinks() |
|
| 35 | { |
||
| 36 | 9 | $links = []; |
|
| 37 | 9 | $elements = $this->harvest->getDom()->filter($this->selector); // what happen if find nothing |
|
| 38 | |||
| 39 | 9 | foreach ($elements as $element) { |
|
| 40 | //var_dump(get_class_methods($element->getNode())); |
||
| 41 | //if (!$element instanceof \DomElement) { continue; } // wtf ? |
||
| 42 | 9 | $url = $this->extractUrl($element); |
|
| 43 | //$type = $element->getAttribute('href') ? Link::LINK_A : Link::LINK_SRC; |
||
| 44 | 9 | if (null !== $url) { |
|
| 45 | //$links[] = (new Link($url, $element, $type))->setParent($this->parentUrl); |
||
| 46 | 9 | $links[] = (new Link($url, $this->harvest, $element)); |
|
| 47 | } |
||
| 48 | } |
||
| 49 | |||
| 50 | 9 | return $links; |
|
| 51 | } |
||
| 52 | |||
| 53 | /** |
||
| 54 | * @return string|null absolute url |
||
| 55 | */ |
||
| 56 | 9 | private function extractUrl(\DomElement $element): ?string |
|
| 57 | { |
||
| 58 | 9 | $attributes = explode(',', str_replace(['a[', '*[', '[', ']'], '', $this->selector)); |
|
| 59 | 9 | foreach ($attributes as $attribute) { |
|
| 60 | 9 | $url = $element->getAttribute($attribute); |
|
| 61 | 9 | if ($url) { |
|
| 62 | 9 | break; |
|
| 63 | } |
||
| 64 | } |
||
| 65 | |||
| 66 | 9 | if (! $url || ! $this->isWebLink($url)) { |
|
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
Loading history...
|
|||
| 67 | 3 | return null; |
|
| 68 | } |
||
| 69 | |||
| 70 | 9 | return $this->harvest->url()->resolve($url); |
|
| 71 | } |
||
| 72 | |||
| 73 | 18 | public static function isWebLink(string $url) |
|
| 74 | { |
||
| 75 | 18 | return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url); |
|
| 76 | } |
||
| 77 | } |
||
| 78 |