Issues (8)

src/ExtractLinks.php (1 issue)

1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
class ExtractLinks
6
{
7
    public const SELECT_A = 'a[href]';
8
9
    public const SELECT_ALL = '[href],[src]';
10
11
    /** @var Harvest */
12
    private $harvest;
13
14
    /** @var string */
15
    private $selector;
16
17 9
    public static function get(Harvest $harvest, $selector = self::SELECT_A): array
18
    {
19 9
        $self = new self();
20
21 9
        $self->selector = $selector;
22 9
        $self->harvest = $harvest;
23
24 9
        return $self->extractLinks();
25
    }
26
27 9
    private function __construct()
28
    {
29 9
    }
30
31
    /**
32
     * @return array
33
     */
34 9
    private function extractLinks()
35
    {
36 9
        $links = [];
37 9
        $elements = $this->harvest->getDom()->filter($this->selector); // what happen if find nothing
38
39 9
        foreach ($elements as $element) {
40
            //var_dump(get_class_methods($element->getNode()));
41
            //if (!$element instanceof \DomElement) { continue; } // wtf ?
42 9
            $url = $this->extractUrl($element);
43
            //$type = $element->getAttribute('href') ? Link::LINK_A : Link::LINK_SRC;
44 9
            if (null !== $url) {
45
                //$links[] = (new Link($url, $element, $type))->setParent($this->parentUrl);
46 9
                $links[] = (new Link($url, $this->harvest, $element));
47
            }
48
        }
49
50 9
        return $links;
51
    }
52
53
    /**
54
     * @return string|null absolute url
55
     */
56 9
    private function extractUrl(\DomElement $element): ?string
57
    {
58 9
        $attributes = explode(',', str_replace(['a[', '*[', '[', ']'], '', $this->selector));
59 9
        foreach ($attributes as $attribute) {
60 9
            $url = $element->getAttribute($attribute);
61 9
            if ($url) {
62 9
                break;
63
            }
64
        }
65
66 9
        if (! $url || ! $this->isWebLink($url)) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $url seems to be defined by a foreach iteration on line 59. Are you sure the iterator is never empty, otherwise this variable is not defined?
Loading history...
67 3
            return null;
68
        }
69
70 9
        return $this->harvest->url()->resolve($url);
71
    }
72
73 18
    public static function isWebLink(string $url)
74
    {
75 18
        return preg_match('@^((?:(http:|https:)//([\w\d-]+\.)+[\w\d-]+){0,1}(/?[\w~,;\-\./?%&+#=]*))$@', $url);
76
    }
77
}
78