Completed
Push — master ( 6a5c93...71d76d )
by Jeroen
03:37
created

Diglett::getUrl()   A

Complexity

Conditions 6
Paths 9

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 18
nc 9
nop 0
dl 0
loc 27
rs 9.0444
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Closure;
6
use Jerodev\Diglett\CssFilters\ICssFilter;
7
use Symfony\Component\DomCrawler\Crawler;
8
9
class Diglett
10
{
11
    /**
12
     *  The Symfony DomCrawler to work with.
13
     *
14
     *  @var Crawler|null
15
     */
16
    private $crawler;
17
18
    /**
19
     *  The css selector parser.
20
     *
21
     *  @var CssFilterParser
22
     */
23
    private $cssFilterParser;
24
25
    /**
26
     *  Create a diglett instance from a Symfony Crawler.
27
     *
28
     *  @param Crawler|null $crawler
29
     *  @param ICssFilter[] $cssFilters An array of extra ICssFilterl classes to filter on
30
     */
31
    public function __construct(?Crawler $crawler = null, array $cssFilters = [])
32
    {
33
        $this->crawler = $crawler;
34
        $this->cssFilterParser = new CssFilterParser($cssFilters);
35
    }
36
37
    /**
38
     *  Get the underlying crawler object.
39
     *
40
     *  @return Crawler|null
41
     */
42
    public function getCrawler(): ?Crawler
43
    {
44
        return $this->crawler;
45
    }
46
47
    /**
48
     *  Perform a closure function on matched nodes for a selector and return as array.
49
     *
50
     *  @param string $selector
51
     *  @param Closure $closure A function to perform on the list of nodes
52
     *
53
     *  @return array An array of results returned by the closure
54
     */
55
    public function each(string $selector, Closure $closure): array
56
    {
57
        return $this->filter($selector)->getCrawler()->each(function ($crawler, $i) use ($closure) {
58
            return $closure(new self($crawler), $i);
59
        });
60
    }
61
62
    /**
63
     *  Use special css selectors to filter on the current node collection.
64
     *
65
     *  @param string $selector
66
     *
67
     *  @return Diglett
68
     */
69
    public function filter(string $selector): self
70
    {
71
        $parsedSelector = $this->cssFilterParser->parse($selector);
72
73
        $crawler = $this->getCrawler();
74
        foreach ($parsedSelector as $part) {
75
            if (empty($crawler) || $crawler->count() === 0) {
76
                break;
77
            }
78
79
            if (!empty($part->getSelector())) {
80
                $crawler = $crawler->filter($part->getSelector());
81
            }
82
83
            foreach ($part->getFunctions() as $function) {
84
                $crawler = $function->filterNodes($crawler);
85
                if ($crawler === null) {
86
                    break;
87
                }
88
            }
89
        }
90
91
        return new self($crawler);
92
    }
93
94
    /**
95
     *  Use special css selectors to fetch several values.
96
     *
97
     *  @param string[] $selectors
98
     *
99
     *  @return array
100
     */
101
    public function getTexts(array $selectors): array
102
    {
103
        $results = [];
104
        foreach ($selectors as $key => $value) {
105
            $results[$key] = $this->getText($value);
106
        }
107
108
        return $results;
109
    }
110
111
    /**
112
     *  Get the value for a single special css selector.
113
     *
114
     *  @param string $selector
115
     *
116
     *  @return string|null
117
     */
118
    public function getText(?string $selector = null): ?string
119
    {
120
        $attribute = null;
121
        $diglett = $this;
122
123
        if ($selector !== null) {
124
            if (($attr = strstr($selector, '{')) && $attr[-1] === '}') {
125
                $selector = substr($selector, 0, strlen($attr) * -1);
126
                $attribute = substr($attr, 1, -1);
127
            }
128
129
            $diglett = $this->filter($selector);
130
        }
131
132
        if ($diglett->nodeCount() === 0) {
133
            return null;
134
        }
135
136
        $crawler = $diglett->getCrawler();
137
138
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
139
    }
140
141
    /**
142
     *  Fetch urls from the selected nodes (a[href], img[src]).
143
     */
144
    public function getUrls(?string $selector = null): array
145
    {
146
        if ($selector !== null) {
147
            $diglett = $this->filter($selector);
148
        } else {
149
            $diglett = new self($this->getCrawler());
150
        }
151
        
152
        if ($diglett->nodeCount() === 0) {
153
            return [];
154
        }
155
156
        $urls = $diglett->each('a, img', function ($d) {
157
            return $d->getUrl();
158
        });
159
160
        return array_filter($urls);
161
    }
162
163
    /**
164
     *  Fetch the url from the current main node if available.
165
     */
166
    public function getUrl(): ?string
167
    {
168
        $crawler = $this->getCrawler();
169
        $absolute = implode('/', array_slice(explode('/', $crawler->getUri()), 0, 3)) . '/';
170
        $relative = substr(strstr($crawler->getUri(), '?', true) ?: $crawler->getUri(), 0, strrpos($crawler->getUri(), '/') + 1);
171
172
        $url = null;
173
        $node = $crawler->first();
174
        switch (strtolower($node->nodeName())) {
175
            case 'a':
176
                $url = $node->attr('href');
177
                break;
178
179
            case 'img':
180
                $url = $node->attr('src');
181
                break;
182
        }
183
184
        if (!in_array(substr($url, 0, 7), ['http://', 'https:/'])) {
185
            if ($url[0] === '/') {
186
                $url = $absolute . ltrim($url, '/');
187
            } else {
188
                $url = $relative . ltrim($url, '/');
189
            }
190
        }
191
192
        return $url;
193
    }
194
195
    /**
196
     *  Find the node count on the current crawler instance.
197
     */
198
    public function nodeCount(): int
199
    {
200
        if ($this->crawler === null) {
201
            return 0;
202
        }
203
204
        return $this->crawler->count();
205
    }
206
}
207