Diglett::getCrawler()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Closure;
6
use Jerodev\Diglett\CssFilters\ICssFilter;
7
use Symfony\Component\DomCrawler\Crawler;
8
9
class Diglett
10
{
11
    /**
12
     *  The Symfony DomCrawler to work with.
13
     *
14
     *  @var Crawler|null
15
     */
16
    private $crawler;
17
18
    /**
19
     *  The css selector parser.
20
     *
21
     *  @var CssFilterParser
22
     */
23
    private $cssFilterParser;
24
25
    /**
26
     *  Create a diglett instance from a Symfony Crawler.
27
     *
28
     *  @param Crawler|null $crawler
29
     *  @param ICssFilter[] $cssFilters An array of extra ICssFilterl classes to filter on
30
     */
31
    public function __construct(?Crawler $crawler = null, array $cssFilters = [])
32
    {
33
        $this->crawler = $crawler;
34
        $this->cssFilterParser = new CssFilterParser($cssFilters);
35
    }
36
37
    /**
38
     *  Get the underlying crawler object.
39
     *
40
     *  @return Crawler|null
41
     */
42
    public function getCrawler(): ?Crawler
43
    {
44
        return $this->crawler;
45
    }
46
47
    /**
48
     *  Perform a closure function on matched nodes for a selector and return as array.
49
     *
50
     *  @param string $selector
51
     *  @param Closure $closure A function to perform on the list of nodes
52
     *
53
     *  @return array An array of results returned by the closure
54
     */
55
    public function each(string $selector, Closure $closure): array
56
    {
57
        $nodes = $this->filter($selector);
58
        if ($nodes->nodeCount() === 0) {
59
            return [];
60
        }
61
62
        return $nodes->getCrawler()->each(function ($crawler, $i) use ($closure) {
63
            return $closure(new self($crawler), $i);
64
        });
65
    }
66
67
    /**
68
     *  Use special css selectors to filter on the current node collection.
69
     *
70
     *  @param string $selector
71
     *
72
     *  @return Diglett
73
     */
74
    public function filter(string $selector): self
75
    {
76
        $parsedSelector = $this->cssFilterParser->parse($selector);
77
78
        $crawler = $this->getCrawler();
79
        foreach ($parsedSelector as $part) {
80
            if (empty($crawler) || $crawler->count() === 0) {
81
                break;
82
            }
83
84
            if (!empty($part->getSelector())) {
85
                $crawler = $crawler->filter($part->getSelector());
86
            }
87
88
            foreach ($part->getFunctions() as $function) {
89
                $crawler = $function->filterNodes($crawler);
90
                if ($crawler === null) {
91
                    break;
92
                }
93
            }
94
        }
95
96
        return new self($crawler);
97
    }
98
99
    /**
100
     *  Use special css selectors to fetch several values.
101
     *
102
     *  @param string[] $selectors
103
     *
104
     *  @return array
105
     */
106
    public function getTexts(array $selectors): array
107
    {
108
        $results = [];
109
        foreach ($selectors as $key => $value) {
110
            $results[$key] = $this->getText($value);
111
        }
112
113
        return $results;
114
    }
115
116
    /**
117
     *  Get the value for a single special css selector.
118
     *
119
     *  @param string $selector
120
     *
121
     *  @return string|null
122
     */
123
    public function getText(?string $selector = null): ?string
124
    {
125
        $attribute = null;
126
        $diglett = $this;
127
128
        if ($selector !== null) {
129
            if (($attr = strstr($selector, '{')) && $attr[-1] === '}') {
130
                $selector = substr($selector, 0, strlen($attr) * -1);
131
                $attribute = substr($attr, 1, -1);
132
            }
133
134
            $diglett = $this->filter($selector);
135
        }
136
137
        if ($diglett->nodeCount() === 0) {
138
            return null;
139
        }
140
141
        $crawler = $diglett->getCrawler();
142
143
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
144
    }
145
146
    /**
147
     *  Fetch urls from the selected nodes (a[href], img[src]).
148
     */
149
    public function getUrls(?string $selector = null): array
150
    {
151
        if ($selector !== null) {
152
            $diglett = $this->filter($selector);
153
        } else {
154
            $diglett = new self($this->getCrawler());
155
        }
156
157
        if ($diglett->nodeCount() === 0) {
158
            return [];
159
        }
160
161
        $urls = $diglett->each('a, img', function ($d) {
162
            return $d->getUrl();
163
        });
164
165
        return array_filter($urls);
166
    }
167
168
    /**
169
     *  Fetch the url from the current main node if available.
170
     */
171
    public function getUrl(): ?string
172
    {
173
        if ($this->nodeCount() === 0) {
174
            return null;
175
        }
176
177
        $crawler = $this->getCrawler();
178
        $absolute = implode('/', array_slice(explode('/', $crawler->getUri()), 0, 3)) . '/';
179
        $relative = substr(strstr($crawler->getUri(), '?', true) ?: $crawler->getUri(), 0, strrpos($crawler->getUri(), '/') + 1);
180
181
        $url = null;
182
        $node = $crawler->first();
183
        switch (strtolower($node->nodeName())) {
184
            case 'a':
185
                $url = $node->attr('href');
186
                break;
187
188
            case 'img':
189
                $url = $node->attr('src');
190
                break;
191
192
            case 'meta':
193
                $url = $node->attr('content');
194
                break;
195
        }
196
197
        if ($url && !in_array(substr($url, 0, 7), ['http://', 'https:/'])) {
198
            if ($url[0] === '/') {
199
                $url = $absolute . ltrim($url, '/');
200
            } else {
201
                $url = $relative . ltrim($url, '/');
202
            }
203
        }
204
205
        return $url;
206
    }
207
208
    /**
209
     *  Find the node count on the current crawler instance.
210
     */
211
    public function nodeCount(): int
212
    {
213
        if ($this->getCrawler() === null) {
214
            return 0;
215
        }
216
217
        return $this->getCrawler()->count();
218
    }
219
}
220