Passed
Push — master ( 6c3d4f...94c35e )
by Jeroen
02:03
created

Diglett::filter()   B

Complexity

Conditions 7
Paths 2

Size

Total Lines 23
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 12
nc 2
nop 1
dl 0
loc 23
rs 8.8333
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Closure;
6
use Jerodev\Diglett\CssFilters\ICssFilter;
7
use Symfony\Component\DomCrawler\Crawler;
8
9
class Diglett
10
{
11
    /**
12
     *  The Symfony DomCrawler to work with.
13
     *
14
     *  @var Crawler|null
15
     */
16
    private $crawler;
17
18
    /**
19
     *  The css selector parser.
20
     *
21
     *  @var CssFilterParser
22
     */
23
    private $cssFilterParser;
24
25
    /**
26
     *  Create a diglett instance from a Symfony Crawler.
27
     *
28
     *  @param Crawler|null $crawler
29
     *  @param ICssFilter[] $cssFilters An array of extra ICssFilterl classes to filter on
30
     */
31
    public function __construct(?Crawler $crawler = null, array $cssFilters = [])
32
    {
33
        $this->crawler = $crawler;
34
        $this->cssFilterParser = new CssFilterParser($cssFilters);
35
    }
36
37
    /**
38
     *  Get the underlying crawler object.
39
     *
40
     *  @return Crawler|null
41
     */
42
    public function getCrawler(): ?Crawler
43
    {
44
        return $this->crawler;
45
    }
46
47
    /**
48
     *  Perform a closure function on matched nodes for a selector and return as array.
49
     *
50
     *  @param string $selector
51
     *  @param Closure $closure A function to perform on the list of nodes
52
     *
53
     *  @return array An array of results returned by the closure
54
     */
55
    public function each(string $selector, Closure $closure): array
56
    {
57
        return $this->filter($selector)->getCrawler()->each(function ($crawler, $i) use ($closure) {
58
            return $closure(new self($crawler), $i);
59
        });
60
    }
61
62
    /**
63
     *  Use special css selectors to filter on the current node collection.
64
     *
65
     *  @param string $selector
66
     *
67
     *  @return Diglett
68
     */
69
    public function filter(string $selector): self
70
    {
71
        $parsedSelector = $this->cssFilterParser->parse($selector);
72
73
        $crawler = $this->getCrawler();
74
        foreach ($parsedSelector as $part) {
75
            if (empty($crawler) || $crawler->count() === 0) {
76
                break;
77
            }
78
79
            if (!empty($part['selector'])) {
80
                $crawler = $crawler->filter($part['selector']);
81
            }
82
83
            foreach ($part['functions'] as $function) {
84
                $crawler = $function->filterNodes($crawler);
85
                if ($crawler === null) {
86
                    break;
87
                }
88
            }
89
        }
90
91
        return new self($crawler);
92
    }
93
94
    /**
95
     *  Use special css selectors to fetch several values.
96
     *
97
     *  @param string[] $selectors
98
     *
99
     *  @return array
100
     */
101
    public function getTexts(array $selectors): array
102
    {
103
        $results = [];
104
        foreach ($selectors as $key => $value) {
105
            $results[$key] = $this->getText($value);
106
        }
107
108
        return $results;
109
    }
110
111
    /**
112
     *  Get the value for a single special css selector.
113
     *
114
     *  @param string $selector
115
     *
116
     *  @return string|null
117
     */
118
    public function getText(?string $selector = null): ?string
119
    {
120
        $attribute = null;
121
        $diglett = $this;
122
123
        if ($selector !== null) {
124
            $selector = preg_replace_callback(
125
                '/\{(.*?)\}$/',
126
                function ($matches) use (&$attribute) {
127
                    $attribute = $matches[1] ?? null;
128
                },
129
                $selector
130
            );
131
132
            $diglett = $this->filter($selector);
133
        }
134
135
        if ($diglett->nodeCount() === 0) {
136
            return null;
137
        }
138
139
        $crawler = $diglett->getCrawler();
140
141
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
142
    }
143
144
    /**
145
     *  Fetch urls from the selected nodes (a[href], img[src]).
146
     */
147
    public function getUrls(string $selector): array
148
    {
149
        $diglett = $this->filter($selector);
150
        if ($diglett->nodeCount() === 0) {
151
            return [];
152
        }
153
154
        $crawler = $diglett->getCrawler();
155
        $absolute = implode('/', array_slice(preg_split('/\//', $crawler->getUri()), 0, 3)).'/';
0 ignored issues
show
Bug introduced by
It seems like preg_split('/\//', $crawler->getUri()) can also be of type false; however, parameter $array of array_slice() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

155
        $absolute = implode('/', array_slice(/** @scrutinizer ignore-type */ preg_split('/\//', $crawler->getUri()), 0, 3)).'/';
Loading history...
156
        $relative = substr(preg_replace('/\?.*?$/', '', $crawler->getUri()), 0, strrpos($crawler->getUri(), '/') + 1);
157
158
        return $crawler
159
            ->reduce(function ($node) {
160
                return in_array(strtolower($node->nodeName()), ['a', 'img']);
161
            })
162
            ->each(function ($node) use ($absolute, $relative) {
163
                $url = null;
164
                switch (strtolower($node->nodeName())) {
165
                    case 'a':
166
                        $url = $node->attr('href');
167
                        break;
168
169
                    case 'img':
170
                        $url = $node->attr('src');
171
                        break;
172
                }
173
174
                if (preg_match('/^https?:\/\//', $url) !== 1) {
175
                    if ($url[0] === '/') {
176
                        $url = $absolute.ltrim($url, '/');
177
                    } else {
178
                        $url = $relative.ltrim($url, '/');
179
                    }
180
                }
181
182
                return $url;
183
            });
184
    }
185
186
    /**
187
     *  Find the node count on the current crawler instance.
188
     */
189
    public function nodeCount(): int
190
    {
191
        if ($this->crawler === null) {
192
            return 0;
193
        }
194
195
        return $this->crawler->count();
196
    }
197
}
198