Passed
Push — master ( 0e0034...2c0b4d )
by Jeroen
01:17
created

Diglett::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Closure;
6
use Jerodev\Diglett\CssFilters\ICssFilter;
7
use Symfony\Component\DomCrawler\Crawler;
8
9
class Diglett
10
{
11
    /**
12
     *  The Symfony DomCrawler to work with.
13
     *
14
     *  @var Crawler|null
15
     */
16
    private $crawler;
17
18
    /**
19
     *  The css selector parser.
20
     *
21
     *  @var CssFilterParser
22
     */
23
    private $cssFilterParser;
24
25
    /**
26
     *  Create a diglett instance from a Symfony Crawler.
27
     *
28
     *  @param Crawler|null $crawler
29
     *  @param ICssFilter[] $cssFilters An array of extra ICssFilterl classes to filter on
30
     */
31
    public function __construct(?Crawler $crawler = null, array $cssFilters = [])
32
    {
33
        $this->crawler = $crawler;
34
        $this->cssFilterParser = new CssFilterParser($cssFilters);
35
    }
36
37
    /**
38
     *  Get the underlying crawler object.
39
     *
40
     *  @return Crawler|null
41
     */
42
    public function getCrawler(): ?Crawler
43
    {
44
        return $this->crawler;
45
    }
46
47
    /**
48
     *  Perform a closure function on matched nodes for a selector and return as array
49
     *
50
     *  @param string $selector
51
     *  @param Closure $closure A function to perform on the list of nodes
52
     *  @return array An array of dom nodes
53
     */
54
    public function each(string $selector, Closure $closure): array
55
    {
56
        return $this->filter($selector)->getCrawler()->each($closure);
57
    }
58
59
    /**
60
     *  Use special css selectors to filter on the current node collection.
61
     *
62
     *  @param string $selector
63
     *
64
     *  @return Diglett
65
     */
66
    public function filter(string $selector): self
67
    {
68
        $parsedSelector = $this->cssFilterParser->parse($selector);
69
70
        $crawler = $this->getCrawler();
71
        foreach ($parsedSelector as $part) {
72
            if (empty($crawler) || $crawler->count() === 0) {
73
                break;
74
            }
75
76
            $crawler = $crawler->filter($part['selector']);
77
78
            foreach ($part['functions'] as $function) {
79
                $crawler = $function->filterNodes($crawler);
80
                if ($crawler === null) {
81
                    break;
82
                }
83
            }
84
        }
85
86
        return new self($crawler);
87
    }
88
89
    /**
90
     *  Use special css selectors to fetch several values.
91
     *
92
     *  @param string[] $selectors
93
     *
94
     *  @return array
95
     */
96
    public function getTexts(array $selectors): array
97
    {
98
        $results = [];
99
        foreach ($selectors as $key => $value) {
100
            $results[$key] = $this->getText($value);
101
        }
102
103
        return $results;
104
    }
105
106
    /**
107
     *  Get the value for a single special css selector.
108
     *
109
     *  @param string $selector
110
     *
111
     *  @return string|null
112
     */
113
    public function getText(string $selector): ?string
114
    {
115
        $attribute = null;
116
        $selector = preg_replace_callback(
117
            '/\{(.*?)\}$/',
118
            function ($matches) use (&$attribute) {
119
                $attribute = $matches[1] ?? null;
120
            },
121
            $selector
122
        );
123
124
        $diglett = $this->filter($selector);
125
        if ($diglett->nodeCount() === 0) {
126
            return null;
127
        }
128
129
        $crawler = $diglett->getCrawler();
130
131
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
132
    }
133
134
    /**
135
     *  Fetch urls from the selected nodes (a[href], img[src]).
136
     */
137
    public function getUrls(string $selector): array
138
    {
139
        $diglett = $this->filter($selector);
140
        if ($diglett->nodeCount() === 0) {
141
            return [];
142
        }
143
144
        $crawler = $diglett->getCrawler();
145
        $absolute = implode('/', array_slice(preg_split('/\//', $crawler->getUri()), 0, 3)).'/';
0 ignored issues
show
Bug introduced by
It seems like preg_split('/\//', $crawler->getUri()) can also be of type false; however, parameter $array of array_slice() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

145
        $absolute = implode('/', array_slice(/** @scrutinizer ignore-type */ preg_split('/\//', $crawler->getUri()), 0, 3)).'/';
Loading history...
146
        $relative = substr(preg_replace('/\?.*?$/', '', $crawler->getUri()), 0, strrpos($crawler->getUri(), '/') + 1);
147
148
        return $crawler
149
            ->reduce(function ($node) {
150
                return in_array(strtolower($node->nodeName()), ['a', 'img']);
151
            })
152
            ->each(function ($node) use ($absolute, $relative) {
153
                $url = null;
154
                switch (strtolower($node->nodeName())) {
155
                    case 'a':
156
                        $url = $node->attr('href');
157
                        break;
158
159
                    case 'img':
160
                        $url = $node->attr('src');
161
                        break;
162
                }
163
164
                if (preg_match('/^https?:\/\//', $url) !== 1) {
165
                    if ($url[0] === '/') {
166
                        $url = $absolute.ltrim($url, '/');
167
                    } else {
168
                        $url = $relative.ltrim($url, '/');
169
                    }
170
                }
171
172
                return $url;
173
            });
174
    }
175
176
    /**
177
     *  Find the node count on the current crawler instance.
178
     */
179
    public function nodeCount(): int
180
    {
181
        if ($this->crawler === null) {
182
            return 0;
183
        }
184
185
        return $this->crawler->count();
186
    }
187
}
188