Passed
Push — master ( bbc68c...66b206 )
by Jeroen
03:18
created

Diglett::nodeCount()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 3
nc 2
nop 0
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Jerodev\Diglett\CssFilters\ICssFilter;
6
use Symfony\Component\DomCrawler\Crawler;
7
8
class Diglett
9
{
10
    /**
11
     *  The Symfony DomCrawler to work with.
12
     *
13
     *  @var Crawler|null
14
     */
15
    private $crawler;
16
17
    /**
18
     *  The css selector parser
19
     *
20
     *  @var CssFilterParser
21
     */
22
    private $cssFilterParser;
23
24
    /**
25
     *  Create a diglett instance from a Symfony Crawler.
26
     *
27
     *  @param Crawler|null $crawler
28
     *  @param ICssFilter[] $cssFilters An array of extra ICssFilterl classes to filter on
29
     */
30
    public function __construct(?Crawler $crawler = null, array $cssFilters = [])
31
    {
32
        $this->crawler = $crawler;
33
        $this->cssFilterParser = new CssFilterParser($cssFilters);
34
    }
35
36
    /**
37
     *  Get the underlying crawler object
38
     *
39
     *  @return Crawler|null
40
     */
41
    public function getCrawler(): ?Crawler
42
    {
43
        return $this->crawler;
44
    }
45
46
    /**
47
     *  Use special css selectors to filter on the current node collection
48
     *
49
     *  @param string $selector
50
     *  @return Diglett
51
     */
52
    public function filter(string $selector): Diglett
53
    {
54
        $parsedSelector = $this->cssFilterParser->parse($selector);
55
56
        $crawler = $this->getCrawler();
57
        foreach ($parsedSelector as $part) {
58
            if (empty($crawler) || $crawler->count() === 0) {
59
                break;
60
            }
61
62
            $crawler = $crawler->filter($part['selector']);
63
64
            foreach ($part['functions'] as $function) {
65
                $crawler = $function->filterNodes($crawler);
66
                if ($crawler === null) {
67
                    break;
68
                }
69
            }
70
        }
71
72
        return new self($crawler);
73
    }
74
75
76
    /**
77
     *  Use special css selectors to fetch several values
78
     *
79
     *  @param string[] $selectors
80
     *  @return array
81
     */
82
    public function getTexts(array $selectors): array
83
    {
84
        $results = [];
85
        foreach ($selectors as $key => $value)
86
        {
87
            $results[$key] = $this->getText($value);
88
        }
89
90
        return $results;
91
    }
92
93
    /**
94
     *  Get the value for a single special css selector
95
     *
96
     *  @param string $selector
97
     *  @return string|null
98
     */
99
    public function getText(string $selector): ?string
100
    {
101
        $attribute = null;
102
        $selector = preg_replace_callback(
103
            '/\{(.*?)\}$/',
104
            function ($matches) use (&$attribute) {
105
                $attribute = $matches[1] ?? null;
106
            },
107
            $selector
108
        );
109
110
        $diglett = $this->filter($selector);
111
        if ($diglett->nodeCount() === 0) {
112
            return null;
113
        }
114
115
        $crawler = $diglett->getCrawler();
116
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
117
    }
118
119
    /**
120
     *  Fetch urls from the selected nodes (a[href], img[src])
121
     */
122
    public function getUrls(string $selector): array
123
    {
124
        $diglett = $this->filter($selector);
125
        if ($diglett->nodeCount() === 0) {
126
            return [];
127
        }
128
129
        $crawler = $diglett->getCrawler();
130
        $absolute = implode('/', array_slice(preg_split('/\//', $crawler->getUri()), 0, 3)) . '/';
0 ignored issues
show
Bug introduced by
It seems like preg_split('/\//', $crawler->getUri()) can also be of type false; however, parameter $array of array_slice() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

130
        $absolute = implode('/', array_slice(/** @scrutinizer ignore-type */ preg_split('/\//', $crawler->getUri()), 0, 3)) . '/';
Loading history...
131
        $relative = substr(preg_replace('/\?.*?$/', '', $crawler->getUri()), 0, strrpos($crawler->getUri(), '/') + 1);
132
133
        return $crawler
134
            ->reduce(function ($node) {
135
                return in_array(strtolower($node->nodeName()), ['a', 'img']);
136
            })
137
            ->each(function ($node) use ($absolute, $relative) {
138
139
                $url = null;
140
                switch (strtolower($node->nodeName()))
141
                {
142
                    case 'a':
143
                        $url = $node->attr('href');
144
                        break;
145
146
                    case 'img':
147
                        $url = $node->attr('src');
148
                        break;
149
                }
150
151
                if (preg_match('/^https?:\/\//', $url) !== 1)
152
                {
153
                    if ($url[0] === '/')
154
                        $url = $absolute . ltrim($url, '/');
155
                    else
156
                        $url = $relative . ltrim($url, '/');
157
                }
158
159
                return $url;
160
161
            });
162
    }
163
164
    /**
165
     *  Find the node count on the current crawler instance
166
     */
167
    public function nodeCount(): int
168
    {
169
        if ($this->crawler === null) {
170
            return 0;
171
        }
172
173
        return $this->crawler->count();
174
    }
175
}
176