Passed
Push — master ( b88b49...ad326c )
by Jeroen
01:33
created

Diglett::getTexts()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
nc 2
nop 1
dl 0
loc 9
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Jerodev\Diglett;
4
5
use Symfony\Component\DomCrawler\Crawler;
6
7
class Diglett
8
{
9
    /**
10
     *  The Symfony DomCrawler to work with.
11
     *
12
     *  @var Crawler
13
     */
14
    private $crawler;
15
16
    /**
17
     *  The css selector parser
18
     *
19
     *  @var CssFilterParser
20
     */
21
    private $cssFilterParser;
22
23
    /**
24
     *  Create a diglett instance from a Symfony Crawler.
25
     *
26
     *  @param Crawler
27
     *  @param array $cssFilter An array of extra ICssFilterl classes to filter on
28
     */
29
    public function __construct(Crawler $crawler, array $cssFilters = [])
30
    {
31
        $this->crawler = $crawler;
32
        $this->cssFilterParser = new CssFilterParser($cssFilters);
33
    }
34
35
    /**
36
     *  Get the underlying crawler object
37
     *
38
     *  @return Crawler
39
     */
40
    public function getCrawler(): Crawler
41
    {
42
        return $this->crawler;
43
    }
44
45
    /**
46
     *  Use special css selectors to filter on the current node collection
47
     *
48
     *  @param string $selector
49
     *  @return Crawler|null
50
     */
51
    public function filter(string $selector): ?Crawler
52
    {
53
        $parsedSelector = $this->cssFilterParser->parse($selector);
54
55
        $crawler = $this->getCrawler();
56
        foreach ($parsedSelector as $part)
57
        {
58
            $crawler = $crawler->filter($part['selector']);
59
60
            foreach ($part['functions'] as $function)
61
            {
62
                $crawler = $function->filterNodes($crawler);
63
                if ($crawler === null)
64
                {
65
                    return null;
66
                }
67
            }
68
69
            if (empty($crawler) || $crawler->count() === 0)
70
            {
71
                break;
72
            }
73
        }
74
75
        return $crawler;
76
    }
77
78
79
    /**
80
     *  Use special css selectors to fetch several values
81
     *
82
     *  @param array $selectors
83
     *  @return array
84
     */
85
    public function getTexts(array $selectors): array
86
    {
87
        $results = [];
88
        foreach ($selectors as $key => $value)
89
        {
90
            $results[$key] = $this->getText($value);
91
        }
92
93
        return $results;
94
    }
95
96
    /**
97
     *  Get the value for a single special css selector
98
     *
99
     *  @param string $selector
100
     *  @return string|null
101
     */
102
    public function getText(string $selector): ?string
103
    {
104
        $attribute = null;
105
        $selector = preg_replace_callback(
106
            '/\{(.*?)\}$/',
107
            function ($matches) use (&$attribute) {
108
                $attribute = $matches[1] ?? null;
109
            },
110
            $selector
111
        );
112
113
        $crawler = $this->filter($selector);
114
        if ($crawler === null || $crawler->count() === 0)
115
        {
116
            return null;
117
        }
118
119
        return $attribute === null ? $crawler->text() : $crawler->attr($attribute);
120
    }
121
122
    /**
123
     *  Fetch urls from the selected nodes (a[href], img[src])
124
     */
125
    public function getUrls(string $selector): array
126
    {
127
        $crawler = $this->filter($selector);
128
        if ($crawler === null || $crawler->count() === 0)
129
        {
130
            return [];
131
        }
132
133
        $absolute = implode('/', array_slice(preg_split('/\//', $crawler->getUri()), 0, 3)) . '/';
0 ignored issues
show
Bug introduced by
It seems like preg_split('/\//', $crawler->getUri()) can also be of type false; however, parameter $array of array_slice() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

133
        $absolute = implode('/', array_slice(/** @scrutinizer ignore-type */ preg_split('/\//', $crawler->getUri()), 0, 3)) . '/';
Loading history...
134
        $relative = substr(preg_replace('/\?.*?$/', '', $crawler->getUri()), 0, strrpos($crawler->getUri(), '/') + 1);
135
136
        return $crawler
137
            ->reduce(function ($node) {
138
                return in_array(strtolower($node->nodeName()), ['a', 'img']);
139
            })
140
            ->each(function ($node) use ($absolute, $relative) {
141
142
                $url = null;
143
                switch (strtolower($node->nodeName()))
144
                {
145
                    case 'a':
146
                        $url = $node->attr('href');
147
                        break;
148
149
                    case 'img':
150
                        $url = $node->attr('src');
151
                        break;
152
                }
153
154
                if (preg_match('/^https?:\/\//', $url) !== 1)
155
                {
156
                    if ($url[0] === '/')
157
                        $url = $absolute . ltrim($url, '/');
158
                    else
159
                        $url = $relative . ltrim($url, '/');
160
                }
161
162
                return $url;
163
164
            });
165
    }
166
}
167