These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | /* |
||
6 | * (c) Christian Gripp <[email protected]> |
||
7 | * |
||
8 | * For the full copyright and license information, please view the LICENSE |
||
9 | * file that was distributed with this source code. |
||
10 | */ |
||
11 | |||
12 | namespace Core23\LastFm\Crawler; |
||
13 | |||
14 | use Core23\LastFm\Connection\ConnectionInterface; |
||
15 | use Core23\LastFm\Exception\CrawlException; |
||
16 | use Core23\LastFm\Model\Event; |
||
17 | use Core23\LastFm\Model\Image; |
||
18 | use Core23\LastFm\Model\Venue; |
||
19 | use Core23\LastFm\Model\VenueAddress; |
||
20 | use DateTime; |
||
21 | use Exception; |
||
22 | use Symfony\Component\DomCrawler\Crawler; |
||
23 | |||
24 | abstract class AbstractCrawler |
||
25 | { |
||
26 | public const URL_PREFIX = 'http://last.fm'; |
||
27 | |||
28 | public const NEWLINE = "\n"; |
||
29 | |||
30 | /** |
||
31 | * @var ConnectionInterface |
||
32 | */ |
||
33 | private $connection; |
||
34 | |||
35 | /** |
||
36 | * @param ConnectionInterface $connection |
||
37 | */ |
||
38 | public function __construct(ConnectionInterface $connection) |
||
39 | { |
||
40 | $this->connection = $connection; |
||
41 | } |
||
42 | |||
43 | /** |
||
44 | * Crawles a url. |
||
45 | * |
||
46 | * @param string $url |
||
47 | * @param array $params |
||
48 | * |
||
49 | * @return Crawler|null |
||
50 | */ |
||
51 | final protected function crawl(string $url, array $params = []): ?Crawler |
||
52 | { |
||
53 | if ($content = $this->connection->getPageBody($url, $params)) { |
||
54 | return new Crawler($content); |
||
55 | } |
||
56 | |||
57 | return null; |
||
58 | } |
||
59 | |||
60 | /** |
||
61 | * @param Crawler $node |
||
62 | * @param DateTime $datetime |
||
63 | * |
||
64 | * @return Event |
||
65 | */ |
||
66 | final protected function parseEvent(Crawler $node, DateTime $datetime = null): Event |
||
67 | { |
||
68 | $eventNode = $node->filter('.events-list-item-event--title a'); |
||
69 | |||
70 | $url = $this->parseUrl($eventNode); |
||
71 | |||
72 | if (null === $url) { |
||
73 | throw new CrawlException('Error parsing event id.'); |
||
74 | } |
||
75 | |||
76 | $id = (int) preg_replace('/.*\/(\d+)+.*/', '$1', $url); |
||
77 | |||
78 | if (0 === $id) { |
||
79 | throw new CrawlException('Error parsing event id.'); |
||
80 | } |
||
81 | |||
82 | if (null === $datetime) { |
||
83 | try { |
||
84 | $datetime = new DateTime($node->filter('time')->attr('datetime')); |
||
85 | } catch (Exception $exception) { |
||
86 | throw new CrawlException('Error reading event date', $exception->getCode(), $exception); |
||
87 | } |
||
88 | } |
||
89 | |||
90 | $venue = $this->parseVenue($node->filter('.events-list-item-venue')); |
||
91 | |||
92 | return new Event( |
||
93 | $id, |
||
94 | $this->parseString($eventNode) ?? '', |
||
95 | $datetime, |
||
96 | $url, |
||
97 | $venue |
||
98 | ); |
||
99 | } |
||
100 | |||
101 | /** |
||
102 | * @param Crawler $node |
||
103 | * |
||
104 | * @return Venue|null |
||
105 | */ |
||
106 | final protected function parseVenue(Crawler $node): ?Venue |
||
107 | { |
||
108 | $title = $this->parseString($node->filter('.events-list-item-venue--title')); |
||
109 | |||
110 | if (null === $title) { |
||
111 | return null; |
||
112 | } |
||
113 | |||
114 | $city = $this->parseString($node->filter('.events-list-item-venue--city')); |
||
115 | $country = $this->parseString($node->filter('.events-list-item-venue--country')); |
||
116 | |||
117 | return new Venue($title, null, null, new VenueAddress( |
||
118 | null, |
||
119 | null, |
||
120 | $city, |
||
121 | $country |
||
122 | )); |
||
123 | } |
||
124 | |||
125 | /** |
||
126 | * Parses a url node. |
||
127 | * |
||
128 | * @param Crawler $node |
||
129 | * @param string $attr |
||
130 | * |
||
131 | * @return string|null |
||
132 | */ |
||
133 | final protected function parseUrl(Crawler $node, string $attr = 'href'): ?string |
||
134 | { |
||
135 | if (0 === $node->count()) { |
||
136 | return null; |
||
137 | } |
||
138 | |||
139 | if ($url = $node->attr($attr)) { |
||
140 | return preg_replace('/^\//', static::URL_PREFIX.'/', $url); |
||
141 | } |
||
142 | |||
143 | return null; |
||
144 | } |
||
145 | |||
146 | /** |
||
147 | * Parses an image node. |
||
148 | * |
||
149 | * @param Crawler $node |
||
150 | * |
||
151 | * @return Image|null |
||
152 | */ |
||
153 | final protected function parseImage(Crawler $node): ?Image |
||
154 | { |
||
155 | $src = $this->parseUrl($node, 'src'); |
||
156 | |||
157 | if (!$src) { |
||
0 ignored issues
–
show
|
|||
158 | return null; |
||
159 | } |
||
160 | |||
161 | return new Image($src); |
||
162 | } |
||
163 | |||
164 | /** |
||
165 | * Parses a string node. |
||
166 | * |
||
167 | * @param Crawler $node |
||
168 | * @param bool $multiline |
||
169 | * |
||
170 | * @return string|null |
||
171 | */ |
||
172 | final protected function parseString(Crawler $node, bool $multiline = false): ?string |
||
173 | { |
||
174 | if (0 === $node->count()) { |
||
175 | return null; |
||
176 | } |
||
177 | |||
178 | $content = $node->attr('content'); |
||
179 | |||
180 | if (null === $content) { |
||
181 | if ($multiline) { |
||
182 | $content = $node->html(); |
||
183 | $content = (string) preg_replace('/<p[^>]*?>/', '', $content); |
||
184 | $content = str_replace('</p>', static::NEWLINE, $content); |
||
185 | $content = (string) preg_replace('/<br\s?\/?>/i', static::NEWLINE, $content); |
||
186 | } else { |
||
187 | $content = $node->text(); |
||
188 | } |
||
189 | } |
||
190 | |||
191 | return trim(strip_tags($content)); |
||
192 | } |
||
193 | |||
194 | /** |
||
195 | * Parses a date note. |
||
196 | * |
||
197 | * @param Crawler $node |
||
198 | * |
||
199 | * @return DateTime|null |
||
200 | */ |
||
201 | final protected function parseDate(Crawler $node): ?DateTime |
||
202 | { |
||
203 | $content = $this->parseString($node); |
||
204 | |||
205 | if (null !== $content) { |
||
206 | return new DateTime($content); |
||
207 | } |
||
208 | |||
209 | return null; |
||
210 | } |
||
211 | } |
||
212 |
In PHP, under loose comparison (like
==
, or!=
, orswitch
conditions), values of different types might be equal.For
string
values, the empty string''
is a special case, in particular the following results might be unexpected: