1 | <?php |
||
15 | abstract class AbstractCrawler |
||
16 | { |
||
17 | public const URL_PREFIX = 'http://last.fm'; |
||
18 | |||
19 | public const NEWLINE = "\n"; |
||
20 | |||
21 | /** |
||
22 | * @var ConnectionInterface |
||
23 | */ |
||
24 | protected $connection; |
||
25 | |||
26 | /** |
||
27 | * AbstractService constructor. |
||
28 | * |
||
29 | * @param ConnectionInterface $connection |
||
30 | */ |
||
31 | public function __construct(ConnectionInterface $connection) |
||
35 | |||
36 | /** |
||
37 | * Crawles a url. |
||
38 | * |
||
39 | * @param string $url |
||
40 | * |
||
41 | * @return Crawler|null |
||
42 | */ |
||
43 | final protected function crawl(string $url): ? Crawler |
||
44 | { |
||
45 | if ($content = $this->connection->getPageBody($url)) { |
||
46 | return new Crawler($content); |
||
47 | } |
||
48 | |||
49 | return null; |
||
50 | } |
||
51 | |||
52 | /** |
||
53 | * Parses a url node. |
||
54 | * |
||
55 | * @param Crawler $node |
||
56 | * @param string $attr |
||
57 | * |
||
58 | * @return null|string |
||
59 | */ |
||
60 | final protected function parseUrl(Crawler $node, string $attr = 'href') : ? string |
||
61 | { |
||
62 | if (0 === $node->count()) { |
||
63 | return null; |
||
64 | } |
||
65 | |||
66 | if ($url = $node->attr($attr)) { |
||
67 | return preg_replace('/^\//', static::URL_PREFIX.'/', $url); |
||
68 | } |
||
69 | |||
70 | return null; |
||
71 | } |
||
72 | |||
73 | /** |
||
74 | * Parses an image node. |
||
75 | * |
||
76 | * @param Crawler $node |
||
77 | * |
||
78 | * @return null|string |
||
79 | */ |
||
80 | final protected function parseImage(Crawler $node) : ? string |
||
84 | |||
85 | /** |
||
86 | * Parses a string node. |
||
87 | * |
||
88 | * @param Crawler $node |
||
89 | * @param bool $multiline |
||
90 | * |
||
91 | * @return null|string |
||
92 | */ |
||
93 | final protected function parseString(Crawler $node, bool $multiline = false) : ? string |
||
94 | { |
||
95 | if (0 === $node->count()) { |
||
96 | return null; |
||
97 | } |
||
98 | |||
99 | $content = $node->attr('content'); |
||
100 | |||
101 | if (null === $content) { |
||
102 | if ($multiline) { |
||
103 | $content = $node->html(); |
||
104 | $content = preg_replace('/<p[^>]*?>/', '', $content); |
||
105 | $content = str_replace('</p>', static::NEWLINE, $content); |
||
106 | $content = preg_replace('/<br\s?\/?>/i', static::NEWLINE, $content); |
||
107 | } else { |
||
108 | $content = $node->text(); |
||
109 | } |
||
110 | } |
||
111 | |||
112 | return trim(strip_tags($content)); |
||
113 | } |
||
114 | |||
115 | /** |
||
116 | * Parses a date note. |
||
117 | * |
||
118 | * @param Crawler $node |
||
119 | * |
||
120 | * @return \DateTime|null |
||
121 | */ |
||
122 | final protected function parseDate(Crawler $node) : ? \DateTime |
||
132 | } |
||
133 |