1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Zrashwani\NewsScrapper\Adapters; |
4
|
|
|
|
5
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
6
|
|
|
use Zrashwani\NewsScrapper\Selector; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Adapter to extract page data from un-structured HTML document |
10
|
|
|
* @author Zeid Rashwani <zrashwani.com> |
11
|
|
|
*/ |
12
|
|
|
class CustomAdapter extends AbstractAdapter |
13
|
|
|
{ |
14
|
|
|
|
15
|
|
|
private $authorSelector; |
16
|
|
|
private $bodySelector; |
17
|
|
|
private $descriptionSelector; |
18
|
|
|
private $imageSelector; |
19
|
|
|
private $keywordsSelector; |
20
|
|
|
private $publishDateSelector; |
21
|
|
|
private $titleSelector; |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* adapter used to fill in the missing selectors data by default values |
25
|
|
|
* @var DefaultAdapter $fallbackAdapter |
26
|
|
|
*/ |
27
|
|
|
private $fallbackAdapter; |
28
|
|
|
|
29
|
|
|
public function __construct() |
30
|
|
|
{ |
31
|
|
|
$this->fallbackAdapter = new DefaultAdapter(); |
32
|
|
|
} |
33
|
|
|
|
34
|
|
|
public function setAuthorSelector($selector) |
35
|
|
|
{ |
36
|
|
|
$this->authorSelector = $selector; |
37
|
|
|
return $this; |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
public function setBodySelector($selector) |
41
|
|
|
{ |
42
|
|
|
$this->bodySelector = $selector; |
43
|
|
|
return $this; |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
public function setDescriptionSelector($selector) |
47
|
|
|
{ |
48
|
|
|
$this->descriptionSelector = $selector; |
49
|
|
|
return $this; |
50
|
|
|
} |
51
|
|
|
|
52
|
|
|
public function setImageSelector($selector) |
53
|
|
|
{ |
54
|
|
|
$this->imageSelector = $selector; |
55
|
|
|
return $this; |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
public function setKeywordsSelector($selector) |
59
|
|
|
{ |
60
|
|
|
$this->keywordsSelector = $selector; |
61
|
|
|
return $this; |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
public function setPublishDateSelector($selector) |
65
|
|
|
{ |
66
|
|
|
$this->publishDateSelector = $selector; |
67
|
|
|
return $this; |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
public function setTitleSelector($selector) |
71
|
|
|
{ |
72
|
|
|
$this->titleSelector = $selector; |
73
|
|
|
return $this; |
74
|
|
|
} |
75
|
|
|
|
76
|
|
View Code Duplication |
public function extractAuthor(Crawler $crawler) |
|
|
|
|
77
|
|
|
{ |
78
|
|
|
$ret = $this->getElementText($crawler, $this->authorSelector); |
79
|
|
|
if (empty($ret) === true) { |
80
|
|
|
$ret = $this->fallbackAdapter->extractAuthor($crawler); |
81
|
|
|
} |
82
|
|
|
return $ret; |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
public function extractBody(Crawler $crawler) |
86
|
|
|
{ |
87
|
|
|
$ret = $this->getElementText($crawler, $this->bodySelector); |
88
|
|
|
return $this->normalizeHtml($ret); |
89
|
|
|
} |
90
|
|
|
|
91
|
|
View Code Duplication |
public function extractDescription(Crawler $crawler) |
|
|
|
|
92
|
|
|
{ |
93
|
|
|
$ret = $this->getElementText($crawler, $this->descriptionSelector); |
94
|
|
|
if (empty($ret) === true) { |
95
|
|
|
$ret = $this->fallbackAdapter->extractDescription($crawler); |
96
|
|
|
} |
97
|
|
|
return $ret; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
public function extractImage(Crawler $crawler) |
101
|
|
|
{ |
102
|
|
|
|
103
|
|
|
if (empty($this->imageSelector) === false) { |
104
|
|
|
$ret = $this->getSrcByImgSelector($crawler, $this->imageSelector); |
105
|
|
|
} |
106
|
|
|
if (empty($ret) === true) { |
107
|
|
|
$ret = $this->fallbackAdapter->extractImage($crawler); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
if (empty($ret) === false) { |
111
|
|
|
return $this->normalizeLink($ret); |
|
|
|
|
112
|
|
|
} else { |
113
|
|
|
return null; |
114
|
|
|
} |
115
|
|
|
} |
116
|
|
|
|
117
|
|
|
public function extractKeywords(Crawler $crawler) |
118
|
|
|
{ |
119
|
|
|
$ret = $this->getElementText($crawler, $this->keywordsSelector); |
120
|
|
|
if (empty($ret) === true) { |
121
|
|
|
return $this->fallbackAdapter->extractKeywords($crawler); |
122
|
|
|
} else { |
123
|
|
|
return $this->normalizeKeywords(explode(',', $ret)); |
124
|
|
|
} |
125
|
|
|
} |
126
|
|
|
|
127
|
|
View Code Duplication |
public function extractPublishDate(Crawler $crawler) |
|
|
|
|
128
|
|
|
{ |
129
|
|
|
$ret = $this->getElementText($crawler, $this->publishDateSelector); |
130
|
|
|
if (empty($ret) === true) { |
131
|
|
|
$ret = $this->fallbackAdapter->extractPublishDate($crawler); |
132
|
|
|
} |
133
|
|
|
return $ret; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
View Code Duplication |
public function extractTitle(Crawler $crawler) |
|
|
|
|
137
|
|
|
{ |
138
|
|
|
$ret = $this->getElementText($crawler, $this->titleSelector); |
139
|
|
|
if (empty($ret) === true) { |
140
|
|
|
$ret = $this->fallbackAdapter->extractTitle($crawler); |
141
|
|
|
} |
142
|
|
|
return $ret; |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* getting text of element by selector (css selector or xpath ) |
147
|
|
|
* @param Crawler $crawler |
148
|
|
|
* @param string $selector |
149
|
|
|
* @param \Closure $extractClosure callback function to be used for extraction |
150
|
|
|
* @return string |
151
|
|
|
*/ |
152
|
|
|
protected function getElementText(Crawler $crawler, $selector, $extractClosure = null) |
153
|
|
|
{ |
154
|
|
|
|
155
|
|
|
if (empty($selector) === true) { |
156
|
|
|
return null; |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
$ret = null; |
160
|
|
|
if ($extractClosure === null) { |
161
|
|
|
$extractClosure = function(Crawler $node) use (&$ret) { |
162
|
|
|
$ret = $node->html(); |
163
|
|
|
}; |
164
|
|
|
} |
165
|
|
|
if (Selector::isCSS($selector)) { |
166
|
|
|
$crawler->filter($selector) |
167
|
|
|
->each($extractClosure); |
168
|
|
|
} else { |
169
|
|
|
$crawler->filterXPath($selector) |
170
|
|
|
->each($extractClosure); |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
return $ret; |
174
|
|
|
} |
175
|
|
|
} |
176
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.