1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Zrashwani\NewsScrapper\Adapters; |
4
|
|
|
|
5
|
|
|
use \Symfony\Component\DomCrawler\Crawler; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Adapter to extract news base on open graph protocol specifications |
9
|
|
|
* @link http://ogp.me/ open graph meta data specifications |
10
|
|
|
* @author Zeid Rashwani <zrashwani.com> |
11
|
|
|
*/ |
12
|
|
|
class OpenGraphAdapter extends AbstractAdapter |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* extract title information from crawler object |
16
|
|
|
* @param Crawler $crawler |
17
|
|
|
* @return string |
18
|
|
|
*/ |
19
|
|
|
public function extractTitle(Crawler $crawler) |
20
|
|
|
{ |
21
|
|
|
$ret = null; |
22
|
|
|
|
23
|
|
|
$crawler->filterXPath("//head/meta[@property='og:title']") |
24
|
|
|
->each( |
25
|
|
|
function(Crawler $node) use (&$ret) { |
26
|
|
|
$ret = $node->attr('content'); |
27
|
|
|
} |
28
|
|
|
); |
29
|
|
|
|
30
|
|
|
//fallback in case document don't have og:title |
31
|
|
|
if (empty($ret) === true) { |
32
|
|
|
$crawler->filterXPath('//h1') |
33
|
|
|
->each( |
34
|
|
|
function(Crawler $node) use (&$ret) { |
35
|
|
|
$ret = $node->text(); |
36
|
|
|
} |
37
|
|
|
); |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
if (empty($ret) === true) { |
41
|
|
|
$crawler->filterXPath('//head/title') |
42
|
|
|
->each( |
43
|
|
|
function(Crawler $node) use (&$ret) { |
44
|
|
|
$ret = $node->text(); |
45
|
|
|
} |
46
|
|
|
); |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
return $ret; |
50
|
|
|
} |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* extract image url from crawler open graph |
54
|
|
|
* @param Crawler $crawler |
55
|
|
|
* @return string |
56
|
|
|
*/ |
57
|
|
|
public function extractImage(Crawler $crawler) |
58
|
|
|
{ |
59
|
|
|
$ret = null; |
60
|
|
|
$theAdapter = $this; |
61
|
|
|
|
62
|
|
|
$crawler->filterXPath("//head/meta[@property='og:image']") |
63
|
|
|
->each( |
64
|
|
|
function(Crawler $node) use (&$ret) { |
65
|
|
|
if($this->getCheckSmallImage($node->attr('content')) === false){ //not small image size |
66
|
|
|
$ret = $node->attr('content'); |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
); |
70
|
|
|
|
71
|
|
|
if (empty($ret) === true) { |
72
|
|
|
$crawler->filterXPath('//img') |
73
|
|
|
->each( |
74
|
|
|
function(Crawler $node) use (&$ret, $theAdapter) { |
75
|
|
|
$img_src = $theAdapter->normalizeLink($node->attr('src')); |
76
|
|
|
$width_org = $height_org = 0; |
77
|
|
|
|
78
|
|
|
$url = pathinfo($img_src); |
79
|
|
|
list($width, $height) = getimagesize($url['dirname'].'/'.urlencode($url['basename'])); |
80
|
|
|
|
81
|
|
View Code Duplication |
if (empty($ret) === false) { |
|
|
|
|
82
|
|
|
$url_ret = pathinfo($ret); |
83
|
|
|
list($width_org, $height_org) = getimagesize( |
84
|
|
|
$url_ret['dirname']. |
85
|
|
|
'/'.urlencode($url_ret['basename']) |
86
|
|
|
); |
87
|
|
|
} |
88
|
|
|
if ($width > $width_org && $height > $height_org |
89
|
|
|
&& $width > 200 && $height > 200 //min size of the image amended |
90
|
|
|
) { |
91
|
|
|
$ret = $img_src; |
92
|
|
|
} |
93
|
|
|
} |
94
|
|
|
); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
if (empty($ret) === false) { |
98
|
|
|
$ret = $this->normalizeLink($ret); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
return $ret; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
View Code Duplication |
public function extractDescription(Crawler $crawler) |
|
|
|
|
105
|
|
|
{ |
106
|
|
|
$ret = null; |
107
|
|
|
|
108
|
|
|
$crawler->filterXPath("//head/meta[@property='og:description']") |
109
|
|
|
->each( |
110
|
|
|
function(Crawler $node) use (&$ret) { |
111
|
|
|
$ret = $node->attr('content'); |
112
|
|
|
} |
113
|
|
|
); |
114
|
|
|
|
115
|
|
|
return $ret; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* extract keywords out of crawler object |
120
|
|
|
* @param Crawler $crawler |
121
|
|
|
* @return array |
122
|
|
|
*/ |
123
|
|
View Code Duplication |
public function extractKeywords(Crawler $crawler) |
|
|
|
|
124
|
|
|
{ |
125
|
|
|
$ret = array(); |
126
|
|
|
|
127
|
|
|
$crawler->filterXPath("//head/meta[@property='og:keywords']") |
128
|
|
|
->each( |
129
|
|
|
function(Crawler $node) use (&$ret) { |
130
|
|
|
|
131
|
|
|
$node_txt = trim($node->attr('content')); |
132
|
|
|
if (!empty($node_txt)) { |
133
|
|
|
$ret = explode(',', $node_txt); |
134
|
|
|
|
135
|
|
|
} |
136
|
|
|
} |
137
|
|
|
); |
138
|
|
|
|
139
|
|
|
return $ret; |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
public function extractBody(Crawler $crawler) |
143
|
|
|
{ |
144
|
|
|
//No body can be extracted from open graph protocol |
145
|
|
|
return null; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
View Code Duplication |
public function extractPublishDate(Crawler $crawler) |
|
|
|
|
149
|
|
|
{ |
150
|
|
|
$date_str = null; |
151
|
|
|
|
152
|
|
|
$crawler->filterXPath("//head/meta[@property='article:published_time']") |
153
|
|
|
->each( |
154
|
|
|
function(Crawler $node) use (&$date_str) { |
155
|
|
|
$date_str = $node->attr('content'); |
156
|
|
|
} |
157
|
|
|
); |
158
|
|
|
|
159
|
|
|
if (!is_null($date_str)) { |
160
|
|
|
$ret = new \DateTime($date_str); |
161
|
|
|
return $ret->format(\DateTime::ISO8601); |
162
|
|
|
} else { |
163
|
|
|
return null; |
164
|
|
|
} |
165
|
|
|
} |
166
|
|
|
|
167
|
|
View Code Duplication |
public function extractAuthor(Crawler $crawler) |
|
|
|
|
168
|
|
|
{ |
169
|
|
|
$ret = null; |
170
|
|
|
$crawler->filterXPath("//head/meta[@property='article:author']") |
171
|
|
|
->each( |
172
|
|
|
function(Crawler $node) use (&$ret) { |
173
|
|
|
$ret = $node->attr('content'); |
174
|
|
|
} |
175
|
|
|
); |
176
|
|
|
|
177
|
|
|
return $ret; |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
public function getCheckSmallImage($imageUrl){ |
181
|
|
|
|
182
|
|
|
$url_ret = pathinfo($imageUrl); |
183
|
|
|
list($width_org, $height_org) = getimagesize( |
184
|
|
|
$url_ret['dirname'].'/'.urlencode($url_ret['basename']) |
185
|
|
|
); |
186
|
|
|
|
187
|
|
|
if($width_org<200 || $height_org < 200){ |
188
|
|
|
return true; |
189
|
|
|
}else{ |
190
|
|
|
return false; |
191
|
|
|
} |
192
|
|
|
} |
193
|
|
|
} |
194
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.