1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Main Class to manipulate dom |
4
|
|
|
* |
5
|
|
|
* PHP version 5.4 |
6
|
|
|
* |
7
|
|
|
* @category GLICER |
8
|
|
|
* @package GlHtml |
9
|
|
|
* @author Emmanuel ROECKER |
10
|
|
|
* @author Rym BOUCHAGOUR |
11
|
|
|
* @copyright 2015 GLICER |
12
|
|
|
* @license MIT |
13
|
|
|
* @link http://dev.glicer.com/ |
14
|
|
|
* |
15
|
|
|
* Created : 19/02/15 |
16
|
|
|
* File : GlHtml.php |
17
|
|
|
* |
18
|
|
|
*/ |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
namespace GlHtml; |
22
|
|
|
|
23
|
|
|
use Symfony\Component\CssSelector\CssSelector; |
24
|
|
|
use Symfony\Component\CssSelector\CssSelectorConverter; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* Class GlHtml |
28
|
|
|
* @package GlHtml |
29
|
|
|
*/ |
30
|
|
|
class GlHtml |
31
|
|
|
{ |
32
|
|
|
/** |
33
|
|
|
* @var \DOMDocument |
34
|
|
|
*/ |
35
|
|
|
private $dom; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* @var string |
39
|
|
|
*/ |
40
|
|
|
private $html; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* @param string $html |
44
|
|
|
*/ |
45
|
|
|
public function __construct($html) |
46
|
|
|
{ |
47
|
|
|
$html = self::fixNewlines($html); |
48
|
|
|
$this->dom = new \DOMDocument(); |
49
|
|
|
|
50
|
|
|
$libxml_previous_state = libxml_use_internal_errors(true); //disable warnings |
51
|
|
|
$this->dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
52
|
|
|
libxml_clear_errors(); |
53
|
|
|
libxml_use_internal_errors($libxml_previous_state); |
54
|
|
|
|
55
|
|
|
$this->html = $html; |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Unify newlines |
60
|
|
|
* |
61
|
|
|
* @param string $text |
62
|
|
|
* |
63
|
|
|
* @return string the fixed text |
64
|
|
|
*/ |
65
|
|
|
private static function fixNewlines($text) |
66
|
|
|
{ |
67
|
|
|
$text = str_replace("\r\n", "\n", $text); |
68
|
|
|
$text = str_replace("\r", "\n", $text); |
69
|
|
|
|
70
|
|
|
return $text; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* return one dom element with $selector css filter |
75
|
|
|
* |
76
|
|
|
* @param string $selector CSS 3 Selector |
77
|
|
|
* |
78
|
|
|
* @return GlHtmlNode[] |
79
|
|
|
*/ |
80
|
|
|
public function get($selector) |
81
|
|
|
{ |
82
|
|
|
$xpath = new \DOMXPath($this->dom); |
83
|
|
|
|
84
|
|
|
if (class_exists('Symfony\Component\CssSelector\CssSelector')) { |
85
|
|
|
$expression = CssSelector::toXPath($selector); |
86
|
|
|
} else { |
87
|
|
|
$converter = new CssSelectorConverter(); |
88
|
|
|
$expression = $converter->toXPath($selector); |
89
|
|
|
} |
90
|
|
|
$nodes = $xpath->query($expression); |
91
|
|
|
|
92
|
|
|
$glnodes = []; |
93
|
|
|
foreach ($nodes as $node) { |
94
|
|
|
$glnodes[] = new GlHtmlNode($node); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
return $glnodes; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* set a list of attributes |
102
|
|
|
* |
103
|
|
|
* @param string $selector |
104
|
|
|
* @param array $attributes |
105
|
|
|
*/ |
106
|
|
|
public function setAttributes($selector, array $attributes) |
107
|
|
|
{ |
108
|
|
|
$nodes = $this->get($selector); |
109
|
|
|
|
110
|
|
|
foreach ($nodes as $node) { |
111
|
|
|
$node->setAttributes($attributes); |
112
|
|
|
} |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
/** |
116
|
|
|
* @param string $selector |
117
|
|
|
*/ |
118
|
|
|
public function delete($selector) |
119
|
|
|
{ |
120
|
|
|
$nodes = $this->get($selector); |
121
|
|
|
foreach ($nodes as $node) { |
122
|
|
|
$node->delete(); |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* @return string |
128
|
|
|
*/ |
129
|
|
|
public function html() |
130
|
|
|
{ |
131
|
|
|
return $this->dom->saveHTML(); |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
public function getText() |
135
|
|
|
{ |
136
|
|
|
$body = $this->get("body")[0]; |
137
|
|
|
|
138
|
|
|
return $body->getText(); |
139
|
|
|
} |
140
|
|
|
|
141
|
|
|
/** |
142
|
|
|
* @param string $tagname |
143
|
|
|
* @param string $attribute |
144
|
|
|
* @param array $links |
145
|
|
|
*/ |
146
|
|
|
private function getLinksByTagAttribute($tagname, $attribute, array &$links) |
147
|
|
|
{ |
148
|
|
|
$tagslink = $this->get($tagname); |
149
|
|
|
foreach ($tagslink as $taglink) { |
150
|
|
|
$href = $taglink->getAttribute($attribute); |
151
|
|
|
if (isset($href) && (strlen(trim($href)) > 0)) { |
152
|
|
|
$links[$href] = $href; |
153
|
|
|
} |
154
|
|
|
} |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* @param bool $all if true get url in text and params |
159
|
|
|
* |
160
|
|
|
* @return array |
161
|
|
|
*/ |
162
|
|
|
public function getLinks($all = false) |
163
|
|
|
{ |
164
|
|
|
$links = []; |
165
|
|
|
|
166
|
|
|
$this->getLinksByTagAttribute("link", "href", $links); |
167
|
|
|
$this->getLinksByTagAttribute("a", "href", $links); |
168
|
|
|
$this->getLinksByTagAttribute("script", "src", $links); |
169
|
|
|
$this->getLinksByTagAttribute("iframe", "src", $links); |
170
|
|
|
$this->getLinksByTagAttribute("img", "src", $links); |
171
|
|
|
|
172
|
|
|
//get all string started with http |
173
|
|
|
$regexUrl = '/[">\s]+((http|https|ftp|ftps)\:\/\/(.*?))["<\s]+/'; |
174
|
|
|
$urls = null; |
175
|
|
|
if (preg_match_all($regexUrl, $this->html, $urls) > 0) { |
176
|
|
|
$matches = $urls[1]; |
177
|
|
|
foreach ($matches as $url) { |
178
|
|
|
if (filter_var($url, FILTER_VALIDATE_URL)) { |
179
|
|
|
$links[$url] = $url; |
180
|
|
|
} |
181
|
|
|
} |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
if ($all) { |
185
|
|
|
//get all params which can be a url |
186
|
|
|
$regexParam = '/["](.*?)["]/'; |
187
|
|
|
$params = []; |
188
|
|
|
if (preg_match_all($regexParam, $this->html, $params) > 0) { |
189
|
|
|
$urls = $params[1]; |
190
|
|
|
foreach ($urls as $url) { |
191
|
|
|
$url = trim($url); |
192
|
|
|
if ((strpbrk($url, "/.") !== false) && (strpbrk($url, " ") === false)) { |
193
|
|
|
$links[$url] = $url; |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
} |
197
|
|
|
} |
198
|
|
|
|
199
|
|
|
foreach ($links as $link) { |
200
|
|
|
$url = parse_url($link); |
201
|
|
|
if (!((isset($url['host']) && isset($url['scheme'])) || (isset($url['path'])))) { |
202
|
|
|
unset($links[$link]); |
203
|
|
|
} |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
return $links; |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
public function getSentences() |
210
|
|
|
{ |
211
|
|
|
$sentences = []; |
212
|
|
|
|
213
|
|
|
$body = $this->get("body"); |
214
|
|
|
if (count($body) > 0) { |
215
|
|
|
$sentences = $body[0]->getSentences(); |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
$description = $this->get('meta[name="description"]'); |
219
|
|
View Code Duplication |
if (count($description) > 0) { |
|
|
|
|
220
|
|
|
$description = trim($description[0]->getAttribute("content")); |
221
|
|
|
if (strlen($description) > 0) { |
222
|
|
|
array_unshift($sentences, $description); |
223
|
|
|
} |
224
|
|
|
} |
225
|
|
|
|
226
|
|
|
$title = $this->get('title'); |
227
|
|
View Code Duplication |
if (count($title) > 0) { |
|
|
|
|
228
|
|
|
$title = trim($title[0]->getText()); |
229
|
|
|
if (strlen($title) > 0) { |
230
|
|
|
array_unshift($sentences, $title); |
231
|
|
|
} |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
return $sentences; |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
/** |
238
|
|
|
* @return GlHtmlSummary[] |
239
|
|
|
*/ |
240
|
|
|
public function getSummary() |
241
|
|
|
{ |
242
|
|
|
$body = $this->get("body")[0]; |
243
|
|
|
|
244
|
|
|
$summary = []; |
245
|
|
|
$callback = function (GlHtmlNode $childNode) use (&$summary) { |
246
|
|
|
$nodeName = $childNode->getName(); |
247
|
|
|
|
248
|
|
|
if (preg_match('/^h(\d+)$/', $nodeName, $matches)) { |
249
|
|
|
$summary[] = new GlHtmlSummary($childNode, $matches[1]); |
250
|
|
|
} |
251
|
|
|
}; |
252
|
|
|
|
253
|
|
|
$body->callChild($callback); |
254
|
|
|
|
255
|
|
|
return $summary; |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
private function convertHToTree( |
259
|
|
|
&$start, |
260
|
|
|
$summary, |
261
|
|
|
array &$summaryTree |
262
|
|
|
) { |
263
|
|
|
$end = count($summary); |
264
|
|
|
$number = 1; |
265
|
|
|
while ($start < $end) { |
266
|
|
|
$text = $summary[$start]->getNode()->getText(); |
267
|
|
|
$id = $summary[$start]->getNode()->getAttribute('id'); |
268
|
|
|
|
269
|
|
|
$summaryTree[$text] = ['id' => $id, 'children' => []]; |
270
|
|
|
if (($start + 1) < $end) { |
271
|
|
|
$currentLevel = $summary[$start]->getLevel(); |
272
|
|
|
$nextLevel = $summary[$start + 1]->getLevel(); |
273
|
|
|
if ($nextLevel > $currentLevel) { |
274
|
|
|
$start++; |
275
|
|
|
$diff = $this->convertHToTree( |
276
|
|
|
$start, |
277
|
|
|
$summary, |
278
|
|
|
$summaryTree[$text]['children'] |
279
|
|
|
); |
280
|
|
|
if ($diff > 0) { |
281
|
|
|
return $diff - 1; |
282
|
|
|
} |
283
|
|
|
} else { |
284
|
|
|
if ($nextLevel < $currentLevel) { |
285
|
|
|
return ($currentLevel - $nextLevel - 1); |
286
|
|
|
} |
287
|
|
|
} |
288
|
|
|
} |
289
|
|
|
$number++; |
290
|
|
|
$start++; |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
return 0; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
public function getSummaryTree() |
297
|
|
|
{ |
298
|
|
|
$summary = $this->getSummary(); |
299
|
|
|
$start = 0; |
300
|
|
|
$summaryTree = []; |
301
|
|
|
$this->convertHToTree($start, $summary, $summaryTree); |
302
|
|
|
reset($summaryTree); //reset array pointer |
303
|
|
|
|
304
|
|
|
return $summaryTree; |
305
|
|
|
} |
306
|
|
|
} |
307
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.