|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace voku\helper; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Class HtmlMin |
|
7
|
|
|
* |
|
8
|
|
|
* Inspired by: |
|
9
|
|
|
* - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js |
|
10
|
|
|
* - PHP: https://github.com/searchturbine/phpwee-php-minifier |
|
11
|
|
|
* - PHP: https://github.com/zaininnari/html-minifier |
|
12
|
|
|
* - Java: https://code.google.com/archive/p/htmlcompressor/ |
|
13
|
|
|
* |
|
14
|
|
|
* @package voku\helper |
|
15
|
|
|
*/ |
|
16
|
|
|
class HtmlMin |
|
17
|
|
|
{ |
|
18
|
|
|
/** |
|
19
|
|
|
* // https://mathiasbynens.be/demo/javascript-mime-type |
|
20
|
|
|
* // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type |
|
21
|
|
|
* |
|
22
|
|
|
* @var array |
|
23
|
|
|
*/ |
|
24
|
|
|
private static $executableScriptsMimeTypes = array( |
|
25
|
|
|
'text/javascript', |
|
26
|
|
|
'text/ecmascript', |
|
27
|
|
|
'text/jscript', |
|
28
|
|
|
'application/javascript', |
|
29
|
|
|
'application/x-javascript', |
|
30
|
|
|
'application/ecmascript', |
|
31
|
|
|
); |
|
32
|
|
|
|
|
33
|
|
|
/** |
|
34
|
|
|
* @var array |
|
35
|
|
|
*/ |
|
36
|
|
|
private static $booleanAttributes = array( |
|
37
|
|
|
'allowfullscreen', |
|
38
|
|
|
'async', |
|
39
|
|
|
'autofocus', |
|
40
|
|
|
'autoplay', |
|
41
|
|
|
'checked', |
|
42
|
|
|
'compact', |
|
43
|
|
|
'controls', |
|
44
|
|
|
'declare', |
|
45
|
|
|
'default', |
|
46
|
|
|
'defaultchecked', |
|
47
|
|
|
'defaultmuted', |
|
48
|
|
|
'defaultselected', |
|
49
|
|
|
'defer', |
|
50
|
|
|
'disabled', |
|
51
|
|
|
'enabled', |
|
52
|
|
|
'formnovalidate', |
|
53
|
|
|
'hidden', |
|
54
|
|
|
'indeterminate', |
|
55
|
|
|
'inert', |
|
56
|
|
|
'ismap', |
|
57
|
|
|
'itemscope', |
|
58
|
|
|
'loop', |
|
59
|
|
|
'multiple', |
|
60
|
|
|
'muted', |
|
61
|
|
|
'nohref', |
|
62
|
|
|
'noresize', |
|
63
|
|
|
'noshade', |
|
64
|
|
|
'novalidate', |
|
65
|
|
|
'nowrap', |
|
66
|
|
|
'open', |
|
67
|
|
|
'pauseonexit', |
|
68
|
|
|
'readonly', |
|
69
|
|
|
'required', |
|
70
|
|
|
'reversed', |
|
71
|
|
|
'scoped', |
|
72
|
|
|
'seamless', |
|
73
|
|
|
'selected', |
|
74
|
|
|
'sortable', |
|
75
|
|
|
'truespeed', |
|
76
|
|
|
'typemustmatch', |
|
77
|
|
|
'visible', |
|
78
|
|
|
); |
|
79
|
|
|
|
|
80
|
|
|
/** |
|
81
|
|
|
* An random md5-hash, generated via "random_bytes()". |
|
82
|
|
|
* |
|
83
|
|
|
* @var string |
|
84
|
|
|
*/ |
|
85
|
|
|
protected $randomHash; |
|
86
|
|
|
|
|
87
|
|
|
/** |
|
88
|
|
|
* HtmlMin constructor. |
|
89
|
|
|
*/ |
|
90
|
21 |
|
public function __construct() |
|
91
|
|
|
{ |
|
92
|
21 |
|
$this->randomHash = md5(Bootup::get_random_bytes(16)); |
|
93
|
21 |
|
} |
|
94
|
|
|
|
|
95
|
|
|
/** |
|
96
|
|
|
* @param string $html |
|
97
|
|
|
* |
|
98
|
|
|
* @return string |
|
99
|
|
|
*/ |
|
100
|
21 |
|
public function minify($html) |
|
101
|
|
|
{ |
|
102
|
21 |
|
$html = (string)$html; |
|
103
|
21 |
|
if (!isset($html[0])) { |
|
104
|
|
|
return ''; |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
21 |
|
$html = trim($html); |
|
108
|
21 |
|
if (!$html) { |
|
109
|
2 |
|
return ''; |
|
110
|
|
|
} |
|
111
|
|
|
|
|
112
|
19 |
|
$origHtml = $html; |
|
113
|
19 |
|
$origHtmlLength = UTF8::strlen($html); |
|
114
|
|
|
|
|
115
|
19 |
|
$dom = new HtmlDomParser(); |
|
116
|
19 |
|
$dom->getDocument()->preserveWhiteSpace = false; |
|
117
|
19 |
|
$dom->getDocument()->formatOutput = false; |
|
118
|
|
|
|
|
119
|
19 |
|
$dom->loadHtml($html); |
|
120
|
19 |
|
$xpath = new \DOMXPath($dom->getDocument()); |
|
121
|
|
|
|
|
122
|
19 |
|
foreach ($xpath->query('//comment()') as $comment) { |
|
123
|
2 |
|
$val = $comment->nodeValue; |
|
124
|
2 |
|
if (strpos($val, '[') !== 0) { |
|
125
|
2 |
|
$comment->parentNode->removeChild($comment); |
|
126
|
2 |
|
} |
|
127
|
19 |
|
} |
|
128
|
|
|
|
|
129
|
19 |
|
$dom->getDocument()->normalizeDocument(); |
|
130
|
|
|
|
|
131
|
19 |
|
$textnodes = $xpath->query('//text()'); |
|
132
|
19 |
|
$skip = array('style', 'pre', 'code', 'script', 'textarea'); |
|
133
|
19 |
|
foreach ($textnodes as $t) { |
|
134
|
|
|
/* @var $t \DOMNode */ |
|
135
|
15 |
|
$xp = $t->getNodePath(); |
|
136
|
|
|
|
|
137
|
15 |
|
$doSkip = false; |
|
138
|
15 |
|
foreach ($skip as $pattern) { |
|
139
|
15 |
|
if (strpos($xp, "/$pattern") !== false) { |
|
140
|
3 |
|
$doSkip = true; |
|
141
|
3 |
|
break; |
|
142
|
|
|
} |
|
143
|
15 |
|
} |
|
144
|
|
|
|
|
145
|
15 |
|
if ($doSkip) { |
|
146
|
3 |
|
continue; |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
15 |
|
$t->nodeValue = preg_replace("/\s{2,}/", ' ', $t->nodeValue); |
|
150
|
19 |
|
} |
|
151
|
|
|
|
|
152
|
19 |
|
$dom->getDocument()->normalizeDocument(); |
|
153
|
|
|
|
|
154
|
19 |
|
$divnodes = $xpath->query('//div|//p|//nav|//footer|//article|//script|//hr|//br'); |
|
155
|
19 |
|
foreach ($divnodes as $d) { |
|
156
|
7 |
|
$candidates = array(); |
|
157
|
|
|
|
|
158
|
7 |
|
if (count($d->childNodes)) { |
|
159
|
7 |
|
$candidates[] = $d->firstChild; |
|
160
|
7 |
|
$candidates[] = $d->lastChild; |
|
161
|
7 |
|
$candidates[] = $d->previousSibling; |
|
162
|
7 |
|
$candidates[] = $d->nextSibling; |
|
163
|
7 |
|
} |
|
164
|
|
|
|
|
165
|
7 |
|
foreach ($candidates as $c) { |
|
166
|
7 |
|
if ($c === null) { |
|
167
|
7 |
|
continue; |
|
168
|
|
|
} |
|
169
|
|
|
|
|
170
|
7 |
|
if ($c->nodeType === 3) { |
|
171
|
7 |
|
$c->nodeValue = trim($c->nodeValue); |
|
172
|
7 |
|
} |
|
173
|
7 |
|
} |
|
174
|
19 |
|
} |
|
175
|
|
|
|
|
176
|
19 |
|
$dom->getDocument()->normalizeDocument(); |
|
177
|
|
|
|
|
178
|
19 |
|
$elements = $dom->find('*'); |
|
179
|
19 |
|
foreach ($elements as $element) { |
|
180
|
19 |
|
if (count($element) > 1) { |
|
181
|
|
|
foreach ($element as $e) { |
|
182
|
|
|
$this->optimizeAttributes($e); |
|
183
|
|
|
} |
|
184
|
|
|
} else { |
|
185
|
19 |
|
$this->optimizeAttributes($element); |
|
186
|
|
|
} |
|
187
|
19 |
|
} |
|
188
|
|
|
|
|
189
|
19 |
|
$dom->getDocument()->normalizeDocument(); |
|
190
|
|
|
|
|
191
|
|
|
// ------------------------------------ |
|
192
|
|
|
|
|
193
|
19 |
|
$html = UTF8::cleanup($dom->html()); |
|
194
|
|
|
// final clean-up |
|
195
|
19 |
|
$html = str_replace( |
|
196
|
|
|
array( |
|
197
|
19 |
|
'html>' . "\n", |
|
198
|
19 |
|
"\n" . '<html', |
|
199
|
19 |
|
'<!doctype', |
|
200
|
19 |
|
'="delete-this-' . $this->randomHash . '"', |
|
201
|
19 |
|
), |
|
202
|
|
|
array( |
|
203
|
19 |
|
'html>', |
|
204
|
19 |
|
'<html', |
|
205
|
19 |
|
'<!DOCTYPE', |
|
206
|
19 |
|
'', |
|
207
|
19 |
|
), |
|
208
|
|
|
$html |
|
209
|
19 |
|
); |
|
210
|
|
|
|
|
211
|
19 |
|
if ($origHtmlLength < UTF8::strlen($html)) { |
|
212
|
3 |
|
$html = $origHtml; |
|
213
|
3 |
|
} |
|
214
|
|
|
|
|
215
|
|
|
// Remove spaces that are followed by either > or < |
|
216
|
19 |
|
$html = preg_replace('/ (>)/', '$1', $html); |
|
217
|
|
|
// Remove spaces that are preceded by either > or < |
|
218
|
19 |
|
$html = preg_replace('/(<) /', '$1', $html); |
|
219
|
|
|
// Remove spaces that are between > and < |
|
220
|
19 |
|
$html = preg_replace('/(>) (<)/', '>$2', $html); |
|
221
|
|
|
|
|
222
|
19 |
|
return $html; |
|
223
|
|
|
} |
|
224
|
|
|
|
|
225
|
|
|
/** |
|
226
|
|
|
* Sort HTML-Attributes, so that gzip can do better work |
|
227
|
|
|
* and remove some default attributes. |
|
228
|
|
|
* |
|
229
|
|
|
* @param SimpleHtmlDom $element |
|
230
|
|
|
* |
|
231
|
|
|
* @return bool |
|
232
|
|
|
*/ |
|
233
|
19 |
|
private function optimizeAttributes(SimpleHtmlDom $element) |
|
234
|
|
|
{ |
|
235
|
19 |
|
$attributs = $element->getAllAttributes(); |
|
236
|
|
|
|
|
237
|
19 |
|
if (!$attributs) { |
|
238
|
19 |
|
return false; |
|
239
|
|
|
} |
|
240
|
|
|
|
|
241
|
|
|
/* |
|
|
|
|
|
|
242
|
|
|
if ( |
|
243
|
|
|
($element->tag === 'script' || $element->tag === 'style') |
|
244
|
|
|
&& |
|
245
|
|
|
!isset($attributs['src']) |
|
246
|
|
|
) { |
|
247
|
|
|
// TODO: protect inline css / js |
|
248
|
|
|
} |
|
249
|
|
|
*/ |
|
250
|
|
|
|
|
251
|
8 |
|
$attrs = array(); |
|
252
|
8 |
|
foreach ((array)$attributs as $attrName => $attrValue) { |
|
253
|
|
|
|
|
254
|
8 |
|
if (in_array($attrName, self::$booleanAttributes, true)) { |
|
255
|
6 |
|
$attrs[$attrName] = 'delete-this-' . $this->randomHash; |
|
256
|
6 |
|
$element->{$attrName} = null; |
|
257
|
6 |
|
continue; |
|
258
|
|
|
} |
|
259
|
|
|
|
|
260
|
|
|
if ( |
|
261
|
8 |
|
($attrName === 'href' || $attrName === 'src' || $attrName === 'action') |
|
262
|
8 |
|
&& |
|
263
|
3 |
|
!(isset($attributs['rel']) && $attributs['rel'] === 'external') |
|
264
|
8 |
|
&& |
|
265
|
3 |
|
!(isset($attributs['target']) && $attributs['target'] === '_blank') |
|
266
|
8 |
|
) { |
|
267
|
3 |
|
$attrValue = str_replace('http://', '//', $attrValue); |
|
268
|
3 |
|
} |
|
269
|
|
|
|
|
270
|
8 |
|
if ($this->optimizeAttributesFilters($element->tag, $attrName, $attrValue, $attributs)) { |
|
|
|
|
|
|
271
|
2 |
|
$element->{$attrName} = null; |
|
272
|
2 |
|
continue; |
|
273
|
|
|
} |
|
274
|
|
|
|
|
275
|
8 |
|
$attrValue = $this->sortCssClasses($attrName, $attrValue); |
|
276
|
|
|
|
|
277
|
8 |
|
$attrs[$attrName] = $attrValue; |
|
278
|
8 |
|
$element->{$attrName} = null; |
|
279
|
8 |
|
} |
|
280
|
|
|
|
|
281
|
8 |
|
ksort($attrs); |
|
282
|
8 |
|
foreach ($attrs as $attrName => $attrValue) { |
|
283
|
8 |
|
$element->setAttribute($attrName, $attrValue, true); |
|
284
|
8 |
|
} |
|
285
|
|
|
|
|
286
|
8 |
|
return true; |
|
287
|
|
|
} |
|
288
|
|
|
|
|
289
|
|
|
/** |
|
290
|
|
|
* Check if the attribute (key / value) is default and can be skipped. |
|
291
|
|
|
* |
|
292
|
|
|
* @param string $tag |
|
293
|
|
|
* @param string $attrName |
|
294
|
|
|
* @param string $attrValue |
|
295
|
|
|
* @param string $allAttr |
|
296
|
|
|
* |
|
297
|
|
|
* @return bool |
|
298
|
|
|
*/ |
|
299
|
8 |
|
private function optimizeAttributesFilters($tag, $attrName, $attrValue, $allAttr) |
|
300
|
|
|
{ |
|
301
|
|
|
// remove default |
|
302
|
8 |
|
if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') { |
|
303
|
|
|
return true; |
|
304
|
|
|
} |
|
305
|
|
|
|
|
306
|
|
|
// remove default |
|
307
|
8 |
|
if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') { |
|
308
|
1 |
|
return true; |
|
309
|
|
|
} |
|
310
|
|
|
|
|
311
|
|
|
// remove default |
|
312
|
8 |
|
if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') { |
|
313
|
1 |
|
return true; |
|
314
|
|
|
} |
|
315
|
|
|
|
|
316
|
|
|
// remove default |
|
317
|
8 |
|
if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') { |
|
318
|
|
|
return true; |
|
319
|
|
|
} |
|
320
|
|
|
|
|
321
|
|
|
// remove deprecated charset-attribute (the Browser will use the charset from the HTTP-Header, anyway) |
|
322
|
8 |
|
if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) { |
|
323
|
|
|
return true; |
|
324
|
|
|
} |
|
325
|
|
|
|
|
326
|
|
|
// remove deprecated anchor-jump |
|
327
|
8 |
|
if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id'])) { |
|
328
|
|
|
return true; |
|
329
|
|
|
} |
|
330
|
|
|
|
|
331
|
|
|
// remove "type=text/css" for css links |
|
332
|
8 |
|
if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') { |
|
333
|
1 |
|
return true; |
|
334
|
|
|
} |
|
335
|
|
|
|
|
336
|
|
|
// remove deprecated script-mime-types |
|
337
|
8 |
|
if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src']) && in_array($attrValue, self::$executableScriptsMimeTypes, true)) { |
|
338
|
1 |
|
return true; |
|
339
|
|
|
} |
|
340
|
|
|
|
|
341
|
|
|
// remove empty value from <input> |
|
342
|
8 |
|
if ($tag === 'input' && $attrName === 'value' && $attrValue === '') { |
|
343
|
1 |
|
return true; |
|
344
|
|
|
} |
|
345
|
|
|
|
|
346
|
|
|
// remove some empty attribute |
|
347
|
8 |
|
if ($attrValue === '' && preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) { |
|
348
|
1 |
|
return true; |
|
349
|
|
|
} |
|
350
|
|
|
|
|
351
|
8 |
|
return false; |
|
352
|
|
|
} |
|
353
|
|
|
|
|
354
|
|
|
/** |
|
355
|
|
|
* @param $attrName |
|
356
|
|
|
* @param $attrValue |
|
357
|
|
|
* |
|
358
|
|
|
* @return string |
|
359
|
|
|
*/ |
|
360
|
8 |
|
private function sortCssClasses($attrName, $attrValue) |
|
361
|
|
|
{ |
|
362
|
8 |
|
if ($attrName !== 'class' || !$attrValue) { |
|
363
|
8 |
|
return $attrValue; |
|
364
|
|
|
} |
|
365
|
|
|
|
|
366
|
4 |
|
$classes = explode(' ', $attrValue); |
|
367
|
4 |
|
if (!$classes) { |
|
|
|
|
|
|
368
|
|
|
return ''; |
|
369
|
|
|
} |
|
370
|
|
|
|
|
371
|
4 |
|
sort($classes); |
|
372
|
4 |
|
$attrValue = ''; |
|
373
|
4 |
|
foreach ($classes as $class) { |
|
374
|
4 |
|
if (!$class) { |
|
375
|
1 |
|
continue; |
|
376
|
|
|
} |
|
377
|
4 |
|
$attrValue .= trim($class) . ' '; |
|
378
|
4 |
|
} |
|
379
|
4 |
|
$attrValue = trim($attrValue); |
|
380
|
|
|
|
|
381
|
4 |
|
return $attrValue; |
|
382
|
|
|
} |
|
383
|
|
|
} |
|
384
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.