Completed
Push — master ( 634334...8dad79 )
by Lars
02:41
created

HtmlMin::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 8
ccs 5
cts 5
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 5
nc 1
nop 0
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
/**
6
 * Class HtmlMin
7
 *
8
 * Inspired by:
9
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
10
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
11
 * - PHP: https://github.com/WyriHaximus/HtmlCompress
12
 * - PHP: https://github.com/zaininnari/html-minifier
13
 * - Java: https://code.google.com/archive/p/htmlcompressor/
14
 *
15
 * @package voku\helper
16
 */
17
class HtmlMin
18
{
19
  /**
20
   * // https://mathiasbynens.be/demo/javascript-mime-type
21
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
22
   *
23
   * @var array
24
   */
25
  private static $executableScriptsMimeTypes = array(
26
      'text/javascript',
27
      'text/ecmascript',
28
      'text/jscript',
29
      'application/javascript',
30
      'application/x-javascript',
31
      'application/ecmascript',
32
  );
33
34
  /**
35
   * @var array
36
   */
37
  private static $booleanAttributes = array(
38
      'allowfullscreen',
39
      'async',
40
      'autofocus',
41
      'autoplay',
42
      'checked',
43
      'compact',
44
      'controls',
45
      'declare',
46
      'default',
47
      'defaultchecked',
48
      'defaultmuted',
49
      'defaultselected',
50
      'defer',
51
      'disabled',
52
      'enabled',
53
      'formnovalidate',
54
      'hidden',
55
      'indeterminate',
56
      'inert',
57
      'ismap',
58
      'itemscope',
59
      'loop',
60
      'multiple',
61
      'muted',
62
      'nohref',
63
      'noresize',
64
      'noshade',
65
      'novalidate',
66
      'nowrap',
67
      'open',
68
      'pauseonexit',
69
      'readonly',
70
      'required',
71
      'reversed',
72
      'scoped',
73
      'seamless',
74
      'selected',
75
      'sortable',
76
      'truespeed',
77
      'typemustmatch',
78
      'visible',
79
  );
80
81
  /**
82
   * An random md5-hash, generated via "random_bytes()".
83
   *
84
   * @var string
85
   */
86
  private $randomHash;
87
88
  /**
89
   * @var array
90 21
   */
91
  private $protectedChildNodes;
92 21
93 21
  /**
94
   * @var array
95
   */
96
  private static $skipTagsForRemoveWhitespace = array('style', 'pre', 'code', 'script', 'textarea');
97
98
  /**
99
   * @var string
100 21
   */
101
  private $protectedChildNodesHelper;
102 21
103 21
  /**
104
   * @var string
105
   */
106
  private $booleanAttributesHelper;
107 21
108 21
  /**
109 2
   * HtmlMin constructor.
110
   */
111
  public function __construct()
112 19
  {
113 19
    $this->protectedChildNodes = array();
114
    $this->randomHash = md5(Bootup::get_random_bytes(16));
115 19
116 19
    $this->protectedChildNodesHelper = 'html-min--saved-content-' . $this->randomHash;
117 19
    $this->booleanAttributesHelper = 'html-min--delete-this-' . $this->randomHash;
118
  }
119 19
120 19
  /**
121
   * @param string $html
122 19
   *
123 2
   * @return string
124 2
   */
125 2
  public function minify($html)
126
  {
127
    $html = (string)$html;
128
    if (!isset($html[0])) {
129 19
      return '';
130
    }
131 19
132 19
    $html = trim($html);
133 19
    if (!$html) {
134
      return '';
135 15
    }
136
137 15
    // init
138 15
    $this->protectedChildNodes = array();
139 15
    $origHtml = $html;
140 3
    $origHtmlLength = UTF8::strlen($html);
141 15
142
    $dom = new HtmlDomParser();
143
    $dom->getDocument()->preserveWhiteSpace = false; // remove redundant white space
144
    $dom->getDocument()->formatOutput = false; // do not formats output with indentation
145 15
146 3
    $dom->loadHtml($html);
147
148
    $dom = $this->protectTagsInDom($dom);
149 15
    $dom = $this->optimizeAttributesInDom($dom);
150
    $dom = $this->removeCommentsInDom($dom);
151
    $dom = $this->removeWhitespaceInDom($dom);
152 19
    $dom = $this->trimTagsInDom($dom);
153
154 19
    $html = $dom->html();
155 19
156 7
    // -------------------------------------------------------------------------
157
    // Trim whitespace from html-string. [protected html is still protected]
158 7
    // -------------------------------------------------------------------------
159 7
160 7
    // Remove spaces that are followed by either > or <
161 7
    $html = preg_replace('/ (>)/', '$1', $html);
162 7
    // Remove spaces that are preceded by either > or <
163
    $html = preg_replace('/(<) /', '$1', $html);
164
    // Remove spaces that are between > and <
165 7
    $html = preg_replace('/(>) (<)/', '>$2', $html);
166 7
167 7
    // -------------------------------------------------------------------------
168
    // Restore protected HTML-code.
169
    // -------------------------------------------------------------------------
170 7
171 7
    $html = preg_replace_callback(
172
        '/<(?<element>'. $this->protectedChildNodesHelper . ')(?<attributes> [^>]*)?>(?<value>.*?)<\/' . $this->protectedChildNodesHelper . '>/',
173
        array($this, 'restoreProtectedHtml'),
174
        $html
175
    );
176 19
    $html = $dom::putReplacedBackToPreserveHtmlEntities($html);
177
178 19
    // ------------------------------------
179 19
    // final clean-up
180 19
    // ------------------------------------
181
182
    $html = UTF8::cleanup($html);
183
184
    $html = str_replace(
185 19
        array(
186
            'html>' . "\n",
187
            "\n" . '<html',
188
            '<!doctype',
189 19
            '="' . $this->booleanAttributesHelper . '"',
190
            '</' . $this->protectedChildNodesHelper . '>',
191
        ),
192
        array(
193 19
            'html>',
194
            '<html',
195 19
            '<!DOCTYPE',
196
            '',
197 19
            '',
198
        ),
199 19
        $html
200 19
    );
201
202
    $html = preg_replace(
203 19
        array(
204
            '/<(?:' . $this->protectedChildNodesHelper . ')(:? [^>]*)?>/'
205
        ),
206
        array(
207
            ''
208
        ),
209
        $html
210
    );
211 19
212 3
    // ------------------------------------
213
    // check if compression worked
214
    // ------------------------------------
215
216 19
    if ($origHtmlLength < UTF8::strlen($html)) {
217
      $html = $origHtml;
218 19
    }
219
220 19
    return $html;
221
  }
222 19
223
  /**
224
   * Prevent changes of inline "styles" and "scripts".
225
   *
226
   * @param HtmlDomParser $dom
227
   *
228
   * @return HtmlDomParser
229
   */
230
  private function protectTagsInDom(HtmlDomParser $dom)
231
  {
232
    // init
233 19
    $i = 0;
234
235 19
    foreach ($dom->find('script, style') as $element) {
236
237 19
      // skip external links
238 19
      if ($element->tag === 'script' || $element->tag === 'style') {
239
        $attributs = $element->getAllAttributes();
240
        if (isset($attributs['src'])) {
241
          continue;
242
        }
243
      }
244
245
      $node = $element->getNode();
246
      while ($node->childNodes->length > 0) {
247
        $this->protectedChildNodes[$i][] = $node->firstChild->nodeValue;
248
        $node->removeChild($node->firstChild);
249
      }
250
251 8
      $child = new \DOMElement($this->protectedChildNodesHelper);
252 8
      $node = $element->getNode()->appendChild($child);
253
      /* @var $node \DOMElement */
254 8
      $node->setAttribute('data-html-min--saved-content', $i);
255 6
256 6
      ++$i;
257 6
    }
258
259
    return $dom;
260
  }
261 8
262
  /**
263 8
   * Optimize HTML-tag attributes in the dom.
264
   *
265 8
   * @param HtmlDomParser $dom
266
   *
267 3
   * @return HtmlDomParser
268
   */
269
  private function optimizeAttributesInDom(HtmlDomParser $dom)
270 8
  {
271 2
    foreach ($dom->find('*') as $element) {
272 2
      $attributs = $element->getAllAttributes();
273
274
      $this->optimizeAttributes($element, $attributs);
275 8
    }
276
277 8
    return $dom;
278 8
  }
279
280
  /**
281 8
   * Remove comments in the dom.
282 8
   *
283 8
   * @param HtmlDomParser $dom
284
   *
285
   * @return HtmlDomParser
286 8
   */
287
  private function removeCommentsInDom(HtmlDomParser $dom)
288
  {
289
    foreach ($dom->find('//comment()') as $commentWrapper) {
290
      $comment = $commentWrapper->getNode();
291
      $val = $comment->nodeValue;
292
      if (strpos($val, '[') !== 0) {
293
        $comment->parentNode->removeChild($comment);
294
      }
295
    }
296
297
    $dom->getDocument()->normalizeDocument();
298
299 8
    return $dom;
300
  }
301
302 8
  /**
303
   * Trim tags in the dom.
304
   *
305
   * @param HtmlDomParser $dom
306
   *
307 8
   * @return HtmlDomParser
308 1
   */
309
  private function trimTagsInDom(HtmlDomParser $dom) {
310
    $divnodes = $dom->find('//div|//p|//nav|//footer|//article|//script|//hr|//br');
311
    foreach ($divnodes as $divnodeWrapper) {
312 8
      $divnode = $divnodeWrapper->getNode();
313 1
314
      $candidates = array();
315
      /** @noinspection PhpParamsInspection */
316
      if (count($divnode->childNodes) > 0) {
317 8
        $candidates[] = $divnode->firstChild;
318
        $candidates[] = $divnode->lastChild;
319
        $candidates[] = $divnode->previousSibling;
320
        $candidates[] = $divnode->nextSibling;
321
      }
322 8
323
      foreach ($candidates as $candidate) {
324
        if ($candidate === null) {
325
          continue;
326
        }
327 8
328
        if ($candidate->nodeType === 3) {
329
          $candidate->nodeValue = trim($candidate->nodeValue);
330
        }
331
      }
332 8
    }
333 1
334
    $dom->getDocument()->normalizeDocument();
335
336
    return $dom;
337 8
  }
338 1
339
  /**
340
   * Remove whitespace from dom-nodes.
341
   *
342 8
   * @param HtmlDomParser $dom
343 1
   *
344
   * @return HtmlDomParser
345
   */
346
  private function removeWhitespaceInDom(HtmlDomParser $dom)
347 8
  {
348 1
    $textnodes = $dom->find('//text()');
349
    foreach ($textnodes as $textnodeWrapper) {
350
      $textnode = $textnodeWrapper->getNode();
351 8
      $xp = $textnode->getNodePath();
352
353
      $doSkip = false;
354
      foreach (self::$skipTagsForRemoveWhitespace as $pattern) {
355
        if (strpos($xp, "/$pattern") !== false) {
356
          $doSkip = true;
357
          break;
358
        }
359
      }
360 8
361
      if ($doSkip) {
362 8
        continue;
363 8
      }
364
365
      $textnode->nodeValue = preg_replace("/\s{2,}/", ' ', $textnode->nodeValue);
366 4
    }
367 4
368
    $dom->getDocument()->normalizeDocument();
369
370
    return $dom;
371 4
  }
372 4
373 4
  /**
374 4
   * Callback function for preg_replace_callback use.
375 1
   *
376
   * @param  array $matches PREG matches
377 4
   *
378
   * @return string
379 4
   */
380
  private function restoreProtectedHtml($matches)
381 4
  {
382
    preg_match('/.*"(?<id>\d*)"/', $matches['attributes'], $matchesInner);
383
384
    $htmlChild = '';
385
    if (isset($this->protectedChildNodes[$matchesInner['id']])) {
386
      foreach ($this->protectedChildNodes[$matchesInner['id']] as $childNode) {
387
        $htmlChild .= $childNode;
388
      }
389
    }
390
391
    return $htmlChild;
392
  }
393
394
  /**
395
   * Sort HTML-Attributes, so that gzip can do better work
396
   *  and remove some default attributes.
397
   *
398
   * @param SimpleHtmlDom $element
399
   * @param array         $attributs
400
   *
401
   * @return bool
402
   */
403
  private function optimizeAttributes(SimpleHtmlDom $element, &$attributs)
404
  {
405
    if (!$attributs) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $attributs of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
406
      return false;
407
    }
408
409
    $attrs = array();
410
    foreach ($attributs as $attrName => $attrValue) {
411
412
      if (in_array($attrName, self::$booleanAttributes, true)) {
413
        $attrs[$attrName] = $this->booleanAttributesHelper;
414
        $element->{$attrName} = null;
415
        continue;
416
      }
417
418
      if (
419
          ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
420
          &&
421
          !(isset($attributs['rel']) && $attributs['rel'] === 'external')
422
          &&
423
          !(isset($attributs['target']) && $attributs['target'] === '_blank')
424
      ) {
425
        $attrValue = str_replace('http://', '//', $attrValue);
426
      }
427
428
      if ($this->optimizeAttributesFilters($element->tag, $attrName, $attrValue, $attributs)) {
0 ignored issues
show
Documentation introduced by
$attributs is of type array, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
429
        $element->{$attrName} = null;
430
        continue;
431
      }
432
433
      $attrValue = $this->sortCssClasses($attrName, $attrValue);
434
435
      $attrs[$attrName] = $attrValue;
436
      $element->{$attrName} = null;
437
    }
438
439
    ksort($attrs);
440
    foreach ($attrs as $attrName => $attrValue) {
441
      $element->setAttribute($attrName, $attrValue, true);
442
    }
443
444
    return true;
445
  }
446
447
  /**
448
   * Check if the attribute (key / value) is default and can be skipped.
449
   *
450
   * @param string $tag
451
   * @param string $attrName
452
   * @param string $attrValue
453
   * @param string $allAttr
454
   *
455
   * @return bool
456
   */
457
  private function optimizeAttributesFilters($tag, $attrName, $attrValue, $allAttr)
458
  {
459
    // remove default
460
    if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
461
      return true;
462
    }
463
464
    // remove default
465
    if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
466
      return true;
467
    }
468
469
    // remove default
470
    if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
471
      return true;
472
    }
473
474
    // remove default
475
    if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
476
      return true;
477
    }
478
479
    // remove deprecated charset-attribute (the Browser will use the charset from the HTTP-Header, anyway)
480
    if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
481
      return true;
482
    }
483
484
    // remove deprecated anchor-jump
485
    if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id'])) {
486
      return true;
487
    }
488
489
    // remove "type=text/css" for css links
490
    if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
491
      return true;
492
    }
493
494
    // remove deprecated script-mime-types
495
    if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src']) && in_array($attrValue, self::$executableScriptsMimeTypes, true)) {
496
      return true;
497
    }
498
499
    // remove empty value from <input>
500
    if ($tag === 'input' && $attrName === 'value' && $attrValue === '') {
501
      return true;
502
    }
503
504
    // remove some empty attribute
505
    if ($attrValue === '' && preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
506
      return true;
507
    }
508
509
    return false;
510
  }
511
512
  /**
513
   * @param $attrName
514
   * @param $attrValue
515
   *
516
   * @return string
517
   */
518
  private function sortCssClasses($attrName, $attrValue)
519
  {
520
    if ($attrName !== 'class' || !$attrValue) {
521
      return $attrValue;
522
    }
523
524
    $classes = explode(' ', $attrValue);
525
    if (!$classes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $classes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
526
      return '';
527
    }
528
529
    sort($classes);
530
    $attrValue = '';
531
    foreach ($classes as $class) {
532
      if (!$class) {
533
        continue;
534
      }
535
      $attrValue .= trim($class) . ' ';
536
    }
537
    $attrValue = trim($attrValue);
538
539
    return $attrValue;
540
  }
541
}
542