Completed
Push — master ( 9cd978...fb0caf )
by Lars
01:58
created

HtmlMin::minifyHtmlDom()   B

Complexity

Conditions 6
Paths 20

Size

Total Lines 62
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 0
Metric Value
dl 0
loc 62
ccs 0
cts 0
cp 0
rs 8.6652
c 0
b 0
f 0
cc 6
eloc 19
nc 20
nop 2
crap 42

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace voku\helper;
4
5
/**
6
 * Class HtmlMin
7
 *
8
 * Inspired by:
9
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
10
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
11
 * - PHP: https://github.com/WyriHaximus/HtmlCompress
12
 * - PHP: https://github.com/zaininnari/html-minifier
13
 * - Java: https://code.google.com/archive/p/htmlcompressor/
14
 *
15
 * @package voku\helper
16
 */
17
class HtmlMin
18
{
19
  /**
20
   * @var string
21
   */
22
  private static $regExSpace = "/[[:space:]]{2,}|[\r\n]+/u";
23
24
  /**
25
   * @var array
26
   */
27
  private static $optional_end_tags = array(
28
      'html',
29
      'head',
30
      'body',
31
  );
32
33
  /**
34
   * // https://mathiasbynens.be/demo/javascript-mime-type
35
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
36
   *
37
   * @var array
38
   */
39
  private static $executableScriptsMimeTypes = array(
40
      'text/javascript'          => '',
41
      'text/ecmascript'          => '',
42
      'text/jscript'             => '',
43
      'application/javascript'   => '',
44
      'application/x-javascript' => '',
45
      'application/ecmascript'   => '',
46
  );
47
48
  private static $selfClosingTags = array(
49
      'area',
50
      'base',
51
      'basefont',
52
      'br',
53
      'col',
54
      'command',
55
      'embed',
56
      'frame',
57
      'hr',
58
      'img',
59
      'input',
60
      'isindex',
61
      'keygen',
62
      'link',
63
      'meta',
64
      'param',
65
      'source',
66
      'track',
67
      'wbr',
68
  );
69
70
  private static $trimWhitespaceFromTags = array(
71
      'article' => '',
72
      'br'      => '',
73
      'div'     => '',
74
      'footer'  => '',
75
      'hr'      => '',
76
      'nav'     => '',
77
      'p'       => '',
78
      'script'  => '',
79
  );
80
81
  /**
82
   * @var array
83
   */
84
  private static $booleanAttributes = array(
85
      'allowfullscreen' => '',
86
      'async'           => '',
87
      'autofocus'       => '',
88
      'autoplay'        => '',
89
      'checked'         => '',
90
      'compact'         => '',
91
      'controls'        => '',
92
      'declare'         => '',
93
      'default'         => '',
94
      'defaultchecked'  => '',
95
      'defaultmuted'    => '',
96
      'defaultselected' => '',
97
      'defer'           => '',
98
      'disabled'        => '',
99
      'enabled'         => '',
100
      'formnovalidate'  => '',
101
      'hidden'          => '',
102
      'indeterminate'   => '',
103
      'inert'           => '',
104
      'ismap'           => '',
105
      'itemscope'       => '',
106
      'loop'            => '',
107
      'multiple'        => '',
108
      'muted'           => '',
109
      'nohref'          => '',
110
      'noresize'        => '',
111
      'noshade'         => '',
112
      'novalidate'      => '',
113
      'nowrap'          => '',
114
      'open'            => '',
115
      'pauseonexit'     => '',
116
      'readonly'        => '',
117
      'required'        => '',
118
      'reversed'        => '',
119
      'scoped'          => '',
120
      'seamless'        => '',
121
      'selected'        => '',
122
      'sortable'        => '',
123
      'truespeed'       => '',
124
      'typemustmatch'   => '',
125
      'visible'         => '',
126
  );
127
  /**
128
   * @var array
129
   */
130
  private static $skipTagsForRemoveWhitespace = array(
131
      'code',
132
      'pre',
133
      'script',
134
      'style',
135
      'textarea',
136 23
  );
137
138 23
  /**
139 23
   * @var array
140
   */
141 23
  private $protectedChildNodes = array();
142 23
143 23
  /**
144
   * @var string
145
   */
146
  private $protectedChildNodesHelper = 'html-min--voku--saved-content';
147
148
  /**
149
   * @var bool
150 23
   */
151
  private $doOptimizeViaHtmlDomParser = true;
152 23
153 23
  /**
154 1
   * @var bool
155
   */
156
  private $doOptimizeAttributes = true;
157 23
158 23
  /**
159 3
   * @var bool
160
   */
161
  private $doRemoveComments = true;
162
163 20
  /**
164 20
   * @var bool
165 20
   */
166
  private $doRemoveWhitespaceAroundTags = true;
167 20
168 20
  /**
169 20
   * @var bool
170
   */
171 20
  private $doRemoveHttpPrefixFromAttributes = false;
172
173 20
174 20
  /**
175 11
   * @var array
176 11
   */
177 11
  private $domainsToRemoveHttpPrefixFromAttributes = array(
178
      'google.com',
179 11
      'google.de',
180
  );
181
182
  /**
183
   * @var bool
184
   */
185
  private $doSortCssClassNames = true;
186 11
187
  /**
188 11
   * @var bool
189
   */
190 11
  private $doSortHtmlAttributes = true;
191
192
  /**
193
   * @var bool
194
   */
195
  private $doRemoveDeprecatedScriptCharsetAttribute = true;
196 11
197 11
  /**
198 11
   * @var bool
199
   */
200 11
  private $doRemoveDefaultAttributes = false;
201 11
202
  /**
203
   * @var bool
204
   */
205
  private $doRemoveDeprecatedAnchorName = true;
206
207 11
  /**
208
   * @var bool
209 11
   */
210
  private $doRemoveDeprecatedTypeFromStylesheetLink = true;
211 11
212 11
  /**
213 11
   * @var bool
214 11
   */
215 11
  private $doRemoveDeprecatedTypeFromScriptTag = true;
216 11
217
  /**
218 11
   * @var bool
219 11
   */
220 11
  private $doRemoveValueFromEmptyInput = true;
221 11
222 11
  /**
223 11
   * @var bool
224
   */
225 11
  private $doRemoveEmptyAttributes = true;
226
227 11
  /**
228
   * @var bool
229 11
   */
230 11
  private $doSumUpWhitespace = true;
231
232 11
  /**
233 11
   * @var bool
234
   */
235 11
  private $doRemoveSpacesBetweenTags = false;
236
237 11
  /**
238 11
   * HtmlMin constructor.
239 1
   */
240 1
  public function __construct()
241 11
  {
242
  }
243
244
  /**
245
   * @param boolean $doOptimizeAttributes
246
   *
247 11
   * @return $this
248 3
   */
249 3
  public function doOptimizeAttributes($doOptimizeAttributes = true)
250
  {
251 11
    $this->doOptimizeAttributes = $doOptimizeAttributes;
252
253
    return $this;
254
  }
255
256
  /**
257
   * @param boolean $doOptimizeViaHtmlDomParser
258
   *
259
   * @return $this
260
   */
261
  public function doOptimizeViaHtmlDomParser($doOptimizeViaHtmlDomParser = true)
262 20
  {
263
    $this->doOptimizeViaHtmlDomParser = $doOptimizeViaHtmlDomParser;
264 20
265 20
    return $this;
266 17
  }
267
268
  /**
269 9
   * @param boolean $doRemoveComments
270 9
   *
271
   * @return $this
272 9
   */
273 5
  public function doRemoveComments($doRemoveComments = true)
274 5
  {
275 5
    $this->doRemoveComments = $doRemoveComments;
276
277
    return $this;
278
  }
279 9
280 9
  /**
281
   * @param boolean $doRemoveDefaultAttributes
282 9
   *
283
   * @return $this
284 9
   */
285
  public function doRemoveDefaultAttributes($doRemoveDefaultAttributes = true)
286
  {
287
    $this->doRemoveDefaultAttributes = $doRemoveDefaultAttributes;
288 9
289
    return $this;
290
  }
291
292
  /**
293 9
   * @param boolean $doRemoveDeprecatedAnchorName
294
   *
295 9
   * @return $this
296 9
   */
297 9
  public function doRemoveDeprecatedAnchorName($doRemoveDeprecatedAnchorName = true)
298
  {
299 9
    $this->doRemoveDeprecatedAnchorName = $doRemoveDeprecatedAnchorName;
300 9
301 9
    return $this;
302
  }
303
304
  /**
305
   * @param boolean $doRemoveDeprecatedScriptCharsetAttribute
306
   *
307
   * @return $this
308
   */
309
  public function doRemoveDeprecatedScriptCharsetAttribute($doRemoveDeprecatedScriptCharsetAttribute = true)
310
  {
311
    $this->doRemoveDeprecatedScriptCharsetAttribute = $doRemoveDeprecatedScriptCharsetAttribute;
312
313
    return $this;
314
  }
315
316
  /**
317
   * @param boolean $doRemoveDeprecatedTypeFromScriptTag
318 9
   *
319
   * @return $this
320
   */
321 9
  public function doRemoveDeprecatedTypeFromScriptTag($doRemoveDeprecatedTypeFromScriptTag = true)
322
  {
323
    $this->doRemoveDeprecatedTypeFromScriptTag = $doRemoveDeprecatedTypeFromScriptTag;
324
325
    return $this;
326 9
  }
327
328
  /**
329
   * @param boolean $doRemoveDeprecatedTypeFromStylesheetLink
330
   *
331 9
   * @return $this
332
   */
333
  public function doRemoveDeprecatedTypeFromStylesheetLink($doRemoveDeprecatedTypeFromStylesheetLink = true)
334
  {
335
    $this->doRemoveDeprecatedTypeFromStylesheetLink = $doRemoveDeprecatedTypeFromStylesheetLink;
336 9
337
    return $this;
338
  }
339
340
  /**
341 9
   * @param boolean $doRemoveEmptyAttributes
342
   *
343
   * @return $this
344
   */
345
  public function doRemoveEmptyAttributes($doRemoveEmptyAttributes = true)
346 9
  {
347
    $this->doRemoveEmptyAttributes = $doRemoveEmptyAttributes;
348
349
    return $this;
350
  }
351 9
352
  /**
353
   * @param boolean $doRemoveHttpPrefixFromAttributes
354
   *
355
   * @return $this
356 9
   */
357
  public function doRemoveHttpPrefixFromAttributes($doRemoveHttpPrefixFromAttributes = true)
358
  {
359
    $this->doRemoveHttpPrefixFromAttributes = $doRemoveHttpPrefixFromAttributes;
360
361 9
    return $this;
362
  }
363
364
  /**
365
   * @param boolean $doRemoveSpacesBetweenTags
366 9
   *
367
   * @return $this
368
   */
369
  public function doRemoveSpacesBetweenTags($doRemoveSpacesBetweenTags = true)
370 9
  {
371
    $this->doRemoveSpacesBetweenTags = $doRemoveSpacesBetweenTags;
372
373
    return $this;
374
  }
375
376
  /**
377
   * @param boolean $doRemoveValueFromEmptyInput
378
   *
379
   * @return $this
380 20
   */
381
  public function doRemoveValueFromEmptyInput($doRemoveValueFromEmptyInput = true)
382 20
  {
383 20
    $this->doRemoveValueFromEmptyInput = $doRemoveValueFromEmptyInput;
384 17
385
    return $this;
386 11
  }
387
388
  /**
389
   * @param boolean $doRemoveWhitespaceAroundTags
390
   *
391
   * @return $this
392
   */
393
  public function doRemoveWhitespaceAroundTags($doRemoveWhitespaceAroundTags = true)
394
  {
395
    $this->doRemoveWhitespaceAroundTags = $doRemoveWhitespaceAroundTags;
396 20
397
    return $this;
398
  }
399 20
400
  /**
401 20
   * @param boolean $doSortCssClassNames
402
   *
403
   * @return $this
404 4
   */
405 4
  public function doSortCssClassNames($doSortCssClassNames = true)
406 4
  {
407 2
    $this->doSortCssClassNames = $doSortCssClassNames;
408
409 3
    return $this;
410
  }
411 3
412 3
  /**
413 3
   * @param boolean $doSortHtmlAttributes
414 3
   *
415 3
   * @return $this
416
   */
417 3
  public function doSortHtmlAttributes($doSortHtmlAttributes = true)
418 3
  {
419
    $this->doSortHtmlAttributes = $doSortHtmlAttributes;
420 3
421
    return $this;
422 3
  }
423 20
424
  /**
425 20
   * @param boolean $doSumUpWhitespace
426
   *
427
   * @return $this
428
   */
429
  public function doSumUpWhitespace($doSumUpWhitespace = true)
430
  {
431
    $this->doSumUpWhitespace = $doSumUpWhitespace;
432
433
    return $this;
434
  }
435 11
436
  private function domNodeAttributesToString(\DOMNode $node)
437 11
  {
438
    # Remove quotes around attribute values, when allowed (<p class="foo"> → <p class=foo>)
439
    $attrstr = '';
440
    if ($node->attributes != null) {
441
      foreach ($node->attributes as $attribute) {
442
        $attrstr .= $attribute->name;
443 11
444
        if (isset(self::$booleanAttributes[$attribute->name])) {
445 11
          $attrstr .= ' ';
446
          continue;
447 11
        }
448
449
        $attrstr .= '=';
450
        # http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#attributes-0
451
        $omitquotes = $attribute->value != '' && 0 == \preg_match('/["\'=<>` \t\r\n\f]+/', $attribute->value);
452
        $attr_val = $attribute->value;
453
        $attrstr .= ($omitquotes ? '' : '"') . $attr_val . ($omitquotes ? '' : '"');
454
        $attrstr .= ' ';
455
      }
456
    }
457 12
458
    return \trim($attrstr);
459 11
  }
460 11
461 11
  private function domNodeClosingTagOptional(\DOMNode $node)
462 11
  {
463
    $tag_name = $node->tagName;
0 ignored issues
show
Bug introduced by
The property tagName does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
464 11
    $nextSibling = $this->getNextSiblingOfTypeDOMElement($node);
465 11
466 11
    // TODO: check the spec
467
    //
468
    // https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission
469
    //
470 11
    // <html> may be omitted if first thing inside is not comment
471
    // <head> may be omitted if first thing inside is an element
472 11
    // <body> may be omitted if first thing inside is not space, comment, <meta>, <link>, <script>, <style> or <template>
473 1
    // <colgroup> may be omitted if first thing inside is <col>
474
    // <tbody> may be omitted if first thing inside is <tr>
475
    // An <li> element's end tag may be omitted if the li element is immediately followed by another li element or if there is no more content in the parent element.
476 11
    // A <dt> element's end tag may be omitted if the dt element is immediately followed by another dt element or a dd element.
477 12
    // A <dd> element's end tag may be omitted if the dd element is immediately followed by another dd element or a dt element, or if there is no more content in the parent element.
478
    // A <p> element's end tag may be omitted if the p element is immediately followed by an address, article, aside, blockquote, details, div, dl, fieldset, figcaption, figure, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, main, menu, nav, ol, p, pre, section, table, or ul element, or if there is no more content in the parent element and the parent element is an HTML element that is not an a, audio, del, ins, map, noscript, or video element, or an autonomous custom element.
479 11
    // An <rp> element's end tag may be omitted if the rp element is immediately followed by an rt or rp element, or if there is no more content in the parent element.
480
    // An <optgroup> element's end tag may be omitted if the optgroup element is immediately followed by another optgroup element, or if there is no more content in the parent element.
481 11
    // An <option> element's end tag may be omitted if the option element is immediately followed by another option element, or if it is immediately followed by an optgroup element, or if there is no more content in the parent element.
482
    // A <colgroup> element's start tag may be omitted if the first thing inside the colgroup element is a col element, and if the element is not immediately preceded by another colgroup element whose end tag has been omitted. (It can't be omitted if the element is empty.)
483
    // A <colgroup> element's end tag may be omitted if the colgroup element is not immediately followed by ASCII whitespace or a comment.
484
    // A <caption> element's end tag may be omitted if the caption element is not immediately followed by ASCII whitespace or a comment.
485
    // A <thead> element's end tag may be omitted if the thead element is immediately followed by a tbody or tfoot element.
486
    // A <tbody> element's start tag may be omitted if the first thing inside the tbody element is a tr element, and if the element is not immediately preceded by a tbody, thead, or tfoot element whose end tag has been omitted. (It can't be omitted if the element is empty.)
487
    // A <tbody> element's end tag may be omitted if the tbody element is immediately followed by a tbody or tfoot element, or if there is no more content in the parent element.
488
    // A <tfoot> element's end tag may be omitted if there is no more content in the parent element.
489
    // A <tr> element's end tag may be omitted if the tr element is immediately followed by another tr element, or if there is no more content in the parent element.
490
    // A <td> element's end tag may be omitted if the td element is immediately followed by a td or th element, or if there is no more content in the parent element.
491
    // A <th> element's end tag may be omitted if the th element is immediately followed by a td or th element, or if there is no more content in the parent element.
492
    //
493
    // <-- However, a start tag must never be omitted if it has any attributes.
494
495
    return in_array($tag_name, self::$optional_end_tags)
496
           ||
497
           (
498
               $tag_name == 'li'
499
               &&
500
               (
501
                   $nextSibling === null
502
                   ||
503
                   (
504
                       $nextSibling instanceof \DOMElement
505
                       &&
506
                       $nextSibling->tagName == $tag_name
507
                   )
508
               )
509
           )
510
           ||
511 9
           (
512
               $tag_name == 'p'
513 9
               &&
514 8
               (
515
                   (
516
                       $nextSibling === null
517 3
                       &&
518 3
                       (
519
                           $node->parentNode !== null
520
                           &&
521
                           $node->parentNode->tagName != 'a'
522 3
                       )
523 3
                   )
524 3
                   ||
525 3
                   (
526
                       $nextSibling instanceof \DOMElement
527
                       &&
528 3
                       in_array(
529 3
                           $nextSibling->tagName,
530 3
                           array(
531
                               'address',
532 3
                               'article',
533
                               'aside',
534
                               'blockquote',
535
                               'dir',
536
                               'div',
537
                               'dl',
538
                               'fieldset',
539
                               'footer',
540
                               'form',
541
                               'h1',
542 11
                               'h2',
543
                               'h3',
544 11
                               'h4',
545 11
                               'h5',
546 3
                               'h6',
547
                               'header',
548 3
                               'hgroup',
549
                               'hr',
550 3
                               'menu',
551 3
                               'nav',
552 3
                               'ol',
553 3
                               'p',
554 3
                               'pre',
555 3
                               'section',
556
                               'table',
557 3
                               'ul',
558 3
                           ),
559 3
                           true
560
                       )
561
                   )
562 3
               )
563 3
           );
564 3
  }
565 3
566 11
  protected function domNodeToString(\DOMNode $node)
567
  {
568 11
    // init
569
    $htmlstr = '';
570 11
571
    foreach ($node->childNodes as $child) {
572
573
      if ($child instanceof \DOMDocumentType) {
0 ignored issues
show
Unused Code introduced by
This if statement is empty and can be removed.

This check looks for the bodies of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These if bodies can be removed. If you have an empty if but statements in the else branch, consider inverting the condition.

if (rand(1, 6) > 3) {
//print "Check failed";
} else {
    print "Check succeeded";
}

could be turned into

if (rand(1, 6) <= 3) {
    print "Check succeeded";
}

This is much more concise to read.

Loading history...
574
575
        // needed?
576
577
      } else if ($child instanceof \DOMElement) {
578
579
        $htmlstr .= trim('<' . $child->tagName . ' ' . $this->domNodeAttributesToString($child));
580
        $htmlstr .= '>' . $this->domNodeToString($child);
581
582
        if (!$this->domNodeClosingTagOptional($child)) {
583
          $htmlstr .= '</' . $child->tagName . '>';
584
        }
585
586
      } else if ($child instanceof \DOMText) {
587
588
        if ($child->isWhitespaceInElementContent()) {
589
          if (
590
              $child->previousSibling !== null
591
              &&
592
              $child->nextSibling !== null
593
          ) {
594
            $htmlstr .= ' ';
595
          }
596
        } else {
597
          $htmlstr .= $child->wholeText;
598
        }
599
600
      } else if ($child instanceof \DOMComment) {
601
602
        $htmlstr .= $child->wholeText;
0 ignored issues
show
Bug introduced by
The property wholeText does not seem to exist in DOMComment.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
603
604
      } else {
605
606
        throw new \Exception('Error by: ' . print_r($child, true));
607
608
      }
609
    }
610
611
    return $htmlstr;
612
  }
613
614
  protected function getNextSiblingOfTypeDOMElement(\DOMNode $node)
615
  {
616
    do {
617
      $node = $node->nextSibling;
618
    } while (!($node === null || $node instanceof \DOMElement));
619
620
    return $node;
621
  }
622
623
  /**
624
   * Check if the current string is an conditional comment.
625
   *
626
   * INFO: since IE >= 10 conditional comment are not working anymore
627
   *
628
   * <!--[if expression]> HTML <![endif]-->
629
   * <![if expression]> HTML <![endif]>
630
   *
631
   * @param string $comment
632
   *
633
   * @return bool
634
   */
635
  private function isConditionalComment($comment)
636
  {
637
    if (preg_match('/^\[if [^\]]+\]/', $comment)) {
638
      return true;
639
    }
640
641
    if (preg_match('/\[endif\]$/', $comment)) {
642
      return true;
643
    }
644
645
    return false;
646
  }
647
648
  /**
649
   * @param string $html
650
   * @param bool   $decodeUtf8Specials <p>Use this only in special cases, e.g. for PHP 5.3</p>
651
   *
652
   * @return string
653
   */
654
  public function minify($html, $decodeUtf8Specials = false)
655
  {
656
    $html = (string)$html;
657
    if (!isset($html[0])) {
658
      return '';
659
    }
660
661
    $html = trim($html);
662
    if (!$html) {
663
      return '';
664
    }
665
666
    // init
667
    static $CACHE_SELF_CLOSING_TAGS = null;
668
    if ($CACHE_SELF_CLOSING_TAGS === null) {
669
      $CACHE_SELF_CLOSING_TAGS = implode('|', self::$selfClosingTags);
670
    }
671
672
    // reset
673
    $this->protectedChildNodes = array();
674
675
    // save old content
676
    $origHtml = $html;
677
    $origHtmlLength = UTF8::strlen($html);
678
679
    // -------------------------------------------------------------------------
680
    // Minify the HTML via "HtmlDomParser"
681
    // -------------------------------------------------------------------------
682
683
    if ($this->doOptimizeViaHtmlDomParser === true) {
684
      $html = $this->minifyHtmlDom($html, $decodeUtf8Specials);
685
    }
686
687
    // -------------------------------------------------------------------------
688
    // Trim whitespace from html-string. [protected html is still protected]
689
    // -------------------------------------------------------------------------
690
691
    // Remove extra white-space(s) between HTML attribute(s)
692
    $html = (string)\preg_replace_callback(
693
        '#<([^\/\s<>!]+)(?:\s+([^<>]*?)\s*|\s*)(\/?)>#',
694
        function ($matches) {
695
          return '<' . $matches[1] . (string)\preg_replace('#([^\s=]+)(\=([\'"]?)(.*?)\3)?(\s+|$)#s', ' $1$2', $matches[2]) . $matches[3] . '>';
696
        },
697
        $html
698
    );
699
700
701
    if ($this->doRemoveSpacesBetweenTags === true) {
702
      // Remove spaces that are between > and <
703
      $html = (string)\preg_replace('/(>) (<)/', '>$2', $html);
704
    }
705
706
    // -------------------------------------------------------------------------
707
    // Restore protected HTML-code.
708
    // -------------------------------------------------------------------------
709
710
    $html = (string)\preg_replace_callback(
711
        '/<(?<element>' . $this->protectedChildNodesHelper . ')(?<attributes> [^>]*)?>(?<value>.*?)<\/' . $this->protectedChildNodesHelper . '>/',
712
        array($this, 'restoreProtectedHtml'),
713
        $html
714
    );
715
716
    // -------------------------------------------------------------------------
717
    // Restore protected HTML-entities.
718
    // -------------------------------------------------------------------------
719
720
    if ($this->doOptimizeViaHtmlDomParser === true) {
721
      $html = HtmlDomParser::putReplacedBackToPreserveHtmlEntities($html);
722
    }
723
724
    // ------------------------------------
725
    // Final clean-up
726
    // ------------------------------------
727
728
    $html = UTF8::cleanup($html);
729
730
    $html = \str_replace(
731
        array(
732
            'html>' . "\n",
733
            "\n" . '<html',
734
            'html/>' . "\n",
735
            "\n" . '</html',
736
            'head>' . "\n",
737
            "\n" . '<head',
738
            'head/>' . "\n",
739
            "\n" . '</head',
740
        ),
741
        array(
742
            'html>',
743
            '<html',
744
            'html/>',
745
            '</html',
746
            'head>',
747
            '<head',
748
            'head/>',
749
            '</head',
750
        ),
751
        $html
752
    );
753
754
    // self closing tags, don't need a trailing slash ...
755
    $replace = array();
756
    $replacement = array();
757
    foreach (self::$selfClosingTags as $selfClosingTag) {
758
      $replace[] = '<' . $selfClosingTag . '/>';
759
      $replacement[] = '<' . $selfClosingTag . '>';
760
      $replace[] = '<' . $selfClosingTag . ' />';
761
      $replacement[] = '<' . $selfClosingTag . '>';
762
    }
763
    $html = \str_replace(
764
        $replace,
765
        $replacement,
766
        $html
767
    );
768
769
    // ------------------------------------
770
    // check if compression worked
771
    // ------------------------------------
772
773
    if ($origHtmlLength < UTF8::strlen($html)) {
774
      $html = $origHtml;
775
    }
776
777
    return $html;
778
  }
779
780
  /**
781
   * @param $html
782
   * @param $decodeUtf8Specials
783
   *
784
   * @return string
785
   */
786
  private function minifyHtmlDom($html, $decodeUtf8Specials)
787
  {
788
    // init dom
789
    $dom = new HtmlDomParser();
790
    $dom->getDocument()->preserveWhiteSpace = false; // remove redundant white space
791
    $dom->getDocument()->formatOutput = false; // do not formats output with indentation
792
793
    // load dom
794
    $dom->loadHtml($html);
795
796
    // -------------------------------------------------------------------------
797
    // Protect HTML tags and conditional comments.
798
    // -------------------------------------------------------------------------
799
800
    $dom = $this->protectTags($dom);
801
802
    // -------------------------------------------------------------------------
803
    // Remove default HTML comments. [protected html is still protected]
804
    // -------------------------------------------------------------------------
805
806
    if ($this->doRemoveComments === true) {
807
      $dom = $this->removeComments($dom);
808
    }
809
810
    // -------------------------------------------------------------------------
811
    // Sum-Up extra whitespace from the Dom. [protected html is still protected]
812
    // -------------------------------------------------------------------------
813
814
    if ($this->doSumUpWhitespace === true) {
815
      $dom = $this->sumUpWhitespace($dom);
816
    }
817
818
    foreach ($dom->find('*') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('*') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
819
820
      // -------------------------------------------------------------------------
821
      // Optimize html attributes. [protected html is still protected]
822
      // -------------------------------------------------------------------------
823
824
      if ($this->doOptimizeAttributes === true) {
825
        $this->optimizeAttributes($element);
826
      }
827
828
      // -------------------------------------------------------------------------
829
      // Remove whitespace around tags. [protected html is still protected]
830
      // -------------------------------------------------------------------------
831
832
      if ($this->doRemoveWhitespaceAroundTags === true) {
833
        $this->removeWhitespaceAroundTags($element);
834
      }
835
    }
836
837
    // -------------------------------------------------------------------------
838
    // Convert the Dom into a string.
839
    // -------------------------------------------------------------------------
840
841
    $html = $dom->fixHtmlOutput(
0 ignored issues
show
Bug introduced by
The method fixHtmlOutput() cannot be called from this context as it is declared protected in class voku\helper\HtmlDomParser.

This check looks for access to methods that are not accessible from the current context.

If you need to make a method accessible to another context you can raise its visibility level in the defining class.

Loading history...
842
        $this->domNodeToString($dom->getDocument()),
843
        $decodeUtf8Specials
844
    );
845
846
    return $html;
847
  }
848
849
  /**
850
   * Sort HTML-Attributes, so that gzip can do better work and remove some default attributes...
851
   *
852
   * @param SimpleHtmlDom $element
853
   *
854
   * @return bool
855
   */
856
  private function optimizeAttributes(SimpleHtmlDom $element)
857
  {
858
    $attributes = $element->getAllAttributes();
859
    if ($attributes === null) {
860
      return false;
861
    }
862
863
    $attrs = array();
864
    foreach ((array)$attributes as $attrName => $attrValue) {
865
866
      if (isset(self::$booleanAttributes[$attrName])) {
867
        continue;
868
      }
869
870
      // -------------------------------------------------------------------------
871
      // Remove optional "http:"-prefix from attributes.
872
      // -------------------------------------------------------------------------
873
874
      if ($this->doRemoveHttpPrefixFromAttributes === true) {
875
        if (
876
            ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
877
            &&
878
            !(isset($attributes['rel']) && $attributes['rel'] === 'external')
879
            &&
880
            !(isset($attributes['target']) && $attributes['target'] === '_blank')
881
        ) {
882
          $attrValue = \str_replace('http://', '//', $attrValue);
883
        }
884
      }
885
886
      if ($this->removeAttributeHelper($element->tag, $attrName, $attrValue, $attributes)) {
887
        $element->{$attrName} = null;
888
        continue;
889
      }
890
891
      // -------------------------------------------------------------------------
892
      // Sort css-class-names, for better gzip results.
893
      // -------------------------------------------------------------------------
894
895
      if ($this->doSortCssClassNames === true) {
896
        $attrValue = $this->sortCssClassNames($attrName, $attrValue);
897
      }
898
899
      if ($this->doSortHtmlAttributes === true) {
900
        $attrs[$attrName] = $attrValue;
901
        $element->{$attrName} = null;
902
      }
903
    }
904
905
    // -------------------------------------------------------------------------
906
    // Sort html-attributes, for better gzip results.
907
    // -------------------------------------------------------------------------
908
909
    if ($this->doSortHtmlAttributes === true) {
910
      \ksort($attrs);
911
      foreach ($attrs as $attrName => $attrValue) {
912
        $attrValue = HtmlDomParser::replaceToPreserveHtmlEntities($attrValue);
913
        $element->setAttribute($attrName, $attrValue, true);
914
      }
915
    }
916
917
    return true;
918
  }
919
920
  /**
921
   * Prevent changes of inline "styles" and "scripts".
922
   *
923
   * @param HtmlDomParser $dom
924
   *
925
   * @return HtmlDomParser
926
   */
927
  private function protectTags(HtmlDomParser $dom)
928
  {
929
    // init
930
    $counter = 0;
931
932
    foreach ($dom->find('script, style') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('script, style') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
933
934
      // skip external links
935
      if ($element->tag === 'script' || $element->tag === 'style') {
936
        $attributes = $element->getAllAttributes();
937
        if (isset($attributes['src'])) {
938
          continue;
939
        }
940
      }
941
942
      $this->protectedChildNodes[$counter] = $element->text();
943
      $element->getNode()->nodeValue = '<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>';
944
945
      ++$counter;
946
    }
947
948
    $dom->getDocument()->normalizeDocument();
949
950
    foreach ($dom->find('//comment()') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
951
      $text = $element->text();
952
953
      // skip normal comments
954
      if ($this->isConditionalComment($text) === false) {
955
        continue;
956
      }
957
958
      $this->protectedChildNodes[$counter] = '<!--' . $text . '-->';
959
960
      /* @var $node \DOMComment */
961
      $node = $element->getNode();
962
      $child = new \DOMText('<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>');
963
      $element->getNode()->parentNode->replaceChild($child, $node);
964
965
      ++$counter;
966
    }
967
968
    $dom->getDocument()->normalizeDocument();
969
970
    return $dom;
971
  }
972
973
  /**
974
   * Check if the attribute can be removed.
975
   *
976
   * @param string $tag
977
   * @param string $attrName
978
   * @param string $attrValue
979
   * @param array  $allAttr
980
   *
981
   * @return bool
982
   */
983
  private function removeAttributeHelper($tag, $attrName, $attrValue, $allAttr)
984
  {
985
    // remove defaults
986
    if ($this->doRemoveDefaultAttributes === true) {
987
988
      if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
989
        return true;
990
      }
991
992
      if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
993
        return true;
994
      }
995
996
      if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
997
        return true;
998
      }
999
1000
      if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
1001
        return true;
1002
      }
1003
    }
1004
1005
    // remove deprecated charset-attribute (the browser will use the charset from the HTTP-Header, anyway)
1006 View Code Duplication
    if ($this->doRemoveDeprecatedScriptCharsetAttribute === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1007
      if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
1008
        return true;
1009
      }
1010
    }
1011
1012
    // remove deprecated anchor-jump
1013 View Code Duplication
    if ($this->doRemoveDeprecatedAnchorName === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1014
      if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id']) && $allAttr['id'] === $attrValue) {
1015
        return true;
1016
      }
1017
    }
1018
1019
    // remove "type=text/css" for css links
1020 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromStylesheetLink === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1021
      if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
1022
        return true;
1023
      }
1024
    }
1025
1026
    // remove deprecated script-mime-types
1027 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromScriptTag === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1028
      if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src'], self::$executableScriptsMimeTypes[$attrValue])) {
1029
        return true;
1030
      }
1031
    }
1032
1033
    // remove 'value=""' from <input type="text">
1034
    if ($this->doRemoveValueFromEmptyInput === true) {
1035
      if ($tag === 'input' && $attrName === 'value' && $attrValue === '' && isset($allAttr['type']) && $allAttr['type'] === 'text') {
1036
        return true;
1037
      }
1038
    }
1039
1040
    // remove some empty attributes
1041
    if ($this->doRemoveEmptyAttributes === true) {
1042
      if (\trim($attrValue) === '' && \preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
1043
        return true;
1044
      }
1045
    }
1046
1047
    return false;
1048
  }
1049
1050
  /**
1051
   * Remove comments in the dom.
1052
   *
1053
   * @param HtmlDomParser $dom
1054
   *
1055
   * @return HtmlDomParser
1056
   */
1057
  private function removeComments(HtmlDomParser $dom)
1058
  {
1059
    foreach ($dom->find('//comment()') as $commentWrapper) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1060
      $comment = $commentWrapper->getNode();
1061
      $val = $comment->nodeValue;
1062
      if (\strpos($val, '[') === false) {
1063
        $comment->parentNode->removeChild($comment);
1064
      }
1065
    }
1066
1067
    $dom->getDocument()->normalizeDocument();
1068
1069
    return $dom;
1070
  }
1071
1072
  /**
1073
   * Trim tags in the dom.
1074
   *
1075
   * @param SimpleHtmlDom $element
1076
   *
1077
   * @return void
1078
   */
1079
  private function removeWhitespaceAroundTags(SimpleHtmlDom $element)
1080
  {
1081
    if (isset(self::$trimWhitespaceFromTags[$element->tag])) {
1082
      $node = $element->getNode();
1083
1084
      $candidates = array();
1085
      if ($node->childNodes->length > 0) {
1086
        $candidates[] = $node->firstChild;
1087
        $candidates[] = $node->lastChild;
1088
        $candidates[] = $node->previousSibling;
1089
        $candidates[] = $node->nextSibling;
1090
      }
1091
1092
      foreach ($candidates as &$candidate) {
1093
        if ($candidate === null) {
1094
          continue;
1095
        }
1096
1097
        if ($candidate->nodeType === 3) {
1098
          $candidate->nodeValue = \preg_replace(self::$regExSpace, ' ', $candidate->nodeValue);
1099
        }
1100
      }
1101
    }
1102
  }
1103
1104
  /**
1105
   * Callback function for preg_replace_callback use.
1106
   *
1107
   * @param array $matches PREG matches
1108
   *
1109
   * @return string
1110
   */
1111
  private function restoreProtectedHtml($matches)
1112
  {
1113
    \preg_match('/.*"(?<id>\d*)"/', $matches['attributes'], $matchesInner);
1114
1115
    $html = '';
1116
    if (isset($this->protectedChildNodes[$matchesInner['id']])) {
1117
      $html .= $this->protectedChildNodes[$matchesInner['id']];
1118
    }
1119
1120
    return $html;
1121
  }
1122
1123
  /**
1124
   * @param array $domainsToRemoveHttpPrefixFromAttributes
1125
   *
1126
   * @return $this
1127
   */
1128
  public function setDomainsToRemoveHttpPrefixFromAttributes($domainsToRemoveHttpPrefixFromAttributes)
1129
  {
1130
    $this->domainsToRemoveHttpPrefixFromAttributes = $domainsToRemoveHttpPrefixFromAttributes;
1131
1132
    return $this;
1133
  }
1134
1135
  /**
1136
   * @param $attrName
1137
   * @param $attrValue
1138
   *
1139
   * @return string
1140
   */
1141
  private function sortCssClassNames($attrName, $attrValue)
1142
  {
1143
    if ($attrName !== 'class' || !$attrValue) {
1144
      return $attrValue;
1145
    }
1146
1147
    $classes = \array_unique(
1148
        \explode(' ', $attrValue)
1149
    );
1150
    \sort($classes);
1151
1152
    $attrValue = '';
1153
    foreach ($classes as $class) {
1154
1155
      if (!$class) {
1156
        continue;
1157
      }
1158
1159
      $attrValue .= \trim($class) . ' ';
1160
    }
1161
    $attrValue = \trim($attrValue);
1162
1163
    return $attrValue;
1164
  }
1165
1166
  /**
1167
   * Sum-up extra whitespace from dom-nodes.
1168
   *
1169
   * @param HtmlDomParser $dom
1170
   *
1171
   * @return HtmlDomParser
1172
   */
1173
  private function sumUpWhitespace(HtmlDomParser $dom)
1174
  {
1175
    $textnodes = $dom->find('//text()');
1176
    foreach ($textnodes as $textnodeWrapper) {
0 ignored issues
show
Bug introduced by
The expression $textnodes of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1177
      /* @var $textnode \DOMNode */
1178
      $textnode = $textnodeWrapper->getNode();
1179
      $xp = $textnode->getNodePath();
1180
1181
      $doSkip = false;
1182
      foreach (self::$skipTagsForRemoveWhitespace as $pattern) {
1183
        if (\strpos($xp, "/$pattern") !== false) {
1184
          $doSkip = true;
1185
          break;
1186
        }
1187
      }
1188
      if ($doSkip) {
1189
        continue;
1190
      }
1191
1192
      $textnode->nodeValue = \preg_replace(self::$regExSpace, ' ', $textnode->nodeValue);
1193
    }
1194
1195
    $dom->getDocument()->normalizeDocument();
1196
1197
    return $dom;
1198
  }
1199
}
1200