Completed
Push — master ( ecebd3...709799 )
by Lars
01:29
created

HtmlMin::domNodeClosingTagOptional()   D

Complexity

Conditions 16
Paths 209

Size

Total Lines 137
Code Lines 72

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 16

Importance

Changes 0
Metric Value
dl 0
loc 137
ccs 25
cts 25
cp 1
rs 4.4123
c 0
b 0
f 0
cc 16
eloc 72
nc 209
nop 1
crap 16

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * Class HtmlMin
9
 *
10
 * Inspired by:
11
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
12
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
13
 * - PHP: https://github.com/WyriHaximus/HtmlCompress
14
 * - PHP: https://github.com/zaininnari/html-minifier
15
 * - Java: https://code.google.com/archive/p/htmlcompressor/
16
 *
17
 * @package voku\helper
18
 */
19
class HtmlMin
20
{
21
  /**
22
   * @var string
23
   */
24
  private static $regExSpace = "/[[:space:]]{2,}|[\r\n]+/u";
25
26
  /**
27
   * @var array
28
   */
29
  private static $optional_end_tags = [
30
      'html',
31
      'head',
32
      'body',
33
  ];
34
35
  /**
36
   * // https://mathiasbynens.be/demo/javascript-mime-type
37
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
38
   *
39
   * @var array
40
   */
41
  private static $executableScriptsMimeTypes = [
42
      'text/javascript'          => '',
43
      'text/ecmascript'          => '',
44
      'text/jscript'             => '',
45
      'application/javascript'   => '',
46
      'application/x-javascript' => '',
47
      'application/ecmascript'   => '',
48
  ];
49
50
  private static $selfClosingTags = [
51
      'area',
52
      'base',
53
      'basefont',
54
      'br',
55
      'col',
56
      'command',
57
      'embed',
58
      'frame',
59
      'hr',
60
      'img',
61
      'input',
62
      'isindex',
63
      'keygen',
64
      'link',
65
      'meta',
66
      'param',
67
      'source',
68
      'track',
69
      'wbr',
70
  ];
71
72
  private static $trimWhitespaceFromTags = [
73
      'article' => '',
74
      'br'      => '',
75
      'div'     => '',
76
      'footer'  => '',
77
      'hr'      => '',
78
      'nav'     => '',
79
      'p'       => '',
80
      'script'  => '',
81
  ];
82
83
  /**
84
   * @var array
85
   */
86
  private static $booleanAttributes = [
87
      'allowfullscreen' => '',
88
      'async'           => '',
89
      'autofocus'       => '',
90
      'autoplay'        => '',
91
      'checked'         => '',
92
      'compact'         => '',
93
      'controls'        => '',
94
      'declare'         => '',
95
      'default'         => '',
96
      'defaultchecked'  => '',
97
      'defaultmuted'    => '',
98
      'defaultselected' => '',
99
      'defer'           => '',
100
      'disabled'        => '',
101
      'enabled'         => '',
102
      'formnovalidate'  => '',
103
      'hidden'          => '',
104
      'indeterminate'   => '',
105
      'inert'           => '',
106
      'ismap'           => '',
107
      'itemscope'       => '',
108
      'loop'            => '',
109
      'multiple'        => '',
110
      'muted'           => '',
111
      'nohref'          => '',
112
      'noresize'        => '',
113
      'noshade'         => '',
114
      'novalidate'      => '',
115
      'nowrap'          => '',
116
      'open'            => '',
117
      'pauseonexit'     => '',
118
      'readonly'        => '',
119
      'required'        => '',
120
      'reversed'        => '',
121
      'scoped'          => '',
122
      'seamless'        => '',
123
      'selected'        => '',
124
      'sortable'        => '',
125
      'truespeed'       => '',
126
      'typemustmatch'   => '',
127
      'visible'         => '',
128
  ];
129
  /**
130
   * @var array
131
   */
132
  private static $skipTagsForRemoveWhitespace = [
133
      'code',
134
      'pre',
135
      'script',
136
      'style',
137
      'textarea',
138
  ];
139
140
  /**
141
   * @var array
142
   */
143
  private $protectedChildNodes = [];
144
145
  /**
146
   * @var string
147
   */
148
  private $protectedChildNodesHelper = 'html-min--voku--saved-content';
149
150
  /**
151
   * @var bool
152
   */
153
  private $doOptimizeViaHtmlDomParser = true;
154
155
  /**
156
   * @var bool
157
   */
158
  private $doOptimizeAttributes = true;
159
160
  /**
161
   * @var bool
162
   */
163
  private $doRemoveComments = true;
164
165
  /**
166
   * @var bool
167
   */
168
  private $doRemoveWhitespaceAroundTags = true;
169
170
  /**
171
   * @var bool
172
   */
173
  private $doRemoveHttpPrefixFromAttributes = false;
174
175
176
  /**
177
   * @var array
178
   */
179
  private $domainsToRemoveHttpPrefixFromAttributes = [
180
      'google.com',
181
      'google.de',
182
  ];
183
184
  /**
185
   * @var bool
186
   */
187
  private $doSortCssClassNames = true;
188
189
  /**
190
   * @var bool
191
   */
192
  private $doSortHtmlAttributes = true;
193
194
  /**
195
   * @var bool
196
   */
197
  private $doRemoveDeprecatedScriptCharsetAttribute = true;
198
199
  /**
200
   * @var bool
201
   */
202
  private $doRemoveDefaultAttributes = false;
203
204
  /**
205
   * @var bool
206
   */
207
  private $doRemoveDeprecatedAnchorName = true;
208
209
  /**
210
   * @var bool
211
   */
212
  private $doRemoveDeprecatedTypeFromStylesheetLink = true;
213
214
  /**
215
   * @var bool
216
   */
217
  private $doRemoveDeprecatedTypeFromScriptTag = true;
218
219
  /**
220
   * @var bool
221
   */
222
  private $doRemoveValueFromEmptyInput = true;
223
224
  /**
225
   * @var bool
226
   */
227
  private $doRemoveEmptyAttributes = true;
228
229
  /**
230
   * @var bool
231
   */
232
  private $doSumUpWhitespace = true;
233
234
  /**
235
   * @var bool
236
   */
237
  private $doRemoveSpacesBetweenTags = false;
238
239
  /**
240
   * HtmlMin constructor.
241
   */
242 25
  public function __construct()
243
  {
244 25
  }
245
246
  /**
247
   * @param boolean $doOptimizeAttributes
248
   *
249
   * @return $this
250
   */
251 2
  public function doOptimizeAttributes(bool $doOptimizeAttributes = true)
252
  {
253 2
    $this->doOptimizeAttributes = $doOptimizeAttributes;
254
255 2
    return $this;
256
  }
257
258
  /**
259
   * @param boolean $doOptimizeViaHtmlDomParser
260
   *
261
   * @return $this
262
   */
263 1
  public function doOptimizeViaHtmlDomParser(bool $doOptimizeViaHtmlDomParser = true)
264
  {
265 1
    $this->doOptimizeViaHtmlDomParser = $doOptimizeViaHtmlDomParser;
266
267 1
    return $this;
268
  }
269
270
  /**
271
   * @param boolean $doRemoveComments
272
   *
273
   * @return $this
274
   */
275 2
  public function doRemoveComments(bool $doRemoveComments = true)
276
  {
277 2
    $this->doRemoveComments = $doRemoveComments;
278
279 2
    return $this;
280
  }
281
282
  /**
283
   * @param boolean $doRemoveDefaultAttributes
284
   *
285
   * @return $this
286
   */
287 2
  public function doRemoveDefaultAttributes(bool $doRemoveDefaultAttributes = true)
288
  {
289 2
    $this->doRemoveDefaultAttributes = $doRemoveDefaultAttributes;
290
291 2
    return $this;
292
  }
293
294
  /**
295
   * @param boolean $doRemoveDeprecatedAnchorName
296
   *
297
   * @return $this
298
   */
299 2
  public function doRemoveDeprecatedAnchorName(bool $doRemoveDeprecatedAnchorName = true)
300
  {
301 2
    $this->doRemoveDeprecatedAnchorName = $doRemoveDeprecatedAnchorName;
302
303 2
    return $this;
304
  }
305
306
  /**
307
   * @param boolean $doRemoveDeprecatedScriptCharsetAttribute
308
   *
309
   * @return $this
310
   */
311 2
  public function doRemoveDeprecatedScriptCharsetAttribute(bool $doRemoveDeprecatedScriptCharsetAttribute = true)
312
  {
313 2
    $this->doRemoveDeprecatedScriptCharsetAttribute = $doRemoveDeprecatedScriptCharsetAttribute;
314
315 2
    return $this;
316
  }
317
318
  /**
319
   * @param boolean $doRemoveDeprecatedTypeFromScriptTag
320
   *
321
   * @return $this
322
   */
323 2
  public function doRemoveDeprecatedTypeFromScriptTag(bool $doRemoveDeprecatedTypeFromScriptTag = true)
324
  {
325 2
    $this->doRemoveDeprecatedTypeFromScriptTag = $doRemoveDeprecatedTypeFromScriptTag;
326
327 2
    return $this;
328
  }
329
330
  /**
331
   * @param boolean $doRemoveDeprecatedTypeFromStylesheetLink
332
   *
333
   * @return $this
334
   */
335 2
  public function doRemoveDeprecatedTypeFromStylesheetLink(bool $doRemoveDeprecatedTypeFromStylesheetLink = true)
336
  {
337 2
    $this->doRemoveDeprecatedTypeFromStylesheetLink = $doRemoveDeprecatedTypeFromStylesheetLink;
338
339 2
    return $this;
340
  }
341
342
  /**
343
   * @param boolean $doRemoveEmptyAttributes
344
   *
345
   * @return $this
346
   */
347 2
  public function doRemoveEmptyAttributes(bool $doRemoveEmptyAttributes = true)
348
  {
349 2
    $this->doRemoveEmptyAttributes = $doRemoveEmptyAttributes;
350
351 2
    return $this;
352
  }
353
354
  /**
355
   * @param boolean $doRemoveHttpPrefixFromAttributes
356
   *
357
   * @return $this
358
   */
359 4
  public function doRemoveHttpPrefixFromAttributes(bool $doRemoveHttpPrefixFromAttributes = true)
360
  {
361 4
    $this->doRemoveHttpPrefixFromAttributes = $doRemoveHttpPrefixFromAttributes;
362
363 4
    return $this;
364
  }
365
366
  /**
367
   * @param boolean $doRemoveSpacesBetweenTags
368
   *
369
   * @return $this
370
   */
371
  public function doRemoveSpacesBetweenTags(bool $doRemoveSpacesBetweenTags = true)
372
  {
373
    $this->doRemoveSpacesBetweenTags = $doRemoveSpacesBetweenTags;
374
375
    return $this;
376
  }
377
378
  /**
379
   * @param boolean $doRemoveValueFromEmptyInput
380
   *
381
   * @return $this
382
   */
383 2
  public function doRemoveValueFromEmptyInput(bool $doRemoveValueFromEmptyInput = true)
384
  {
385 2
    $this->doRemoveValueFromEmptyInput = $doRemoveValueFromEmptyInput;
386
387 2
    return $this;
388
  }
389
390
  /**
391
   * @param boolean $doRemoveWhitespaceAroundTags
392
   *
393
   * @return $this
394
   */
395 2
  public function doRemoveWhitespaceAroundTags(bool $doRemoveWhitespaceAroundTags = true)
396
  {
397 2
    $this->doRemoveWhitespaceAroundTags = $doRemoveWhitespaceAroundTags;
398
399 2
    return $this;
400
  }
401
402
  /**
403
   * @param boolean $doSortCssClassNames
404
   *
405
   * @return $this
406
   */
407 2
  public function doSortCssClassNames(bool $doSortCssClassNames = true)
408
  {
409 2
    $this->doSortCssClassNames = $doSortCssClassNames;
410
411 2
    return $this;
412
  }
413
414
  /**
415
   * @param boolean $doSortHtmlAttributes
416
   *
417
   * @return $this
418
   */
419 2
  public function doSortHtmlAttributes(bool $doSortHtmlAttributes = true)
420
  {
421 2
    $this->doSortHtmlAttributes = $doSortHtmlAttributes;
422
423 2
    return $this;
424
  }
425
426
  /**
427
   * @param boolean $doSumUpWhitespace
428
   *
429
   * @return $this
430
   */
431 2
  public function doSumUpWhitespace(bool $doSumUpWhitespace = true)
432
  {
433 2
    $this->doSumUpWhitespace = $doSumUpWhitespace;
434
435 2
    return $this;
436
  }
437
438 21
  private function domNodeAttributesToString(\DOMNode $node): string
439
  {
440
    # Remove quotes around attribute values, when allowed (<p class="foo"> → <p class=foo>)
441 21
    $attrstr = '';
442 21
    if ($node->attributes != null) {
443 21
      foreach ($node->attributes as $attribute) {
444 10
        $attrstr .= $attribute->name;
445
446 10
        if (isset(self::$booleanAttributes[$attribute->name])) {
447 6
          $attrstr .= ' ';
448 6
          continue;
449
        }
450
451 10
        $attrstr .= '=';
452
        # http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#attributes-0
453 10
        $omitquotes = $attribute->value != '' && 0 == \preg_match('/["\'=<>` \t\r\n\f]+/', $attribute->value);
454 10
        $attr_val = $attribute->value;
455 10
        $attrstr .= ($omitquotes ? '' : '"') . $attr_val . ($omitquotes ? '' : '"');
456 10
        $attrstr .= ' ';
457
      }
458
    }
459
460 21
    return \trim($attrstr);
461
  }
462
463 21
  private function domNodeClosingTagOptional(\DOMNode $node): bool
464
  {
465 21
    $tag_name = $node->tagName;
0 ignored issues
show
Bug introduced by
The property tagName does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
466 21
    $nextSibling = $this->getNextSiblingOfTypeDOMElement($node);
467
468
    // https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission
469
470
    // Implemented:
471
    //
472
    // A <p> element's end tag may be omitted if the p element is immediately followed by an address, article, aside, blockquote, details, div, dl, fieldset, figcaption, figure, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, main, menu, nav, ol, p, pre, section, table, or ul element, or if there is no more content in the parent element and the parent element is an HTML element that is not an a, audio, del, ins, map, noscript, or video element, or an autonomous custom element.
473
    // An <li> element's end tag may be omitted if the li element is immediately followed by another li element or if there is no more content in the parent element.
474
    // A <td> element's end tag may be omitted if the td element is immediately followed by a td or th element, or if there is no more content in the parent element.
475
476
    // TODO:
477
    //
478
    // <html> may be omitted if first thing inside is not comment
479
    // <head> may be omitted if first thing inside is an element
480
    // <body> may be omitted if first thing inside is not space, comment, <meta>, <link>, <script>, <style> or <template>
481
    // <colgroup> may be omitted if first thing inside is <col>
482
    // <tbody> may be omitted if first thing inside is <tr>
483
    // A <dt> element's end tag may be omitted if the dt element is immediately followed by another dt element or a dd element.
484
    // A <dd> element's end tag may be omitted if the dd element is immediately followed by another dd element or a dt element, or if there is no more content in the parent element.
485
    // An <rp> element's end tag may be omitted if the rp element is immediately followed by an rt or rp element, or if there is no more content in the parent element.
486
    // An <optgroup> element's end tag may be omitted if the optgroup element is immediately followed by another optgroup element, or if there is no more content in the parent element.
487
    // An <option> element's end tag may be omitted if the option element is immediately followed by another option element, or if it is immediately followed by an optgroup element, or if there is no more content in the parent element.
488
    // A <colgroup> element's start tag may be omitted if the first thing inside the colgroup element is a col element, and if the element is not immediately preceded by another colgroup element whose end tag has been omitted. (It can't be omitted if the element is empty.)
489
    // A <colgroup> element's end tag may be omitted if the colgroup element is not immediately followed by ASCII whitespace or a comment.
490
    // A <caption> element's end tag may be omitted if the caption element is not immediately followed by ASCII whitespace or a comment.
491
    // A <thead> element's end tag may be omitted if the thead element is immediately followed by a tbody or tfoot element.
492
    // A <tbody> element's start tag may be omitted if the first thing inside the tbody element is a tr element, and if the element is not immediately preceded by a tbody, thead, or tfoot element whose end tag has been omitted. (It can't be omitted if the element is empty.)
493
    // A <tbody> element's end tag may be omitted if the tbody element is immediately followed by a tbody or tfoot element, or if there is no more content in the parent element.
494
    // A <tfoot> element's end tag may be omitted if there is no more content in the parent element.
495
    // A <tr> element's end tag may be omitted if the tr element is immediately followed by another tr element, or if there is no more content in the parent element.
496
    // A <th> element's end tag may be omitted if the th element is immediately followed by a td or th element, or if there is no more content in the parent element.
497
    //
498
    // <-- However, a start tag must never be omitted if it has any attributes.
499
500 21
    return \in_array($tag_name, self::$optional_end_tags, true)
501
           ||
502
           (
503 18
               $tag_name == 'li'
504
               &&
505
               (
506 4
                   $nextSibling === null
507
                   ||
508
                   (
509 2
                       $nextSibling instanceof \DOMElement
510
                       &&
511 18
                       $nextSibling->tagName == 'li'
512
                   )
513
               )
514
           )
515
           ||
516
           (
517 18
               $tag_name == 'td'
518
               &&
519
               (
520 1
                   $nextSibling === null
521
                   ||
522
                   (
523 1
                       $nextSibling instanceof \DOMElement
524
                       &&
525
                       (
526 1
                           $nextSibling->tagName == 'td'
527
                           ||
528 18
                           $nextSibling->tagName == 'th'
529
                       )
530
                   )
531
               )
532
           )
533
           ||
534
           (
535 18
               $tag_name == 'p'
536
               &&
537
               (
538
                   (
539 8
                       $nextSibling === null
540
                       &&
541
                       (
542 8
                           $node->parentNode !== null
543
                           &&
544 8
                           !\in_array(
545 8
                               $node->parentNode->tagName,
546
                               [
547 8
                                   'a',
548
                                   'audio',
549
                                   'del',
550
                                   'ins',
551
                                   'map',
552
                                   'noscript',
553
                                   'video'
554
                               ],
555 8
                               true
556
                           )
557
                       )
558
                   )
559
                   ||
560
                   (
561 5
                       $nextSibling instanceof \DOMElement
562
                       &&
563 5
                       \in_array(
564 5
                           $nextSibling->tagName,
565
                           [
566 5
                               'address',
567
                               'article',
568
                               'aside',
569
                               'blockquote',
570
                               'dir',
571
                               'div',
572
                               'dl',
573
                               'fieldset',
574
                               'footer',
575
                               'form',
576
                               'h1',
577
                               'h2',
578
                               'h3',
579
                               'h4',
580
                               'h5',
581
                               'h6',
582
                               'header',
583
                               'hgroup',
584
                               'hr',
585
                               'menu',
586
                               'nav',
587
                               'ol',
588
                               'p',
589
                               'pre',
590
                               'section',
591
                               'table',
592
                               'ul',
593
                           ],
594 21
                           true
595
                       )
596
                   )
597
               )
598
           );
599
  }
600
601 21
  protected function domNodeToString(\DOMNode $node): string
602
  {
603
    // init
604 21
    $htmlstr = '';
605
606 21
    foreach ($node->childNodes as $child) {
607
608 21
      if ($child instanceof \DOMDocumentType) {
0 ignored issues
show
Unused Code introduced by
This if statement is empty and can be removed.

This check looks for the bodies of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These if bodies can be removed. If you have an empty if but statements in the else branch, consider inverting the condition.

if (rand(1, 6) > 3) {
//print "Check failed";
} else {
    print "Check succeeded";
}

could be turned into

if (rand(1, 6) <= 3) {
    print "Check succeeded";
}

This is much more concise to read.

Loading history...
609
610
        // needed?
611
612 21
      } elseif ($child instanceof \DOMElement) {
613
614 21
        $htmlstr .= trim('<' . $child->tagName . ' ' . $this->domNodeAttributesToString($child));
615 21
        $htmlstr .= '>' . $this->domNodeToString($child);
616
617 21
        if (!$this->domNodeClosingTagOptional($child)) {
618 21
          $htmlstr .= '</' . $child->tagName . '>';
619
        }
620
621 17
      } elseif ($child instanceof \DOMText) {
622
623 17
        if ($child->isWhitespaceInElementContent()) {
624
          if (
625 13
              $child->previousSibling !== null
626
              &&
627 13
              $child->nextSibling !== null
628
          ) {
629 13
            $htmlstr .= ' ';
630
          }
631
        } else {
632 17
          $htmlstr .= $child->wholeText;
633
        }
634
635
      } elseif ($child instanceof \DOMComment) {
636
637
        $htmlstr .= $child->wholeText;
0 ignored issues
show
Bug introduced by
The property wholeText does not seem to exist in DOMComment.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
638
639
      } else {
640
641 21
        throw new \Exception('Error by: ' . print_r($child, true));
642
643
      }
644
    }
645
646 21
    return $htmlstr;
647
  }
648
649
  /**
650
   * @param \DOMNode $node
651
   *
652
   * @return \DOMNode|null
653
   */
654 21
  protected function getNextSiblingOfTypeDOMElement(\DOMNode $node)
655
  {
656
    do {
657 21
      $node = $node->nextSibling;
658 21
    } while (!($node === null || $node instanceof \DOMElement));
659
660 21
    return $node;
661
  }
662
663
  /**
664
   * Check if the current string is an conditional comment.
665
   *
666
   * INFO: since IE >= 10 conditional comment are not working anymore
667
   *
668
   * <!--[if expression]> HTML <![endif]-->
669
   * <![if expression]> HTML <![endif]>
670
   *
671
   * @param string $comment
672
   *
673
   * @return bool
674
   */
675 2
  private function isConditionalComment($comment): bool
676
  {
677 2
    if (preg_match('/^\[if [^\]]+\]/', $comment)) {
678 2
      return true;
679
    }
680
681 2
    if (preg_match('/\[endif\]$/', $comment)) {
682 1
      return true;
683
    }
684
685 2
    return false;
686
  }
687
688
  /**
689
   * @param string $html
690
   * @param bool   $decodeUtf8Specials <p>Use this only in special cases, e.g. for PHP 5.3</p>
691
   *
692
   * @return string
693
   */
694 25
  public function minify($html, $decodeUtf8Specials = false): string
695
  {
696 25
    $html = (string)$html;
697 25
    if (!isset($html[0])) {
698 1
      return '';
699
    }
700
701 25
    $html = trim($html);
702 25
    if (!$html) {
703 3
      return '';
704
    }
705
706
    // init
707 22
    static $CACHE_SELF_CLOSING_TAGS = null;
708 22
    if ($CACHE_SELF_CLOSING_TAGS === null) {
709 1
      $CACHE_SELF_CLOSING_TAGS = implode('|', self::$selfClosingTags);
710
    }
711
712
    // reset
713 22
    $this->protectedChildNodes = [];
714
715
    // save old content
716 22
    $origHtml = $html;
717 22
    $origHtmlLength = UTF8::strlen($html);
718
719
    // -------------------------------------------------------------------------
720
    // Minify the HTML via "HtmlDomParser"
721
    // -------------------------------------------------------------------------
722
723 22
    if ($this->doOptimizeViaHtmlDomParser === true) {
724 21
      $html = $this->minifyHtmlDom($html, $decodeUtf8Specials);
725
    }
726
727
    // -------------------------------------------------------------------------
728
    // Trim whitespace from html-string. [protected html is still protected]
729
    // -------------------------------------------------------------------------
730
731
    // Remove extra white-space(s) between HTML attribute(s)
732 22
    $html = (string)\preg_replace_callback(
733 22
        '#<([^\/\s<>!]+)(?:\s+([^<>]*?)\s*|\s*)(\/?)>#',
734 22
        function ($matches) {
735 22
          return '<' . $matches[1] . (string)\preg_replace('#([^\s=]+)(\=([\'"]?)(.*?)\3)?(\s+|$)#s', ' $1$2', $matches[2]) . $matches[3] . '>';
736 22
        },
737 22
        $html
738
    );
739
740
741 22
    if ($this->doRemoveSpacesBetweenTags === true) {
742
      // Remove spaces that are between > and <
743
      $html = (string)\preg_replace('/(>) (<)/', '>$2', $html);
744
    }
745
746
    // -------------------------------------------------------------------------
747
    // Restore protected HTML-code.
748
    // -------------------------------------------------------------------------
749
750 22
    $html = (string)\preg_replace_callback(
751 22
        '/<(?<element>' . $this->protectedChildNodesHelper . ')(?<attributes> [^>]*)?>(?<value>.*?)<\/' . $this->protectedChildNodesHelper . '>/',
752 22
        [$this, 'restoreProtectedHtml'],
753 22
        $html
754
    );
755
756
    // -------------------------------------------------------------------------
757
    // Restore protected HTML-entities.
758
    // -------------------------------------------------------------------------
759
760 22
    if ($this->doOptimizeViaHtmlDomParser === true) {
761 21
      $html = HtmlDomParser::putReplacedBackToPreserveHtmlEntities($html);
762
    }
763
764
    // ------------------------------------
765
    // Final clean-up
766
    // ------------------------------------
767
768 22
    $html = UTF8::cleanup($html);
769
770 22
    $html = \str_replace(
771
        [
772 22
            'html>' . "\n",
773
            "\n" . '<html',
774
            'html/>' . "\n",
775
            "\n" . '</html',
776
            'head>' . "\n",
777
            "\n" . '<head',
778
            'head/>' . "\n",
779
            "\n" . '</head',
780
        ],
781
        [
782 22
            'html>',
783
            '<html',
784
            'html/>',
785
            '</html',
786
            'head>',
787
            '<head',
788
            'head/>',
789
            '</head',
790
        ],
791 22
        $html
792
    );
793
794
    // self closing tags, don't need a trailing slash ...
795 22
    $replace = [];
796 22
    $replacement = [];
797 22
    foreach (self::$selfClosingTags as $selfClosingTag) {
798 22
      $replace[] = '<' . $selfClosingTag . '/>';
799 22
      $replacement[] = '<' . $selfClosingTag . '>';
800 22
      $replace[] = '<' . $selfClosingTag . ' />';
801 22
      $replacement[] = '<' . $selfClosingTag . '>';
802
    }
803 22
    $html = \str_replace(
804 22
        $replace,
805 22
        $replacement,
806 22
        $html
807
    );
808
809
    // ------------------------------------
810
    // check if compression worked
811
    // ------------------------------------
812
813 22
    if ($origHtmlLength < UTF8::strlen($html)) {
814 2
      $html = $origHtml;
815
    }
816
817 22
    return $html;
818
  }
819
820
  /**
821
   * @param $html
822
   * @param $decodeUtf8Specials
823
   *
824
   * @return string
825
   */
826 21
  private function minifyHtmlDom($html, $decodeUtf8Specials): string
827
  {
828
    // init dom
829 21
    $dom = new HtmlDomParser();
830 21
    $dom->getDocument()->preserveWhiteSpace = false; // remove redundant white space
831 21
    $dom->getDocument()->formatOutput = false; // do not formats output with indentation
832
833
    // load dom
834 21
    $dom->loadHtml($html);
835
836
    // -------------------------------------------------------------------------
837
    // Protect HTML tags and conditional comments.
838
    // -------------------------------------------------------------------------
839
840 21
    $dom = $this->protectTags($dom);
841
842
    // -------------------------------------------------------------------------
843
    // Remove default HTML comments. [protected html is still protected]
844
    // -------------------------------------------------------------------------
845
846 21
    if ($this->doRemoveComments === true) {
847 20
      $dom = $this->removeComments($dom);
848
    }
849
850
    // -------------------------------------------------------------------------
851
    // Sum-Up extra whitespace from the Dom. [protected html is still protected]
852
    // -------------------------------------------------------------------------
853
854 21
    if ($this->doSumUpWhitespace === true) {
855 20
      $dom = $this->sumUpWhitespace($dom);
856
    }
857
858 21
    foreach ($dom->find('*') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('*') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
859
860
      // -------------------------------------------------------------------------
861
      // Optimize html attributes. [protected html is still protected]
862
      // -------------------------------------------------------------------------
863
864 21
      if ($this->doOptimizeAttributes === true) {
865 20
        $this->optimizeAttributes($element);
866
      }
867
868
      // -------------------------------------------------------------------------
869
      // Remove whitespace around tags. [protected html is still protected]
870
      // -------------------------------------------------------------------------
871
872 21
      if ($this->doRemoveWhitespaceAroundTags === true) {
873 21
        $this->removeWhitespaceAroundTags($element);
874
      }
875
    }
876
877
    // -------------------------------------------------------------------------
878
    // Convert the Dom into a string.
879
    // -------------------------------------------------------------------------
880
881 21
    $html = $dom->fixHtmlOutput(
882 21
        $this->domNodeToString($dom->getDocument()),
883 21
        $decodeUtf8Specials
884
    );
885
886 21
    return $html;
887
  }
888
889
  /**
890
   * Sort HTML-Attributes, so that gzip can do better work and remove some default attributes...
891
   *
892
   * @param SimpleHtmlDom $element
893
   *
894
   * @return bool
895
   */
896 20
  private function optimizeAttributes(SimpleHtmlDom $element): bool
897
  {
898 20
    $attributes = $element->getAllAttributes();
899 20
    if ($attributes === null) {
900 20
      return false;
901
    }
902
903 9
    $attrs = [];
904 9
    foreach ((array)$attributes as $attrName => $attrValue) {
905
906 9
      if (isset(self::$booleanAttributes[$attrName])) {
907 6
        continue;
908
      }
909
910
      // -------------------------------------------------------------------------
911
      // Remove optional "http:"-prefix from attributes.
912
      // -------------------------------------------------------------------------
913
914 9
      if ($this->doRemoveHttpPrefixFromAttributes === true) {
915
        if (
916 3
            ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
917
            &&
918 3
            !(isset($attributes['rel']) && $attributes['rel'] === 'external')
919
            &&
920 3
            !(isset($attributes['target']) && $attributes['target'] === '_blank')
921
        ) {
922 2
          $attrValue = \str_replace('http://', '//', $attrValue);
923
        }
924
      }
925
926 9
      if ($this->removeAttributeHelper($element->tag, $attrName, $attrValue, $attributes)) {
927 3
        $element->{$attrName} = null;
928 3
        continue;
929
      }
930
931
      // -------------------------------------------------------------------------
932
      // Sort css-class-names, for better gzip results.
933
      // -------------------------------------------------------------------------
934
935 9
      if ($this->doSortCssClassNames === true) {
936 9
        $attrValue = $this->sortCssClassNames($attrName, $attrValue);
937
      }
938
939 9
      if ($this->doSortHtmlAttributes === true) {
940 9
        $attrs[$attrName] = $attrValue;
941 9
        $element->{$attrName} = null;
942
      }
943
    }
944
945
    // -------------------------------------------------------------------------
946
    // Sort html-attributes, for better gzip results.
947
    // -------------------------------------------------------------------------
948
949 9
    if ($this->doSortHtmlAttributes === true) {
950 9
      \ksort($attrs);
951 9
      foreach ($attrs as $attrName => $attrValue) {
952 9
        $attrValue = HtmlDomParser::replaceToPreserveHtmlEntities($attrValue);
953 9
        $element->setAttribute($attrName, $attrValue, true);
954
      }
955
    }
956
957 9
    return true;
958
  }
959
960
  /**
961
   * Prevent changes of inline "styles" and "scripts".
962
   *
963
   * @param HtmlDomParser $dom
964
   *
965
   * @return HtmlDomParser
966
   */
967 21
  private function protectTags(HtmlDomParser $dom): HtmlDomParser
968
  {
969
    // init
970 21
    $counter = 0;
971
972 21
    foreach ($dom->find('script, style') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('script, style') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
973
974
      // skip external links
975 3
      if ($element->tag === 'script' || $element->tag === 'style') {
976 3
        $attributes = $element->getAllAttributes();
977 3
        if (isset($attributes['src'])) {
978 2
          continue;
979
        }
980
      }
981
982 2
      $this->protectedChildNodes[$counter] = $element->text();
983 2
      $element->getNode()->nodeValue = '<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>';
984
985 2
      ++$counter;
986
    }
987
988 21
    $dom->getDocument()->normalizeDocument();
989
990 21
    foreach ($dom->find('//comment()') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
991 2
      $text = $element->text();
992
993
      // skip normal comments
994 2
      if ($this->isConditionalComment($text) === false) {
995 2
        continue;
996
      }
997
998 2
      $this->protectedChildNodes[$counter] = '<!--' . $text . '-->';
999
1000
      /* @var $node \DOMComment */
1001 2
      $node = $element->getNode();
1002 2
      $child = new \DOMText('<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>');
1003 2
      $element->getNode()->parentNode->replaceChild($child, $node);
1004
1005 2
      ++$counter;
1006
    }
1007
1008 21
    $dom->getDocument()->normalizeDocument();
1009
1010 21
    return $dom;
1011
  }
1012
1013
  /**
1014
   * Check if the attribute can be removed.
1015
   *
1016
   * @param string $tag
1017
   * @param string $attrName
1018
   * @param string $attrValue
1019
   * @param array  $allAttr
1020
   *
1021
   * @return bool
1022
   */
1023 9
  private function removeAttributeHelper($tag, $attrName, $attrValue, $allAttr): bool
1024
  {
1025
    // remove defaults
1026 9
    if ($this->doRemoveDefaultAttributes === true) {
1027
1028 1
      if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
1029
        return true;
1030
      }
1031
1032 1
      if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
1033
        return true;
1034
      }
1035
1036 1
      if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
1037
        return true;
1038
      }
1039
1040 1
      if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
1041
        return true;
1042
      }
1043
    }
1044
1045
    // remove deprecated charset-attribute (the browser will use the charset from the HTTP-Header, anyway)
1046 9 View Code Duplication
    if ($this->doRemoveDeprecatedScriptCharsetAttribute === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1047 9
      if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
1048
        return true;
1049
      }
1050
    }
1051
1052
    // remove deprecated anchor-jump
1053 9 View Code Duplication
    if ($this->doRemoveDeprecatedAnchorName === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1054 9
      if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id']) && $allAttr['id'] === $attrValue) {
1055
        return true;
1056
      }
1057
    }
1058
1059
    // remove "type=text/css" for css links
1060 9 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromStylesheetLink === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1061 9
      if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
1062 1
        return true;
1063
      }
1064
    }
1065
1066
    // remove deprecated script-mime-types
1067 9 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromScriptTag === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1068 9
      if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src'], self::$executableScriptsMimeTypes[$attrValue])) {
1069 1
        return true;
1070
      }
1071
    }
1072
1073
    // remove 'value=""' from <input type="text">
1074 9
    if ($this->doRemoveValueFromEmptyInput === true) {
1075 9
      if ($tag === 'input' && $attrName === 'value' && $attrValue === '' && isset($allAttr['type']) && $allAttr['type'] === 'text') {
1076 1
        return true;
1077
      }
1078
    }
1079
1080
    // remove some empty attributes
1081 9
    if ($this->doRemoveEmptyAttributes === true) {
1082 9
      if (\trim($attrValue) === '' && \preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
1083 2
        return true;
1084
      }
1085
    }
1086
1087 9
    return false;
1088
  }
1089
1090
  /**
1091
   * Remove comments in the dom.
1092
   *
1093
   * @param HtmlDomParser $dom
1094
   *
1095
   * @return HtmlDomParser
1096
   */
1097 20
  private function removeComments(HtmlDomParser $dom): HtmlDomParser
1098
  {
1099 20
    foreach ($dom->find('//comment()') as $commentWrapper) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1100 2
      $comment = $commentWrapper->getNode();
1101 2
      $val = $comment->nodeValue;
1102 2
      if (\strpos($val, '[') === false) {
1103 2
        $comment->parentNode->removeChild($comment);
1104
      }
1105
    }
1106
1107 20
    $dom->getDocument()->normalizeDocument();
1108
1109 20
    return $dom;
1110
  }
1111
1112
  /**
1113
   * Trim tags in the dom.
1114
   *
1115
   * @param SimpleHtmlDom $element
1116
   *
1117
   * @return void
1118
   */
1119 20
  private function removeWhitespaceAroundTags(SimpleHtmlDom $element)
1120
  {
1121 20
    if (isset(self::$trimWhitespaceFromTags[$element->tag])) {
1122 8
      $node = $element->getNode();
1123
1124 8
      $candidates = [];
1125 8
      if ($node->childNodes->length > 0) {
1126 7
        $candidates[] = $node->firstChild;
1127 7
        $candidates[] = $node->lastChild;
1128 7
        $candidates[] = $node->previousSibling;
1129 7
        $candidates[] = $node->nextSibling;
1130
      }
1131
1132 8
      foreach ($candidates as &$candidate) {
1133 7
        if ($candidate === null) {
1134 5
          continue;
1135
        }
1136
1137 7
        if ($candidate->nodeType === 3) {
1138 7
          $candidate->nodeValue = \preg_replace(self::$regExSpace, ' ', $candidate->nodeValue);
1139
        }
1140
      }
1141
    }
1142 20
  }
1143
1144
  /**
1145
   * Callback function for preg_replace_callback use.
1146
   *
1147
   * @param array $matches PREG matches
1148
   *
1149
   * @return string
1150
   */
1151 2
  private function restoreProtectedHtml($matches): string
1152
  {
1153 2
    \preg_match('/.*"(?<id>\d*)"/', $matches['attributes'], $matchesInner);
1154
1155 2
    $html = '';
1156 2
    if (isset($this->protectedChildNodes[$matchesInner['id']])) {
1157 2
      $html .= $this->protectedChildNodes[$matchesInner['id']];
1158
    }
1159
1160 2
    return $html;
1161
  }
1162
1163
  /**
1164
   * @param array $domainsToRemoveHttpPrefixFromAttributes
1165
   *
1166
   * @return $this
1167
   */
1168 2
  public function setDomainsToRemoveHttpPrefixFromAttributes($domainsToRemoveHttpPrefixFromAttributes)
1169
  {
1170 2
    $this->domainsToRemoveHttpPrefixFromAttributes = $domainsToRemoveHttpPrefixFromAttributes;
1171
1172 2
    return $this;
1173
  }
1174
1175
  /**
1176
   * @param $attrName
1177
   * @param $attrValue
1178
   *
1179
   * @return string
1180
   */
1181 9
  private function sortCssClassNames($attrName, $attrValue): string
1182
  {
1183 9
    if ($attrName !== 'class' || !$attrValue) {
1184 9
      return $attrValue;
1185
    }
1186
1187 5
    $classes = \array_unique(
1188 5
        \explode(' ', $attrValue)
1189
    );
1190 5
    \sort($classes);
1191
1192 5
    $attrValue = '';
1193 5
    foreach ($classes as $class) {
1194
1195 5
      if (!$class) {
1196 2
        continue;
1197
      }
1198
1199 5
      $attrValue .= \trim($class) . ' ';
1200
    }
1201 5
    $attrValue = \trim($attrValue);
1202
1203 5
    return $attrValue;
1204
  }
1205
1206
  /**
1207
   * Sum-up extra whitespace from dom-nodes.
1208
   *
1209
   * @param HtmlDomParser $dom
1210
   *
1211
   * @return HtmlDomParser
1212
   */
1213 20
  private function sumUpWhitespace(HtmlDomParser $dom): HtmlDomParser
1214
  {
1215 20
    $textnodes = $dom->find('//text()');
1216 20
    foreach ($textnodes as $textnodeWrapper) {
0 ignored issues
show
Bug introduced by
The expression $textnodes of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1217
      /* @var $textnode \DOMNode */
1218 16
      $textnode = $textnodeWrapper->getNode();
1219 16
      $xp = $textnode->getNodePath();
1220
1221 16
      $doSkip = false;
1222 16
      foreach (self::$skipTagsForRemoveWhitespace as $pattern) {
1223 16
        if (\strpos($xp, "/$pattern") !== false) {
1224 3
          $doSkip = true;
1225 16
          break;
1226
        }
1227
      }
1228 16
      if ($doSkip) {
1229 3
        continue;
1230
      }
1231
1232 16
      $textnode->nodeValue = \preg_replace(self::$regExSpace, ' ', $textnode->nodeValue);
1233
    }
1234
1235 20
    $dom->getDocument()->normalizeDocument();
1236
1237 20
    return $dom;
1238
  }
1239
}
1240