Completed
Push — master ( 709799...b4b5ef )
by Lars
01:21
created

HtmlMin::domNodeClosingTagOptional()   F

Complexity

Conditions 25
Paths 4249

Size

Total Lines 169
Code Lines 90

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 25.4288

Importance

Changes 0
Metric Value
dl 0
loc 169
ccs 31
cts 34
cp 0.9118
rs 2
c 0
b 0
f 0
cc 25
eloc 90
nc 4249
nop 1
crap 25.4288

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * Class HtmlMin
9
 *
10
 * Inspired by:
11
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
12
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
13
 * - PHP: https://github.com/WyriHaximus/HtmlCompress
14
 * - PHP: https://github.com/zaininnari/html-minifier
15
 * - Java: https://code.google.com/archive/p/htmlcompressor/
16
 *
17
 * @package voku\helper
18
 */
19
class HtmlMin
20
{
21
  /**
22
   * @var string
23
   */
24
  private static $regExSpace = "/[[:space:]]{2,}|[\r\n]+/u";
25
26
  /**
27
   * @var array
28
   */
29
  private static $optional_end_tags = [
30
      'html',
31
      'head',
32
      'body',
33
  ];
34
35
  /**
36
   * // https://mathiasbynens.be/demo/javascript-mime-type
37
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
38
   *
39
   * @var array
40
   */
41
  private static $executableScriptsMimeTypes = [
42
      'text/javascript'          => '',
43
      'text/ecmascript'          => '',
44
      'text/jscript'             => '',
45
      'application/javascript'   => '',
46
      'application/x-javascript' => '',
47
      'application/ecmascript'   => '',
48
  ];
49
50
  private static $selfClosingTags = [
51
      'area',
52
      'base',
53
      'basefont',
54
      'br',
55
      'col',
56
      'command',
57
      'embed',
58
      'frame',
59
      'hr',
60
      'img',
61
      'input',
62
      'isindex',
63
      'keygen',
64
      'link',
65
      'meta',
66
      'param',
67
      'source',
68
      'track',
69
      'wbr',
70
  ];
71
72
  private static $trimWhitespaceFromTags = [
73
      'article' => '',
74
      'br'      => '',
75
      'div'     => '',
76
      'footer'  => '',
77
      'hr'      => '',
78
      'nav'     => '',
79
      'p'       => '',
80
      'script'  => '',
81
  ];
82
83
  /**
84
   * @var array
85
   */
86
  private static $booleanAttributes = [
87
      'allowfullscreen' => '',
88
      'async'           => '',
89
      'autofocus'       => '',
90
      'autoplay'        => '',
91
      'checked'         => '',
92
      'compact'         => '',
93
      'controls'        => '',
94
      'declare'         => '',
95
      'default'         => '',
96
      'defaultchecked'  => '',
97
      'defaultmuted'    => '',
98
      'defaultselected' => '',
99
      'defer'           => '',
100
      'disabled'        => '',
101
      'enabled'         => '',
102
      'formnovalidate'  => '',
103
      'hidden'          => '',
104
      'indeterminate'   => '',
105
      'inert'           => '',
106
      'ismap'           => '',
107
      'itemscope'       => '',
108
      'loop'            => '',
109
      'multiple'        => '',
110
      'muted'           => '',
111
      'nohref'          => '',
112
      'noresize'        => '',
113
      'noshade'         => '',
114
      'novalidate'      => '',
115
      'nowrap'          => '',
116
      'open'            => '',
117
      'pauseonexit'     => '',
118
      'readonly'        => '',
119
      'required'        => '',
120
      'reversed'        => '',
121
      'scoped'          => '',
122
      'seamless'        => '',
123
      'selected'        => '',
124
      'sortable'        => '',
125
      'truespeed'       => '',
126
      'typemustmatch'   => '',
127
      'visible'         => '',
128
  ];
129
  /**
130
   * @var array
131
   */
132
  private static $skipTagsForRemoveWhitespace = [
133
      'code',
134
      'pre',
135
      'script',
136
      'style',
137
      'textarea',
138
  ];
139
140
  /**
141
   * @var array
142
   */
143
  private $protectedChildNodes = [];
144
145
  /**
146
   * @var string
147
   */
148
  private $protectedChildNodesHelper = 'html-min--voku--saved-content';
149
150
  /**
151
   * @var bool
152
   */
153
  private $doOptimizeViaHtmlDomParser = true;
154
155
  /**
156
   * @var bool
157
   */
158
  private $doOptimizeAttributes = true;
159
160
  /**
161
   * @var bool
162
   */
163
  private $doRemoveComments = true;
164
165
  /**
166
   * @var bool
167
   */
168
  private $doRemoveWhitespaceAroundTags = true;
169
170
  /**
171
   * @var bool
172
   */
173
  private $doRemoveHttpPrefixFromAttributes = false;
174
175
176
  /**
177
   * @var array
178
   */
179
  private $domainsToRemoveHttpPrefixFromAttributes = [
180
      'google.com',
181
      'google.de',
182
  ];
183
184
  /**
185
   * @var bool
186
   */
187
  private $doSortCssClassNames = true;
188
189
  /**
190
   * @var bool
191
   */
192
  private $doSortHtmlAttributes = true;
193
194
  /**
195
   * @var bool
196
   */
197
  private $doRemoveDeprecatedScriptCharsetAttribute = true;
198
199
  /**
200
   * @var bool
201
   */
202
  private $doRemoveDefaultAttributes = false;
203
204
  /**
205
   * @var bool
206
   */
207
  private $doRemoveDeprecatedAnchorName = true;
208
209
  /**
210
   * @var bool
211
   */
212
  private $doRemoveDeprecatedTypeFromStylesheetLink = true;
213
214
  /**
215
   * @var bool
216
   */
217
  private $doRemoveDeprecatedTypeFromScriptTag = true;
218
219
  /**
220
   * @var bool
221
   */
222
  private $doRemoveValueFromEmptyInput = true;
223
224
  /**
225
   * @var bool
226
   */
227
  private $doRemoveEmptyAttributes = true;
228
229
  /**
230
   * @var bool
231
   */
232
  private $doSumUpWhitespace = true;
233
234
  /**
235
   * @var bool
236
   */
237
  private $doRemoveSpacesBetweenTags = false;
238
239
  /**
240
   * HtmlMin constructor.
241
   */
242 25
  public function __construct()
243
  {
244 25
  }
245
246
  /**
247
   * @param boolean $doOptimizeAttributes
248
   *
249
   * @return $this
250
   */
251 2
  public function doOptimizeAttributes(bool $doOptimizeAttributes = true)
252
  {
253 2
    $this->doOptimizeAttributes = $doOptimizeAttributes;
254
255 2
    return $this;
256
  }
257
258
  /**
259
   * @param boolean $doOptimizeViaHtmlDomParser
260
   *
261
   * @return $this
262
   */
263 1
  public function doOptimizeViaHtmlDomParser(bool $doOptimizeViaHtmlDomParser = true)
264
  {
265 1
    $this->doOptimizeViaHtmlDomParser = $doOptimizeViaHtmlDomParser;
266
267 1
    return $this;
268
  }
269
270
  /**
271
   * @param boolean $doRemoveComments
272
   *
273
   * @return $this
274
   */
275 2
  public function doRemoveComments(bool $doRemoveComments = true)
276
  {
277 2
    $this->doRemoveComments = $doRemoveComments;
278
279 2
    return $this;
280
  }
281
282
  /**
283
   * @param boolean $doRemoveDefaultAttributes
284
   *
285
   * @return $this
286
   */
287 2
  public function doRemoveDefaultAttributes(bool $doRemoveDefaultAttributes = true)
288
  {
289 2
    $this->doRemoveDefaultAttributes = $doRemoveDefaultAttributes;
290
291 2
    return $this;
292
  }
293
294
  /**
295
   * @param boolean $doRemoveDeprecatedAnchorName
296
   *
297
   * @return $this
298
   */
299 2
  public function doRemoveDeprecatedAnchorName(bool $doRemoveDeprecatedAnchorName = true)
300
  {
301 2
    $this->doRemoveDeprecatedAnchorName = $doRemoveDeprecatedAnchorName;
302
303 2
    return $this;
304
  }
305
306
  /**
307
   * @param boolean $doRemoveDeprecatedScriptCharsetAttribute
308
   *
309
   * @return $this
310
   */
311 2
  public function doRemoveDeprecatedScriptCharsetAttribute(bool $doRemoveDeprecatedScriptCharsetAttribute = true)
312
  {
313 2
    $this->doRemoveDeprecatedScriptCharsetAttribute = $doRemoveDeprecatedScriptCharsetAttribute;
314
315 2
    return $this;
316
  }
317
318
  /**
319
   * @param boolean $doRemoveDeprecatedTypeFromScriptTag
320
   *
321
   * @return $this
322
   */
323 2
  public function doRemoveDeprecatedTypeFromScriptTag(bool $doRemoveDeprecatedTypeFromScriptTag = true)
324
  {
325 2
    $this->doRemoveDeprecatedTypeFromScriptTag = $doRemoveDeprecatedTypeFromScriptTag;
326
327 2
    return $this;
328
  }
329
330
  /**
331
   * @param boolean $doRemoveDeprecatedTypeFromStylesheetLink
332
   *
333
   * @return $this
334
   */
335 2
  public function doRemoveDeprecatedTypeFromStylesheetLink(bool $doRemoveDeprecatedTypeFromStylesheetLink = true)
336
  {
337 2
    $this->doRemoveDeprecatedTypeFromStylesheetLink = $doRemoveDeprecatedTypeFromStylesheetLink;
338
339 2
    return $this;
340
  }
341
342
  /**
343
   * @param boolean $doRemoveEmptyAttributes
344
   *
345
   * @return $this
346
   */
347 2
  public function doRemoveEmptyAttributes(bool $doRemoveEmptyAttributes = true)
348
  {
349 2
    $this->doRemoveEmptyAttributes = $doRemoveEmptyAttributes;
350
351 2
    return $this;
352
  }
353
354
  /**
355
   * @param boolean $doRemoveHttpPrefixFromAttributes
356
   *
357
   * @return $this
358
   */
359 4
  public function doRemoveHttpPrefixFromAttributes(bool $doRemoveHttpPrefixFromAttributes = true)
360
  {
361 4
    $this->doRemoveHttpPrefixFromAttributes = $doRemoveHttpPrefixFromAttributes;
362
363 4
    return $this;
364
  }
365
366
  /**
367
   * @param boolean $doRemoveSpacesBetweenTags
368
   *
369
   * @return $this
370
   */
371
  public function doRemoveSpacesBetweenTags(bool $doRemoveSpacesBetweenTags = true)
372
  {
373
    $this->doRemoveSpacesBetweenTags = $doRemoveSpacesBetweenTags;
374
375
    return $this;
376
  }
377
378
  /**
379
   * @param boolean $doRemoveValueFromEmptyInput
380
   *
381
   * @return $this
382
   */
383 2
  public function doRemoveValueFromEmptyInput(bool $doRemoveValueFromEmptyInput = true)
384
  {
385 2
    $this->doRemoveValueFromEmptyInput = $doRemoveValueFromEmptyInput;
386
387 2
    return $this;
388
  }
389
390
  /**
391
   * @param boolean $doRemoveWhitespaceAroundTags
392
   *
393
   * @return $this
394
   */
395 2
  public function doRemoveWhitespaceAroundTags(bool $doRemoveWhitespaceAroundTags = true)
396
  {
397 2
    $this->doRemoveWhitespaceAroundTags = $doRemoveWhitespaceAroundTags;
398
399 2
    return $this;
400
  }
401
402
  /**
403
   * @param boolean $doSortCssClassNames
404
   *
405
   * @return $this
406
   */
407 2
  public function doSortCssClassNames(bool $doSortCssClassNames = true)
408
  {
409 2
    $this->doSortCssClassNames = $doSortCssClassNames;
410
411 2
    return $this;
412
  }
413
414
  /**
415
   * @param boolean $doSortHtmlAttributes
416
   *
417
   * @return $this
418
   */
419 2
  public function doSortHtmlAttributes(bool $doSortHtmlAttributes = true)
420
  {
421 2
    $this->doSortHtmlAttributes = $doSortHtmlAttributes;
422
423 2
    return $this;
424
  }
425
426
  /**
427
   * @param boolean $doSumUpWhitespace
428
   *
429
   * @return $this
430
   */
431 2
  public function doSumUpWhitespace(bool $doSumUpWhitespace = true)
432
  {
433 2
    $this->doSumUpWhitespace = $doSumUpWhitespace;
434
435 2
    return $this;
436
  }
437
438 21
  private function domNodeAttributesToString(\DOMNode $node): string
439
  {
440
    # Remove quotes around attribute values, when allowed (<p class="foo"> → <p class=foo>)
441 21
    $attrstr = '';
442 21
    if ($node->attributes != null) {
443 21
      foreach ($node->attributes as $attribute) {
444 10
        $attrstr .= $attribute->name;
445
446 10
        if (isset(self::$booleanAttributes[$attribute->name])) {
447 6
          $attrstr .= ' ';
448 6
          continue;
449
        }
450
451 10
        $attrstr .= '=';
452
        # http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#attributes-0
453 10
        $omitquotes = $attribute->value != '' && 0 == \preg_match('/["\'=<>` \t\r\n\f]+/', $attribute->value);
454 10
        $attr_val = $attribute->value;
455 10
        $attrstr .= ($omitquotes ? '' : '"') . $attr_val . ($omitquotes ? '' : '"');
456 10
        $attrstr .= ' ';
457
      }
458
    }
459
460 21
    return \trim($attrstr);
461
  }
462
463 21
  private function domNodeClosingTagOptional(\DOMNode $node): bool
464
  {
465 21
    $tag_name = $node->tagName;
0 ignored issues
show
Bug introduced by
The property tagName does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
466 21
    $nextSibling = $this->getNextSiblingOfTypeDOMElement($node);
467
468
    // https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission
469
470
    // Implemented:
471
    //
472
    // A <p> element's end tag may be omitted if the p element is immediately followed by an address, article, aside, blockquote, details, div, dl, fieldset, figcaption, figure, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, main, menu, nav, ol, p, pre, section, table, or ul element, or if there is no more content in the parent element and the parent element is an HTML element that is not an a, audio, del, ins, map, noscript, or video element, or an autonomous custom element.
473
    // An <li> element's end tag may be omitted if the li element is immediately followed by another li element or if there is no more content in the parent element.
474
    // A <td> element's end tag may be omitted if the td element is immediately followed by a td or th element, or if there is no more content in the parent element.
475
    // An <option> element's end tag may be omitted if the option element is immediately followed by another option element, or if it is immediately followed by an optgroup element, or if there is no more content in the parent element.
476
    // A <tr> element's end tag may be omitted if the tr element is immediately followed by another tr element, or if there is no more content in the parent element.
477
478
    // TODO:
479
    //
480
    // <html> may be omitted if first thing inside is not comment
481
    // <head> may be omitted if first thing inside is an element
482
    // <body> may be omitted if first thing inside is not space, comment, <meta>, <link>, <script>, <style> or <template>
483
    // <colgroup> may be omitted if first thing inside is <col>
484
    // <tbody> may be omitted if first thing inside is <tr>
485
    // A <dt> element's end tag may be omitted if the dt element is immediately followed by another dt element or a dd element.
486
    // A <dd> element's end tag may be omitted if the dd element is immediately followed by another dd element or a dt element, or if there is no more content in the parent element.
487
    // An <rp> element's end tag may be omitted if the rp element is immediately followed by an rt or rp element, or if there is no more content in the parent element.
488
    // An <optgroup> element's end tag may be omitted if the optgroup element is immediately followed by another optgroup element, or if there is no more content in the parent element.
489
    // A <colgroup> element's start tag may be omitted if the first thing inside the colgroup element is a col element, and if the element is not immediately preceded by another colgroup element whose end tag has been omitted. (It can't be omitted if the element is empty.)
490
    // A <colgroup> element's end tag may be omitted if the colgroup element is not immediately followed by ASCII whitespace or a comment.
491
    // A <caption> element's end tag may be omitted if the caption element is not immediately followed by ASCII whitespace or a comment.
492
    // A <thead> element's end tag may be omitted if the thead element is immediately followed by a tbody or tfoot element.
493
    // A <tbody> element's start tag may be omitted if the first thing inside the tbody element is a tr element, and if the element is not immediately preceded by a tbody, thead, or tfoot element whose end tag has been omitted. (It can't be omitted if the element is empty.)
494
    // A <tbody> element's end tag may be omitted if the tbody element is immediately followed by a tbody or tfoot element, or if there is no more content in the parent element.
495
    // A <tfoot> element's end tag may be omitted if there is no more content in the parent element.
496
    // A <th> element's end tag may be omitted if the th element is immediately followed by a td or th element, or if there is no more content in the parent element.
497
    //
498
    // <-- However, a start tag must never be omitted if it has any attributes.
499
500 21
    return \in_array($tag_name, self::$optional_end_tags, true)
501
           ||
502
           (
503 18
               $tag_name == 'li'
504
               &&
505
               (
506 4
                   $nextSibling === null
507
                   ||
508
                   (
509 2
                       $nextSibling instanceof \DOMElement
510
                       &&
511 18
                       $nextSibling->tagName == 'li'
512
                   )
513
               )
514
           )
515
           ||
516
           (
517 18
               $tag_name == 'tr'
518
               &&
519
               (
520 1
                   $nextSibling === null
521
                   ||
522
                   (
523 1
                       $nextSibling instanceof \DOMElement
524
                       &&
525 18
                       $nextSibling->tagName == 'tr'
526
                   )
527
               )
528
           )
529
           ||
530
           (
531 18
               $tag_name == 'td'
532
               &&
533
               (
534 1
                   $nextSibling === null
535
                   ||
536
                   (
537 1
                       $nextSibling instanceof \DOMElement
538
                       &&
539
                       (
540 1
                           $nextSibling->tagName == 'td'
541
                           ||
542 18
                           $nextSibling->tagName == 'th'
543
                       )
544
                   )
545
               )
546
           )
547
           ||
548
           (
549 18
               $tag_name == 'option'
550
               &&
551
               (
552
                   $nextSibling === null
553
                   ||
554
                   (
555
                       $nextSibling instanceof \DOMElement
556
                       &&
557
                       (
558
                           $nextSibling->tagName == 'option'
559
                           ||
560 18
                           $nextSibling->tagName == 'optgroup'
561
                       )
562
                   )
563
               )
564
           )
565
           ||
566
           (
567 18
               $tag_name == 'p'
568
               &&
569
               (
570
                   (
571 8
                       $nextSibling === null
572
                       &&
573
                       (
574 8
                           $node->parentNode !== null
575
                           &&
576 8
                           !\in_array(
577 8
                               $node->parentNode->tagName,
578
                               [
579 8
                                   'a',
580
                                   'audio',
581
                                   'del',
582
                                   'ins',
583
                                   'map',
584
                                   'noscript',
585
                                   'video'
586
                               ],
587 8
                               true
588
                           )
589
                       )
590
                   )
591
                   ||
592
                   (
593 5
                       $nextSibling instanceof \DOMElement
594
                       &&
595 5
                       \in_array(
596 5
                           $nextSibling->tagName,
597
                           [
598 5
                               'address',
599
                               'article',
600
                               'aside',
601
                               'blockquote',
602
                               'dir',
603
                               'div',
604
                               'dl',
605
                               'fieldset',
606
                               'footer',
607
                               'form',
608
                               'h1',
609
                               'h2',
610
                               'h3',
611
                               'h4',
612
                               'h5',
613
                               'h6',
614
                               'header',
615
                               'hgroup',
616
                               'hr',
617
                               'menu',
618
                               'nav',
619
                               'ol',
620
                               'p',
621
                               'pre',
622
                               'section',
623
                               'table',
624
                               'ul',
625
                           ],
626 21
                           true
627
                       )
628
                   )
629
               )
630
           );
631
  }
632
633 21
  protected function domNodeToString(\DOMNode $node): string
634
  {
635
    // init
636 21
    $htmlstr = '';
637
638 21
    foreach ($node->childNodes as $child) {
639
640 21
      if ($child instanceof \DOMDocumentType) {
0 ignored issues
show
Unused Code introduced by
This if statement is empty and can be removed.

This check looks for the bodies of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These if bodies can be removed. If you have an empty if but statements in the else branch, consider inverting the condition.

if (rand(1, 6) > 3) {
//print "Check failed";
} else {
    print "Check succeeded";
}

could be turned into

if (rand(1, 6) <= 3) {
    print "Check succeeded";
}

This is much more concise to read.

Loading history...
641
642
        // needed?
643
644 21
      } elseif ($child instanceof \DOMElement) {
645
646 21
        $htmlstr .= trim('<' . $child->tagName . ' ' . $this->domNodeAttributesToString($child));
647 21
        $htmlstr .= '>' . $this->domNodeToString($child);
648
649 21
        if (!$this->domNodeClosingTagOptional($child)) {
650 21
          $htmlstr .= '</' . $child->tagName . '>';
651
        }
652
653 17
      } elseif ($child instanceof \DOMText) {
654
655 17
        if ($child->isWhitespaceInElementContent()) {
656
          if (
657 13
              $child->previousSibling !== null
658
              &&
659 13
              $child->nextSibling !== null
660
          ) {
661 13
            $htmlstr .= ' ';
662
          }
663
        } else {
664 17
          $htmlstr .= $child->wholeText;
665
        }
666
667
      } elseif ($child instanceof \DOMComment) {
668
669
        $htmlstr .= $child->wholeText;
0 ignored issues
show
Bug introduced by
The property wholeText does not seem to exist in DOMComment.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
670
671
      } else {
672
673 21
        throw new \Exception('Error by: ' . print_r($child, true));
674
675
      }
676
    }
677
678 21
    return $htmlstr;
679
  }
680
681
  /**
682
   * @param \DOMNode $node
683
   *
684
   * @return \DOMNode|null
685
   */
686 21
  protected function getNextSiblingOfTypeDOMElement(\DOMNode $node)
687
  {
688
    do {
689 21
      $node = $node->nextSibling;
690 21
    } while (!($node === null || $node instanceof \DOMElement));
691
692 21
    return $node;
693
  }
694
695
  /**
696
   * Check if the current string is an conditional comment.
697
   *
698
   * INFO: since IE >= 10 conditional comment are not working anymore
699
   *
700
   * <!--[if expression]> HTML <![endif]-->
701
   * <![if expression]> HTML <![endif]>
702
   *
703
   * @param string $comment
704
   *
705
   * @return bool
706
   */
707 2
  private function isConditionalComment($comment): bool
708
  {
709 2
    if (preg_match('/^\[if [^\]]+\]/', $comment)) {
710 2
      return true;
711
    }
712
713 2
    if (preg_match('/\[endif\]$/', $comment)) {
714 1
      return true;
715
    }
716
717 2
    return false;
718
  }
719
720
  /**
721
   * @param string $html
722
   * @param bool   $decodeUtf8Specials <p>Use this only in special cases, e.g. for PHP 5.3</p>
723
   *
724
   * @return string
725
   */
726 25
  public function minify($html, $decodeUtf8Specials = false): string
727
  {
728 25
    $html = (string)$html;
729 25
    if (!isset($html[0])) {
730 1
      return '';
731
    }
732
733 25
    $html = trim($html);
734 25
    if (!$html) {
735 3
      return '';
736
    }
737
738
    // init
739 22
    static $CACHE_SELF_CLOSING_TAGS = null;
740 22
    if ($CACHE_SELF_CLOSING_TAGS === null) {
741 1
      $CACHE_SELF_CLOSING_TAGS = implode('|', self::$selfClosingTags);
742
    }
743
744
    // reset
745 22
    $this->protectedChildNodes = [];
746
747
    // save old content
748 22
    $origHtml = $html;
749 22
    $origHtmlLength = UTF8::strlen($html);
750
751
    // -------------------------------------------------------------------------
752
    // Minify the HTML via "HtmlDomParser"
753
    // -------------------------------------------------------------------------
754
755 22
    if ($this->doOptimizeViaHtmlDomParser === true) {
756 21
      $html = $this->minifyHtmlDom($html, $decodeUtf8Specials);
757
    }
758
759
    // -------------------------------------------------------------------------
760
    // Trim whitespace from html-string. [protected html is still protected]
761
    // -------------------------------------------------------------------------
762
763
    // Remove extra white-space(s) between HTML attribute(s)
764 22
    $html = (string)\preg_replace_callback(
765 22
        '#<([^\/\s<>!]+)(?:\s+([^<>]*?)\s*|\s*)(\/?)>#',
766 22
        function ($matches) {
767 22
          return '<' . $matches[1] . (string)\preg_replace('#([^\s=]+)(\=([\'"]?)(.*?)\3)?(\s+|$)#s', ' $1$2', $matches[2]) . $matches[3] . '>';
768 22
        },
769 22
        $html
770
    );
771
772
773 22
    if ($this->doRemoveSpacesBetweenTags === true) {
774
      // Remove spaces that are between > and <
775
      $html = (string)\preg_replace('/(>) (<)/', '>$2', $html);
776
    }
777
778
    // -------------------------------------------------------------------------
779
    // Restore protected HTML-code.
780
    // -------------------------------------------------------------------------
781
782 22
    $html = (string)\preg_replace_callback(
783 22
        '/<(?<element>' . $this->protectedChildNodesHelper . ')(?<attributes> [^>]*)?>(?<value>.*?)<\/' . $this->protectedChildNodesHelper . '>/',
784 22
        [$this, 'restoreProtectedHtml'],
785 22
        $html
786
    );
787
788
    // -------------------------------------------------------------------------
789
    // Restore protected HTML-entities.
790
    // -------------------------------------------------------------------------
791
792 22
    if ($this->doOptimizeViaHtmlDomParser === true) {
793 21
      $html = HtmlDomParser::putReplacedBackToPreserveHtmlEntities($html);
794
    }
795
796
    // ------------------------------------
797
    // Final clean-up
798
    // ------------------------------------
799
800 22
    $html = UTF8::cleanup($html);
801
802 22
    $html = \str_replace(
803
        [
804 22
            'html>' . "\n",
805
            "\n" . '<html',
806
            'html/>' . "\n",
807
            "\n" . '</html',
808
            'head>' . "\n",
809
            "\n" . '<head',
810
            'head/>' . "\n",
811
            "\n" . '</head',
812
        ],
813
        [
814 22
            'html>',
815
            '<html',
816
            'html/>',
817
            '</html',
818
            'head>',
819
            '<head',
820
            'head/>',
821
            '</head',
822
        ],
823 22
        $html
824
    );
825
826
    // self closing tags, don't need a trailing slash ...
827 22
    $replace = [];
828 22
    $replacement = [];
829 22
    foreach (self::$selfClosingTags as $selfClosingTag) {
830 22
      $replace[] = '<' . $selfClosingTag . '/>';
831 22
      $replacement[] = '<' . $selfClosingTag . '>';
832 22
      $replace[] = '<' . $selfClosingTag . ' />';
833 22
      $replacement[] = '<' . $selfClosingTag . '>';
834
    }
835 22
    $html = \str_replace(
836 22
        $replace,
837 22
        $replacement,
838 22
        $html
839
    );
840
841
    // ------------------------------------
842
    // check if compression worked
843
    // ------------------------------------
844
845 22
    if ($origHtmlLength < UTF8::strlen($html)) {
846 2
      $html = $origHtml;
847
    }
848
849 22
    return $html;
850
  }
851
852
  /**
853
   * @param $html
854
   * @param $decodeUtf8Specials
855
   *
856
   * @return string
857
   */
858 21
  private function minifyHtmlDom($html, $decodeUtf8Specials): string
859
  {
860
    // init dom
861 21
    $dom = new HtmlDomParser();
862 21
    $dom->getDocument()->preserveWhiteSpace = false; // remove redundant white space
863 21
    $dom->getDocument()->formatOutput = false; // do not formats output with indentation
864
865
    // load dom
866 21
    $dom->loadHtml($html);
867
868
    // -------------------------------------------------------------------------
869
    // Protect HTML tags and conditional comments.
870
    // -------------------------------------------------------------------------
871
872 21
    $dom = $this->protectTags($dom);
873
874
    // -------------------------------------------------------------------------
875
    // Remove default HTML comments. [protected html is still protected]
876
    // -------------------------------------------------------------------------
877
878 21
    if ($this->doRemoveComments === true) {
879 20
      $dom = $this->removeComments($dom);
880
    }
881
882
    // -------------------------------------------------------------------------
883
    // Sum-Up extra whitespace from the Dom. [protected html is still protected]
884
    // -------------------------------------------------------------------------
885
886 21
    if ($this->doSumUpWhitespace === true) {
887 20
      $dom = $this->sumUpWhitespace($dom);
888
    }
889
890 21
    foreach ($dom->find('*') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('*') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
891
892
      // -------------------------------------------------------------------------
893
      // Optimize html attributes. [protected html is still protected]
894
      // -------------------------------------------------------------------------
895
896 21
      if ($this->doOptimizeAttributes === true) {
897 20
        $this->optimizeAttributes($element);
898
      }
899
900
      // -------------------------------------------------------------------------
901
      // Remove whitespace around tags. [protected html is still protected]
902
      // -------------------------------------------------------------------------
903
904 21
      if ($this->doRemoveWhitespaceAroundTags === true) {
905 21
        $this->removeWhitespaceAroundTags($element);
906
      }
907
    }
908
909
    // -------------------------------------------------------------------------
910
    // Convert the Dom into a string.
911
    // -------------------------------------------------------------------------
912
913 21
    $html = $dom->fixHtmlOutput(
914 21
        $this->domNodeToString($dom->getDocument()),
915 21
        $decodeUtf8Specials
916
    );
917
918 21
    return $html;
919
  }
920
921
  /**
922
   * Sort HTML-Attributes, so that gzip can do better work and remove some default attributes...
923
   *
924
   * @param SimpleHtmlDom $element
925
   *
926
   * @return bool
927
   */
928 20
  private function optimizeAttributes(SimpleHtmlDom $element): bool
929
  {
930 20
    $attributes = $element->getAllAttributes();
931 20
    if ($attributes === null) {
932 20
      return false;
933
    }
934
935 9
    $attrs = [];
936 9
    foreach ((array)$attributes as $attrName => $attrValue) {
937
938 9
      if (isset(self::$booleanAttributes[$attrName])) {
939 6
        continue;
940
      }
941
942
      // -------------------------------------------------------------------------
943
      // Remove optional "http:"-prefix from attributes.
944
      // -------------------------------------------------------------------------
945
946 9
      if ($this->doRemoveHttpPrefixFromAttributes === true) {
947
        if (
948 3
            ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
949
            &&
950 3
            !(isset($attributes['rel']) && $attributes['rel'] === 'external')
951
            &&
952 3
            !(isset($attributes['target']) && $attributes['target'] === '_blank')
953
        ) {
954 2
          $attrValue = \str_replace('http://', '//', $attrValue);
955
        }
956
      }
957
958 9
      if ($this->removeAttributeHelper($element->tag, $attrName, $attrValue, $attributes)) {
959 3
        $element->{$attrName} = null;
960 3
        continue;
961
      }
962
963
      // -------------------------------------------------------------------------
964
      // Sort css-class-names, for better gzip results.
965
      // -------------------------------------------------------------------------
966
967 9
      if ($this->doSortCssClassNames === true) {
968 9
        $attrValue = $this->sortCssClassNames($attrName, $attrValue);
969
      }
970
971 9
      if ($this->doSortHtmlAttributes === true) {
972 9
        $attrs[$attrName] = $attrValue;
973 9
        $element->{$attrName} = null;
974
      }
975
    }
976
977
    // -------------------------------------------------------------------------
978
    // Sort html-attributes, for better gzip results.
979
    // -------------------------------------------------------------------------
980
981 9
    if ($this->doSortHtmlAttributes === true) {
982 9
      \ksort($attrs);
983 9
      foreach ($attrs as $attrName => $attrValue) {
984 9
        $attrValue = HtmlDomParser::replaceToPreserveHtmlEntities($attrValue);
985 9
        $element->setAttribute($attrName, $attrValue, true);
986
      }
987
    }
988
989 9
    return true;
990
  }
991
992
  /**
993
   * Prevent changes of inline "styles" and "scripts".
994
   *
995
   * @param HtmlDomParser $dom
996
   *
997
   * @return HtmlDomParser
998
   */
999 21
  private function protectTags(HtmlDomParser $dom): HtmlDomParser
1000
  {
1001
    // init
1002 21
    $counter = 0;
1003
1004 21
    foreach ($dom->find('script, style') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('script, style') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1005
1006
      // skip external links
1007 3
      if ($element->tag === 'script' || $element->tag === 'style') {
1008 3
        $attributes = $element->getAllAttributes();
1009 3
        if (isset($attributes['src'])) {
1010 2
          continue;
1011
        }
1012
      }
1013
1014 2
      $this->protectedChildNodes[$counter] = $element->text();
1015 2
      $element->getNode()->nodeValue = '<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>';
1016
1017 2
      ++$counter;
1018
    }
1019
1020 21
    $dom->getDocument()->normalizeDocument();
1021
1022 21
    foreach ($dom->find('//comment()') as $element) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1023 2
      $text = $element->text();
1024
1025
      // skip normal comments
1026 2
      if ($this->isConditionalComment($text) === false) {
1027 2
        continue;
1028
      }
1029
1030 2
      $this->protectedChildNodes[$counter] = '<!--' . $text . '-->';
1031
1032
      /* @var $node \DOMComment */
1033 2
      $node = $element->getNode();
1034 2
      $child = new \DOMText('<' . $this->protectedChildNodesHelper . ' data-' . $this->protectedChildNodesHelper . '="' . $counter . '"></' . $this->protectedChildNodesHelper . '>');
1035 2
      $element->getNode()->parentNode->replaceChild($child, $node);
1036
1037 2
      ++$counter;
1038
    }
1039
1040 21
    $dom->getDocument()->normalizeDocument();
1041
1042 21
    return $dom;
1043
  }
1044
1045
  /**
1046
   * Check if the attribute can be removed.
1047
   *
1048
   * @param string $tag
1049
   * @param string $attrName
1050
   * @param string $attrValue
1051
   * @param array  $allAttr
1052
   *
1053
   * @return bool
1054
   */
1055 9
  private function removeAttributeHelper($tag, $attrName, $attrValue, $allAttr): bool
1056
  {
1057
    // remove defaults
1058 9
    if ($this->doRemoveDefaultAttributes === true) {
1059
1060 1
      if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
1061
        return true;
1062
      }
1063
1064 1
      if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
1065
        return true;
1066
      }
1067
1068 1
      if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
1069
        return true;
1070
      }
1071
1072 1
      if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
1073
        return true;
1074
      }
1075
    }
1076
1077
    // remove deprecated charset-attribute (the browser will use the charset from the HTTP-Header, anyway)
1078 9 View Code Duplication
    if ($this->doRemoveDeprecatedScriptCharsetAttribute === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1079 9
      if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
1080
        return true;
1081
      }
1082
    }
1083
1084
    // remove deprecated anchor-jump
1085 9 View Code Duplication
    if ($this->doRemoveDeprecatedAnchorName === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1086 9
      if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id']) && $allAttr['id'] === $attrValue) {
1087
        return true;
1088
      }
1089
    }
1090
1091
    // remove "type=text/css" for css links
1092 9 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromStylesheetLink === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1093 9
      if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
1094 1
        return true;
1095
      }
1096
    }
1097
1098
    // remove deprecated script-mime-types
1099 9 View Code Duplication
    if ($this->doRemoveDeprecatedTypeFromScriptTag === true) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1100 9
      if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src'], self::$executableScriptsMimeTypes[$attrValue])) {
1101 1
        return true;
1102
      }
1103
    }
1104
1105
    // remove 'value=""' from <input type="text">
1106 9
    if ($this->doRemoveValueFromEmptyInput === true) {
1107 9
      if ($tag === 'input' && $attrName === 'value' && $attrValue === '' && isset($allAttr['type']) && $allAttr['type'] === 'text') {
1108 1
        return true;
1109
      }
1110
    }
1111
1112
    // remove some empty attributes
1113 9
    if ($this->doRemoveEmptyAttributes === true) {
1114 9
      if (\trim($attrValue) === '' && \preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
1115 2
        return true;
1116
      }
1117
    }
1118
1119 9
    return false;
1120
  }
1121
1122
  /**
1123
   * Remove comments in the dom.
1124
   *
1125
   * @param HtmlDomParser $dom
1126
   *
1127
   * @return HtmlDomParser
1128
   */
1129 20
  private function removeComments(HtmlDomParser $dom): HtmlDomParser
1130
  {
1131 20
    foreach ($dom->find('//comment()') as $commentWrapper) {
0 ignored issues
show
Bug introduced by
The expression $dom->find('//comment()') of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1132 2
      $comment = $commentWrapper->getNode();
1133 2
      $val = $comment->nodeValue;
1134 2
      if (\strpos($val, '[') === false) {
1135 2
        $comment->parentNode->removeChild($comment);
1136
      }
1137
    }
1138
1139 20
    $dom->getDocument()->normalizeDocument();
1140
1141 20
    return $dom;
1142
  }
1143
1144
  /**
1145
   * Trim tags in the dom.
1146
   *
1147
   * @param SimpleHtmlDom $element
1148
   *
1149
   * @return void
1150
   */
1151 20
  private function removeWhitespaceAroundTags(SimpleHtmlDom $element)
1152
  {
1153 20
    if (isset(self::$trimWhitespaceFromTags[$element->tag])) {
1154 8
      $node = $element->getNode();
1155
1156 8
      $candidates = [];
1157 8
      if ($node->childNodes->length > 0) {
1158 7
        $candidates[] = $node->firstChild;
1159 7
        $candidates[] = $node->lastChild;
1160 7
        $candidates[] = $node->previousSibling;
1161 7
        $candidates[] = $node->nextSibling;
1162
      }
1163
1164 8
      foreach ($candidates as &$candidate) {
1165 7
        if ($candidate === null) {
1166 5
          continue;
1167
        }
1168
1169 7
        if ($candidate->nodeType === 3) {
1170 7
          $candidate->nodeValue = \preg_replace(self::$regExSpace, ' ', $candidate->nodeValue);
1171
        }
1172
      }
1173
    }
1174 20
  }
1175
1176
  /**
1177
   * Callback function for preg_replace_callback use.
1178
   *
1179
   * @param array $matches PREG matches
1180
   *
1181
   * @return string
1182
   */
1183 2
  private function restoreProtectedHtml($matches): string
1184
  {
1185 2
    \preg_match('/.*"(?<id>\d*)"/', $matches['attributes'], $matchesInner);
1186
1187 2
    $html = '';
1188 2
    if (isset($this->protectedChildNodes[$matchesInner['id']])) {
1189 2
      $html .= $this->protectedChildNodes[$matchesInner['id']];
1190
    }
1191
1192 2
    return $html;
1193
  }
1194
1195
  /**
1196
   * @param array $domainsToRemoveHttpPrefixFromAttributes
1197
   *
1198
   * @return $this
1199
   */
1200 2
  public function setDomainsToRemoveHttpPrefixFromAttributes($domainsToRemoveHttpPrefixFromAttributes)
1201
  {
1202 2
    $this->domainsToRemoveHttpPrefixFromAttributes = $domainsToRemoveHttpPrefixFromAttributes;
1203
1204 2
    return $this;
1205
  }
1206
1207
  /**
1208
   * @param $attrName
1209
   * @param $attrValue
1210
   *
1211
   * @return string
1212
   */
1213 9
  private function sortCssClassNames($attrName, $attrValue): string
1214
  {
1215 9
    if ($attrName !== 'class' || !$attrValue) {
1216 9
      return $attrValue;
1217
    }
1218
1219 5
    $classes = \array_unique(
1220 5
        \explode(' ', $attrValue)
1221
    );
1222 5
    \sort($classes);
1223
1224 5
    $attrValue = '';
1225 5
    foreach ($classes as $class) {
1226
1227 5
      if (!$class) {
1228 2
        continue;
1229
      }
1230
1231 5
      $attrValue .= \trim($class) . ' ';
1232
    }
1233 5
    $attrValue = \trim($attrValue);
1234
1235 5
    return $attrValue;
1236
  }
1237
1238
  /**
1239
   * Sum-up extra whitespace from dom-nodes.
1240
   *
1241
   * @param HtmlDomParser $dom
1242
   *
1243
   * @return HtmlDomParser
1244
   */
1245 20
  private function sumUpWhitespace(HtmlDomParser $dom): HtmlDomParser
1246
  {
1247 20
    $textnodes = $dom->find('//text()');
1248 20
    foreach ($textnodes as $textnodeWrapper) {
0 ignored issues
show
Bug introduced by
The expression $textnodes of type array<integer,object<vok...leHtmlDomNodeInterface> is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1249
      /* @var $textnode \DOMNode */
1250 16
      $textnode = $textnodeWrapper->getNode();
1251 16
      $xp = $textnode->getNodePath();
1252
1253 16
      $doSkip = false;
1254 16
      foreach (self::$skipTagsForRemoveWhitespace as $pattern) {
1255 16
        if (\strpos($xp, "/$pattern") !== false) {
1256 3
          $doSkip = true;
1257 16
          break;
1258
        }
1259
      }
1260 16
      if ($doSkip) {
1261 3
        continue;
1262
      }
1263
1264 16
      $textnode->nodeValue = \preg_replace(self::$regExSpace, ' ', $textnode->nodeValue);
1265
    }
1266
1267 20
    $dom->getDocument()->normalizeDocument();
1268
1269 20
    return $dom;
1270
  }
1271
}
1272