Completed
Push — master ( c927d1...313909 )
by Lars
04:16
created

HtmlMin::optimizeAttributes()   C

Complexity

Conditions 13
Paths 13

Size

Total Lines 45
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 17.1345

Importance

Changes 0
Metric Value
dl 0
loc 45
ccs 22
cts 31
cp 0.7097
rs 5.1234
c 0
b 0
f 0
cc 13
eloc 28
nc 13
nop 1
crap 17.1345

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace voku\helper;
4
5
/**
6
 * Class HtmlMin
7
 *
8
 * Inspired by:
9
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
10
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
11
 * - PHP: https://github.com/WyriHaximus/HtmlCompress
12
 * - PHP: https://github.com/zaininnari/html-minifier
13
 * - Java: https://code.google.com/archive/p/htmlcompressor/
14
 *
15
 * @package voku\helper
16
 */
17
class HtmlMin
18
{
19
  /**
20
   * // https://mathiasbynens.be/demo/javascript-mime-type
21
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
22
   *
23
   * @var array
24
   */
25
  private static $executableScriptsMimeTypes = array(
26
      'text/javascript'          => '',
27
      'text/ecmascript'          => '',
28
      'text/jscript'             => '',
29
      'application/javascript'   => '',
30
      'application/x-javascript' => '',
31
      'application/ecmascript'   => '',
32
  );
33
34
  private static $selfClosingTags = array(
35
      'area',
36
      'base',
37
      'br',
38
      'col',
39
      'command',
40
      'embed',
41
      'hr',
42
      'img',
43
      'input',
44
      'keygen',
45
      'link',
46
      'meta',
47
      'param',
48
      'source',
49
      'track',
50
      'wbr',
51
  );
52
53
54
  /**
55
   * @var array
56
   */
57
  private static $booleanAttributes = array(
58
      'allowfullscreen' => '',
59
      'async'           => '',
60
      'autofocus'       => '',
61
      'autoplay'        => '',
62
      'checked'         => '',
63
      'compact'         => '',
64
      'controls'        => '',
65
      'declare'         => '',
66
      'default'         => '',
67
      'defaultchecked'  => '',
68
      'defaultmuted'    => '',
69
      'defaultselected' => '',
70
      'defer'           => '',
71
      'disabled'        => '',
72
      'enabled'         => '',
73
      'formnovalidate'  => '',
74
      'hidden'          => '',
75
      'indeterminate'   => '',
76
      'inert'           => '',
77
      'ismap'           => '',
78
      'itemscope'       => '',
79
      'loop'            => '',
80
      'multiple'        => '',
81
      'muted'           => '',
82
      'nohref'          => '',
83
      'noresize'        => '',
84
      'noshade'         => '',
85
      'novalidate'      => '',
86
      'nowrap'          => '',
87
      'open'            => '',
88
      'pauseonexit'     => '',
89
      'readonly'        => '',
90
      'required'        => '',
91
      'reversed'        => '',
92
      'scoped'          => '',
93
      'seamless'        => '',
94
      'selected'        => '',
95
      'sortable'        => '',
96
      'truespeed'       => '',
97
      'typemustmatch'   => '',
98
      'visible'         => '',
99
  );
100
  /**
101
   * @var array
102
   */
103
  private static $skipTagsForRemoveWhitespace = array(
104
      'style',
105
      'pre',
106
      'code',
107
      'script',
108
      'textarea',
109
  );
110
111
  /**
112
   * An random md5-hash, generated via "random_bytes()".
113
   *
114
   * @var string
115
   */
116
  private $randomHash;
117
118
  /**
119
   * @var array
120
   */
121
  private $protectedChildNodes;
122
123
  /**
124
   * @var string
125
   */
126
  private $protectedChildNodesHelper;
127
128
  /**
129
   * @var string
130
   */
131
  private $booleanAttributesHelper;
132
133
  /**
134
   * HtmlMin constructor.
135
   */
136 23
  public function __construct()
137
  {
138 23
    $this->protectedChildNodes = array();
139 23
    $this->randomHash = md5(Bootup::get_random_bytes(16));
140
141 23
    $this->protectedChildNodesHelper = 'html-min--saved-content-' . $this->randomHash;
142 23
    $this->booleanAttributesHelper = 'html-min--delete-this-' . $this->randomHash;
143 23
  }
144
145
  /**
146
   * @param string $html
147
   *
148
   * @return string
149
   */
150 23
  public function minify($html)
151
  {
152 23
    $html = (string)$html;
153 23
    if (!isset($html[0])) {
154 1
      return '';
155
    }
156
157 23
    $html = trim($html);
158 23
    if (!$html) {
159 3
      return '';
160
    }
161
162
    // init
163 20
    $this->protectedChildNodes = array();
164 20
    $origHtml = $html;
165 20
    $origHtmlLength = UTF8::strlen($html);
166
167 20
    $dom = new HtmlDomParser();
168 20
    $dom->getDocument()->preserveWhiteSpace = false; // remove redundant white space
169 20
    $dom->getDocument()->formatOutput = false; // do not formats output with indentation
170
171 20
    $dom->loadHtml($html);
172
173 20
    $dom = $this->protectTagsInDom($dom);
174 20
    $dom = $this->optimizeAttributesInDom($dom);
175 11
    $dom = $this->removeCommentsInDom($dom);
176 11
    $dom = $this->removeWhitespaceInDom($dom);
177 11
    $dom = $this->trimTagsInDom($dom);
178
179 11
    $html = $dom->html();
180
181
    // -------------------------------------------------------------------------
182
    // Trim whitespace from html-string. [protected html is still protected]
183
    // -------------------------------------------------------------------------
184
185
    // Remove spaces that are followed by either > or <
186 11
    $html = preg_replace('/ (>)/', '$1', $html);
187
    // Remove spaces that are preceded by either > or <
188 11
    $html = preg_replace('/(<) /', '$1', $html);
189
    // Remove spaces that are between > and <
190 11
    $html = preg_replace('/(>) (<)/', '>$2', $html);
191
192
    // -------------------------------------------------------------------------
193
    // Restore protected HTML-code.
194
    // -------------------------------------------------------------------------
195
196 11
    $html = preg_replace_callback(
197 11
        '/<(?<element>' . $this->protectedChildNodesHelper . ')(?<attributes> [^>]*)?>(?<value>.*?)<\/' . $this->protectedChildNodesHelper . '>/',
198 11
        array($this, 'restoreProtectedHtml'),
199
        $html
200 11
    );
201 11
    $html = $dom::putReplacedBackToPreserveHtmlEntities($html);
202
203
    // ------------------------------------
204
    // final clean-up
205
    // ------------------------------------
206
207 11
    $html = UTF8::cleanup($html);
208
209 11
    $html = str_replace(
210
        array(
211 11
            'html>' . "\n",
212 11
            "\n" . '<html',
213 11
            '<!doctype',
214 11
            '="' . $this->booleanAttributesHelper . '"',
215 11
            '</' . $this->protectedChildNodesHelper . '>',
216 11
        ),
217
        array(
218 11
            'html>',
219 11
            '<html',
220 11
            '<!DOCTYPE',
221 11
            '',
222 11
            '',
223 11
        ),
224
        $html
225 11
    );
226
227 11
    $html = preg_replace(
228
        array(
229 11
            '/<(?:' . $this->protectedChildNodesHelper . ')(:? [^>]*)?>/',
230 11
        ),
231
        array(
232 11
            '',
233 11
        ),
234
        $html
235 11
    );
236
237 11
    static $cacheSelfClosingTags = null;
238 11
    if ($cacheSelfClosingTags === null) {
239 1
      $cacheSelfClosingTags = implode('|', self::$selfClosingTags);
240 1
    }
241 11
    $html = preg_replace('#<\b(' . $cacheSelfClosingTags . ')([^>]+)><\/\b\1>#', '<\\1\\2/>', $html);
242
243
    // ------------------------------------
244
    // check if compression worked
245
    // ------------------------------------
246
247 11
    if ($origHtmlLength < UTF8::strlen($html)) {
248 3
      $html = $origHtml;
249 3
    }
250
251 11
    return $html;
252
  }
253
254
  /**
255
   * Sort HTML-Attributes, so that gzip can do better work
256
   *  and remove some default attributes.
257
   *
258
   * @param SimpleHtmlDom $element
259
   *
260
   * @return bool
261
   */
262 20
  private function optimizeAttributes(SimpleHtmlDom $element)
263
  {
264 20
    $attributs = $element->getAllAttributes();
265 20
    if ($attributs === null) {
266 17
      return false;
267
    }
268
269 9
    $attrs = array();
270 9
    foreach ((array)$attributs as $attrName => $attrValue) {
271
272 9
      if (isset(self::$booleanAttributes[$attrName])) {
273 5
        $attrs[$attrName] = $this->booleanAttributesHelper;
274 5
        $element->{$attrName} = null;
275 5
        continue;
276
      }
277
278
      if (
279 9
          ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
280 9
          &&
281
          !(isset($attributs['rel']) && $attributs['rel'] === 'external')
282 9
          &&
283
          !(isset($attributs['target']) && $attributs['target'] === '_blank')
284 9
      ) {
285
        $attrValue = str_replace('http://', '//', $attrValue);
286
      }
287
288 9
      if ($this->optimizeAttributesFilters($element->tag, $attrName, $attrValue, $attributs)) {
0 ignored issues
show
Documentation introduced by
$attributs is of type array, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
289
        $element->{$attrName} = null;
290
        continue;
291
      }
292
293 9
      $attrValue = $this->sortCssClasses($attrName, $attrValue);
294
295 9
      $attrs[$attrName] = $attrValue;
296 9
      $element->{$attrName} = null;
297 9
    }
298
299 9
    ksort($attrs);
300 9
    foreach ($attrs as $attrName => $attrValue) {
301 9
      $attrValue = HtmlDomParser::replaceToPreserveHtmlEntities($attrValue);
0 ignored issues
show
Bug introduced by
The method replaceToPreserveHtmlEntities() cannot be called from this context as it is declared protected in class voku\helper\HtmlDomParser.

This check looks for access to methods that are not accessible from the current context.

If you need to make a method accessible to another context you can raise its visibility level in the defining class.

Loading history...
302
      $element->setAttribute($attrName, $attrValue, true);
303
    }
304
305
    return true;
306
  }
307
308
  /**
309
   * Check if the attribute (key / value) is default and can be skipped.
310
   *
311
   * @param string $tag
312
   * @param string $attrName
313
   * @param string $attrValue
314
   * @param string $allAttr
315
   *
316
   * @return bool
317
   */
318 9
  private function optimizeAttributesFilters($tag, $attrName, $attrValue, $allAttr)
319
  {
320
    // remove default
321 9
    if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
322
      return true;
323
    }
324
325
    // remove default
326 9
    if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
327
      return true;
328
    }
329
330
    // remove default
331 9
    if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
332
      return true;
333
    }
334
335
    // remove default
336 9
    if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
337
      return true;
338
    }
339
340
    // remove deprecated charset-attribute (the Browser will use the charset from the HTTP-Header, anyway)
341 9
    if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
342
      return true;
343
    }
344
345
    // remove deprecated anchor-jump
346 9
    if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id'])) {
347
      return true;
348
    }
349
350
    // remove "type=text/css" for css links
351 9
    if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
352
      return true;
353
    }
354
355
    // remove deprecated script-mime-types
356 9
    if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src'], self::$executableScriptsMimeTypes[$attrValue])) {
357
      return true;
358
    }
359
360
    // remove empty value from <input>
361 9
    if ($tag === 'input' && $attrName === 'value' && $attrValue === '') {
362
      return true;
363
    }
364
365
    // remove some empty attribute
366 9
    if ($attrValue === '' && preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
367
      return true;
368
    }
369
370 9
    return false;
371
  }
372
373
  /**
374
   * Optimize HTML-tag attributes in the dom.
375
   *
376
   * @param HtmlDomParser $dom
377
   *
378
   * @return HtmlDomParser
379
   */
380 20
  private function optimizeAttributesInDom(HtmlDomParser $dom)
381
  {
382 20
    foreach ($dom->find('*') as $element) {
383 20
      $this->optimizeAttributes($element);
384 17
    }
385
386 11
    return $dom;
387
  }
388
389
  /**
390
   * Prevent changes of inline "styles" and "scripts".
391
   *
392
   * @param HtmlDomParser $dom
393
   *
394
   * @return HtmlDomParser
395
   */
396 20
  private function protectTagsInDom(HtmlDomParser $dom)
397
  {
398
    // init
399 20
    $i = 0;
400
401 20
    foreach ($dom->find('script, style') as $element) {
402
403
      // skip external links
404 4
      if ($element->tag === 'script' || $element->tag === 'style') {
405 4
        $attributs = $element->getAllAttributes();
406 4
        if (isset($attributs['src'])) {
407 2
          continue;
408
        }
409 3
      }
410
411 3
      $node = $element->getNode();
412 3
      while ($node->childNodes->length > 0) {
413 3
        $this->protectedChildNodes[$i][] = $node->firstChild->nodeValue;
414 3
        $node->removeChild($node->firstChild);
415 3
      }
416
417 3
      $child = new \DOMElement($this->protectedChildNodesHelper);
418 3
      $node = $element->getNode()->appendChild($child);
419
      /* @var $node \DOMElement */
420 3
      $node->setAttribute('data-html-min--saved-content', $i);
421
422 3
      ++$i;
423 20
    }
424
425 20
    return $dom;
426
  }
427
428
  /**
429
   * Remove comments in the dom.
430
   *
431
   * @param HtmlDomParser $dom
432
   *
433
   * @return HtmlDomParser
434
   */
435 11
  private function removeCommentsInDom(HtmlDomParser $dom)
436
  {
437 11
    foreach ($dom->find('//comment()') as $commentWrapper) {
438
      $comment = $commentWrapper->getNode();
439
      $val = $comment->nodeValue;
440
      if (strpos($val, '[') !== 0) {
441
        $comment->parentNode->removeChild($comment);
442
      }
443 11
    }
444
445 11
    $dom->getDocument()->normalizeDocument();
446
447 11
    return $dom;
448
  }
449
450
  /**
451
   * Remove whitespace from dom-nodes.
452
   *
453
   * @param HtmlDomParser $dom
454
   *
455
   * @return HtmlDomParser
456
   */
457 12
  private function removeWhitespaceInDom(HtmlDomParser $dom)
458
  {
459 11
    $textnodes = $dom->find('//text()');
460 11
    foreach ($textnodes as $textnodeWrapper) {
461 11
      $textnode = $textnodeWrapper->getNode();
462 11
      $xp = $textnode->getNodePath();
463
464 11
      $doSkip = false;
465 11
      foreach (self::$skipTagsForRemoveWhitespace as $pattern) {
466 11
        if (strpos($xp, "/$pattern") !== false) {
467
          $doSkip = true;
468
          break;
469
        }
470 11
      }
471
472 11
      if ($doSkip) {
473 1
        continue;
474
      }
475
476 11
      $textnode->nodeValue = preg_replace("/\s{2,}/", ' ', $textnode->nodeValue);
477 12
    }
478
479 11
    $dom->getDocument()->normalizeDocument();
480
481 11
    return $dom;
482
  }
483
484
  /**
485
   * Callback function for preg_replace_callback use.
486
   *
487
   * @param  array $matches PREG matches
488
   *
489
   * @return string
490
   */
491
  private function restoreProtectedHtml($matches)
492
  {
493
    preg_match('/.*"(?<id>\d*)"/', $matches['attributes'], $matchesInner);
494
495
    $htmlChild = '';
496
    if (isset($this->protectedChildNodes[$matchesInner['id']])) {
497
      foreach ($this->protectedChildNodes[$matchesInner['id']] as $childNode) {
498
        $htmlChild .= $childNode;
499
      }
500
    }
501
502
    return $htmlChild;
503
  }
504
505
  /**
506
   * @param $attrName
507
   * @param $attrValue
508
   *
509
   * @return string
510
   */
511 9
  private function sortCssClasses($attrName, $attrValue)
512
  {
513 9
    if ($attrName !== 'class' || !$attrValue) {
514 8
      return $attrValue;
515
    }
516
517 3
    $classes = explode(' ', $attrValue);
518 3
    if (!$classes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $classes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
519
      return '';
520
    }
521
522 3
    sort($classes);
523 3
    $attrValue = '';
524 3
    foreach ($classes as $class) {
525 3
      if (!$class) {
526
        continue;
527
      }
528 3
      $attrValue .= trim($class) . ' ';
529 3
    }
530 3
    $attrValue = trim($attrValue);
531
532 3
    return $attrValue;
533
  }
534
535
  /**
536
   * Trim tags in the dom.
537
   *
538
   * @param HtmlDomParser $dom
539
   *
540
   * @return HtmlDomParser
541
   */
542 11
  private function trimTagsInDom(HtmlDomParser $dom)
543
  {
544 11
    $divnodes = $dom->find('//div|//p|//nav|//footer|//article|//script|//hr|//br');
545 11
    foreach ($divnodes as $divnodeWrapper) {
546 3
      $divnode = $divnodeWrapper->getNode();
547
548 3
      $candidates = array();
549
      /** @noinspection PhpParamsInspection */
550 3
      if (count($divnode->childNodes) > 0) {
551 3
        $candidates[] = $divnode->firstChild;
552 3
        $candidates[] = $divnode->lastChild;
553 3
        $candidates[] = $divnode->previousSibling;
554 3
        $candidates[] = $divnode->nextSibling;
555 3
      }
556
557 3
      foreach ($candidates as $candidate) {
558 3
        if ($candidate === null) {
559 3
          continue;
560
        }
561
562 3
        if ($candidate->nodeType === 3) {
563 3
          $candidate->nodeValue = trim($candidate->nodeValue);
564 3
        }
565 3
      }
566 11
    }
567
568 11
    $dom->getDocument()->normalizeDocument();
569
570 11
    return $dom;
571
  }
572
}
573