Completed
Push — master ( c442f2...6b8e64 )
by Lars
16:50 queued 14:16
created

HtmlMin::sortCssClasses()   B

Complexity

Conditions 6
Paths 5

Size

Total Lines 23
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 14
CRAP Score 6.0106

Importance

Changes 0
Metric Value
dl 0
loc 23
ccs 14
cts 15
cp 0.9333
rs 8.5906
c 0
b 0
f 0
cc 6
eloc 14
nc 5
nop 2
crap 6.0106
1
<?php
2
3
namespace voku\helper;
4
5
/**
6
 * Class HtmlMin
7
 *
8
 * Inspired by:
9
 * - JS: https://github.com/kangax/html-minifier/blob/gh-pages/src/htmlminifier.js
10
 * - PHP: https://github.com/searchturbine/phpwee-php-minifier
11
 * - PHP: https://github.com/zaininnari/html-minifier
12
 * - Java: https://code.google.com/archive/p/htmlcompressor/
13
 *
14
 * @package voku\helper
15
 */
16
class HtmlMin
17
{
18
  /**
19
   * // https://mathiasbynens.be/demo/javascript-mime-type
20
   * // https://developer.mozilla.org/en/docs/Web/HTML/Element/script#attr-type
21
   *
22
   * @var array
23
   */
24
  private static $executableScriptsMimeTypes = array(
25
      'text/javascript',
26
      'text/ecmascript',
27
      'text/jscript',
28
      'application/javascript',
29
      'application/x-javascript',
30
      'application/ecmascript',
31
  );
32
33
  /**
34
   * @var array
35
   */
36
  private static $booleanAttributes = array(
37
      'allowfullscreen',
38
      'async',
39
      'autofocus',
40
      'autoplay',
41
      'checked',
42
      'compact',
43
      'controls',
44
      'declare',
45
      'default',
46
      'defaultchecked',
47
      'defaultmuted',
48
      'defaultselected',
49
      'defer',
50
      'disabled',
51
      'enabled',
52
      'formnovalidate',
53
      'hidden',
54
      'indeterminate',
55
      'inert',
56
      'ismap',
57
      'itemscope',
58
      'loop',
59
      'multiple',
60
      'muted',
61
      'nohref',
62
      'noresize',
63
      'noshade',
64
      'novalidate',
65
      'nowrap',
66
      'open',
67
      'pauseonexit',
68
      'readonly',
69
      'required',
70
      'reversed',
71
      'scoped',
72
      'seamless',
73
      'selected',
74
      'sortable',
75
      'truespeed',
76
      'typemustmatch',
77
      'visible',
78
  );
79
80
  /**
81
   * An random md5-hash, generated via "random_bytes()".
82
   *
83
   * @var string
84
   */
85
  protected $randomHash;
86
87
  /**
88
   * HtmlMin constructor.
89
   */
90 21
  public function __construct()
91
  {
92 21
    $this->randomHash = md5(Bootup::get_random_bytes(16));
93 21
  }
94
95
  /**
96
   * @param string $html
97
   *
98
   * @return string
99
   */
100 21
  public function minify($html)
101
  {
102 21
    $html = (string)$html;
103 21
    if (!isset($html[0])) {
104
      return '';
105
    }
106
107 21
    $html = trim($html);
108 21
    if (!$html) {
109 2
      return '';
110
    }
111
112 19
    $origHtml = $html;
113 19
    $origHtmlLength = UTF8::strlen($html);
114
115 19
    $dom = new HtmlDomParser();
116 19
    $dom->getDocument()->preserveWhiteSpace = false;
117 19
    $dom->getDocument()->formatOutput = false;
118
119 19
    $dom->loadHtml($html);
120 19
    $xpath = new \DOMXPath($dom->getDocument());
121
122 19
    foreach ($xpath->query('//comment()') as $comment) {
123 2
      $val = $comment->nodeValue;
124 2
      if (strpos($val, '[') !== 0) {
125 2
        $comment->parentNode->removeChild($comment);
126 2
      }
127 19
    }
128
129 19
    $dom->getDocument()->normalizeDocument();
130
131 19
    $textnodes = $xpath->query('//text()');
132 19
    $skip = array('style', 'pre', 'code', 'script', 'textarea');
133 19
    foreach ($textnodes as $t) {
134
      /* @var $t \DOMNode */
135 15
      $xp = $t->getNodePath();
136
137 15
      $doSkip = false;
138 15
      foreach ($skip as $pattern) {
139 15
        if (strpos($xp, "/$pattern") !== false) {
140 3
          $doSkip = true;
141 3
          break;
142
        }
143 15
      }
144
145 15
      if ($doSkip) {
146 3
        continue;
147
      }
148
149 15
      $t->nodeValue = preg_replace("/\s{2,}/", ' ', $t->nodeValue);
150 19
    }
151
152 19
    $dom->getDocument()->normalizeDocument();
153
154 19
    $divnodes = $xpath->query('//div|//p|//nav|//footer|//article|//script|//hr|//br');
155 19
    foreach ($divnodes as $d) {
156 7
      $candidates = array();
157
158 7
      if (count($d->childNodes)) {
159 7
        $candidates[] = $d->firstChild;
160 7
        $candidates[] = $d->lastChild;
161 7
        $candidates[] = $d->previousSibling;
162 7
        $candidates[] = $d->nextSibling;
163 7
      }
164
165 7
      foreach ($candidates as $c) {
166 7
        if ($c === null) {
167 7
          continue;
168
        }
169
170 7
        if ($c->nodeType === 3) {
171 7
          $c->nodeValue = trim($c->nodeValue);
172 7
        }
173 7
      }
174 19
    }
175
176 19
    $dom->getDocument()->normalizeDocument();
177
178 19
    $elements = $dom->find('*');
179 19
    foreach ($elements as $element) {
180 19
      if (count($element) > 1) {
181
        foreach ($element as $e) {
182
          $this->optimizeAttributes($e);
183
        }
184
      } else {
185 19
        $this->optimizeAttributes($element);
186
      }
187 19
    }
188
189 19
    $dom->getDocument()->normalizeDocument();
190
191
    // ------------------------------------
192
193 19
    $html = UTF8::cleanup($dom->html());
194
    // final clean-up
195 19
    $html = str_replace(
196
        array(
197 19
            'html>' . "\n",
198 19
            "\n" . '<html',
199 19
            '<!doctype',
200 19
            '="delete-this-' . $this->randomHash . '"',
201 19
        ),
202
        array(
203 19
            'html>',
204 19
            '<html',
205 19
            '<!DOCTYPE',
206 19
            '',
207 19
        ),
208
        $html
209 19
    );
210
211 19
    if ($origHtmlLength < UTF8::strlen($html)) {
212 3
      $html = $origHtml;
213 3
    }
214
215
    // Remove spaces that are followed by either > or <
216 19
    $html = preg_replace('/ (>)/', '$1', $html);
217
    // Remove spaces that are preceded by either > or <
218 19
    $html = preg_replace('/(<) /', '$1', $html);
219
    // Remove spaces that are between > and <
220 19
    $html = preg_replace('/(>) (<)/', '>$2', $html);
221
222 19
    return $html;
223
  }
224
225
  /**
226
   * Sort HTML-Attributes, so that gzip can do better work
227
   *  and remove some default attributes.
228
   *
229
   * @param SimpleHtmlDom $element
230
   *
231
   * @return bool
232
   */
233 19
  private function optimizeAttributes(SimpleHtmlDom $element)
234
  {
235 19
    $attributs = $element->getAllAttributes();
236
237 19
    if (!$attributs) {
238 19
      return false;
239
    }
240
241
    /*
0 ignored issues
show
Unused Code Comprehensibility introduced by
43% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
242
    if (
243
        ($element->tag === 'script' || $element->tag === 'style')
244
        &&
245
        !isset($attributs['src'])
246
    ) {
247
      // TODO: protect inline css / js
248
    }
249
    */
250
251 8
    $attrs = array();
252 8
    foreach ((array)$attributs as $attrName => $attrValue) {
253
254 8
      if (in_array($attrName, self::$booleanAttributes, true)) {
255 6
        $attrs[$attrName] = 'delete-this-' . $this->randomHash;
256 6
        $element->{$attrName} = null;
257 6
        continue;
258
      }
259
260
      if (
261 8
          ($attrName === 'href' || $attrName === 'src' || $attrName === 'action')
262 8
          &&
263 3
          !(isset($attributs['rel']) && $attributs['rel'] === 'external')
264 8
          &&
265 3
          !(isset($attributs['target']) && $attributs['target'] === '_blank')
266 8
      ) {
267 3
        $attrValue = str_replace('http://', '//', $attrValue);
268 3
      }
269
270 8
      if ($this->optimizeAttributesFilters($element->tag, $attrName, $attrValue, $attributs)) {
0 ignored issues
show
Documentation introduced by
$attributs is of type array, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
271 2
        $element->{$attrName} = null;
272 2
        continue;
273
      }
274
275 8
      $attrValue = $this->sortCssClasses($attrName, $attrValue);
276
277 8
      $attrs[$attrName] = $attrValue;
278 8
      $element->{$attrName} = null;
279 8
    }
280
281 8
    ksort($attrs);
282 8
    foreach ($attrs as $attrName => $attrValue) {
283 8
      $element->setAttribute($attrName, $attrValue, true);
284 8
    }
285
286 8
    return true;
287
  }
288
289
  /**
290
   * Check if the attribute (key / value) is default and can be skipped.
291
   *
292
   * @param string $tag
293
   * @param string $attrName
294
   * @param string $attrValue
295
   * @param string $allAttr
296
   *
297
   * @return bool
298
   */
299 8
  private function optimizeAttributesFilters($tag, $attrName, $attrValue, $allAttr)
300
  {
301
    // remove default
302 8
    if ($tag === 'script' && $attrName === 'language' && $attrValue === 'javascript') {
303
      return true;
304
    }
305
306
    // remove default
307 8
    if ($tag === 'form' && $attrName === 'method' && $attrValue === 'get') {
308 1
      return true;
309
    }
310
311
    // remove default
312 8
    if ($tag === 'input' && $attrName === 'type' && $attrValue === 'text') {
313 1
      return true;
314
    }
315
316
    // remove default
317 8
    if ($tag === 'area' && $attrName === 'shape' && $attrValue === 'rect') {
318
      return true;
319
    }
320
321
    // remove deprecated charset-attribute (the Browser will use the charset from the HTTP-Header, anyway)
322 8
    if ($tag === 'script' && $attrName === 'charset' && !isset($allAttr['src'])) {
323
      return true;
324
    }
325
326
    // remove deprecated anchor-jump
327 8
    if ($tag === 'a' && $attrName === 'name' && isset($allAttr['id'])) {
328
      return true;
329
    }
330
331
    // remove "type=text/css" for css links
332 8
    if ($tag === 'link' && $attrName === 'type' && $attrValue === 'text/css' && isset($allAttr['rel']) && $allAttr['rel'] === 'stylesheet') {
333 1
      return true;
334
    }
335
336
    // remove deprecated script-mime-types
337 8
    if ($tag === 'script' && $attrName === 'type' && isset($allAttr['src']) && in_array($attrValue, self::$executableScriptsMimeTypes, true)) {
338 1
      return true;
339
    }
340
341
    // remove empty value from <input>
342 8
    if ($tag === 'input' && $attrName === 'value' && $attrValue === '') {
343 1
      return true;
344
    }
345
346
    // remove some empty attribute
347 8
    if ($attrValue === '' && preg_match('/^(?:class|id|style|title|lang|dir|on(?:focus|blur|change|click|dblclick|mouse(?:down|up|over|move|out)|key(?:press|down|up)))$/', $attrName)) {
348 1
      return true;
349
    }
350
351 8
    return false;
352
  }
353
354
  /**
355
   * @param $attrName
356
   * @param $attrValue
357
   *
358
   * @return string
359
   */
360 8
  private function sortCssClasses($attrName, $attrValue)
361
  {
362 8
    if ($attrName !== 'class' || !$attrValue) {
363 8
      return $attrValue;
364
    }
365
366 4
    $classes = explode(' ', $attrValue);
367 4
    if (!$classes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $classes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
368
      return '';
369
    }
370
371 4
    sort($classes);
372 4
    $attrValue = '';
373 4
    foreach ($classes as $class) {
374 4
      if (!$class) {
375 1
        continue;
376
      }
377 4
      $attrValue .= trim($class) . ' ';
378 4
    }
379 4
    $attrValue = trim($attrValue);
380
381 4
    return $attrValue;
382
  }
383
}
384