Completed
Push — master ( e3e355...4c22c7 )
by Tim
13:40
created

CleanHtmlService::rTrimLines()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 0
cts 7
cp 0
rs 9.9332
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 6
1
<?php
2
3
namespace HTML\Sourceopt\Service;
4
5
use HTML\Sourceopt\Manipulation\ManipulationInterface;
6
use HTML\Sourceopt\Manipulation\RemoveBlurScript;
7
use HTML\Sourceopt\Manipulation\RemoveComments;
8
use HTML\Sourceopt\Manipulation\RemoveGenerator;
9
use TYPO3\CMS\Core\Core\Environment;
10
use TYPO3\CMS\Core\SingletonInterface;
11
use TYPO3\CMS\Core\Utility\GeneralUtility;
12
13
/**
14
 * Service: Clean parsed HTML functionality
15
 * Based on the extension 'sourceopt'
16
 */
17
class CleanHtmlService implements SingletonInterface
18
{
19
20
    /**
21
     * Enable Debug comment in footer
22
     *
23
     * @var boolean
24
     */
25
    protected $debugComment = false;
26
27
    /**
28
     * Format Type
29
     *
30
     * @var integer
31
     */
32
    protected $formatType = 0;
33
34
    /**
35
     * Tab character
36
     *
37
     * @var string
38
     */
39
    protected $tab = "\t";
40
41
    /**
42
     * Newline character
43
     *
44
     * @var string
45
     */
46
    protected $newline = "\n";
47
48
    /**
49
     * Configured extra header comment
50
     *
51
     * @var string
52
     */
53
    protected $headerComment = '';
54
55
    /**
56
     * Empty space char
57
     * @var string
58
     */
59
    protected $emptySpaceChar = ' ';
60
61
    /**
62
     * Set variables based on given config
63
     *
64
     * @param array $config
65
     *
66
     * @return void
67
     */
68
    public function setVariables(array $config)
69
    {
70
        // Set newline based on OS
71
        if (Environment::isWindows()) {
72
            $this->newline = "\r\n";
73
        } else {
74
            $this->newline = "\n";
75
        }
76
77
        if (!empty($config)) {
78
            if ($config['formatHtml'] && is_numeric($config['formatHtml'])) {
79
                $this->formatType = (int)$config['formatHtml'];
80
            }
81
82
            if ($config['formatHtml.']['tabSize'] && is_numeric($config['formatHtml.']['tabSize'])) {
83
                $this->tab = str_pad('', $config['formatHtml.']['tabSize'], ' ');
84
            }
85
86
            if (isset($config['formatHtml.']['debugComment'])) {
87
                $this->debugComment = (bool)$config['formatHtml.']['debugComment'];
88
            }
89
90
            if (isset($config['headerComment'])) {
91
                $this->headerComment = $config['headerComment'];
92
            }
93
94
            if (isset($config['dropEmptySpaceChar']) && (bool)$config['dropEmptySpaceChar']) {
95
                $this->emptySpaceChar = '';
96
            }
97
        }
98
    }
99
100
    /**
101
     * Clean given HTML with formatter
102
     *
103
     * @param string $html
104
     * @param array $config
105
     *
106
     * @return string
107
     */
108
    public function clean($html, $config = [])
109
    {
110
        if (!empty($config)) {
111
            if ((bool)$config['enabled'] === false) {
112
                return $html;
113
            }
114
115
            $this->setVariables($config);
116
        }
117
118
        $manipulations = [];
119
120
        if (isset($config['removeGenerator']) && (bool)$config['removeGenerator']) {
121
            $manipulations['removeGenerator'] = GeneralUtility::makeInstance(RemoveGenerator::class);
122
        }
123
124
        if (isset($config['removeComments']) && (bool)$config['removeComments']) {
125
            $manipulations['removeComments'] = GeneralUtility::makeInstance(RemoveComments::class);
126
        }
127
128
        if (isset($config['removeBlurScript']) && (bool)$config['removeBlurScript']) {
129
            $manipulations['removeBlurScript'] = GeneralUtility::makeInstance(RemoveBlurScript::class);
130
        }
131
132
        if (!empty($this->headerComment)) {
133
            $this->includeHeaderComment($html);
134
        }
135
136
        foreach ($manipulations as $key => $manipulation) {
137
            /** @var ManipulationInterface $manipulation */
138
            $configuration = isset($config[$key . '.']) && is_array($config[$key . '.']) ? $config[$key . '.'] : [];
139
            $html = $manipulation->manipulate($html, $configuration);
140
        }
141
142
        if ($this->formatType > 0) {
143
            $html = $this->formatHtml($html);
144
        }
145
146
        return $html;
147
    }
148
149
    /**
150
     * Formats the (X)HTML code:
151
     *  - taps according to the hirarchy of the tags
152
     *  - removes empty spaces between tags
153
     *  - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..)
154
     *  choose from five options:
155
     *    0 => off
156
     *    1 => no line break at all  (code in one line)
157
     *    2 => minimalistic line breaks (structure defining box-elements)
158
     *    3 => aesthetic line breaks (important box-elements)
159
     *    4 => logic line breaks (all box-elements)
160
     *    5 => max line breaks (all elements)
161
     *
162
     * @param string $html
163
     *
164
     * @return string
165
     */
166
    protected function formatHtml($html)
167
    {
168
        // Save original formated comments, pre, textarea, styles and java-scripts & replace them with markers
169
        preg_match_all(
170
            '/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
171
            $html,
172
            $matches
173
        );
174
        $noFormat = $matches[0]; // do not format these block elements
175
        for ($i = 0; $i < count($noFormat); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
176
            $html = str_replace($noFormat[$i], "\n<!-- ELEMENT $i -->", $html);
177
        }
178
179
        // define box elements for formatting
180
        $trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section';
181
        $functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup';
182
        $usableBoxElements = 'applet|button|del|iframe|ins|map|object|script';
183
        $imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--';
184
        $allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')';
185
        $esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)';
186
        $structureBoxLikeElements = '(?>html|head|body|div|!--)';
187
188
        // split html into it's elements
189
        $htmlArrayTemp = preg_split(
190
            '/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/',
191
            $html,
192
            -1,
193
            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
194
        );
195
196
        if ($htmlArrayTemp === false) {
197
            return $html;
198
        }
199
        // remove empty lines
200
        $htmlArray = [''];
201
        $z = 1;
202
        for ($x = 0; $x < count($htmlArrayTemp); $x++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
203
            $t = trim($htmlArrayTemp[$x]);
204
            if ($t !== '') {
205
                $htmlArray[$z] = $htmlArrayTemp[$x];
206
                $z++;
207
            } else {
208
                $htmlArray[$z] = $this->emptySpaceChar;
209
                $z++;
210
            }
211
        }
212
213
        // rebuild html
214
        $html = '';
215
        $tabs = 0;
216
        for ($x = 0; $x < count($htmlArray); $x++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
217
            // check if the element should stand in a new line
218
            $newline = false;
219
            if (substr($htmlArray[$x - 1], 0, 5) == '<?xml') {
220
                $newline = true;
221
            } elseif ($this->formatType == 2 && ( // minimalistic line break
222
                    # this element has a line break before itself
223
                    preg_match(
224
                        '/<' . $structureBoxLikeElements . '(.*)>/Usi',
225
                        $htmlArray[$x]
226
                    ) || preg_match(
227
                        '/<' . $structureBoxLikeElements . '(.*) \/>/Usi',
228
                        $htmlArray[$x]
229
                    ) || # one element before is a element that has a line break after
230
                    preg_match(
231
                        '/<\/' . $structureBoxLikeElements . '(.*)>/Usi',
232
                        $htmlArray[$x - 1]
233
                    ) || substr(
234
                        $htmlArray[$x - 1],
235
                        0,
236
                        4
237
                    ) == '<!--' || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1]))
238
            ) {
239
                $newline = true;
240
            } elseif ($this->formatType == 3 && ( // aestetic line break
241
                    # this element has a line break before itself
242
                    preg_match(
243
                        '/<' . $esteticBoxLikeElements . '(.*)>/Usi',
244
                        $htmlArray[$x]
245
                    ) || preg_match(
246
                        '/<' . $esteticBoxLikeElements . '(.*) \/>/Usi',
247
                        $htmlArray[$x]
248
                    ) || # one element before is a element that has a line break after
249
                    preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr(
250
                        $htmlArray[$x - 1],
251
                        0,
252
                        4
253
                    ) == '<!--' || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1]))
254
            ) {
255
                $newline = true;
256
            } elseif ($this->formatType >= 4 && ( // logical line break
257
                    # this element has a line break before itself
258
                    preg_match(
259
                        '/<' . $allBoxLikeElements . '(.*)>/Usi',
260
                        $htmlArray[$x]
261
                    ) || preg_match(
262
                        '/<' . $allBoxLikeElements . '(.*) \/>/Usi',
263
                        $htmlArray[$x]
264
                    ) || # one element before is a element that has a line break after
265
                    preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr(
266
                        $htmlArray[$x - 1],
267
                        0,
268
                        4
269
                    ) == '<!--' || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1]))
270
            ) {
271
                $newline = true;
272
            }
273
274
            // count down a tab
275
            if (substr($htmlArray[$x], 0, 2) == '</') {
276
                $tabs--;
277
            }
278
279
            // add tabs and line breaks in front of the current tag
280
            if ($newline) {
281
                $html .= $this->newline;
282
                for ($y = 0; $y < $tabs; $y++) {
283
                    $html .= $this->tab;
284
                }
285
            }
286
287
            // remove white spaces and line breaks and add current tag to the html-string
288
            if (substr($htmlArray[$x - 1], 0, 4) == '<pre' // remove white space after line ending in PRE / TEXTAREA / comment
289
                || substr($htmlArray[$x - 1], 0, 9) == '<textarea' || substr($htmlArray[$x - 1], 0, 4) == '<!--'
290
            ) {
291
                $html .= $this->rTrimLines($htmlArray[$x]);
292
            } elseif (substr($htmlArray[$x], 0, 9) == '<![CDATA[' // remove multiple white space in CDATA / XML
293
                || substr($htmlArray[$x], 0, 5) == '<?xml'
294
            ) {
295
                $html .= $this->killWhiteSpace($htmlArray[$x]);
296
            } else { // remove all line breaks
297
                $html .= $this->killLineBreaks($htmlArray[$x]);
298
            }
299
300
            // count up a tab
301
            if (substr($htmlArray[$x], 0, 1) == '<' && substr($htmlArray[$x], 1, 1) != '/') {
302
                if (
303
                    substr($htmlArray[$x], 1, 1) !== ' '
304
                    && substr($htmlArray[$x], 1, 3) !== 'img'
305
                    && substr($htmlArray[$x], 1, 6) !== 'source'
306
                    && substr($htmlArray[$x], 1, 2) !== 'br'
307
                    && substr($htmlArray[$x], 1, 2) !== 'hr'
308
                    && substr($htmlArray[$x], 1, 5) !== 'input'
309
                    && substr($htmlArray[$x], 1, 4) !== 'link'
310
                    && substr($htmlArray[$x], 1, 4) !== 'meta'
311
                    && substr($htmlArray[$x], 1, 4) !== 'col '
312
                    && substr($htmlArray[$x], 1, 5) !== 'frame'
313
                    && substr($htmlArray[$x], 1, 7) !== 'isindex'
314
                    && substr($htmlArray[$x], 1, 5) !== 'param'
315
                    && substr($htmlArray[$x], 1, 4) !== 'area'
316
                    && substr($htmlArray[$x], 1, 4) !== 'base'
317
                    && substr($htmlArray[$x], 0, 2) !== '<!'
318
                    && substr($htmlArray[$x], 0, 5) !== '<?xml'
319
                ) {
320
                    $tabs++;
321
                }
322
            }
323
        }
324
325
        // Remove empty lines
326
        if ($this->formatType > 1) {
327
            $this->removeEmptyLines($html);
328
        }
329
330
        // Restore saved comments, styles and java-scripts
331
        for ($i = 0; $i < count($noFormat); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
332
            $noFormat[$i] = $this->rTrimLines($noFormat[$i]); // remove white space after line ending
333
            $html = str_replace("<!-- ELEMENT $i -->", $noFormat[$i], $html);
334
        }
335
336
        // include debug comment at the end
337
        if ($tabs != 0 && $this->debugComment === true) {
338
            $html .= '<!--' . $tabs . " open elements found-->\r\n";
339
        }
340
341
        return $html;
342
    }
343
344
    /**
345
     * Remove ALL line breaks and multiple white space
346
     *
347
     * @param string $html
348
     *
349
     * @return string
350
     */
351
    protected function killLineBreaks($html)
352
    {
353
        $html = $this->convNlOs($html);
354
        $html = str_replace($this->newline, "", $html);
355
        $html = preg_replace('/\s\s+/u', ' ', $html);
356
        return $html;
357
    }
358
359
    /**
360
     * Remove multiple white space, keeps line breaks
361
     *
362
     * @param string $html
363
     *
364
     * @return string
365
     */
366
    protected function killWhiteSpace($html)
367
    {
368
        $html = $this->convNlOs($html);
369
        $temp = explode($this->newline, $html);
370
        for ($i = 0; $i < count($temp); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
371
            if (!trim($temp[$i])) {
372
                unset($temp[$i]);
373
            } else {
374
                $temp[$i] = trim($temp[$i]);
375
                $temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]);
376
            }
377
        }
378
        $html = implode($this->newline, $temp);
379
        return $html;
380
    }
381
382
    /**
383
     * Remove white space at the end of lines, keeps other white space and line breaks
384
     *
385
     * @param string $html
386
     *
387
     * @return string
388
     */
389
    protected function rTrimLines($html)
390
    {
391
        $html = $this->convNlOs($html);
392
        $temp = explode($this->newline, $html);
393
        for ($i = 0; $i < count($temp); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
394
            $temp[$i] = rtrim($temp[$i]);
395
        }
396
        $html = implode($this->newline, $temp);
397
        return $html;
398
    }
399
400
    /**
401
     * Convert newlines according to the current OS
402
     *
403
     * @param string $html
404
     *
405
     * @return string
406
     */
407
    protected function convNlOs($html)
408
    {
409
        $html = preg_replace("(\r\n|\n|\r)", $this->newline, $html);
410
        return $html;
411
    }
412
413
    /**
414
     * Remove tabs and empty spaces before and after lines, transforms linebreaks system conform
415
     *
416
     * @param string $html Html-Code
417
     *
418
     * @return void
419
     */
420
    protected function trimLines(&$html)
421
    {
422
        $html = str_replace("\t", "", $html);
423
        // convert newlines according to the current OS
424
        if (Environment::isWindows()) {
425
            $html = str_replace("\n", "\r\n", $html);
426
        } else {
427
            $html = str_replace("\r\n", "\n", $html);
428
        }
429
        $temp = explode($this->newline, $html);
430
        $temp = array_map('trim', $temp);
431
        $html = implode($this->newline, $temp);
432
        unset($temp);
433
    }
434
435
    /**
436
     * Remove empty lines
437
     *
438
     * @param string $html
439
     *
440
     * @return void
441
     */
442
    protected function removeEmptyLines(&$html)
443
    {
444
        $temp = explode($this->newline, $html);
445
        $result = [];
446
        for ($i = 0; $i < count($temp); ++$i) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
447
            if ("" == trim($temp[$i])) {
448
                continue;
449
            }
450
            $result[] = $temp[$i];
451
        }
452
        $html = implode($this->newline, $result);
453
    }
454
455
    /**
456
     * Remove new lines where unnecessary
457
     * spares line breaks within: pre, textarea, ...
458
     *
459
     * @param string $html
460
     *
461
     * @return void
462
     */
463
    protected function removeNewLines(&$html)
464
    {
465
        $splitArray = [
466
            'textarea',
467
            'pre'
468
        ]; // eventuell auch: span, script, style
469
        $peaces = preg_split('#(<(' . implode('|', $splitArray) . ').*>.*</\2>)#Uis', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
470
        $html = "";
471
        for ($i = 0; $i < count($peaces); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
472
            if (($i + 1) % 3 == 0) {
473
                continue;
474
            }
475
            $html .= (($i - 1) % 3 != 0) ? $this->killLineBreaks($peaces[$i]) : $peaces[$i];
476
        }
477
    }
478
479
    /**
480
     * Remove obsolete link schema
481
     *
482
     * @param string $html
483
     *
484
     * @return void
485
     */
486
    protected function removeLinkSchema(&$html)
487
    {
488
        $html = preg_replace("/<link rel=\"?schema.dc\"?.+?>/is", "", $html);
489
    }
490
491
    /**
492
     * Remove empty alt tags
493
     *
494
     * @param string $html
495
     *
496
     * @return void
497
     */
498
    protected function removeEmptyAltAtr(&$html)
499
    {
500
        $html = str_replace("alt=\"\"", "", $html);
501
    }
502
503
    /**
504
     * Remove broken links in <a> tags
505
     *
506
     * @param string $html
507
     *
508
     * @return void
509
     */
510
    protected function removeRealUrlBrokenRootLink(&$html)
511
    {
512
        $html = str_replace('href=".html"', 'href=""', $html);
513
    }
514
515
    /**
516
     * Include configured header comment in HTML content block
517
     *
518
     * @param $html
519
     */
520
    public function includeHeaderComment(&$html)
521
    {
522
        if (!empty($this->headerComment)) {
523
            $html = preg_replace_callback('/<meta http-equiv(.*)>/Usi', function ($matches) {
524
                return trim($matches[0] . $this->newline . $this->tab . $this->tab . '<!-- ' . $this->headerComment . '-->');
525
            }, $html, 1);
526
        }
527
    }
528
}
529