Completed
Push — master ( 041c4d...dec97d )
by Tim
03:47
created
Classes/Service/CleanHtmlService.php 1 patch
Indentation   +517 added lines, -517 removed lines patch added patch discarded remove patch
@@ -13,521 +13,521 @@
 block discarded – undo
13 13
 class CleanHtmlService implements SingletonInterface
14 14
 {
15 15
 
16
-    /**
17
-     * Enable Debug comment in footer
18
-     *
19
-     * @var boolean
20
-     */
21
-    protected $debugComment = false;
22
-
23
-    /**
24
-     * Format Type
25
-     *
26
-     * @var integer
27
-     */
28
-    protected $formatType = 2;
29
-
30
-    /**
31
-     * Tab character
32
-     *
33
-     * @var string
34
-     */
35
-    protected $tab = "\t";
36
-
37
-    /**
38
-     * Newline character
39
-     *
40
-     * @var string
41
-     */
42
-    protected $newline = "\n";
43
-
44
-    /**
45
-     * Enable/disable UTF8 support
46
-     *
47
-     * @var boolean
48
-     */
49
-    protected $utf8 = true;
50
-
51
-    /**
52
-     * Configured extra header comment
53
-     *
54
-     * @var string
55
-     */
56
-    protected $headerComment = '';
57
-
58
-    /**
59
-     * Set variables based on given config
60
-     *
61
-     * @param array $config
62
-     *
63
-     * @return void
64
-     */
65
-    public function setVariables(array $config)
66
-    {
67
-        switch (TYPO3_OS) { // set newline
68
-            case 'WIN':
69
-                $this->newline = "\r\n";
70
-                break;
71
-            default:
72
-                $this->newline = "\n";
73
-        }
74
-
75
-        if (!empty($config)) {
76
-            if ($config['formatHtml'] && is_numeric($config['formatHtml'])) {
77
-                $this->formatType = (int)$config['formatHtml'];
78
-            }
79
-
80
-            if ($config['formatHtml.']['tabSize'] && is_numeric($config['formatHtml.']['tabSize'])) {
81
-                $this->tab = str_pad('', $config['formatHtml.']['tabSize'], ' ');
82
-            }
83
-
84
-            if (isset($config['enable_utf'])) {
85
-                $this->utf8 = (bool)$config['enable_utf-8_support'];
86
-            }
87
-
88
-            if (isset($config['formatHtml.']['debugComment'])) {
89
-                $this->debugComment = (bool)$config['formatHtml.']['debugComment'];
90
-            }
91
-
92
-            if (isset($config['headerComment'])) {
93
-                $this->headerComment = $config['headerComment'];
94
-            }
95
-        }
96
-    }
97
-
98
-    /**
99
-     * Clean given HTML with formatter
100
-     *
101
-     * @param string $html
102
-     * @param array  $config
103
-     *
104
-     * @return void
105
-     */
106
-    public function clean(&$html, $config = [])
107
-    {
108
-        if (!empty($config)) {
109
-            if ((bool)$config['enabled'] === false) {
110
-                return;
111
-            }
112
-
113
-            $this->setVariables($config);
114
-        }
115
-
116
-        $manipulations = [];
117
-
118
-        if (isset($config['removeGenerator']) && (bool)$config['removeGenerator']) {
119
-            $manipulations['removeGenerator'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveGenerator');
120
-        }
121
-
122
-        if (isset($config['removeComments']) && (bool)$config['removeComments']) {
123
-            $manipulations['removeComments'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveComments');
124
-        }
125
-
126
-        if (isset($config['removeBlurScript']) && (bool)$config['removeBlurScript']) {
127
-            $manipulations['removeBlurScript'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveBlurScript');
128
-        }
129
-
130
-        if (!empty($this->headerComment)) {
131
-            $this->includeHeaderComment($html);
132
-        }
133
-
134
-        foreach ($manipulations as $key => $manipulation) {
135
-            /** @var ManipulationInterface $manipulation */
136
-            $configuration = isset($config[$key . '.']) && is_array($config[$key . '.']) ? $config[$key . '.'] : [];
137
-            $html = $manipulation->manipulate($html, $configuration);
138
-        }
139
-
140
-        if ($this->formatType) {
141
-            $this->formatHtml($html);
142
-        }
143
-    }
144
-
145
-    /**
146
-     * Formats the (X)HTML code:
147
-     *  - taps according to the hirarchy of the tags
148
-     *  - removes empty spaces between tags
149
-     *  - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..)
150
-     *  choose from five options:
151
-     *    0 => off
152
-     *    1 => no line break at all  (code in one line)
153
-     *    2 => minimalistic line breaks (structure defining box-elements)
154
-     *    3 => aesthetic line breaks (important box-elements)
155
-     *    4 => logic line breaks (all box-elements)
156
-     *    5 => max line breaks (all elements)
157
-     *
158
-     * @param string $html
159
-     *
160
-     * @return void
161
-     */
162
-    protected function formatHtml(&$html)
163
-    {
164
-        // Save original formated comments, pre, textarea, styles and java-scripts & replace them with markers
165
-        preg_match_all(
166
-            '/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
167
-            $html,
168
-            $matches
169
-        );
170
-        $no_format = $matches[0]; // do not format these block elements
171
-        for ($i = 0; $i < count($no_format); $i++) {
172
-            $html = str_replace($no_format[$i], "\n<!-- ELEMENT $i -->", $html);
173
-        }
174
-
175
-        // define box elements for formatting
176
-        $trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section';
177
-        $functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup';
178
-        $usableBoxElements = 'applet|button|del|iframe|ins|map|object|script';
179
-        $imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--';
180
-        $allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')';
181
-        $esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)';
182
-        $structureBoxLikeElements = '(?>html|head|body|div|!--)';
183
-
184
-        // split html into it's elements
185
-        $html_array_temp = preg_split(
186
-            '/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/',
187
-            $html,
188
-            -1,
189
-            PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
190
-        );
191
-        // remove empty lines
192
-        $html_array = [''];
193
-        $z = 1;
194
-        for ($x = 0; $x < count($html_array_temp); $x++) {
195
-            $t = trim($html_array_temp[$x]);
196
-            if ($t !== '') {
197
-                $html_array[$z] = $html_array_temp[$x];
198
-                $z++;
199
-                // if the trimmed line was empty but the original wasn't, search for inline element closing tags in the last $html_array element
200
-            } else {
201
-                // if ($t !== $html_array_temp[$x] && preg_match('/<\/' . $inlineElements . '( .*)? >/Usi', $html_array[$z - 1]) === 1)
202
-                $html_array[$z] = ' ';
203
-                $z++;
204
-            }
205
-        }
206
-
207
-        // rebuild html
208
-        $html = '';
209
-        $tabs = 0;
210
-        for ($x = 0; $x < count($html_array); $x++) {
211
-            // check if the element should stand in a new line
212
-            $newline = false;
213
-            if (substr($html_array[$x - 1], 0, 5) == '<?xml') {
214
-                $newline = true;
215
-            } elseif ($this->formatType == 2 && ( // minimalistic line break
216
-                    # this element has a line break before itself
217
-                    preg_match(
218
-                        '/<' . $structureBoxLikeElements . '(.*)>/Usi',
219
-                        $html_array[$x]
220
-                    ) || preg_match(
221
-                        '/<' . $structureBoxLikeElements . '(.*) \/>/Usi',
222
-                        $html_array[$x]
223
-                    ) || # one element before is a element that has a line break after
224
-                    preg_match(
225
-                        '/<\/' . $structureBoxLikeElements . '(.*)>/Usi',
226
-                        $html_array[$x - 1]
227
-                    ) || substr(
228
-                        $html_array[$x - 1],
229
-                        0,
230
-                        4
231
-                    ) == '<!--' || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
232
-            ) {
233
-                $newline = true;
234
-            } elseif ($this->formatType == 3 && ( // aestetic line break
235
-                    # this element has a line break before itself
236
-                    preg_match(
237
-                        '/<' . $esteticBoxLikeElements . '(.*)>/Usi',
238
-                        $html_array[$x]
239
-                    ) || preg_match(
240
-                        '/<' . $esteticBoxLikeElements . '(.*) \/>/Usi',
241
-                        $html_array[$x]
242
-                    ) || # one element before is a element that has a line break after
243
-                    preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $html_array[$x - 1]) || substr(
244
-                        $html_array[$x - 1],
245
-                        0,
246
-                        4
247
-                    ) == '<!--' || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
248
-            ) {
249
-                $newline = true;
250
-            } elseif ($this->formatType >= 4 && ( // logical line break
251
-                    # this element has a line break before itself
252
-                    preg_match(
253
-                        '/<' . $allBoxLikeElements . '(.*)>/Usi',
254
-                        $html_array[$x]
255
-                    ) || preg_match(
256
-                        '/<' . $allBoxLikeElements . '(.*) \/>/Usi',
257
-                        $html_array[$x]
258
-                    ) || # one element before is a element that has a line break after
259
-                    preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $html_array[$x - 1]) || substr(
260
-                        $html_array[$x - 1],
261
-                        0,
262
-                        4
263
-                    ) == '<!--' || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
264
-            ) {
265
-                $newline = true;
266
-            }
267
-
268
-            // count down a tab
269
-            if (substr($html_array[$x], 0, 2) == '</') {
270
-                $tabs--;
271
-            }
272
-
273
-            // add tabs and line breaks in front of the current tag
274
-            if ($newline) {
275
-                $html .= $this->newline;
276
-                for ($y = 0; $y < $tabs; $y++) {
277
-                    $html .= $this->tab;
278
-                }
279
-            }
280
-
281
-            // remove white spaces and line breaks and add current tag to the html-string
282
-            if (substr($html_array[$x - 1], 0, 4) == '<pre' // remove white space after line ending in PRE / TEXTAREA / comment
283
-                || substr($html_array[$x - 1], 0, 9) == '<textarea' || substr($html_array[$x - 1], 0, 4) == '<!--'
284
-            ) {
285
-                $html .= $this->rTrimLines($html_array[$x]);
286
-            } elseif (substr($html_array[$x], 0, 9) == '<![CDATA[' // remove multiple white space in CDATA / XML
287
-                || substr($html_array[$x], 0, 5) == '<?xml'
288
-            ) {
289
-                $html .= $this->killWhiteSpace($html_array[$x]);
290
-            } else { // remove all line breaks
291
-                $html .= $this->killLineBreaks($html_array[$x]);
292
-            }
293
-
294
-            // count up a tab
295
-            if (substr($html_array[$x], 0, 1) == '<' && substr($html_array[$x], 1, 1) != '/') {
296
-                if (substr($html_array[$x], 1, 1) != ' ' && substr($html_array[$x], 1, 3) != 'img' && substr(
297
-                    $html_array[$x],
298
-                    1,
299
-                    2
300
-                ) != 'br' && substr($html_array[$x], 1, 2) != 'hr' && substr(
301
-                    $html_array[$x],
302
-                    1,
303
-                    5
304
-                ) != 'input' && substr($html_array[$x], 1, 4) != 'link' && substr(
305
-                    $html_array[$x],
306
-                    1,
307
-                    4
308
-                ) != 'meta' && substr($html_array[$x], 1, 4) != 'col ' && substr(
309
-                    $html_array[$x],
310
-                    1,
311
-                    5
312
-                ) != 'frame' && substr($html_array[$x], 1, 7) != 'isindex' && substr(
313
-                    $html_array[$x],
314
-                    1,
315
-                    5
316
-                ) != 'param' && substr($html_array[$x], 1, 4) != 'area' && substr(
317
-                    $html_array[$x],
318
-                    1,
319
-                    4
320
-                ) != 'base' && substr($html_array[$x], 0, 2) != '<!' && substr($html_array[$x], 0, 5) != '<?xml'
321
-                ) {
322
-                    $tabs++;
323
-                }
324
-            }
325
-        }
326
-
327
-        // Remove empty lines
328
-        if ($this->formatType > 1) {
329
-            $this->removeEmptyLines($html);
330
-        }
331
-
332
-        // Restore saved comments, styles and java-scripts
333
-        for ($i = 0; $i < count($no_format); $i++) {
334
-            $no_format[$i] = $this->rTrimLines($no_format[$i]); // remove white space after line ending
335
-            $html = str_replace("<!-- ELEMENT $i -->", $no_format[$i], $html);
336
-        }
337
-
338
-        // include debug comment at the end
339
-        if ($tabs != 0 && $this->debugComment === true) {
340
-            $html .= '<!--' . $tabs . " open elements found-->\r\n";
341
-        }
342
-    }
343
-
344
-    /**
345
-     * Remove ALL line breaks and multiple white space
346
-     *
347
-     * @param string $html
348
-     *
349
-     * @return string
350
-     */
351
-    protected function killLineBreaks($html)
352
-    {
353
-        $html = $this->convNlOs($html);
354
-        $html = str_replace($this->newline, "", $html);
355
-        // remove double empty spaces
356
-        if ($this->utf8 == true) {
357
-            $html = preg_replace('/\s\s+/u', ' ', $html);
358
-        } else {
359
-            $html = preg_replace('/\s\s+/', ' ', $html);
360
-        }
361
-        return $html;
362
-    }
363
-
364
-    /**
365
-     * Remove multiple white space, keeps line breaks
366
-     *
367
-     * @param string $html
368
-     *
369
-     * @return string
370
-     */
371
-    protected function killWhiteSpace($html)
372
-    {
373
-        $html = $this->convNlOs($html);
374
-        $temp = explode($this->newline, $html);
375
-        for ($i = 0; $i < count($temp); $i++) {
376
-            if (!trim($temp[$i])) {
377
-                unset($temp[$i]);
378
-            } else {
379
-                $temp[$i] = trim($temp[$i]);
380
-                $temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]);
381
-            }
382
-        }
383
-        $html = implode($this->newline, $temp);
384
-        return $html;
385
-    }
386
-
387
-    /**
388
-     * Remove white space at the end of lines, keeps other white space and line breaks
389
-     *
390
-     * @param string $html
391
-     *
392
-     * @return string
393
-     */
394
-    protected function rTrimLines($html)
395
-    {
396
-        $html = $this->convNlOs($html);
397
-        $temp = explode($this->newline, $html);
398
-        for ($i = 0; $i < count($temp); $i++) {
399
-            $temp[$i] = rtrim($temp[$i]);
400
-        }
401
-        $html = implode($this->newline, $temp);
402
-        return $html;
403
-    }
404
-
405
-    /**
406
-     * Convert newlines according to the current OS
407
-     *
408
-     * @param string $html
409
-     *
410
-     * @return string
411
-     */
412
-    protected function convNlOs($html)
413
-    {
414
-        $html = preg_replace("(\r\n|\n|\r)", $this->newline, $html);
415
-        return $html;
416
-    }
417
-
418
-    /**
419
-     * Remove tabs and empty spaces before and after lines, transforms linebreaks system conform
420
-     *
421
-     * @param string $html Html-Code
422
-     *
423
-     * @return void
424
-     */
425
-    protected function trimLines(&$html)
426
-    {
427
-        $html = str_replace("\t", "", $html);
428
-        // convert newlines according to the current OS
429
-        if (TYPO3_OS == "WIN") {
430
-            $html = str_replace("\n", "\r\n", $html);
431
-        } else {
432
-            $html = str_replace("\r\n", "\n", $html);
433
-        }
434
-        $temp = explode($this->newline, $html);
435
-        $temp = array_map('trim', $temp);
436
-        $html = implode($this->newline, $temp);
437
-        unset($temp);
438
-    }
439
-
440
-    /**
441
-     * Remove empty lines
442
-     *
443
-     * @param string $html
444
-     *
445
-     * @return void
446
-     */
447
-    protected function removeEmptyLines(&$html)
448
-    {
449
-        $temp = explode($this->newline, $html);
450
-        $result = [];
451
-        for ($i = 0; $i < count($temp); ++$i) {
452
-            if ("" == trim($temp[$i])) {
453
-                continue;
454
-            }
455
-            $result[] = $temp[$i];
456
-        }
457
-        $html = implode($this->newline, $result);
458
-    }
459
-
460
-    /**
461
-     * Remove new lines where unnecessary
462
-     * spares line breaks within: pre, textarea, ...
463
-     *
464
-     * @param string $html
465
-     *
466
-     * @return void
467
-     */
468
-    protected function removeNewLines(&$html)
469
-    {
470
-        $splitArray = [
471
-            'textarea',
472
-            'pre'
473
-        ]; // eventuell auch: span, script, style
474
-        $peaces = preg_split('#(<(' . implode('|', $splitArray) . ').*>.*</\2>)#Uis', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
475
-        $html = "";
476
-        for ($i = 0; $i < count($peaces); $i++) {
477
-            if (($i + 1) % 3 == 0) {
478
-                continue;
479
-            }
480
-            $html .= (($i - 1) % 3 != 0) ? $this->killLineBreaks($peaces[$i]) : $peaces[$i];
481
-        }
482
-    }
483
-
484
-    /**
485
-     * Remove obsolete link schema
486
-     *
487
-     * @param string $html
488
-     *
489
-     * @return void
490
-     */
491
-    protected function removeLinkSchema(&$html)
492
-    {
493
-        $html = preg_replace("/<link rel=\"?schema.dc\"?.+?>/is", "", $html);
494
-    }
495
-
496
-    /**
497
-     * Remove empty alt tags
498
-     *
499
-     * @param string $html
500
-     *
501
-     * @return void
502
-     */
503
-    protected function removeEmptyAltAtr(&$html)
504
-    {
505
-        $html = str_replace("alt=\"\"", "", $html);
506
-    }
507
-
508
-    /**
509
-     * Remove broken links in <a> tags
510
-     *
511
-     * @param string $html
512
-     *
513
-     * @return void
514
-     */
515
-    protected function removeRealUrlBrokenRootLink(&$html)
516
-    {
517
-        $html = str_replace('href=".html"', 'href=""', $html);
518
-    }
519
-
520
-    /**
521
-     * Include configured header comment in HTML content block
522
-     *
523
-     * @param $html
524
-     */
525
-    public function includeHeaderComment(&$html)
526
-    {
527
-        if (!empty($this->headerComment)) {
528
-            $html = preg_replace_callback('/<meta http-equiv(.*)>/Usi', function ($matches) {
529
-                return trim($matches[0] . $this->newline . $this->tab . $this->tab . '<!-- ' . $this->headerComment . '-->');
530
-            }, $html, 1);
531
-        }
532
-    }
16
+	/**
17
+	 * Enable Debug comment in footer
18
+	 *
19
+	 * @var boolean
20
+	 */
21
+	protected $debugComment = false;
22
+
23
+	/**
24
+	 * Format Type
25
+	 *
26
+	 * @var integer
27
+	 */
28
+	protected $formatType = 2;
29
+
30
+	/**
31
+	 * Tab character
32
+	 *
33
+	 * @var string
34
+	 */
35
+	protected $tab = "\t";
36
+
37
+	/**
38
+	 * Newline character
39
+	 *
40
+	 * @var string
41
+	 */
42
+	protected $newline = "\n";
43
+
44
+	/**
45
+	 * Enable/disable UTF8 support
46
+	 *
47
+	 * @var boolean
48
+	 */
49
+	protected $utf8 = true;
50
+
51
+	/**
52
+	 * Configured extra header comment
53
+	 *
54
+	 * @var string
55
+	 */
56
+	protected $headerComment = '';
57
+
58
+	/**
59
+	 * Set variables based on given config
60
+	 *
61
+	 * @param array $config
62
+	 *
63
+	 * @return void
64
+	 */
65
+	public function setVariables(array $config)
66
+	{
67
+		switch (TYPO3_OS) { // set newline
68
+			case 'WIN':
69
+				$this->newline = "\r\n";
70
+				break;
71
+			default:
72
+				$this->newline = "\n";
73
+		}
74
+
75
+		if (!empty($config)) {
76
+			if ($config['formatHtml'] && is_numeric($config['formatHtml'])) {
77
+				$this->formatType = (int)$config['formatHtml'];
78
+			}
79
+
80
+			if ($config['formatHtml.']['tabSize'] && is_numeric($config['formatHtml.']['tabSize'])) {
81
+				$this->tab = str_pad('', $config['formatHtml.']['tabSize'], ' ');
82
+			}
83
+
84
+			if (isset($config['enable_utf'])) {
85
+				$this->utf8 = (bool)$config['enable_utf-8_support'];
86
+			}
87
+
88
+			if (isset($config['formatHtml.']['debugComment'])) {
89
+				$this->debugComment = (bool)$config['formatHtml.']['debugComment'];
90
+			}
91
+
92
+			if (isset($config['headerComment'])) {
93
+				$this->headerComment = $config['headerComment'];
94
+			}
95
+		}
96
+	}
97
+
98
+	/**
99
+	 * Clean given HTML with formatter
100
+	 *
101
+	 * @param string $html
102
+	 * @param array  $config
103
+	 *
104
+	 * @return void
105
+	 */
106
+	public function clean(&$html, $config = [])
107
+	{
108
+		if (!empty($config)) {
109
+			if ((bool)$config['enabled'] === false) {
110
+				return;
111
+			}
112
+
113
+			$this->setVariables($config);
114
+		}
115
+
116
+		$manipulations = [];
117
+
118
+		if (isset($config['removeGenerator']) && (bool)$config['removeGenerator']) {
119
+			$manipulations['removeGenerator'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveGenerator');
120
+		}
121
+
122
+		if (isset($config['removeComments']) && (bool)$config['removeComments']) {
123
+			$manipulations['removeComments'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveComments');
124
+		}
125
+
126
+		if (isset($config['removeBlurScript']) && (bool)$config['removeBlurScript']) {
127
+			$manipulations['removeBlurScript'] = GeneralUtility::makeInstance('HTML\\Sourceopt\\Manipulation\\RemoveBlurScript');
128
+		}
129
+
130
+		if (!empty($this->headerComment)) {
131
+			$this->includeHeaderComment($html);
132
+		}
133
+
134
+		foreach ($manipulations as $key => $manipulation) {
135
+			/** @var ManipulationInterface $manipulation */
136
+			$configuration = isset($config[$key . '.']) && is_array($config[$key . '.']) ? $config[$key . '.'] : [];
137
+			$html = $manipulation->manipulate($html, $configuration);
138
+		}
139
+
140
+		if ($this->formatType) {
141
+			$this->formatHtml($html);
142
+		}
143
+	}
144
+
145
+	/**
146
+	 * Formats the (X)HTML code:
147
+	 *  - taps according to the hirarchy of the tags
148
+	 *  - removes empty spaces between tags
149
+	 *  - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..)
150
+	 *  choose from five options:
151
+	 *    0 => off
152
+	 *    1 => no line break at all  (code in one line)
153
+	 *    2 => minimalistic line breaks (structure defining box-elements)
154
+	 *    3 => aesthetic line breaks (important box-elements)
155
+	 *    4 => logic line breaks (all box-elements)
156
+	 *    5 => max line breaks (all elements)
157
+	 *
158
+	 * @param string $html
159
+	 *
160
+	 * @return void
161
+	 */
162
+	protected function formatHtml(&$html)
163
+	{
164
+		// Save original formated comments, pre, textarea, styles and java-scripts & replace them with markers
165
+		preg_match_all(
166
+			'/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
167
+			$html,
168
+			$matches
169
+		);
170
+		$no_format = $matches[0]; // do not format these block elements
171
+		for ($i = 0; $i < count($no_format); $i++) {
172
+			$html = str_replace($no_format[$i], "\n<!-- ELEMENT $i -->", $html);
173
+		}
174
+
175
+		// define box elements for formatting
176
+		$trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section';
177
+		$functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup';
178
+		$usableBoxElements = 'applet|button|del|iframe|ins|map|object|script';
179
+		$imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--';
180
+		$allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')';
181
+		$esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)';
182
+		$structureBoxLikeElements = '(?>html|head|body|div|!--)';
183
+
184
+		// split html into it's elements
185
+		$html_array_temp = preg_split(
186
+			'/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/',
187
+			$html,
188
+			-1,
189
+			PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
190
+		);
191
+		// remove empty lines
192
+		$html_array = [''];
193
+		$z = 1;
194
+		for ($x = 0; $x < count($html_array_temp); $x++) {
195
+			$t = trim($html_array_temp[$x]);
196
+			if ($t !== '') {
197
+				$html_array[$z] = $html_array_temp[$x];
198
+				$z++;
199
+				// if the trimmed line was empty but the original wasn't, search for inline element closing tags in the last $html_array element
200
+			} else {
201
+				// if ($t !== $html_array_temp[$x] && preg_match('/<\/' . $inlineElements . '( .*)? >/Usi', $html_array[$z - 1]) === 1)
202
+				$html_array[$z] = ' ';
203
+				$z++;
204
+			}
205
+		}
206
+
207
+		// rebuild html
208
+		$html = '';
209
+		$tabs = 0;
210
+		for ($x = 0; $x < count($html_array); $x++) {
211
+			// check if the element should stand in a new line
212
+			$newline = false;
213
+			if (substr($html_array[$x - 1], 0, 5) == '<?xml') {
214
+				$newline = true;
215
+			} elseif ($this->formatType == 2 && ( // minimalistic line break
216
+					# this element has a line break before itself
217
+					preg_match(
218
+						'/<' . $structureBoxLikeElements . '(.*)>/Usi',
219
+						$html_array[$x]
220
+					) || preg_match(
221
+						'/<' . $structureBoxLikeElements . '(.*) \/>/Usi',
222
+						$html_array[$x]
223
+					) || # one element before is a element that has a line break after
224
+					preg_match(
225
+						'/<\/' . $structureBoxLikeElements . '(.*)>/Usi',
226
+						$html_array[$x - 1]
227
+					) || substr(
228
+						$html_array[$x - 1],
229
+						0,
230
+						4
231
+					) == '<!--' || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
232
+			) {
233
+				$newline = true;
234
+			} elseif ($this->formatType == 3 && ( // aestetic line break
235
+					# this element has a line break before itself
236
+					preg_match(
237
+						'/<' . $esteticBoxLikeElements . '(.*)>/Usi',
238
+						$html_array[$x]
239
+					) || preg_match(
240
+						'/<' . $esteticBoxLikeElements . '(.*) \/>/Usi',
241
+						$html_array[$x]
242
+					) || # one element before is a element that has a line break after
243
+					preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $html_array[$x - 1]) || substr(
244
+						$html_array[$x - 1],
245
+						0,
246
+						4
247
+					) == '<!--' || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
248
+			) {
249
+				$newline = true;
250
+			} elseif ($this->formatType >= 4 && ( // logical line break
251
+					# this element has a line break before itself
252
+					preg_match(
253
+						'/<' . $allBoxLikeElements . '(.*)>/Usi',
254
+						$html_array[$x]
255
+					) || preg_match(
256
+						'/<' . $allBoxLikeElements . '(.*) \/>/Usi',
257
+						$html_array[$x]
258
+					) || # one element before is a element that has a line break after
259
+					preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $html_array[$x - 1]) || substr(
260
+						$html_array[$x - 1],
261
+						0,
262
+						4
263
+					) == '<!--' || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $html_array[$x - 1]))
264
+			) {
265
+				$newline = true;
266
+			}
267
+
268
+			// count down a tab
269
+			if (substr($html_array[$x], 0, 2) == '</') {
270
+				$tabs--;
271
+			}
272
+
273
+			// add tabs and line breaks in front of the current tag
274
+			if ($newline) {
275
+				$html .= $this->newline;
276
+				for ($y = 0; $y < $tabs; $y++) {
277
+					$html .= $this->tab;
278
+				}
279
+			}
280
+
281
+			// remove white spaces and line breaks and add current tag to the html-string
282
+			if (substr($html_array[$x - 1], 0, 4) == '<pre' // remove white space after line ending in PRE / TEXTAREA / comment
283
+				|| substr($html_array[$x - 1], 0, 9) == '<textarea' || substr($html_array[$x - 1], 0, 4) == '<!--'
284
+			) {
285
+				$html .= $this->rTrimLines($html_array[$x]);
286
+			} elseif (substr($html_array[$x], 0, 9) == '<![CDATA[' // remove multiple white space in CDATA / XML
287
+				|| substr($html_array[$x], 0, 5) == '<?xml'
288
+			) {
289
+				$html .= $this->killWhiteSpace($html_array[$x]);
290
+			} else { // remove all line breaks
291
+				$html .= $this->killLineBreaks($html_array[$x]);
292
+			}
293
+
294
+			// count up a tab
295
+			if (substr($html_array[$x], 0, 1) == '<' && substr($html_array[$x], 1, 1) != '/') {
296
+				if (substr($html_array[$x], 1, 1) != ' ' && substr($html_array[$x], 1, 3) != 'img' && substr(
297
+					$html_array[$x],
298
+					1,
299
+					2
300
+				) != 'br' && substr($html_array[$x], 1, 2) != 'hr' && substr(
301
+					$html_array[$x],
302
+					1,
303
+					5
304
+				) != 'input' && substr($html_array[$x], 1, 4) != 'link' && substr(
305
+					$html_array[$x],
306
+					1,
307
+					4
308
+				) != 'meta' && substr($html_array[$x], 1, 4) != 'col ' && substr(
309
+					$html_array[$x],
310
+					1,
311
+					5
312
+				) != 'frame' && substr($html_array[$x], 1, 7) != 'isindex' && substr(
313
+					$html_array[$x],
314
+					1,
315
+					5
316
+				) != 'param' && substr($html_array[$x], 1, 4) != 'area' && substr(
317
+					$html_array[$x],
318
+					1,
319
+					4
320
+				) != 'base' && substr($html_array[$x], 0, 2) != '<!' && substr($html_array[$x], 0, 5) != '<?xml'
321
+				) {
322
+					$tabs++;
323
+				}
324
+			}
325
+		}
326
+
327
+		// Remove empty lines
328
+		if ($this->formatType > 1) {
329
+			$this->removeEmptyLines($html);
330
+		}
331
+
332
+		// Restore saved comments, styles and java-scripts
333
+		for ($i = 0; $i < count($no_format); $i++) {
334
+			$no_format[$i] = $this->rTrimLines($no_format[$i]); // remove white space after line ending
335
+			$html = str_replace("<!-- ELEMENT $i -->", $no_format[$i], $html);
336
+		}
337
+
338
+		// include debug comment at the end
339
+		if ($tabs != 0 && $this->debugComment === true) {
340
+			$html .= '<!--' . $tabs . " open elements found-->\r\n";
341
+		}
342
+	}
343
+
344
+	/**
345
+	 * Remove ALL line breaks and multiple white space
346
+	 *
347
+	 * @param string $html
348
+	 *
349
+	 * @return string
350
+	 */
351
+	protected function killLineBreaks($html)
352
+	{
353
+		$html = $this->convNlOs($html);
354
+		$html = str_replace($this->newline, "", $html);
355
+		// remove double empty spaces
356
+		if ($this->utf8 == true) {
357
+			$html = preg_replace('/\s\s+/u', ' ', $html);
358
+		} else {
359
+			$html = preg_replace('/\s\s+/', ' ', $html);
360
+		}
361
+		return $html;
362
+	}
363
+
364
+	/**
365
+	 * Remove multiple white space, keeps line breaks
366
+	 *
367
+	 * @param string $html
368
+	 *
369
+	 * @return string
370
+	 */
371
+	protected function killWhiteSpace($html)
372
+	{
373
+		$html = $this->convNlOs($html);
374
+		$temp = explode($this->newline, $html);
375
+		for ($i = 0; $i < count($temp); $i++) {
376
+			if (!trim($temp[$i])) {
377
+				unset($temp[$i]);
378
+			} else {
379
+				$temp[$i] = trim($temp[$i]);
380
+				$temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]);
381
+			}
382
+		}
383
+		$html = implode($this->newline, $temp);
384
+		return $html;
385
+	}
386
+
387
+	/**
388
+	 * Remove white space at the end of lines, keeps other white space and line breaks
389
+	 *
390
+	 * @param string $html
391
+	 *
392
+	 * @return string
393
+	 */
394
+	protected function rTrimLines($html)
395
+	{
396
+		$html = $this->convNlOs($html);
397
+		$temp = explode($this->newline, $html);
398
+		for ($i = 0; $i < count($temp); $i++) {
399
+			$temp[$i] = rtrim($temp[$i]);
400
+		}
401
+		$html = implode($this->newline, $temp);
402
+		return $html;
403
+	}
404
+
405
+	/**
406
+	 * Convert newlines according to the current OS
407
+	 *
408
+	 * @param string $html
409
+	 *
410
+	 * @return string
411
+	 */
412
+	protected function convNlOs($html)
413
+	{
414
+		$html = preg_replace("(\r\n|\n|\r)", $this->newline, $html);
415
+		return $html;
416
+	}
417
+
418
+	/**
419
+	 * Remove tabs and empty spaces before and after lines, transforms linebreaks system conform
420
+	 *
421
+	 * @param string $html Html-Code
422
+	 *
423
+	 * @return void
424
+	 */
425
+	protected function trimLines(&$html)
426
+	{
427
+		$html = str_replace("\t", "", $html);
428
+		// convert newlines according to the current OS
429
+		if (TYPO3_OS == "WIN") {
430
+			$html = str_replace("\n", "\r\n", $html);
431
+		} else {
432
+			$html = str_replace("\r\n", "\n", $html);
433
+		}
434
+		$temp = explode($this->newline, $html);
435
+		$temp = array_map('trim', $temp);
436
+		$html = implode($this->newline, $temp);
437
+		unset($temp);
438
+	}
439
+
440
+	/**
441
+	 * Remove empty lines
442
+	 *
443
+	 * @param string $html
444
+	 *
445
+	 * @return void
446
+	 */
447
+	protected function removeEmptyLines(&$html)
448
+	{
449
+		$temp = explode($this->newline, $html);
450
+		$result = [];
451
+		for ($i = 0; $i < count($temp); ++$i) {
452
+			if ("" == trim($temp[$i])) {
453
+				continue;
454
+			}
455
+			$result[] = $temp[$i];
456
+		}
457
+		$html = implode($this->newline, $result);
458
+	}
459
+
460
+	/**
461
+	 * Remove new lines where unnecessary
462
+	 * spares line breaks within: pre, textarea, ...
463
+	 *
464
+	 * @param string $html
465
+	 *
466
+	 * @return void
467
+	 */
468
+	protected function removeNewLines(&$html)
469
+	{
470
+		$splitArray = [
471
+			'textarea',
472
+			'pre'
473
+		]; // eventuell auch: span, script, style
474
+		$peaces = preg_split('#(<(' . implode('|', $splitArray) . ').*>.*</\2>)#Uis', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
475
+		$html = "";
476
+		for ($i = 0; $i < count($peaces); $i++) {
477
+			if (($i + 1) % 3 == 0) {
478
+				continue;
479
+			}
480
+			$html .= (($i - 1) % 3 != 0) ? $this->killLineBreaks($peaces[$i]) : $peaces[$i];
481
+		}
482
+	}
483
+
484
+	/**
485
+	 * Remove obsolete link schema
486
+	 *
487
+	 * @param string $html
488
+	 *
489
+	 * @return void
490
+	 */
491
+	protected function removeLinkSchema(&$html)
492
+	{
493
+		$html = preg_replace("/<link rel=\"?schema.dc\"?.+?>/is", "", $html);
494
+	}
495
+
496
+	/**
497
+	 * Remove empty alt tags
498
+	 *
499
+	 * @param string $html
500
+	 *
501
+	 * @return void
502
+	 */
503
+	protected function removeEmptyAltAtr(&$html)
504
+	{
505
+		$html = str_replace("alt=\"\"", "", $html);
506
+	}
507
+
508
+	/**
509
+	 * Remove broken links in <a> tags
510
+	 *
511
+	 * @param string $html
512
+	 *
513
+	 * @return void
514
+	 */
515
+	protected function removeRealUrlBrokenRootLink(&$html)
516
+	{
517
+		$html = str_replace('href=".html"', 'href=""', $html);
518
+	}
519
+
520
+	/**
521
+	 * Include configured header comment in HTML content block
522
+	 *
523
+	 * @param $html
524
+	 */
525
+	public function includeHeaderComment(&$html)
526
+	{
527
+		if (!empty($this->headerComment)) {
528
+			$html = preg_replace_callback('/<meta http-equiv(.*)>/Usi', function ($matches) {
529
+				return trim($matches[0] . $this->newline . $this->tab . $this->tab . '<!-- ' . $this->headerComment . '-->');
530
+			}, $html, 1);
531
+		}
532
+	}
533 533
 }
Please login to merge, or discard this patch.
Classes/Manipulation/RemoveComments.php 1 patch
Indentation   +57 added lines, -57 removed lines patch added patch discarded remove patch
@@ -13,65 +13,65 @@
 block discarded – undo
13 13
 class RemoveComments implements ManipulationInterface
14 14
 {
15 15
 
16
-    /**
17
-     * Patterns for white-listing comments inside content
18
-     *
19
-     * @var array
20
-     */
21
-    protected $whiteListCommentsPatterns = [];
16
+	/**
17
+	 * Patterns for white-listing comments inside content
18
+	 *
19
+	 * @var array
20
+	 */
21
+	protected $whiteListCommentsPatterns = [];
22 22
 
23
-    /**
24
-     * @param string $html          The original HTML
25
-     * @param array  $configuration Configuration
26
-     *
27
-     * @return string the manipulated HTML
28
-     */
29
-    public function manipulate($html, array $configuration = [])
30
-    {
31
-        if (isset($configuration['keep.'])) {
32
-            $this->whiteListCommentsPatterns = $configuration['keep.'];
33
-        }
23
+	/**
24
+	 * @param string $html          The original HTML
25
+	 * @param array  $configuration Configuration
26
+	 *
27
+	 * @return string the manipulated HTML
28
+	 */
29
+	public function manipulate($html, array $configuration = [])
30
+	{
31
+		if (isset($configuration['keep.'])) {
32
+			$this->whiteListCommentsPatterns = $configuration['keep.'];
33
+		}
34 34
 
35
-        // match all styles, scripts and comments
36
-        $matches = [];
37
-        preg_match_all(
38
-            '/(?s)((<!--.*?-->)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
39
-            $html,
40
-            $matches
41
-        );
42
-        foreach ($matches[0] as $tag) {
43
-            if ($this->keepComment($tag) === false) {
44
-                $html = str_replace($tag, '', $html);
45
-            }
46
-        }
47
-        return $html;
48
-    }
35
+		// match all styles, scripts and comments
36
+		$matches = [];
37
+		preg_match_all(
38
+			'/(?s)((<!--.*?-->)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im',
39
+			$html,
40
+			$matches
41
+		);
42
+		foreach ($matches[0] as $tag) {
43
+			if ($this->keepComment($tag) === false) {
44
+				$html = str_replace($tag, '', $html);
45
+			}
46
+		}
47
+		return $html;
48
+	}
49 49
 
50
-    /**
51
-     * Check if a comment is defined to be kept in a pattern whiteListOfComments
52
-     *
53
-     * @param string $commentHtml
54
-     *
55
-     * @return boolean
56
-     */
57
-    protected function keepComment($commentHtml)
58
-    {
59
-        // if not even a comment, skip this
60
-        if (!preg_match('/^\<\!\-\-(.*?)\-\-\>$/usi', $commentHtml)) {
61
-            return true;
62
-        }
50
+	/**
51
+	 * Check if a comment is defined to be kept in a pattern whiteListOfComments
52
+	 *
53
+	 * @param string $commentHtml
54
+	 *
55
+	 * @return boolean
56
+	 */
57
+	protected function keepComment($commentHtml)
58
+	{
59
+		// if not even a comment, skip this
60
+		if (!preg_match('/^\<\!\-\-(.*?)\-\-\>$/usi', $commentHtml)) {
61
+			return true;
62
+		}
63 63
 
64
-        // if not defined in white list
65
-        if (!empty($this->whiteListCommentsPatterns)) {
66
-            $commentHtml = str_replace("<!--", "", $commentHtml);
67
-            $commentHtml = str_replace("-->", "", $commentHtml);
68
-            $commentHtml = trim($commentHtml);
69
-            foreach ($this->whiteListCommentsPatterns as $pattern) {
70
-                if (preg_match($pattern, $commentHtml)) {
71
-                    return true;
72
-                }
73
-            }
74
-        }
75
-        return false;
76
-    }
64
+		// if not defined in white list
65
+		if (!empty($this->whiteListCommentsPatterns)) {
66
+			$commentHtml = str_replace("<!--", "", $commentHtml);
67
+			$commentHtml = str_replace("-->", "", $commentHtml);
68
+			$commentHtml = trim($commentHtml);
69
+			foreach ($this->whiteListCommentsPatterns as $pattern) {
70
+				if (preg_match($pattern, $commentHtml)) {
71
+					return true;
72
+				}
73
+			}
74
+		}
75
+		return false;
76
+	}
77 77
 }
Please login to merge, or discard this patch.