@@ -17,517 +17,517 @@ |
||
17 | 17 | class CleanHtmlService implements SingletonInterface |
18 | 18 | { |
19 | 19 | |
20 | - /** |
|
21 | - * Enable Debug comment in footer |
|
22 | - * |
|
23 | - * @var boolean |
|
24 | - */ |
|
25 | - protected $debugComment = false; |
|
26 | - |
|
27 | - /** |
|
28 | - * Format Type |
|
29 | - * |
|
30 | - * @var integer |
|
31 | - */ |
|
32 | - protected $formatType = 0; |
|
33 | - |
|
34 | - /** |
|
35 | - * Tab character |
|
36 | - * |
|
37 | - * @var string |
|
38 | - */ |
|
39 | - protected $tab = "\t"; |
|
40 | - |
|
41 | - /** |
|
42 | - * Newline character |
|
43 | - * |
|
44 | - * @var string |
|
45 | - */ |
|
46 | - protected $newline = "\n"; |
|
47 | - |
|
48 | - /** |
|
49 | - * Configured extra header comment |
|
50 | - * |
|
51 | - * @var string |
|
52 | - */ |
|
53 | - protected $headerComment = ''; |
|
54 | - |
|
55 | - /** |
|
56 | - * Empty space char |
|
57 | - * @var string |
|
58 | - */ |
|
59 | - protected $emptySpaceChar = ' '; |
|
60 | - |
|
61 | - /** |
|
62 | - * Set variables based on given config |
|
63 | - * |
|
64 | - * @param array $config |
|
65 | - * |
|
66 | - * @return void |
|
67 | - */ |
|
68 | - public function setVariables(array $config) |
|
69 | - { |
|
70 | - // Set newline based on OS |
|
71 | - if (Environment::isWindows()) { |
|
72 | - $this->newline = "\r\n"; |
|
73 | - } else { |
|
74 | - $this->newline = "\n"; |
|
75 | - } |
|
76 | - |
|
77 | - if (!empty($config)) { |
|
78 | - if ($config['formatHtml'] && is_numeric($config['formatHtml'])) { |
|
79 | - $this->formatType = (int)$config['formatHtml']; |
|
80 | - } |
|
81 | - |
|
82 | - if ($config['formatHtml.']['tabSize'] && is_numeric($config['formatHtml.']['tabSize'])) { |
|
83 | - $this->tab = str_pad('', $config['formatHtml.']['tabSize'], ' '); |
|
84 | - } |
|
85 | - |
|
86 | - if (isset($config['formatHtml.']['debugComment'])) { |
|
87 | - $this->debugComment = (bool)$config['formatHtml.']['debugComment']; |
|
88 | - } |
|
89 | - |
|
90 | - if (isset($config['headerComment'])) { |
|
91 | - $this->headerComment = $config['headerComment']; |
|
92 | - } |
|
93 | - |
|
94 | - if (isset($config['dropEmptySpaceChar']) && (bool)$config['dropEmptySpaceChar']) { |
|
95 | - $this->emptySpaceChar = ''; |
|
96 | - } |
|
97 | - } |
|
98 | - } |
|
99 | - |
|
100 | - /** |
|
101 | - * Clean given HTML with formatter |
|
102 | - * |
|
103 | - * @param string $html |
|
104 | - * @param array $config |
|
105 | - * |
|
106 | - * @return string |
|
107 | - */ |
|
108 | - public function clean($html, $config = []) |
|
109 | - { |
|
110 | - if (!empty($config)) { |
|
111 | - if ((bool)$config['enabled'] === false) { |
|
112 | - return $html; |
|
113 | - } |
|
114 | - |
|
115 | - $this->setVariables($config); |
|
116 | - } |
|
117 | - |
|
118 | - $manipulations = []; |
|
119 | - |
|
120 | - if (isset($config['removeGenerator']) && (bool)$config['removeGenerator']) { |
|
121 | - $manipulations['removeGenerator'] = GeneralUtility::makeInstance(RemoveGenerator::class); |
|
122 | - } |
|
123 | - |
|
124 | - if (isset($config['removeComments']) && (bool)$config['removeComments']) { |
|
125 | - $manipulations['removeComments'] = GeneralUtility::makeInstance(RemoveComments::class); |
|
126 | - } |
|
127 | - |
|
128 | - if (isset($config['removeBlurScript']) && (bool)$config['removeBlurScript']) { |
|
129 | - $manipulations['removeBlurScript'] = GeneralUtility::makeInstance(RemoveBlurScript::class); |
|
130 | - } |
|
131 | - |
|
132 | - if (!empty($this->headerComment)) { |
|
133 | - $this->includeHeaderComment($html); |
|
134 | - } |
|
135 | - |
|
136 | - foreach ($manipulations as $key => $manipulation) { |
|
137 | - /** @var ManipulationInterface $manipulation */ |
|
138 | - $configuration = isset($config[$key . '.']) && is_array($config[$key . '.']) ? $config[$key . '.'] : []; |
|
139 | - $html = $manipulation->manipulate($html, $configuration); |
|
140 | - } |
|
141 | - |
|
142 | - if ($this->formatType > 0) { |
|
143 | - $html = $this->formatHtml($html); |
|
144 | - } |
|
145 | - |
|
146 | - return $html; |
|
147 | - } |
|
148 | - |
|
149 | - /** |
|
150 | - * Formats the (X)HTML code: |
|
151 | - * - taps according to the hirarchy of the tags |
|
152 | - * - removes empty spaces between tags |
|
153 | - * - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..) |
|
154 | - * choose from five options: |
|
155 | - * 0 => off |
|
156 | - * 1 => no line break at all (code in one line) |
|
157 | - * 2 => minimalistic line breaks (structure defining box-elements) |
|
158 | - * 3 => aesthetic line breaks (important box-elements) |
|
159 | - * 4 => logic line breaks (all box-elements) |
|
160 | - * 5 => max line breaks (all elements) |
|
161 | - * |
|
162 | - * @param string $html |
|
163 | - * |
|
164 | - * @return string |
|
165 | - */ |
|
166 | - protected function formatHtml($html) |
|
167 | - { |
|
168 | - // Save original formated comments, pre, textarea, styles and java-scripts & replace them with markers |
|
169 | - preg_match_all( |
|
170 | - '/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im', |
|
171 | - $html, |
|
172 | - $matches |
|
173 | - ); |
|
174 | - $noFormat = $matches[0]; // do not format these block elements |
|
175 | - for ($i = 0; $i < count($noFormat); $i++) { |
|
176 | - $html = str_replace($noFormat[$i], "\n<!-- ELEMENT $i -->", $html); |
|
177 | - } |
|
178 | - |
|
179 | - // define box elements for formatting |
|
180 | - $trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section'; |
|
181 | - $functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup'; |
|
182 | - $usableBoxElements = 'applet|button|del|iframe|ins|map|object|script'; |
|
183 | - $imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--'; |
|
184 | - $allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')'; |
|
185 | - $esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)'; |
|
186 | - $structureBoxLikeElements = '(?>html|head|body|div|!--)'; |
|
187 | - |
|
188 | - // split html into it's elements |
|
189 | - $htmlArrayTemp = preg_split( |
|
190 | - '/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/', |
|
191 | - $html, |
|
192 | - -1, |
|
193 | - PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY |
|
194 | - ); |
|
195 | - |
|
196 | - if ($htmlArrayTemp === false) { |
|
197 | - // Restore saved comments, styles and java-scripts |
|
198 | - for ($i = 0; $i < count($noFormat); $i++) { |
|
199 | - $noFormat[$i] = $this->rTrimLines($noFormat[$i]); // remove white space after line ending |
|
200 | - $html = str_replace("<!-- ELEMENT $i -->", $noFormat[$i], $html); |
|
201 | - } |
|
202 | - return $html; |
|
203 | - } |
|
204 | - // remove empty lines |
|
205 | - $htmlArray = ['']; |
|
206 | - $z = 1; |
|
207 | - for ($x = 0; $x < count($htmlArrayTemp); $x++) { |
|
208 | - $t = trim($htmlArrayTemp[$x]); |
|
209 | - if ($t !== '') { |
|
210 | - $htmlArray[$z] = $htmlArrayTemp[$x]; |
|
211 | - $z++; |
|
212 | - } else { |
|
213 | - $htmlArray[$z] = $this->emptySpaceChar; |
|
214 | - $z++; |
|
215 | - } |
|
216 | - } |
|
217 | - |
|
218 | - // rebuild html |
|
219 | - $html = ''; |
|
220 | - $tabs = 0; |
|
221 | - for ($x = 0; $x < count($htmlArray); $x++) { |
|
222 | - // check if the element should stand in a new line |
|
223 | - $newline = false; |
|
224 | - if (substr($htmlArray[$x - 1], 0, 5) == '<?xml') { |
|
225 | - $newline = true; |
|
226 | - } elseif ($this->formatType == 2 && ( // minimalistic line break |
|
227 | - # this element has a line break before itself |
|
228 | - preg_match( |
|
229 | - '/<' . $structureBoxLikeElements . '(.*)>/Usi', |
|
230 | - $htmlArray[$x] |
|
231 | - ) || preg_match( |
|
232 | - '/<' . $structureBoxLikeElements . '(.*) \/>/Usi', |
|
233 | - $htmlArray[$x] |
|
234 | - ) || # one element before is a element that has a line break after |
|
235 | - preg_match( |
|
236 | - '/<\/' . $structureBoxLikeElements . '(.*)>/Usi', |
|
237 | - $htmlArray[$x - 1] |
|
238 | - ) || substr( |
|
239 | - $htmlArray[$x - 1], |
|
240 | - 0, |
|
241 | - 4 |
|
242 | - ) == '<!--' || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
243 | - ) { |
|
244 | - $newline = true; |
|
245 | - } elseif ($this->formatType == 3 && ( // aestetic line break |
|
246 | - # this element has a line break before itself |
|
247 | - preg_match( |
|
248 | - '/<' . $esteticBoxLikeElements . '(.*)>/Usi', |
|
249 | - $htmlArray[$x] |
|
250 | - ) || preg_match( |
|
251 | - '/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', |
|
252 | - $htmlArray[$x] |
|
253 | - ) || # one element before is a element that has a line break after |
|
254 | - preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr( |
|
255 | - $htmlArray[$x - 1], |
|
256 | - 0, |
|
257 | - 4 |
|
258 | - ) == '<!--' || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
259 | - ) { |
|
260 | - $newline = true; |
|
261 | - } elseif ($this->formatType >= 4 && ( // logical line break |
|
262 | - # this element has a line break before itself |
|
263 | - preg_match( |
|
264 | - '/<' . $allBoxLikeElements . '(.*)>/Usi', |
|
265 | - $htmlArray[$x] |
|
266 | - ) || preg_match( |
|
267 | - '/<' . $allBoxLikeElements . '(.*) \/>/Usi', |
|
268 | - $htmlArray[$x] |
|
269 | - ) || # one element before is a element that has a line break after |
|
270 | - preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr( |
|
271 | - $htmlArray[$x - 1], |
|
272 | - 0, |
|
273 | - 4 |
|
274 | - ) == '<!--' || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
275 | - ) { |
|
276 | - $newline = true; |
|
277 | - } |
|
278 | - |
|
279 | - // count down a tab |
|
280 | - if (substr($htmlArray[$x], 0, 2) == '</') { |
|
281 | - $tabs--; |
|
282 | - } |
|
283 | - |
|
284 | - // add tabs and line breaks in front of the current tag |
|
285 | - if ($newline) { |
|
286 | - $html .= $this->newline; |
|
287 | - for ($y = 0; $y < $tabs; $y++) { |
|
288 | - $html .= $this->tab; |
|
289 | - } |
|
290 | - } |
|
291 | - |
|
292 | - // remove white spaces and line breaks and add current tag to the html-string |
|
293 | - if (substr($htmlArray[$x - 1], 0, 4) == '<pre' // remove white space after line ending in PRE / TEXTAREA / comment |
|
294 | - || substr($htmlArray[$x - 1], 0, 9) == '<textarea' || substr($htmlArray[$x - 1], 0, 4) == '<!--' |
|
295 | - ) { |
|
296 | - $html .= $this->rTrimLines($htmlArray[$x]); |
|
297 | - } elseif (substr($htmlArray[$x], 0, 9) == '<![CDATA[' // remove multiple white space in CDATA / XML |
|
298 | - || substr($htmlArray[$x], 0, 5) == '<?xml' |
|
299 | - ) { |
|
300 | - $html .= $this->killWhiteSpace($htmlArray[$x]); |
|
301 | - } else { // remove all line breaks |
|
302 | - $html .= $this->killLineBreaks($htmlArray[$x]); |
|
303 | - } |
|
304 | - |
|
305 | - // count up a tab |
|
306 | - if (substr($htmlArray[$x], 0, 1) == '<' && substr($htmlArray[$x], 1, 1) != '/') { |
|
307 | - if ( |
|
308 | - substr($htmlArray[$x], 1, 1) !== ' ' |
|
309 | - && substr($htmlArray[$x], 1, 3) !== 'img' |
|
310 | - && substr($htmlArray[$x], 1, 6) !== 'source' |
|
311 | - && substr($htmlArray[$x], 1, 2) !== 'br' |
|
312 | - && substr($htmlArray[$x], 1, 2) !== 'hr' |
|
313 | - && substr($htmlArray[$x], 1, 5) !== 'input' |
|
314 | - && substr($htmlArray[$x], 1, 4) !== 'link' |
|
315 | - && substr($htmlArray[$x], 1, 4) !== 'meta' |
|
316 | - && substr($htmlArray[$x], 1, 4) !== 'col ' |
|
317 | - && substr($htmlArray[$x], 1, 5) !== 'frame' |
|
318 | - && substr($htmlArray[$x], 1, 7) !== 'isindex' |
|
319 | - && substr($htmlArray[$x], 1, 5) !== 'param' |
|
320 | - && substr($htmlArray[$x], 1, 4) !== 'area' |
|
321 | - && substr($htmlArray[$x], 1, 4) !== 'base' |
|
322 | - && substr($htmlArray[$x], 0, 2) !== '<!' |
|
323 | - && substr($htmlArray[$x], 0, 5) !== '<?xml' |
|
324 | - ) { |
|
325 | - $tabs++; |
|
326 | - } |
|
327 | - } |
|
328 | - } |
|
329 | - |
|
330 | - // Remove empty lines |
|
331 | - if ($this->formatType > 1) { |
|
332 | - $this->removeEmptyLines($html); |
|
333 | - } |
|
334 | - |
|
335 | - // Restore saved comments, styles and java-scripts |
|
336 | - for ($i = 0; $i < count($noFormat); $i++) { |
|
337 | - $noFormat[$i] = $this->rTrimLines($noFormat[$i]); // remove white space after line ending |
|
338 | - $html = str_replace("<!-- ELEMENT $i -->", $noFormat[$i], $html); |
|
339 | - } |
|
340 | - |
|
341 | - // include debug comment at the end |
|
342 | - if ($tabs != 0 && $this->debugComment === true) { |
|
343 | - $html .= '<!--' . $tabs . " open elements found-->\r\n"; |
|
344 | - } |
|
345 | - |
|
346 | - return $html; |
|
347 | - } |
|
348 | - |
|
349 | - /** |
|
350 | - * Remove ALL line breaks and multiple white space |
|
351 | - * |
|
352 | - * @param string $html |
|
353 | - * |
|
354 | - * @return string |
|
355 | - */ |
|
356 | - protected function killLineBreaks($html) |
|
357 | - { |
|
358 | - $html = $this->convNlOs($html); |
|
359 | - $html = str_replace($this->newline, "", $html); |
|
360 | - $html = preg_replace('/\s\s+/u', ' ', $html); |
|
361 | - return $html; |
|
362 | - } |
|
363 | - |
|
364 | - /** |
|
365 | - * Remove multiple white space, keeps line breaks |
|
366 | - * |
|
367 | - * @param string $html |
|
368 | - * |
|
369 | - * @return string |
|
370 | - */ |
|
371 | - protected function killWhiteSpace($html) |
|
372 | - { |
|
373 | - $html = $this->convNlOs($html); |
|
374 | - $temp = explode($this->newline, $html); |
|
375 | - for ($i = 0; $i < count($temp); $i++) { |
|
376 | - if (!trim($temp[$i])) { |
|
377 | - unset($temp[$i]); |
|
378 | - } else { |
|
379 | - $temp[$i] = trim($temp[$i]); |
|
380 | - $temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]); |
|
381 | - } |
|
382 | - } |
|
383 | - $html = implode($this->newline, $temp); |
|
384 | - return $html; |
|
385 | - } |
|
386 | - |
|
387 | - /** |
|
388 | - * Remove white space at the end of lines, keeps other white space and line breaks |
|
389 | - * |
|
390 | - * @param string $html |
|
391 | - * |
|
392 | - * @return string |
|
393 | - */ |
|
394 | - protected function rTrimLines($html) |
|
395 | - { |
|
396 | - $html = $this->convNlOs($html); |
|
397 | - $temp = explode($this->newline, $html); |
|
398 | - for ($i = 0; $i < count($temp); $i++) { |
|
399 | - $temp[$i] = rtrim($temp[$i]); |
|
400 | - } |
|
401 | - $html = implode($this->newline, $temp); |
|
402 | - return $html; |
|
403 | - } |
|
404 | - |
|
405 | - /** |
|
406 | - * Convert newlines according to the current OS |
|
407 | - * |
|
408 | - * @param string $html |
|
409 | - * |
|
410 | - * @return string |
|
411 | - */ |
|
412 | - protected function convNlOs($html) |
|
413 | - { |
|
414 | - $html = preg_replace("(\r\n|\n|\r)", $this->newline, $html); |
|
415 | - return $html; |
|
416 | - } |
|
417 | - |
|
418 | - /** |
|
419 | - * Remove tabs and empty spaces before and after lines, transforms linebreaks system conform |
|
420 | - * |
|
421 | - * @param string $html Html-Code |
|
422 | - * |
|
423 | - * @return void |
|
424 | - */ |
|
425 | - protected function trimLines(&$html) |
|
426 | - { |
|
427 | - $html = str_replace("\t", "", $html); |
|
428 | - // convert newlines according to the current OS |
|
429 | - if (Environment::isWindows()) { |
|
430 | - $html = str_replace("\n", "\r\n", $html); |
|
431 | - } else { |
|
432 | - $html = str_replace("\r\n", "\n", $html); |
|
433 | - } |
|
434 | - $temp = explode($this->newline, $html); |
|
435 | - $temp = array_map('trim', $temp); |
|
436 | - $html = implode($this->newline, $temp); |
|
437 | - unset($temp); |
|
438 | - } |
|
439 | - |
|
440 | - /** |
|
441 | - * Remove empty lines |
|
442 | - * |
|
443 | - * @param string $html |
|
444 | - * |
|
445 | - * @return void |
|
446 | - */ |
|
447 | - protected function removeEmptyLines(&$html) |
|
448 | - { |
|
449 | - $temp = explode($this->newline, $html); |
|
450 | - $result = []; |
|
451 | - for ($i = 0; $i < count($temp); ++$i) { |
|
452 | - if ("" == trim($temp[$i])) { |
|
453 | - continue; |
|
454 | - } |
|
455 | - $result[] = $temp[$i]; |
|
456 | - } |
|
457 | - $html = implode($this->newline, $result); |
|
458 | - } |
|
459 | - |
|
460 | - /** |
|
461 | - * Remove new lines where unnecessary |
|
462 | - * spares line breaks within: pre, textarea, ... |
|
463 | - * |
|
464 | - * @param string $html |
|
465 | - * |
|
466 | - * @return void |
|
467 | - */ |
|
468 | - protected function removeNewLines(&$html) |
|
469 | - { |
|
470 | - $splitArray = [ |
|
471 | - 'textarea', |
|
472 | - 'pre' |
|
473 | - ]; // eventuell auch: span, script, style |
|
474 | - $peaces = preg_split('#(<(' . implode('|', $splitArray) . ').*>.*</\2>)#Uis', $html, -1, PREG_SPLIT_DELIM_CAPTURE); |
|
475 | - $html = ""; |
|
476 | - for ($i = 0; $i < count($peaces); $i++) { |
|
477 | - if (($i + 1) % 3 == 0) { |
|
478 | - continue; |
|
479 | - } |
|
480 | - $html .= (($i - 1) % 3 != 0) ? $this->killLineBreaks($peaces[$i]) : $peaces[$i]; |
|
481 | - } |
|
482 | - } |
|
483 | - |
|
484 | - /** |
|
485 | - * Remove obsolete link schema |
|
486 | - * |
|
487 | - * @param string $html |
|
488 | - * |
|
489 | - * @return void |
|
490 | - */ |
|
491 | - protected function removeLinkSchema(&$html) |
|
492 | - { |
|
493 | - $html = preg_replace("/<link rel=\"?schema.dc\"?.+?>/is", "", $html); |
|
494 | - } |
|
495 | - |
|
496 | - /** |
|
497 | - * Remove empty alt tags |
|
498 | - * |
|
499 | - * @param string $html |
|
500 | - * |
|
501 | - * @return void |
|
502 | - */ |
|
503 | - protected function removeEmptyAltAtr(&$html) |
|
504 | - { |
|
505 | - $html = str_replace("alt=\"\"", "", $html); |
|
506 | - } |
|
507 | - |
|
508 | - /** |
|
509 | - * Remove broken links in <a> tags |
|
510 | - * |
|
511 | - * @param string $html |
|
512 | - * |
|
513 | - * @return void |
|
514 | - */ |
|
515 | - protected function removeRealUrlBrokenRootLink(&$html) |
|
516 | - { |
|
517 | - $html = str_replace('href=".html"', 'href=""', $html); |
|
518 | - } |
|
519 | - |
|
520 | - /** |
|
521 | - * Include configured header comment in HTML content block |
|
522 | - * |
|
523 | - * @param $html |
|
524 | - */ |
|
525 | - public function includeHeaderComment(&$html) |
|
526 | - { |
|
527 | - if (!empty($this->headerComment)) { |
|
528 | - $html = preg_replace_callback('/<meta http-equiv(.*)>/Usi', function ($matches) { |
|
529 | - return trim($matches[0] . $this->newline . $this->tab . $this->tab . '<!-- ' . $this->headerComment . '-->'); |
|
530 | - }, $html, 1); |
|
531 | - } |
|
532 | - } |
|
20 | + /** |
|
21 | + * Enable Debug comment in footer |
|
22 | + * |
|
23 | + * @var boolean |
|
24 | + */ |
|
25 | + protected $debugComment = false; |
|
26 | + |
|
27 | + /** |
|
28 | + * Format Type |
|
29 | + * |
|
30 | + * @var integer |
|
31 | + */ |
|
32 | + protected $formatType = 0; |
|
33 | + |
|
34 | + /** |
|
35 | + * Tab character |
|
36 | + * |
|
37 | + * @var string |
|
38 | + */ |
|
39 | + protected $tab = "\t"; |
|
40 | + |
|
41 | + /** |
|
42 | + * Newline character |
|
43 | + * |
|
44 | + * @var string |
|
45 | + */ |
|
46 | + protected $newline = "\n"; |
|
47 | + |
|
48 | + /** |
|
49 | + * Configured extra header comment |
|
50 | + * |
|
51 | + * @var string |
|
52 | + */ |
|
53 | + protected $headerComment = ''; |
|
54 | + |
|
55 | + /** |
|
56 | + * Empty space char |
|
57 | + * @var string |
|
58 | + */ |
|
59 | + protected $emptySpaceChar = ' '; |
|
60 | + |
|
61 | + /** |
|
62 | + * Set variables based on given config |
|
63 | + * |
|
64 | + * @param array $config |
|
65 | + * |
|
66 | + * @return void |
|
67 | + */ |
|
68 | + public function setVariables(array $config) |
|
69 | + { |
|
70 | + // Set newline based on OS |
|
71 | + if (Environment::isWindows()) { |
|
72 | + $this->newline = "\r\n"; |
|
73 | + } else { |
|
74 | + $this->newline = "\n"; |
|
75 | + } |
|
76 | + |
|
77 | + if (!empty($config)) { |
|
78 | + if ($config['formatHtml'] && is_numeric($config['formatHtml'])) { |
|
79 | + $this->formatType = (int)$config['formatHtml']; |
|
80 | + } |
|
81 | + |
|
82 | + if ($config['formatHtml.']['tabSize'] && is_numeric($config['formatHtml.']['tabSize'])) { |
|
83 | + $this->tab = str_pad('', $config['formatHtml.']['tabSize'], ' '); |
|
84 | + } |
|
85 | + |
|
86 | + if (isset($config['formatHtml.']['debugComment'])) { |
|
87 | + $this->debugComment = (bool)$config['formatHtml.']['debugComment']; |
|
88 | + } |
|
89 | + |
|
90 | + if (isset($config['headerComment'])) { |
|
91 | + $this->headerComment = $config['headerComment']; |
|
92 | + } |
|
93 | + |
|
94 | + if (isset($config['dropEmptySpaceChar']) && (bool)$config['dropEmptySpaceChar']) { |
|
95 | + $this->emptySpaceChar = ''; |
|
96 | + } |
|
97 | + } |
|
98 | + } |
|
99 | + |
|
100 | + /** |
|
101 | + * Clean given HTML with formatter |
|
102 | + * |
|
103 | + * @param string $html |
|
104 | + * @param array $config |
|
105 | + * |
|
106 | + * @return string |
|
107 | + */ |
|
108 | + public function clean($html, $config = []) |
|
109 | + { |
|
110 | + if (!empty($config)) { |
|
111 | + if ((bool)$config['enabled'] === false) { |
|
112 | + return $html; |
|
113 | + } |
|
114 | + |
|
115 | + $this->setVariables($config); |
|
116 | + } |
|
117 | + |
|
118 | + $manipulations = []; |
|
119 | + |
|
120 | + if (isset($config['removeGenerator']) && (bool)$config['removeGenerator']) { |
|
121 | + $manipulations['removeGenerator'] = GeneralUtility::makeInstance(RemoveGenerator::class); |
|
122 | + } |
|
123 | + |
|
124 | + if (isset($config['removeComments']) && (bool)$config['removeComments']) { |
|
125 | + $manipulations['removeComments'] = GeneralUtility::makeInstance(RemoveComments::class); |
|
126 | + } |
|
127 | + |
|
128 | + if (isset($config['removeBlurScript']) && (bool)$config['removeBlurScript']) { |
|
129 | + $manipulations['removeBlurScript'] = GeneralUtility::makeInstance(RemoveBlurScript::class); |
|
130 | + } |
|
131 | + |
|
132 | + if (!empty($this->headerComment)) { |
|
133 | + $this->includeHeaderComment($html); |
|
134 | + } |
|
135 | + |
|
136 | + foreach ($manipulations as $key => $manipulation) { |
|
137 | + /** @var ManipulationInterface $manipulation */ |
|
138 | + $configuration = isset($config[$key . '.']) && is_array($config[$key . '.']) ? $config[$key . '.'] : []; |
|
139 | + $html = $manipulation->manipulate($html, $configuration); |
|
140 | + } |
|
141 | + |
|
142 | + if ($this->formatType > 0) { |
|
143 | + $html = $this->formatHtml($html); |
|
144 | + } |
|
145 | + |
|
146 | + return $html; |
|
147 | + } |
|
148 | + |
|
149 | + /** |
|
150 | + * Formats the (X)HTML code: |
|
151 | + * - taps according to the hirarchy of the tags |
|
152 | + * - removes empty spaces between tags |
|
153 | + * - removes linebreaks within tags (spares where necessary: pre, textarea, comments, ..) |
|
154 | + * choose from five options: |
|
155 | + * 0 => off |
|
156 | + * 1 => no line break at all (code in one line) |
|
157 | + * 2 => minimalistic line breaks (structure defining box-elements) |
|
158 | + * 3 => aesthetic line breaks (important box-elements) |
|
159 | + * 4 => logic line breaks (all box-elements) |
|
160 | + * 5 => max line breaks (all elements) |
|
161 | + * |
|
162 | + * @param string $html |
|
163 | + * |
|
164 | + * @return string |
|
165 | + */ |
|
166 | + protected function formatHtml($html) |
|
167 | + { |
|
168 | + // Save original formated comments, pre, textarea, styles and java-scripts & replace them with markers |
|
169 | + preg_match_all( |
|
170 | + '/(?s)((<!--.*?-->)|(<[ \n\r]*pre[^>]*>.*?<[ \n\r]*\/pre[^>]*>)|(<[ \n\r]*textarea[^>]*>.*?<[ \n\r]*\/textarea[^>]*>)|(<[ \n\r]*style[^>]*>.*?<[ \n\r]*\/style[^>]*>)|(<[ \n\r]*script[^>]*>.*?<[ \n\r]*\/script[^>]*>))/im', |
|
171 | + $html, |
|
172 | + $matches |
|
173 | + ); |
|
174 | + $noFormat = $matches[0]; // do not format these block elements |
|
175 | + for ($i = 0; $i < count($noFormat); $i++) { |
|
176 | + $html = str_replace($noFormat[$i], "\n<!-- ELEMENT $i -->", $html); |
|
177 | + } |
|
178 | + |
|
179 | + // define box elements for formatting |
|
180 | + $trueBoxElements = 'address|blockquote|center|dir|div|dl|fieldset|form|h1|h2|h3|h4|h5|h6|hr|isindex|menu|noframes|noscript|ol|p|pre|table|ul|article|aside|details|figcaption|figure|footer|header|hgroup|menu|nav|section'; |
|
181 | + $functionalBoxElements = 'dd|dt|frameset|li|tbody|td|tfoot|th|thead|tr|colgroup'; |
|
182 | + $usableBoxElements = 'applet|button|del|iframe|ins|map|object|script'; |
|
183 | + $imagineBoxElements = 'html|body|head|meta|title|link|script|base|!--'; |
|
184 | + $allBoxLikeElements = '(?>' . $trueBoxElements . '|' . $functionalBoxElements . '|' . $usableBoxElements . '|' . $imagineBoxElements . ')'; |
|
185 | + $esteticBoxLikeElements = '(?>html|head|body|meta name|title|div|table|h1|h2|h3|h4|h5|h6|p|form|pre|center|!--)'; |
|
186 | + $structureBoxLikeElements = '(?>html|head|body|div|!--)'; |
|
187 | + |
|
188 | + // split html into it's elements |
|
189 | + $htmlArrayTemp = preg_split( |
|
190 | + '/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/', |
|
191 | + $html, |
|
192 | + -1, |
|
193 | + PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY |
|
194 | + ); |
|
195 | + |
|
196 | + if ($htmlArrayTemp === false) { |
|
197 | + // Restore saved comments, styles and java-scripts |
|
198 | + for ($i = 0; $i < count($noFormat); $i++) { |
|
199 | + $noFormat[$i] = $this->rTrimLines($noFormat[$i]); // remove white space after line ending |
|
200 | + $html = str_replace("<!-- ELEMENT $i -->", $noFormat[$i], $html); |
|
201 | + } |
|
202 | + return $html; |
|
203 | + } |
|
204 | + // remove empty lines |
|
205 | + $htmlArray = ['']; |
|
206 | + $z = 1; |
|
207 | + for ($x = 0; $x < count($htmlArrayTemp); $x++) { |
|
208 | + $t = trim($htmlArrayTemp[$x]); |
|
209 | + if ($t !== '') { |
|
210 | + $htmlArray[$z] = $htmlArrayTemp[$x]; |
|
211 | + $z++; |
|
212 | + } else { |
|
213 | + $htmlArray[$z] = $this->emptySpaceChar; |
|
214 | + $z++; |
|
215 | + } |
|
216 | + } |
|
217 | + |
|
218 | + // rebuild html |
|
219 | + $html = ''; |
|
220 | + $tabs = 0; |
|
221 | + for ($x = 0; $x < count($htmlArray); $x++) { |
|
222 | + // check if the element should stand in a new line |
|
223 | + $newline = false; |
|
224 | + if (substr($htmlArray[$x - 1], 0, 5) == '<?xml') { |
|
225 | + $newline = true; |
|
226 | + } elseif ($this->formatType == 2 && ( // minimalistic line break |
|
227 | + # this element has a line break before itself |
|
228 | + preg_match( |
|
229 | + '/<' . $structureBoxLikeElements . '(.*)>/Usi', |
|
230 | + $htmlArray[$x] |
|
231 | + ) || preg_match( |
|
232 | + '/<' . $structureBoxLikeElements . '(.*) \/>/Usi', |
|
233 | + $htmlArray[$x] |
|
234 | + ) || # one element before is a element that has a line break after |
|
235 | + preg_match( |
|
236 | + '/<\/' . $structureBoxLikeElements . '(.*)>/Usi', |
|
237 | + $htmlArray[$x - 1] |
|
238 | + ) || substr( |
|
239 | + $htmlArray[$x - 1], |
|
240 | + 0, |
|
241 | + 4 |
|
242 | + ) == '<!--' || preg_match('/<' . $structureBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
243 | + ) { |
|
244 | + $newline = true; |
|
245 | + } elseif ($this->formatType == 3 && ( // aestetic line break |
|
246 | + # this element has a line break before itself |
|
247 | + preg_match( |
|
248 | + '/<' . $esteticBoxLikeElements . '(.*)>/Usi', |
|
249 | + $htmlArray[$x] |
|
250 | + ) || preg_match( |
|
251 | + '/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', |
|
252 | + $htmlArray[$x] |
|
253 | + ) || # one element before is a element that has a line break after |
|
254 | + preg_match('/<\/' . $esteticBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr( |
|
255 | + $htmlArray[$x - 1], |
|
256 | + 0, |
|
257 | + 4 |
|
258 | + ) == '<!--' || preg_match('/<' . $esteticBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
259 | + ) { |
|
260 | + $newline = true; |
|
261 | + } elseif ($this->formatType >= 4 && ( // logical line break |
|
262 | + # this element has a line break before itself |
|
263 | + preg_match( |
|
264 | + '/<' . $allBoxLikeElements . '(.*)>/Usi', |
|
265 | + $htmlArray[$x] |
|
266 | + ) || preg_match( |
|
267 | + '/<' . $allBoxLikeElements . '(.*) \/>/Usi', |
|
268 | + $htmlArray[$x] |
|
269 | + ) || # one element before is a element that has a line break after |
|
270 | + preg_match('/<\/' . $allBoxLikeElements . '(.*)>/Usi', $htmlArray[$x - 1]) || substr( |
|
271 | + $htmlArray[$x - 1], |
|
272 | + 0, |
|
273 | + 4 |
|
274 | + ) == '<!--' || preg_match('/<' . $allBoxLikeElements . '(.*) \/>/Usi', $htmlArray[$x - 1])) |
|
275 | + ) { |
|
276 | + $newline = true; |
|
277 | + } |
|
278 | + |
|
279 | + // count down a tab |
|
280 | + if (substr($htmlArray[$x], 0, 2) == '</') { |
|
281 | + $tabs--; |
|
282 | + } |
|
283 | + |
|
284 | + // add tabs and line breaks in front of the current tag |
|
285 | + if ($newline) { |
|
286 | + $html .= $this->newline; |
|
287 | + for ($y = 0; $y < $tabs; $y++) { |
|
288 | + $html .= $this->tab; |
|
289 | + } |
|
290 | + } |
|
291 | + |
|
292 | + // remove white spaces and line breaks and add current tag to the html-string |
|
293 | + if (substr($htmlArray[$x - 1], 0, 4) == '<pre' // remove white space after line ending in PRE / TEXTAREA / comment |
|
294 | + || substr($htmlArray[$x - 1], 0, 9) == '<textarea' || substr($htmlArray[$x - 1], 0, 4) == '<!--' |
|
295 | + ) { |
|
296 | + $html .= $this->rTrimLines($htmlArray[$x]); |
|
297 | + } elseif (substr($htmlArray[$x], 0, 9) == '<![CDATA[' // remove multiple white space in CDATA / XML |
|
298 | + || substr($htmlArray[$x], 0, 5) == '<?xml' |
|
299 | + ) { |
|
300 | + $html .= $this->killWhiteSpace($htmlArray[$x]); |
|
301 | + } else { // remove all line breaks |
|
302 | + $html .= $this->killLineBreaks($htmlArray[$x]); |
|
303 | + } |
|
304 | + |
|
305 | + // count up a tab |
|
306 | + if (substr($htmlArray[$x], 0, 1) == '<' && substr($htmlArray[$x], 1, 1) != '/') { |
|
307 | + if ( |
|
308 | + substr($htmlArray[$x], 1, 1) !== ' ' |
|
309 | + && substr($htmlArray[$x], 1, 3) !== 'img' |
|
310 | + && substr($htmlArray[$x], 1, 6) !== 'source' |
|
311 | + && substr($htmlArray[$x], 1, 2) !== 'br' |
|
312 | + && substr($htmlArray[$x], 1, 2) !== 'hr' |
|
313 | + && substr($htmlArray[$x], 1, 5) !== 'input' |
|
314 | + && substr($htmlArray[$x], 1, 4) !== 'link' |
|
315 | + && substr($htmlArray[$x], 1, 4) !== 'meta' |
|
316 | + && substr($htmlArray[$x], 1, 4) !== 'col ' |
|
317 | + && substr($htmlArray[$x], 1, 5) !== 'frame' |
|
318 | + && substr($htmlArray[$x], 1, 7) !== 'isindex' |
|
319 | + && substr($htmlArray[$x], 1, 5) !== 'param' |
|
320 | + && substr($htmlArray[$x], 1, 4) !== 'area' |
|
321 | + && substr($htmlArray[$x], 1, 4) !== 'base' |
|
322 | + && substr($htmlArray[$x], 0, 2) !== '<!' |
|
323 | + && substr($htmlArray[$x], 0, 5) !== '<?xml' |
|
324 | + ) { |
|
325 | + $tabs++; |
|
326 | + } |
|
327 | + } |
|
328 | + } |
|
329 | + |
|
330 | + // Remove empty lines |
|
331 | + if ($this->formatType > 1) { |
|
332 | + $this->removeEmptyLines($html); |
|
333 | + } |
|
334 | + |
|
335 | + // Restore saved comments, styles and java-scripts |
|
336 | + for ($i = 0; $i < count($noFormat); $i++) { |
|
337 | + $noFormat[$i] = $this->rTrimLines($noFormat[$i]); // remove white space after line ending |
|
338 | + $html = str_replace("<!-- ELEMENT $i -->", $noFormat[$i], $html); |
|
339 | + } |
|
340 | + |
|
341 | + // include debug comment at the end |
|
342 | + if ($tabs != 0 && $this->debugComment === true) { |
|
343 | + $html .= '<!--' . $tabs . " open elements found-->\r\n"; |
|
344 | + } |
|
345 | + |
|
346 | + return $html; |
|
347 | + } |
|
348 | + |
|
349 | + /** |
|
350 | + * Remove ALL line breaks and multiple white space |
|
351 | + * |
|
352 | + * @param string $html |
|
353 | + * |
|
354 | + * @return string |
|
355 | + */ |
|
356 | + protected function killLineBreaks($html) |
|
357 | + { |
|
358 | + $html = $this->convNlOs($html); |
|
359 | + $html = str_replace($this->newline, "", $html); |
|
360 | + $html = preg_replace('/\s\s+/u', ' ', $html); |
|
361 | + return $html; |
|
362 | + } |
|
363 | + |
|
364 | + /** |
|
365 | + * Remove multiple white space, keeps line breaks |
|
366 | + * |
|
367 | + * @param string $html |
|
368 | + * |
|
369 | + * @return string |
|
370 | + */ |
|
371 | + protected function killWhiteSpace($html) |
|
372 | + { |
|
373 | + $html = $this->convNlOs($html); |
|
374 | + $temp = explode($this->newline, $html); |
|
375 | + for ($i = 0; $i < count($temp); $i++) { |
|
376 | + if (!trim($temp[$i])) { |
|
377 | + unset($temp[$i]); |
|
378 | + } else { |
|
379 | + $temp[$i] = trim($temp[$i]); |
|
380 | + $temp[$i] = preg_replace('/\s\s+/', ' ', $temp[$i]); |
|
381 | + } |
|
382 | + } |
|
383 | + $html = implode($this->newline, $temp); |
|
384 | + return $html; |
|
385 | + } |
|
386 | + |
|
387 | + /** |
|
388 | + * Remove white space at the end of lines, keeps other white space and line breaks |
|
389 | + * |
|
390 | + * @param string $html |
|
391 | + * |
|
392 | + * @return string |
|
393 | + */ |
|
394 | + protected function rTrimLines($html) |
|
395 | + { |
|
396 | + $html = $this->convNlOs($html); |
|
397 | + $temp = explode($this->newline, $html); |
|
398 | + for ($i = 0; $i < count($temp); $i++) { |
|
399 | + $temp[$i] = rtrim($temp[$i]); |
|
400 | + } |
|
401 | + $html = implode($this->newline, $temp); |
|
402 | + return $html; |
|
403 | + } |
|
404 | + |
|
405 | + /** |
|
406 | + * Convert newlines according to the current OS |
|
407 | + * |
|
408 | + * @param string $html |
|
409 | + * |
|
410 | + * @return string |
|
411 | + */ |
|
412 | + protected function convNlOs($html) |
|
413 | + { |
|
414 | + $html = preg_replace("(\r\n|\n|\r)", $this->newline, $html); |
|
415 | + return $html; |
|
416 | + } |
|
417 | + |
|
418 | + /** |
|
419 | + * Remove tabs and empty spaces before and after lines, transforms linebreaks system conform |
|
420 | + * |
|
421 | + * @param string $html Html-Code |
|
422 | + * |
|
423 | + * @return void |
|
424 | + */ |
|
425 | + protected function trimLines(&$html) |
|
426 | + { |
|
427 | + $html = str_replace("\t", "", $html); |
|
428 | + // convert newlines according to the current OS |
|
429 | + if (Environment::isWindows()) { |
|
430 | + $html = str_replace("\n", "\r\n", $html); |
|
431 | + } else { |
|
432 | + $html = str_replace("\r\n", "\n", $html); |
|
433 | + } |
|
434 | + $temp = explode($this->newline, $html); |
|
435 | + $temp = array_map('trim', $temp); |
|
436 | + $html = implode($this->newline, $temp); |
|
437 | + unset($temp); |
|
438 | + } |
|
439 | + |
|
440 | + /** |
|
441 | + * Remove empty lines |
|
442 | + * |
|
443 | + * @param string $html |
|
444 | + * |
|
445 | + * @return void |
|
446 | + */ |
|
447 | + protected function removeEmptyLines(&$html) |
|
448 | + { |
|
449 | + $temp = explode($this->newline, $html); |
|
450 | + $result = []; |
|
451 | + for ($i = 0; $i < count($temp); ++$i) { |
|
452 | + if ("" == trim($temp[$i])) { |
|
453 | + continue; |
|
454 | + } |
|
455 | + $result[] = $temp[$i]; |
|
456 | + } |
|
457 | + $html = implode($this->newline, $result); |
|
458 | + } |
|
459 | + |
|
460 | + /** |
|
461 | + * Remove new lines where unnecessary |
|
462 | + * spares line breaks within: pre, textarea, ... |
|
463 | + * |
|
464 | + * @param string $html |
|
465 | + * |
|
466 | + * @return void |
|
467 | + */ |
|
468 | + protected function removeNewLines(&$html) |
|
469 | + { |
|
470 | + $splitArray = [ |
|
471 | + 'textarea', |
|
472 | + 'pre' |
|
473 | + ]; // eventuell auch: span, script, style |
|
474 | + $peaces = preg_split('#(<(' . implode('|', $splitArray) . ').*>.*</\2>)#Uis', $html, -1, PREG_SPLIT_DELIM_CAPTURE); |
|
475 | + $html = ""; |
|
476 | + for ($i = 0; $i < count($peaces); $i++) { |
|
477 | + if (($i + 1) % 3 == 0) { |
|
478 | + continue; |
|
479 | + } |
|
480 | + $html .= (($i - 1) % 3 != 0) ? $this->killLineBreaks($peaces[$i]) : $peaces[$i]; |
|
481 | + } |
|
482 | + } |
|
483 | + |
|
484 | + /** |
|
485 | + * Remove obsolete link schema |
|
486 | + * |
|
487 | + * @param string $html |
|
488 | + * |
|
489 | + * @return void |
|
490 | + */ |
|
491 | + protected function removeLinkSchema(&$html) |
|
492 | + { |
|
493 | + $html = preg_replace("/<link rel=\"?schema.dc\"?.+?>/is", "", $html); |
|
494 | + } |
|
495 | + |
|
496 | + /** |
|
497 | + * Remove empty alt tags |
|
498 | + * |
|
499 | + * @param string $html |
|
500 | + * |
|
501 | + * @return void |
|
502 | + */ |
|
503 | + protected function removeEmptyAltAtr(&$html) |
|
504 | + { |
|
505 | + $html = str_replace("alt=\"\"", "", $html); |
|
506 | + } |
|
507 | + |
|
508 | + /** |
|
509 | + * Remove broken links in <a> tags |
|
510 | + * |
|
511 | + * @param string $html |
|
512 | + * |
|
513 | + * @return void |
|
514 | + */ |
|
515 | + protected function removeRealUrlBrokenRootLink(&$html) |
|
516 | + { |
|
517 | + $html = str_replace('href=".html"', 'href=""', $html); |
|
518 | + } |
|
519 | + |
|
520 | + /** |
|
521 | + * Include configured header comment in HTML content block |
|
522 | + * |
|
523 | + * @param $html |
|
524 | + */ |
|
525 | + public function includeHeaderComment(&$html) |
|
526 | + { |
|
527 | + if (!empty($this->headerComment)) { |
|
528 | + $html = preg_replace_callback('/<meta http-equiv(.*)>/Usi', function ($matches) { |
|
529 | + return trim($matches[0] . $this->newline . $this->tab . $this->tab . '<!-- ' . $this->headerComment . '-->'); |
|
530 | + }, $html, 1); |
|
531 | + } |
|
532 | + } |
|
533 | 533 | } |