Completed
Push — master ( 272e14...a9a202 )
by
unknown
71:27 queued 30:21
created

RteHtmlParser::init()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
namespace TYPO3\CMS\Core\Html;
3
4
/*
5
 * This file is part of the TYPO3 CMS project.
6
 *
7
 * It is free software; you can redistribute it and/or modify it under
8
 * the terms of the GNU General Public License, either version 2
9
 * of the License, or any later version.
10
 *
11
 * For the full copyright and license information, please read the
12
 * LICENSE.txt file that was distributed with this source code.
13
 *
14
 * The TYPO3 project - inspiring people to share!
15
 */
16
17
use Psr\EventDispatcher\EventDispatcherInterface;
18
use Psr\Log\LoggerAwareInterface;
19
use Psr\Log\LoggerAwareTrait;
20
use TYPO3\CMS\Core\Html\Event\BrokenLinkAnalysisEvent;
21
use TYPO3\CMS\Core\LinkHandling\Exception\UnknownLinkHandlerException;
22
use TYPO3\CMS\Core\LinkHandling\LinkService;
23
use TYPO3\CMS\Core\Utility\GeneralUtility;
24
25
/**
26
 * Class for parsing HTML for the Rich Text Editor. (also called transformations)
27
 *
28
 * Concerning line breaks:
29
 * Regardless if LF (Unix-style) or CRLF (Windows) was put in, the HtmlParser works with LFs and migrates all
30
 * line breaks to LFs internally, however when all transformations are done, all LFs are transformed to CRLFs.
31
 * This means: RteHtmlParser always returns CRLFs to be maximum compatible with all formats.
32
 */
33
class RteHtmlParser extends HtmlParser implements LoggerAwareInterface
34
{
35
    use LoggerAwareTrait;
36
37
    /**
38
     * List of elements that are not wrapped into a "p" tag while doing the transformation.
39
     * @var string
40
     */
41
    protected $blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE';
42
43
    /**
44
     * List of all tags that are allowed by default
45
     * @var string
46
     */
47
    protected $defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn';
48
49
    /**
50
     * Set to the TSconfig options coming from Page TSconfig
51
     *
52
     * @var array
53
     */
54
    protected $procOptions = [];
55
56
    /**
57
     * Run-away brake for recursive calls.
58
     *
59
     * @var int
60
     */
61
    protected $TS_transform_db_safecounter = 100;
62
63
    /**
64
     * Data caching for processing function
65
     *
66
     * @var array
67
     */
68
    protected $getKeepTags_cache = [];
69
70
    /**
71
     * Storage of the allowed CSS class names in the RTE
72
     *
73
     * @var array
74
     */
75
    protected $allowedClasses = [];
76
77
    /**
78
     * A list of HTML attributes for <p> tags. Because <p> tags are wrapped currently in a special handling,
79
     * they have a special place for configuration via 'proc.keepPDIVattribs'
80
     *
81
     * @var array
82
     */
83
    protected $allowedAttributesForParagraphTags = [
84
        'class',
85
        'align',
86
        'id',
87
        'title',
88
        'dir',
89
        'lang',
90
        'xml:lang',
91
        'itemscope',
92
        'itemtype',
93
        'itemprop'
94
    ];
95
96
    /**
97
     * Any tags that are allowed outside of <p> sections - usually similar to the block elements
98
     * plus some special tags like <hr> and <img> (if images are allowed).
99
     * Completely overrideable via 'proc.allowTagsOutside'
100
     *
101
     * @var array
102
     */
103
    protected $allowedTagsOutsideOfParagraphs = [
104
        'address',
105
        'article',
106
        'aside',
107
        'blockquote',
108
        'div',
109
        'footer',
110
        'header',
111
        'hr',
112
        'nav',
113
        'section'
114
    ];
115
116
    /**
117
     * @var EventDispatcherInterface
118
     */
119
    protected $eventDispatcher;
120
121
    public function __construct(EventDispatcherInterface $eventDispatcher)
122
    {
123
        $this->eventDispatcher = $eventDispatcher;
124
    }
125
126
    /**
127
     * Initialize, setting element reference and record PID
128
     *
129
     * @param string $elRef Element reference, eg "tt_content:bodytext
130
     * @param int $recPid PID of the record (page id)
131
     * @deprecated will be removed in TYPO3 v11.0, as it serves no purpose anymore
132
     */
133
    public function init($elRef = '', $recPid = 0)
0 ignored issues
show
Unused Code introduced by
The parameter $recPid is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

133
    public function init($elRef = '', /** @scrutinizer ignore-unused */ $recPid = 0)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Unused Code introduced by
The parameter $elRef is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

133
    public function init(/** @scrutinizer ignore-unused */ $elRef = '', $recPid = 0)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
134
    {
135
        trigger_error('RteHtmlParser->init() is not needed anymore for RTE transformation, and will be removed in TYPO3 v11.0.', E_USER_DEPRECATED);
136
    }
137
138
    /**
139
     * Sanitize and streamline given options (usually from RichTextConfiguration results "proc."
140
     * and set them to the respective properties.
141
     *
142
     * @param array $processingConfiguration
143
     */
144
    protected function setProcessingConfiguration(array $processingConfiguration): void
145
    {
146
        $this->procOptions = $processingConfiguration;
147
        if (isset($this->procOptions['allowedClasses.'])) {
148
            $this->allowedClasses = (array)$this->procOptions['allowedClasses.'];
149
        } else {
150
            $this->allowedClasses = GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true);
151
        }
152
153
        // Dynamic configuration of blockElementList
154
        if (!empty($this->procOptions['blockElementList'])) {
155
            $this->blockElementList = $this->procOptions['blockElementList'];
156
        }
157
158
        // Define which attributes are allowed on <p> tags
159
        if (isset($this->procOptions['allowAttributes.'])) {
160
            $this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.'];
161
        }
162
        // Override tags which are allowed outside of <p> tags
163
        if (isset($this->procOptions['allowTagsOutside'])) {
164
            if (!isset($this->procOptions['allowTagsOutside.'])) {
165
                $this->allowedTagsOutsideOfParagraphs = GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true);
166
            } else {
167
                $this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.'];
168
            }
169
        }
170
    }
171
172
    /**
173
     * Main entry point for transforming RTE content in the database so the Rich Text Editor can deal with
174
     * e.g. links.
175
     *
176
     * @param string $value
177
     * @param array $processingConfiguration
178
     * @return string
179
     */
180
    public function transformTextForRichTextEditor(string $value, array $processingConfiguration): string
181
    {
182
        $this->setProcessingConfiguration($processingConfiguration);
183
        $modes = $this->resolveAppliedTransformationModes('rte');
184
        $value = $this->streamlineLineBreaksForProcessing($value);
185
        // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
186
        $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_rte');
187
        // Traverse modes
188
        foreach ($modes as $cmd) {
189
            // Checking for user defined transformation:
190
            if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
191
                $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
192
                $_procObj->pObj = $this;
193
                $value = $_procObj->transform_rte($value, $this);
194
            } else {
195
                // ... else use defaults:
196
                switch ($cmd) {
197
                    case 'detectbrokenlinks':
198
                        $value = $this->markBrokenLinks($value);
199
                        break;
200
                    case 'css_transform':
201
                        $value = $this->TS_transform_rte($value);
202
                        break;
203
                    default:
204
                        // Do nothing
205
                }
206
            }
207
        }
208
        // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
209
        $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_rte');
210
        // Final clean up of linebreaks
211
        $value = $this->streamlineLineBreaksAfterProcessing($value);
212
        return $value;
213
    }
214
215
    /**
216
     * Called to process HTML content before it is stored in the database.
217
     *
218
     * @param string $value
219
     * @param array $processingConfiguration
220
     * @return string
221
     */
222
    public function transformTextForPersistence(string $value, array $processingConfiguration): string
223
    {
224
        $this->setProcessingConfiguration($processingConfiguration);
225
        $modes = $this->resolveAppliedTransformationModes('db');
226
        $value = $this->streamlineLineBreaksForProcessing($value);
227
        // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
228
        $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_db');
229
        // Traverse modes
230
        foreach ($modes as $cmd) {
231
            // Checking for user defined transformation:
232
            if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
233
                $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
234
                $_procObj->pObj = $this;
235
                $_procObj->transformationKey = $cmd;
236
                $value = $_procObj->transform_db($value, $this);
237
            } else {
238
                // ... else use defaults:
239
                switch ($cmd) {
240
                    case 'detectbrokenlinks':
241
                        $value = $this->removeBrokenLinkMarkers($value);
242
                        break;
243
                    case 'ts_links':
244
                        $value = $this->TS_links_db($value);
245
                        break;
246
                    case 'css_transform':
247
                        // Transform empty paragraphs into spacing paragraphs
248
                        $value = str_replace('<p></p>', '<p>&nbsp;</p>', $value);
249
                        // Double any trailing spacing paragraph so that it does not get removed by divideIntoLines()
250
                        $value = preg_replace('/<p>&nbsp;<\/p>$/', '<p>&nbsp;</p><p>&nbsp;</p>', $value);
251
                        $value = $this->TS_transform_db($value);
252
                        break;
253
                    default:
254
                        // Do nothing
255
                }
256
            }
257
        }
258
        // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
259
        $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_db');
260
        // Final clean up of linebreaks
261
        $value = $this->streamlineLineBreaksAfterProcessing($value);
262
        return $value;
263
    }
264
265
    /**********************************************
266
     *
267
     * Main function
268
     *
269
     **********************************************/
270
    /**
271
     * Transform value for RTE based on specConf in the direction specified by $direction (rte/db)
272
     * This is the main function called from DataHandler and transfer data classes, but has been superseded
273
     * by the methods
274
     * - transformTextForRichTextEditor()
275
     * - transformTextForPersistence()
276
     * to avoid the $direction argument.
277
     * Please use the new methods for TYPO3 v10+ - only code
278
     *
279
     * @param string $value Input value
280
     * @param null $_ unused
0 ignored issues
show
Documentation Bug introduced by
Are you sure the doc-type for parameter $_ is correct as it would always require null to be passed?
Loading history...
281
     * @param string $direction Direction of the transformation. Two keywords are allowed; "db" or "rte". If "db" it means the transformation will clean up content coming from the Rich Text Editor and goes into the database. The other direction, "rte", is of course when content is coming from database and must be transformed to fit the RTE.
282
     * @param array $thisConfig Parsed TypoScript content configuring the RTE, probably coming from Page TSconfig.
283
     * @return string Output value
284
     * @deprecated will be removed in TYPO3 v11.0, use the transformText* methods instead.
285
     */
286
    public function RTE_transform($value, $_ = null, $direction = 'rte', $thisConfig = [])
0 ignored issues
show
Unused Code introduced by
The parameter $_ is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

286
    public function RTE_transform($value, /** @scrutinizer ignore-unused */ $_ = null, $direction = 'rte', $thisConfig = [])

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Coding Style introduced by
Method name "RteHtmlParser::RTE_transform" is not in camel caps format
Loading history...
287
    {
288
        trigger_error('RteHtmlParser->RTE_transform() will be removed in TYPO3 v11.0. Use the transformTextFor* methods in the same class instead', E_USER_DEPRECATED);
289
        if ($direction === 'rte') {
290
            return $this->transformTextForRichTextEditor($value, $thisConfig['proc.'] ?? []);
291
        }
292
        if ($direction === 'db') {
293
            return $this->transformTextForPersistence($value, $thisConfig['proc.'] ?? []);
294
        }
295
        return $value;
296
    }
297
298
    /**
299
     * Ensures what transformation modes should be executed, and that they are only executed once.
300
     *
301
     * @param string $direction
302
     * @return array the resolved transformation modes
303
     */
304
    protected function resolveAppliedTransformationModes(string $direction): array
305
    {
306
        // Setting modes / transformations to be called
307
        if ((string)$this->procOptions['overruleMode'] !== '') {
308
            $modes = GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']);
309
        } else {
310
            $modes = [$this->procOptions['mode']];
311
        }
312
313
        $modeList = implode(',', $modes);
314
315
        // Replace the shortcut "default" with all custom modes
316
        $modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_links', $modeList);
317
318
        // Make list unique
319
        $modes = array_unique(GeneralUtility::trimExplode(',', $modeList, true));
320
        // Reverse order if direction is "rte"
321
        if ($direction === 'rte') {
322
            $modes = array_reverse($modes);
323
        }
324
325
        return $modes;
326
    }
327
328
    /**
329
     * Runs the HTML parser if it is configured
330
     * Getting additional HTML cleaner configuration. These are applied either before or after the main transformation
331
     * is done and thus totally independent processing options you can set up.
332
     *
333
     * This is only possible via TSconfig (procOptions) currently.
334
     *
335
     * @param string $content
336
     * @param string $configurationDirective used to look up in the procOptions if enabled, and then fetch the
337
     * @return string the processed content
338
     */
339
    protected function runHtmlParserIfConfigured($content, $configurationDirective)
340
    {
341
        if (!empty($this->procOptions[$configurationDirective])) {
342
            [$keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration] = $this->HTMLparserConfig($this->procOptions[$configurationDirective . '.']);
343
            $content = $this->HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration);
344
        }
345
        return $content;
346
    }
347
348
    /************************************
349
     *
350
     * Specific RTE TRANSFORMATION functions
351
     *
352
     *************************************/
353
354
    /**
355
     * Transformation handler: 'ts_links' / direction: "db"
356
     * Processing anchor tags, and resolves them correctly again via the LinkService syntax
357
     *
358
     * Splits content into <a> tag blocks and processes each tag, and allows hooks to actually render
359
     * the result.
360
     *
361
     * @param string $value Content input
362
     * @return string Content output
363
     */
364
    protected function TS_links_db($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_links_db" is not in camel caps format
Loading history...
365
    {
366
        $blockSplit = $this->splitIntoBlock('A', $value);
367
        foreach ($blockSplit as $k => $v) {
368
            if ($k % 2) {
369
                [$tagAttributes] = $this->get_tag_attributes($this->getFirstTag($v), true);
370
371
                // Anchors would not have an href attribute
372
                if (!isset($tagAttributes['href'])) {
373
                    continue;
374
                }
375
                $linkService = GeneralUtility::makeInstance(LinkService::class);
376
                // Store the link as <a> tag as default by TYPO3, with the link service syntax
377
                try {
378
                    $linkInformation = $linkService->resolve($tagAttributes['href'] ?? '');
379
                    $tagAttributes['href'] = $linkService->asString($linkInformation);
380
                } catch (UnknownLinkHandlerException $e) {
381
                    $tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href'];
382
                }
383
384
                $blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>'
385
                    . $this->TS_links_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</a>';
386
            }
387
        }
388
        return implode('', $blockSplit);
389
    }
390
391
    /**
392
     * Transformation handler: 'css_transform' / direction: "db"
393
     * Cleaning (->db) for standard content elements (ts)
394
     *
395
     * @param string $value Content input
396
     * @return string Content output
397
     * @see TS_transform_rte()
398
     */
399
    protected function TS_transform_db($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_transform_db" is not in camel caps format
Loading history...
400
    {
401
        // Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...)
402
        $this->TS_transform_db_safecounter--;
403
        if ($this->TS_transform_db_safecounter < 0) {
404
            return $value;
405
        }
406
        // Split the content from RTE by the occurrence of these blocks:
407
        $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
408
409
        // Avoid superfluous linebreaks by transform_db after ending headListTag
410
        while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') {
411
            array_pop($blockSplit);
412
        }
413
414
        // Traverse the blocks
415
        foreach ($blockSplit as $k => $v) {
416
            if ($k % 2) {
417
                // Inside block:
418
                // Init:
419
                $tag = $this->getFirstTag($v);
420
                $tagName = strtolower($this->getFirstTagName($v));
421
                // Process based on the tag:
422
                switch ($tagName) {
423
                    case 'blockquote':
424
                    case 'dd':
425
                    case 'div':
426
                    case 'header':
427
                    case 'section':
428
                    case 'footer':
429
                    case 'nav':
430
                    case 'article':
431
                    case 'aside':
432
                        $blockSplit[$k] = $tag . $this->TS_transform_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
433
                        break;
434
                    case 'pre':
435
                        break;
436
                    default:
437
                        // usually <hx> tags and <table> tags where no other block elements are within the tags
438
                        // Eliminate true linebreaks inside block element tags
439
                        $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
440
                }
441
            } else {
442
                // NON-block:
443
                if (trim($blockSplit[$k]) !== '') {
444
                    $blockSplit[$k] = str_replace('<hr/>', '<hr />', $blockSplit[$k]);
445
                    // Remove linebreaks preceding hr tags
446
                    $blockSplit[$k] = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $blockSplit[$k]);
447
                    // Remove linebreaks following hr tags
448
                    $blockSplit[$k] = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $blockSplit[$k]);
449
                    // Replace other linebreaks with space
450
                    $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
451
                    $blockSplit[$k] = $this->divideIntoLines($blockSplit[$k]);
452
                } else {
453
                    unset($blockSplit[$k]);
454
                }
455
            }
456
        }
457
        $this->TS_transform_db_safecounter++;
458
        return implode(LF, $blockSplit);
459
    }
460
461
    /**
462
     * Transformation handler: css_transform / direction: "rte"
463
     * Set (->rte) for standard content elements (ts)
464
     *
465
     * @param string $value Content input
466
     * @return string Content output
467
     * @see TS_transform_db()
468
     */
469
    protected function TS_transform_rte($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_transform_rte" is not in camel caps format
Loading history...
470
    {
471
        // Split the content from database by the occurrence of the block elements
472
        $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
473
        // Traverse the blocks
474
        foreach ($blockSplit as $k => $v) {
475
            if ($k % 2) {
476
                // Inside one of the blocks:
477
                // Init:
478
                $tag = $this->getFirstTag($v);
479
                $tagName = strtolower($this->getFirstTagName($v));
480
                // Based on tagname, we do transformations:
481
                switch ($tagName) {
482
                    case 'blockquote':
483
                    case 'dd':
484
                    case 'div':
485
                    case 'header':
486
                    case 'section':
487
                    case 'footer':
488
                    case 'nav':
489
                    case 'article':
490
                    case 'aside':
491
                        $blockSplit[$k] = $tag . $this->TS_transform_rte($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
492
                        break;
493
                }
494
                $blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]);
495
            } else {
496
                // NON-block:
497
                $nextFTN = $this->getFirstTagName($blockSplit[$k + 1] ?? '');
498
                $onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1);
499
                // If the line is followed by a block or is the last line:
500
                if (GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) {
501
                    // If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1
502
                    if (!$onlyLineBreaks) {
503
                        $blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]);
504
                    } else {
505
                        // If the line contains only linebreaks, remove the leading linebreak
506
                        $blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]);
507
                    }
508
                }
509
                // If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks
510
                if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) {
511
                    unset($blockSplit[$k]);
512
                } else {
513
                    $blockSplit[$k] = $this->setDivTags($blockSplit[$k]);
514
                }
515
            }
516
        }
517
        return implode(LF, $blockSplit);
518
    }
519
520
    /***************************************************************
521
     *
522
     * Generic RTE transformation, analysis and helper functions
523
     *
524
     **************************************************************/
525
526
    /**
527
     * Function for cleaning content going into the database.
528
     * Content is cleaned eg. by removing unallowed HTML and ds-HSC content
529
     * It is basically calling HTMLcleaner from the parent class with some preset configuration specifically set up for cleaning content going from the RTE into the db
530
     *
531
     * @param string $content Content to clean up
532
     * @return string Clean content
533
     * @see getKeepTags()
534
     */
535
    protected function HTMLcleaner_db($content)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::HTMLcleaner_db" is not in camel caps format
Loading history...
536
    {
537
        $keepTags = $this->getKeepTags('db');
538
        return $this->HTMLcleaner($content, $keepTags, false);
539
    }
540
541
    /**
542
     * Creates an array of configuration for the HTMLcleaner function based on whether content
543
     * go TO or FROM the Rich Text Editor ($direction)
544
     *
545
     * @param string $direction The direction of the content being processed by the output configuration; "db" (content going into the database FROM the rte) or "rte" (content going into the form)
546
     * @return array Configuration array
547
     * @see HTMLcleaner_db()
548
     */
549
    protected function getKeepTags($direction = 'rte')
550
    {
551
        if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) {
552
            // Setting up allowed tags:
553
            // Default is to get allowed/denied tags from internal array of processing options:
554
            // Construct default list of tags to keep:
555
            if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) {
556
                $keepTags = implode(',', $this->procOptions['allowTags.']);
557
            } else {
558
                $keepTags = $this->procOptions['allowTags'] ?? '';
559
            }
560
            $keepTags = array_flip(GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true));
561
            // For tags to deny, remove them from $keepTags array:
562
            $denyTags = GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true);
563
            foreach ($denyTags as $dKe) {
564
                unset($keepTags[$dKe]);
565
            }
566
            // Based on the direction of content, set further options:
567
            switch ($direction) {
568
                case 'rte':
569
                    // Transforming keepTags array so it can be understood by the HTMLcleaner function.
570
                    // This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array.
571
                    [$keepTags] = $this->HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags);
572
                    break;
573
                case 'db':
574
                    // Setting up span tags if they are allowed:
575
                    if (isset($keepTags['span'])) {
576
                        $keepTags['span'] = [
577
                            'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop',
578
                            'fixAttrib' => [
579
                                'class' => [
580
                                    'removeIfFalse' => 1
581
                                ]
582
                            ],
583
                            'rmTagIfNoAttrib' => 1
584
                        ];
585
                        if (!empty($this->allowedClasses)) {
586
                            $keepTags['span']['fixAttrib']['class']['list'] = $this->allowedClasses;
587
                        }
588
                    }
589
                    // Setting further options, getting them from the processing options
590
                    $TSc = $this->procOptions['HTMLparser_db.'] ?? [];
591
                    if (empty($TSc['globalNesting'])) {
592
                        $TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span';
593
                    }
594
                    if (empty($TSc['noAttrib'])) {
595
                        $TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike';
596
                    }
597
                    // Transforming the array from TypoScript to regular array:
598
                    [$keepTags] = $this->HTMLparserConfig($TSc, $keepTags);
599
                    break;
600
            }
601
            // Caching (internally, in object memory) the result
602
            $this->getKeepTags_cache[$direction] = $keepTags;
603
        }
604
        // Return result:
605
        return $this->getKeepTags_cache[$direction];
606
    }
607
608
    /**
609
     * This resolves the $value into parts based on <p>-sections. These are returned as lines separated by LF.
610
     * This point is to resolve the HTML-code returned from RTE into ordinary lines so it's 'human-readable'
611
     * The function ->setDivTags does the opposite.
612
     * This function processes content to go into the database.
613
     *
614
     * @param string $value Value to process.
615
     * @param int $count Recursion brake. Decremented on each recursion down to zero. Default is 5 (which equals the allowed nesting levels of p tags).
616
     * @param bool $returnArray If TRUE, an array with the lines is returned, otherwise a string of the processed input value.
617
     * @return string|array Processed input value.
618
     * @see setDivTags()
619
     */
620
    protected function divideIntoLines($value, $count = 5, $returnArray = false)
621
    {
622
        // Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...?
623
        $paragraphBlocks = $this->splitIntoBlock('p', $value, true);
624
        // Returns plainly the content if there was no p sections in it
625
        if (count($paragraphBlocks) <= 1 || $count <= 0) {
626
            return $this->sanitizeLineBreaksForContentOnly($value);
627
        }
628
629
        // Traverse the splitted sections
630
        foreach ($paragraphBlocks as $k => $v) {
631
            if ($k % 2) {
632
                // Inside a <p> section
633
                $v = $this->removeFirstAndLastTag($v);
634
                // Fetching 'sub-lines' - which will explode any further p nesting recursively
635
                $subLines = $this->divideIntoLines($v, $count - 1, true);
636
                // So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error')
637
                if (is_array($subLines)) {
638
                    $paragraphBlocks[$k] = implode(LF, $subLines);
639
                } else {
640
                    //... but if NO subsection was found, we process it as a TRUE line without erroneous content:
641
                    $paragraphBlocks[$k] = $this->processContentWithinParagraph($subLines, $paragraphBlocks[$k]);
642
                }
643
                // If it turns out the line is just blank (containing a &nbsp; possibly) then just make it pure blank.
644
                // But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes.
645
                // Those attributes should have been filtered before; if they are still there they must be considered as possible content.
646
                if (trim(strip_tags($paragraphBlocks[$k])) === '&nbsp;' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) {
647
                    $paragraphBlocks[$k] = '';
648
                }
649
            } else {
650
                // Outside a paragraph, if there is still something in there, just add a <p> tag
651
                // Remove positions which are outside <p> tags and without content
652
                $paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>'));
653
                $paragraphBlocks[$k] = $this->sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]);
654
                if ((string)$paragraphBlocks[$k] === '') {
655
                    unset($paragraphBlocks[$k]);
656
                } else {
657
                    // add <p> tags around the content
658
                    $paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]);
659
                }
660
            }
661
        }
662
        return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks);
663
    }
664
665
    /**
666
     * Converts all lines into <p></p>-sections (unless the line has a p - tag already)
667
     * For processing of content going FROM database TO RTE.
668
     *
669
     * @param string $value Value to convert
670
     * @return string Processed value.
671
     * @see divideIntoLines()
672
     */
673
    protected function setDivTags($value)
674
    {
675
        // First, setting configuration for the HTMLcleaner function. This will process each line between the <div>/<p> section on their way to the RTE
676
        $keepTags = $this->getKeepTags('rte');
677
        // Divide the content into lines
678
        $parts = explode(LF, $value);
679
        foreach ($parts as $k => $v) {
680
            // Processing of line content:
681
            // If the line is blank, set it to &nbsp;
682
            if (trim($parts[$k]) === '') {
683
                $parts[$k] = '&nbsp;';
684
            } else {
685
                // Clean the line content, keeping unknown tags (as they can be removed in the entryHTMLparser)
686
                $parts[$k] = $this->HTMLcleaner($parts[$k], $keepTags, 'protect');
687
                // convert double-encoded &nbsp; into regular &nbsp; however this could also be reversed via the exitHTMLparser
688
                // This was previously an option to disable called "dontConvAmpInNBSP_rte"
689
                $parts[$k] = str_replace('&amp;nbsp;', '&nbsp;', $parts[$k]);
690
            }
691
            // Wrapping the line in <p> tags if not already wrapped and does not contain an hr tag
692
            if (!preg_match('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', $parts[$k])) {
693
                $testStr = strtolower(trim($parts[$k]));
694
                if (strpos($testStr, '<div') !== 0 || substr($testStr, -6) !== '</div>') {
695
                    if (strpos($testStr, '<p') !== 0 || substr($testStr, -4) !== '</p>') {
696
                        // Only set p-tags if there is not already div or p tags:
697
                        $parts[$k] = '<p>' . $parts[$k] . '</p>';
698
                    }
699
                }
700
            }
701
        }
702
        // Implode result:
703
        return implode(LF, $parts);
704
    }
705
706
    /**
707
     * Used for transformation from RTE to DB
708
     *
709
     * Works on a single line within a <p> tag when storing into the database
710
     * This always adds <p> tags and validates the arguments,
711
     * additionally the content is cleaned up via the HTMLcleaner.
712
     *
713
     * @param string $content the content within the <p> tag
714
     * @param string $fullContentWithTag the whole <p> tag surrounded as well
715
     *
716
     * @return string the full <p> tag with cleaned content
717
     */
718
    protected function processContentWithinParagraph(string $content, string $fullContentWithTag)
719
    {
720
        // clean up the content
721
        $content = $this->HTMLcleaner_db($content);
722
        // Get the <p> tag, and validate the attributes
723
        $fTag = $this->getFirstTag($fullContentWithTag);
724
        // Check which attributes of the <p> tag to keep attributes
725
        if (!empty($this->allowedAttributesForParagraphTags)) {
726
            [$tagAttributes] = $this->get_tag_attributes($fTag);
727
            // Make sure the tag attributes only contain the ones that are defined to be allowed
728
            $tagAttributes = array_intersect_key($tagAttributes, array_flip($this->allowedAttributesForParagraphTags));
729
730
            // Only allow classes that are whitelisted in $this->allowedClasses
731
            if (isset($tagAttributes['class']) && trim($tagAttributes['class']) !== '' && !empty($this->allowedClasses) && !in_array($tagAttributes['class'], $this->allowedClasses, true)) {
732
                $classes = GeneralUtility::trimExplode(' ', $tagAttributes['class'], true);
733
                $classes = array_intersect($classes, $this->allowedClasses);
734
                if (!empty($classes)) {
735
                    $tagAttributes['class'] = implode(' ', $classes);
736
                } else {
737
                    unset($tagAttributes['class']);
738
                }
739
            }
740
        } else {
741
            $tagAttributes = [];
742
        }
743
        // Remove any line break
744
        $content = str_replace(LF, '', $content);
745
        // Compile the surrounding <p> tag
746
        $content = '<' . rtrim('p ' . $this->compileTagAttribs($tagAttributes)) . '>' . $content . '</p>';
747
        return $content;
748
    }
749
750
    /**
751
     * Wrap <hr> tags with LFs, and also remove double LFs, used when transforming from RTE to DB
752
     *
753
     * @param string $content
754
     * @return string the modified content
755
     */
756
    protected function sanitizeLineBreaksForContentOnly(string $content)
757
    {
758
        $content = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', LF . '<$1$2/>' . LF, $content);
759
        $content = str_replace(LF . LF, LF, $content);
760
        $content = preg_replace('/(^' . LF . ')|(' . LF . '$)/i', '', $content);
761
        return $content;
762
    }
763
764
    /**
765
     * Called before any processing / transformation is made
766
     * Removing any CRs (char 13) and only deal with LFs (char 10) internally.
767
     * CR has a very disturbing effect, so just remove all CR and rely on LF
768
     *
769
     * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
770
     *
771
     * @param string $content the content to process
772
     * @return string the modified content
773
     */
774
    protected function streamlineLineBreaksForProcessing(string $content)
775
    {
776
        return str_replace(CR, '', $content);
777
    }
778
779
    /**
780
     * Called after any processing / transformation was made
781
     * just before the content is returned by the RTE parser all line breaks
782
     * get unified to be "CRLF"s again.
783
     *
784
     * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
785
     *
786
     * @param string $content the content to process
787
     * @return string the modified content
788
     */
789
    protected function streamlineLineBreaksAfterProcessing(string $content)
790
    {
791
        // Make sure no \r\n sequences has entered in the meantime
792
        $content = $this->streamlineLineBreaksForProcessing($content);
793
        // ... and then change all \n into \r\n
794
        return str_replace(LF, CRLF, $content);
795
    }
796
797
    /**
798
     * Content Transformation from DB to RTE
799
     * Checks all <a> tags which reference a t3://page and checks if the page is available
800
     * If not, some offensive styling is added.
801
     *
802
     * @param string $content
803
     * @return string the modified content
804
     */
805
    protected function markBrokenLinks(string $content): string
806
    {
807
        $blocks = $this->splitIntoBlock('A', $content);
808
        $linkService = GeneralUtility::makeInstance(LinkService::class);
809
        foreach ($blocks as $position => $value) {
810
            if ($position % 2 === 0) {
811
                continue;
812
            }
813
            [$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
814
            if (empty($attributes['href'])) {
815
                continue;
816
            }
817
818
            try {
819
                $hrefInformation = $linkService->resolve($attributes['href']);
820
821
                $brokenLinkAnalysis = new BrokenLinkAnalysisEvent($hrefInformation['type'], $hrefInformation);
822
                $this->eventDispatcher->dispatch($brokenLinkAnalysis);
823
                if ($brokenLinkAnalysis->isBrokenLink()) {
824
                    $attributes['data-rte-error'] = $brokenLinkAnalysis->getReason();
825
                }
826
            } catch (UnknownLinkHandlerException $e) {
827
                $attributes['data-rte-error'] = $e->getMessage();
828
            }
829
830
            // Always rewrite the block to allow the nested calling even if a page is found
831
            $blocks[$position] =
832
                '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
833
                . $this->markBrokenLinks($this->removeFirstAndLastTag($blocks[$position]))
834
                . '</a>';
835
        }
836
        return implode('', $blocks);
837
    }
838
839
    /**
840
     * Content Transformation from RTE to DB
841
     * Removes link information error attributes from <a> tags that are added to broken links
842
     *
843
     * @param string $content the content to process
844
     * @return string the modified content
845
     */
846
    protected function removeBrokenLinkMarkers(string $content): string
847
    {
848
        $blocks = $this->splitIntoBlock('A', $content);
849
        foreach ($blocks as $position => $value) {
850
            if ($position % 2 === 0) {
851
                continue;
852
            }
853
            [$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
854
            if (empty($attributes['href'])) {
855
                continue;
856
            }
857
            // Always remove the styling again (regardless of the page was found or not)
858
            // so the database does not contain ugly stuff
859
            unset($attributes['data-rte-error']);
860
            if (isset($attributes['style'])) {
861
                $attributes['style'] = trim(str_replace('background-color: yellow; border:2px red solid; color: black;', '', $attributes['style']));
862
                if (empty($attributes['style'])) {
863
                    unset($attributes['style']);
864
                }
865
            }
866
            $blocks[$position] =
867
                '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
868
                . $this->removeBrokenLinkMarkers($this->removeFirstAndLastTag($blocks[$position]))
869
                . '</a>';
870
        }
871
        return implode('', $blocks);
872
    }
873
}
874