Completed
Push — master ( a48ec2...4c6d80 )
by
unknown
13:40
created

RteHtmlParser::HTMLcleaner_db()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
3
/*
4
 * This file is part of the TYPO3 CMS project.
5
 *
6
 * It is free software; you can redistribute it and/or modify it under
7
 * the terms of the GNU General Public License, either version 2
8
 * of the License, or any later version.
9
 *
10
 * For the full copyright and license information, please read the
11
 * LICENSE.txt file that was distributed with this source code.
12
 *
13
 * The TYPO3 project - inspiring people to share!
14
 */
15
16
namespace TYPO3\CMS\Core\Html;
17
18
use Psr\EventDispatcher\EventDispatcherInterface;
19
use Psr\Log\LoggerAwareInterface;
20
use Psr\Log\LoggerAwareTrait;
21
use TYPO3\CMS\Core\Html\Event\BrokenLinkAnalysisEvent;
22
use TYPO3\CMS\Core\LinkHandling\Exception\UnknownLinkHandlerException;
23
use TYPO3\CMS\Core\LinkHandling\LinkService;
24
use TYPO3\CMS\Core\Utility\GeneralUtility;
25
26
/**
27
 * Class for parsing HTML for the Rich Text Editor. (also called transformations)
28
 *
29
 * Concerning line breaks:
30
 * Regardless if LF (Unix-style) or CRLF (Windows) was put in, the HtmlParser works with LFs and migrates all
31
 * line breaks to LFs internally, however when all transformations are done, all LFs are transformed to CRLFs.
32
 * This means: RteHtmlParser always returns CRLFs to be maximum compatible with all formats.
33
 */
34
class RteHtmlParser extends HtmlParser implements LoggerAwareInterface
35
{
36
    use LoggerAwareTrait;
37
38
    /**
39
     * List of elements that are not wrapped into a "p" tag while doing the transformation.
40
     * @var string
41
     */
42
    protected $blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE';
43
44
    /**
45
     * List of all tags that are allowed by default
46
     * @var string
47
     */
48
    protected $defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn';
49
50
    /**
51
     * Set to the TSconfig options coming from Page TSconfig
52
     *
53
     * @var array
54
     */
55
    protected $procOptions = [];
56
57
    /**
58
     * Run-away brake for recursive calls.
59
     *
60
     * @var int
61
     */
62
    protected $TS_transform_db_safecounter = 100;
63
64
    /**
65
     * Data caching for processing function
66
     *
67
     * @var array
68
     */
69
    protected $getKeepTags_cache = [];
70
71
    /**
72
     * Storage of the allowed CSS class names in the RTE
73
     *
74
     * @var array
75
     */
76
    protected $allowedClasses = [];
77
78
    /**
79
     * A list of HTML attributes for <p> tags. Because <p> tags are wrapped currently in a special handling,
80
     * they have a special place for configuration via 'proc.keepPDIVattribs'
81
     *
82
     * @var array
83
     */
84
    protected $allowedAttributesForParagraphTags = [
85
        'class',
86
        'align',
87
        'id',
88
        'title',
89
        'dir',
90
        'lang',
91
        'xml:lang',
92
        'itemscope',
93
        'itemtype',
94
        'itemprop'
95
    ];
96
97
    /**
98
     * Any tags that are allowed outside of <p> sections - usually similar to the block elements
99
     * plus some special tags like <hr> and <img> (if images are allowed).
100
     * Completely overrideable via 'proc.allowTagsOutside'
101
     *
102
     * @var array
103
     */
104
    protected $allowedTagsOutsideOfParagraphs = [
105
        'address',
106
        'article',
107
        'aside',
108
        'blockquote',
109
        'div',
110
        'footer',
111
        'header',
112
        'hr',
113
        'nav',
114
        'section'
115
    ];
116
117
    /**
118
     * @var EventDispatcherInterface
119
     */
120
    protected $eventDispatcher;
121
122
    public function __construct(EventDispatcherInterface $eventDispatcher)
123
    {
124
        $this->eventDispatcher = $eventDispatcher;
125
    }
126
127
    /**
128
     * Sanitize and streamline given options (usually from RichTextConfiguration results "proc."
129
     * and set them to the respective properties.
130
     *
131
     * @param array $processingConfiguration
132
     */
133
    protected function setProcessingConfiguration(array $processingConfiguration): void
134
    {
135
        $this->procOptions = $processingConfiguration;
136
        if (isset($this->procOptions['allowedClasses.'])) {
137
            $this->allowedClasses = (array)$this->procOptions['allowedClasses.'];
138
        } else {
139
            $this->allowedClasses = GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true);
140
        }
141
142
        // Dynamic configuration of blockElementList
143
        if (!empty($this->procOptions['blockElementList'])) {
144
            $this->blockElementList = $this->procOptions['blockElementList'];
145
        }
146
147
        // Define which attributes are allowed on <p> tags
148
        if (isset($this->procOptions['allowAttributes.'])) {
149
            $this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.'];
150
        }
151
        // Override tags which are allowed outside of <p> tags
152
        if (isset($this->procOptions['allowTagsOutside'])) {
153
            if (!isset($this->procOptions['allowTagsOutside.'])) {
154
                $this->allowedTagsOutsideOfParagraphs = GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true);
155
            } else {
156
                $this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.'];
157
            }
158
        }
159
    }
160
161
    /**
162
     * Main entry point for transforming RTE content in the database so the Rich Text Editor can deal with
163
     * e.g. links.
164
     *
165
     * @param string $value
166
     * @param array $processingConfiguration
167
     * @return string
168
     */
169
    public function transformTextForRichTextEditor(string $value, array $processingConfiguration): string
170
    {
171
        $this->setProcessingConfiguration($processingConfiguration);
172
        $modes = $this->resolveAppliedTransformationModes('rte');
173
        $value = $this->streamlineLineBreaksForProcessing($value);
174
        // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
175
        $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_rte');
176
        // Traverse modes
177
        foreach ($modes as $cmd) {
178
            // Checking for user defined transformation:
179
            if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
180
                $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
181
                $_procObj->pObj = $this;
182
                $value = $_procObj->transform_rte($value, $this);
183
            } else {
184
                // ... else use defaults:
185
                switch ($cmd) {
186
                    case 'detectbrokenlinks':
187
                        $value = $this->markBrokenLinks($value);
188
                        break;
189
                    case 'css_transform':
190
                        $value = $this->TS_transform_rte($value);
191
                        break;
192
                    default:
193
                        // Do nothing
194
                }
195
            }
196
        }
197
        // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
198
        $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_rte');
199
        // Final clean up of linebreaks
200
        $value = $this->streamlineLineBreaksAfterProcessing($value);
201
        return $value;
202
    }
203
204
    /**
205
     * Called to process HTML content before it is stored in the database.
206
     *
207
     * @param string $value
208
     * @param array $processingConfiguration
209
     * @return string
210
     */
211
    public function transformTextForPersistence(string $value, array $processingConfiguration): string
212
    {
213
        $this->setProcessingConfiguration($processingConfiguration);
214
        $modes = $this->resolveAppliedTransformationModes('db');
215
        $value = $this->streamlineLineBreaksForProcessing($value);
216
        // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
217
        $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_db');
218
        // Traverse modes
219
        foreach ($modes as $cmd) {
220
            // Checking for user defined transformation:
221
            if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) {
222
                $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]);
223
                $_procObj->pObj = $this;
224
                $_procObj->transformationKey = $cmd;
225
                $value = $_procObj->transform_db($value, $this);
226
            } else {
227
                // ... else use defaults:
228
                switch ($cmd) {
229
                    case 'detectbrokenlinks':
230
                        $value = $this->removeBrokenLinkMarkers($value);
231
                        break;
232
                    case 'ts_links':
233
                        $value = $this->TS_links_db($value);
234
                        break;
235
                    case 'css_transform':
236
                        // Transform empty paragraphs into spacing paragraphs
237
                        $value = str_replace('<p></p>', '<p>&nbsp;</p>', $value);
238
                        // Double any trailing spacing paragraph so that it does not get removed by divideIntoLines()
239
                        $value = preg_replace('/<p>&nbsp;<\/p>$/', '<p>&nbsp;</p><p>&nbsp;</p>', $value);
240
                        $value = $this->TS_transform_db($value);
241
                        break;
242
                    default:
243
                        // Do nothing
244
                }
245
            }
246
        }
247
        // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
248
        $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_db');
249
        // Final clean up of linebreaks
250
        $value = $this->streamlineLineBreaksAfterProcessing($value);
251
        return $value;
252
    }
253
254
    /**
255
     * Ensures what transformation modes should be executed, and that they are only executed once.
256
     *
257
     * @param string $direction
258
     * @return array the resolved transformation modes
259
     */
260
    protected function resolveAppliedTransformationModes(string $direction): array
261
    {
262
        // Setting modes / transformations to be called
263
        if ((string)$this->procOptions['overruleMode'] !== '') {
264
            $modes = GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']);
265
        } else {
266
            $modes = [$this->procOptions['mode']];
267
        }
268
269
        $modeList = implode(',', $modes);
270
271
        // Replace the shortcut "default" with all custom modes
272
        $modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_links', $modeList);
273
274
        // Make list unique
275
        $modes = array_unique(GeneralUtility::trimExplode(',', $modeList, true));
276
        // Reverse order if direction is "rte"
277
        if ($direction === 'rte') {
278
            $modes = array_reverse($modes);
279
        }
280
281
        return $modes;
282
    }
283
284
    /**
285
     * Runs the HTML parser if it is configured
286
     * Getting additional HTML cleaner configuration. These are applied either before or after the main transformation
287
     * is done and thus totally independent processing options you can set up.
288
     *
289
     * This is only possible via TSconfig (procOptions) currently.
290
     *
291
     * @param string $content
292
     * @param string $configurationDirective used to look up in the procOptions if enabled, and then fetch the
293
     * @return string the processed content
294
     */
295
    protected function runHtmlParserIfConfigured($content, $configurationDirective)
296
    {
297
        if (!empty($this->procOptions[$configurationDirective])) {
298
            [$keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration] = $this->HTMLparserConfig($this->procOptions[$configurationDirective . '.']);
299
            $content = $this->HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration);
300
        }
301
        return $content;
302
    }
303
304
    /************************************
305
     *
306
     * Specific RTE TRANSFORMATION functions
307
     *
308
     *************************************/
309
310
    /**
311
     * Transformation handler: 'ts_links' / direction: "db"
312
     * Processing anchor tags, and resolves them correctly again via the LinkService syntax
313
     *
314
     * Splits content into <a> tag blocks and processes each tag, and allows hooks to actually render
315
     * the result.
316
     *
317
     * @param string $value Content input
318
     * @return string Content output
319
     */
320
    protected function TS_links_db($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_links_db" is not in camel caps format
Loading history...
321
    {
322
        $blockSplit = $this->splitIntoBlock('A', $value);
323
        foreach ($blockSplit as $k => $v) {
324
            if ($k % 2) {
325
                [$tagAttributes] = $this->get_tag_attributes($this->getFirstTag($v), true);
326
327
                // Anchors would not have an href attribute
328
                if (!isset($tagAttributes['href'])) {
329
                    continue;
330
                }
331
                $linkService = GeneralUtility::makeInstance(LinkService::class);
332
                // Store the link as <a> tag as default by TYPO3, with the link service syntax
333
                try {
334
                    $linkInformation = $linkService->resolve($tagAttributes['href'] ?? '');
335
                    $tagAttributes['href'] = $linkService->asString($linkInformation);
336
                } catch (UnknownLinkHandlerException $e) {
337
                    $tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href'];
338
                }
339
340
                $blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>'
341
                    . $this->TS_links_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</a>';
342
            }
343
        }
344
        return implode('', $blockSplit);
345
    }
346
347
    /**
348
     * Transformation handler: 'css_transform' / direction: "db"
349
     * Cleaning (->db) for standard content elements (ts)
350
     *
351
     * @param string $value Content input
352
     * @return string Content output
353
     * @see TS_transform_rte()
354
     */
355
    protected function TS_transform_db($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_transform_db" is not in camel caps format
Loading history...
356
    {
357
        // Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...)
358
        $this->TS_transform_db_safecounter--;
359
        if ($this->TS_transform_db_safecounter < 0) {
360
            return $value;
361
        }
362
        // Split the content from RTE by the occurrence of these blocks:
363
        $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
364
365
        // Avoid superfluous linebreaks by transform_db after ending headListTag
366
        while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') {
367
            array_pop($blockSplit);
368
        }
369
370
        // Traverse the blocks
371
        foreach ($blockSplit as $k => $v) {
372
            if ($k % 2) {
373
                // Inside block:
374
                // Init:
375
                $tag = $this->getFirstTag($v);
376
                $tagName = strtolower($this->getFirstTagName($v));
377
                // Process based on the tag:
378
                switch ($tagName) {
379
                    case 'blockquote':
380
                    case 'dd':
381
                    case 'div':
382
                    case 'header':
383
                    case 'section':
384
                    case 'footer':
385
                    case 'nav':
386
                    case 'article':
387
                    case 'aside':
388
                        $blockSplit[$k] = $tag . $this->TS_transform_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
389
                        break;
390
                    case 'pre':
391
                        break;
392
                    default:
393
                        // usually <hx> tags and <table> tags where no other block elements are within the tags
394
                        // Eliminate true linebreaks inside block element tags
395
                        $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
396
                }
397
            } else {
398
                // NON-block:
399
                if (trim($blockSplit[$k]) !== '') {
400
                    $blockSplit[$k] = str_replace('<hr/>', '<hr />', $blockSplit[$k]);
401
                    // Remove linebreaks preceding hr tags
402
                    $blockSplit[$k] = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $blockSplit[$k]);
403
                    // Remove linebreaks following hr tags
404
                    $blockSplit[$k] = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $blockSplit[$k]);
405
                    // Replace other linebreaks with space
406
                    $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
407
                    $blockSplit[$k] = $this->divideIntoLines($blockSplit[$k]);
408
                } else {
409
                    unset($blockSplit[$k]);
410
                }
411
            }
412
        }
413
        $this->TS_transform_db_safecounter++;
414
        return implode(LF, $blockSplit);
415
    }
416
417
    /**
418
     * Transformation handler: css_transform / direction: "rte"
419
     * Set (->rte) for standard content elements (ts)
420
     *
421
     * @param string $value Content input
422
     * @return string Content output
423
     * @see TS_transform_db()
424
     */
425
    protected function TS_transform_rte($value)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::TS_transform_rte" is not in camel caps format
Loading history...
426
    {
427
        // Split the content from database by the occurrence of the block elements
428
        $blockSplit = $this->splitIntoBlock($this->blockElementList, $value);
429
        // Traverse the blocks
430
        foreach ($blockSplit as $k => $v) {
431
            if ($k % 2) {
432
                // Inside one of the blocks:
433
                // Init:
434
                $tag = $this->getFirstTag($v);
435
                $tagName = strtolower($this->getFirstTagName($v));
436
                // Based on tagname, we do transformations:
437
                switch ($tagName) {
438
                    case 'blockquote':
439
                    case 'dd':
440
                    case 'div':
441
                    case 'header':
442
                    case 'section':
443
                    case 'footer':
444
                    case 'nav':
445
                    case 'article':
446
                    case 'aside':
447
                        $blockSplit[$k] = $tag . $this->TS_transform_rte($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
448
                        break;
449
                }
450
                $blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]);
451
            } else {
452
                // NON-block:
453
                $nextFTN = $this->getFirstTagName($blockSplit[$k + 1] ?? '');
454
                $onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1);
455
                // If the line is followed by a block or is the last line:
456
                if (GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) {
457
                    // If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1
458
                    if (!$onlyLineBreaks) {
459
                        $blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]);
460
                    } else {
461
                        // If the line contains only linebreaks, remove the leading linebreak
462
                        $blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]);
463
                    }
464
                }
465
                // If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks
466
                if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) {
467
                    unset($blockSplit[$k]);
468
                } else {
469
                    $blockSplit[$k] = $this->setDivTags($blockSplit[$k]);
470
                }
471
            }
472
        }
473
        return implode(LF, $blockSplit);
474
    }
475
476
    /***************************************************************
477
     *
478
     * Generic RTE transformation, analysis and helper functions
479
     *
480
     **************************************************************/
481
482
    /**
483
     * Function for cleaning content going into the database.
484
     * Content is cleaned eg. by removing unallowed HTML and ds-HSC content
485
     * It is basically calling HTMLcleaner from the parent class with some preset configuration specifically set up for cleaning content going from the RTE into the db
486
     *
487
     * @param string $content Content to clean up
488
     * @return string Clean content
489
     * @see getKeepTags()
490
     */
491
    protected function HTMLcleaner_db($content)
0 ignored issues
show
Coding Style introduced by
Method name "RteHtmlParser::HTMLcleaner_db" is not in camel caps format
Loading history...
492
    {
493
        $keepTags = $this->getKeepTags('db');
494
        return $this->HTMLcleaner($content, $keepTags, false);
495
    }
496
497
    /**
498
     * Creates an array of configuration for the HTMLcleaner function based on whether content
499
     * go TO or FROM the Rich Text Editor ($direction)
500
     *
501
     * @param string $direction The direction of the content being processed by the output configuration; "db" (content going into the database FROM the rte) or "rte" (content going into the form)
502
     * @return array Configuration array
503
     * @see HTMLcleaner_db()
504
     */
505
    protected function getKeepTags($direction = 'rte')
506
    {
507
        if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) {
508
            // Setting up allowed tags:
509
            // Default is to get allowed/denied tags from internal array of processing options:
510
            // Construct default list of tags to keep:
511
            if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) {
512
                $keepTags = implode(',', $this->procOptions['allowTags.']);
513
            } else {
514
                $keepTags = $this->procOptions['allowTags'] ?? '';
515
            }
516
            $keepTags = array_flip(GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true));
517
            // For tags to deny, remove them from $keepTags array:
518
            $denyTags = GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true);
519
            foreach ($denyTags as $dKe) {
520
                unset($keepTags[$dKe]);
521
            }
522
            // Based on the direction of content, set further options:
523
            switch ($direction) {
524
                case 'rte':
525
                    // Transforming keepTags array so it can be understood by the HTMLcleaner function.
526
                    // This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array.
527
                    [$keepTags] = $this->HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags);
528
                    break;
529
                case 'db':
530
                    // Setting up span tags if they are allowed:
531
                    if (isset($keepTags['span'])) {
532
                        $keepTags['span'] = [
533
                            'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop',
534
                            'fixAttrib' => [
535
                                'class' => [
536
                                    'removeIfFalse' => 1
537
                                ]
538
                            ],
539
                            'rmTagIfNoAttrib' => 1
540
                        ];
541
                        if (!empty($this->allowedClasses)) {
542
                            $keepTags['span']['fixAttrib']['class']['list'] = $this->allowedClasses;
543
                        }
544
                    }
545
                    // Setting further options, getting them from the processing options
546
                    $TSc = $this->procOptions['HTMLparser_db.'] ?? [];
547
                    if (empty($TSc['globalNesting'])) {
548
                        $TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span';
549
                    }
550
                    if (empty($TSc['noAttrib'])) {
551
                        $TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike';
552
                    }
553
                    // Transforming the array from TypoScript to regular array:
554
                    [$keepTags] = $this->HTMLparserConfig($TSc, $keepTags);
555
                    break;
556
            }
557
            // Caching (internally, in object memory) the result
558
            $this->getKeepTags_cache[$direction] = $keepTags;
559
        }
560
        // Return result:
561
        return $this->getKeepTags_cache[$direction];
562
    }
563
564
    /**
565
     * This resolves the $value into parts based on <p>-sections. These are returned as lines separated by LF.
566
     * This point is to resolve the HTML-code returned from RTE into ordinary lines so it's 'human-readable'
567
     * The function ->setDivTags does the opposite.
568
     * This function processes content to go into the database.
569
     *
570
     * @param string $value Value to process.
571
     * @param int $count Recursion brake. Decremented on each recursion down to zero. Default is 5 (which equals the allowed nesting levels of p tags).
572
     * @param bool $returnArray If TRUE, an array with the lines is returned, otherwise a string of the processed input value.
573
     * @return string|array Processed input value.
574
     * @see setDivTags()
575
     */
576
    protected function divideIntoLines($value, $count = 5, $returnArray = false)
577
    {
578
        // Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...?
579
        $paragraphBlocks = $this->splitIntoBlock('p', $value, true);
580
        // Returns plainly the content if there was no p sections in it
581
        if (count($paragraphBlocks) <= 1 || $count <= 0) {
582
            return $this->sanitizeLineBreaksForContentOnly($value);
583
        }
584
585
        // Traverse the splitted sections
586
        foreach ($paragraphBlocks as $k => $v) {
587
            if ($k % 2) {
588
                // Inside a <p> section
589
                $v = $this->removeFirstAndLastTag($v);
590
                // Fetching 'sub-lines' - which will explode any further p nesting recursively
591
                $subLines = $this->divideIntoLines($v, $count - 1, true);
592
                // So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error')
593
                if (is_array($subLines)) {
594
                    $paragraphBlocks[$k] = implode(LF, $subLines);
595
                } else {
596
                    //... but if NO subsection was found, we process it as a TRUE line without erroneous content:
597
                    $paragraphBlocks[$k] = $this->processContentWithinParagraph($subLines, $paragraphBlocks[$k]);
598
                }
599
                // If it turns out the line is just blank (containing a &nbsp; possibly) then just make it pure blank.
600
                // But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes.
601
                // Those attributes should have been filtered before; if they are still there they must be considered as possible content.
602
                if (trim(strip_tags($paragraphBlocks[$k])) === '&nbsp;' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) {
603
                    $paragraphBlocks[$k] = '';
604
                }
605
            } else {
606
                // Outside a paragraph, if there is still something in there, just add a <p> tag
607
                // Remove positions which are outside <p> tags and without content
608
                $paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>'));
609
                $paragraphBlocks[$k] = $this->sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]);
610
                if ((string)$paragraphBlocks[$k] === '') {
611
                    unset($paragraphBlocks[$k]);
612
                } else {
613
                    // add <p> tags around the content
614
                    $paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]);
615
                }
616
            }
617
        }
618
        return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks);
619
    }
620
621
    /**
622
     * Converts all lines into <p></p>-sections (unless the line has a p - tag already)
623
     * For processing of content going FROM database TO RTE.
624
     *
625
     * @param string $value Value to convert
626
     * @return string Processed value.
627
     * @see divideIntoLines()
628
     */
629
    protected function setDivTags($value)
630
    {
631
        // First, setting configuration for the HTMLcleaner function. This will process each line between the <div>/<p> section on their way to the RTE
632
        $keepTags = $this->getKeepTags('rte');
633
        // Divide the content into lines
634
        $parts = explode(LF, $value);
635
        foreach ($parts as $k => $v) {
636
            // Processing of line content:
637
            // If the line is blank, set it to &nbsp;
638
            if (trim($parts[$k]) === '') {
639
                $parts[$k] = '&nbsp;';
640
            } else {
641
                // Clean the line content, keeping unknown tags (as they can be removed in the entryHTMLparser)
642
                $parts[$k] = $this->HTMLcleaner($parts[$k], $keepTags, 'protect');
643
                // convert double-encoded &nbsp; into regular &nbsp; however this could also be reversed via the exitHTMLparser
644
                // This was previously an option to disable called "dontConvAmpInNBSP_rte"
645
                $parts[$k] = str_replace('&amp;nbsp;', '&nbsp;', $parts[$k]);
646
            }
647
            // Wrapping the line in <p> tags if not already wrapped and does not contain an hr tag
648
            if (!preg_match('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', $parts[$k])) {
649
                $testStr = strtolower(trim($parts[$k]));
650
                if (strpos($testStr, '<div') !== 0 || substr($testStr, -6) !== '</div>') {
651
                    if (strpos($testStr, '<p') !== 0 || substr($testStr, -4) !== '</p>') {
652
                        // Only set p-tags if there is not already div or p tags:
653
                        $parts[$k] = '<p>' . $parts[$k] . '</p>';
654
                    }
655
                }
656
            }
657
        }
658
        // Implode result:
659
        return implode(LF, $parts);
660
    }
661
662
    /**
663
     * Used for transformation from RTE to DB
664
     *
665
     * Works on a single line within a <p> tag when storing into the database
666
     * This always adds <p> tags and validates the arguments,
667
     * additionally the content is cleaned up via the HTMLcleaner.
668
     *
669
     * @param string $content the content within the <p> tag
670
     * @param string $fullContentWithTag the whole <p> tag surrounded as well
671
     *
672
     * @return string the full <p> tag with cleaned content
673
     */
674
    protected function processContentWithinParagraph(string $content, string $fullContentWithTag)
675
    {
676
        // clean up the content
677
        $content = $this->HTMLcleaner_db($content);
678
        // Get the <p> tag, and validate the attributes
679
        $fTag = $this->getFirstTag($fullContentWithTag);
680
        // Check which attributes of the <p> tag to keep attributes
681
        if (!empty($this->allowedAttributesForParagraphTags)) {
682
            [$tagAttributes] = $this->get_tag_attributes($fTag);
683
            // Make sure the tag attributes only contain the ones that are defined to be allowed
684
            $tagAttributes = array_intersect_key($tagAttributes, array_flip($this->allowedAttributesForParagraphTags));
685
686
            // Only allow classes that are whitelisted in $this->allowedClasses
687
            if (isset($tagAttributes['class']) && trim($tagAttributes['class']) !== '' && !empty($this->allowedClasses) && !in_array($tagAttributes['class'], $this->allowedClasses, true)) {
688
                $classes = GeneralUtility::trimExplode(' ', $tagAttributes['class'], true);
689
                $classes = array_intersect($classes, $this->allowedClasses);
690
                if (!empty($classes)) {
691
                    $tagAttributes['class'] = implode(' ', $classes);
692
                } else {
693
                    unset($tagAttributes['class']);
694
                }
695
            }
696
        } else {
697
            $tagAttributes = [];
698
        }
699
        // Remove any line break
700
        $content = str_replace(LF, '', $content);
701
        // Compile the surrounding <p> tag
702
        $content = '<' . rtrim('p ' . $this->compileTagAttribs($tagAttributes)) . '>' . $content . '</p>';
703
        return $content;
704
    }
705
706
    /**
707
     * Wrap <hr> tags with LFs, and also remove double LFs, used when transforming from RTE to DB
708
     *
709
     * @param string $content
710
     * @return string the modified content
711
     */
712
    protected function sanitizeLineBreaksForContentOnly(string $content)
713
    {
714
        $content = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', LF . '<$1$2/>' . LF, $content);
715
        $content = str_replace(LF . LF, LF, $content);
716
        $content = preg_replace('/(^' . LF . ')|(' . LF . '$)/i', '', $content);
717
        return $content;
718
    }
719
720
    /**
721
     * Called before any processing / transformation is made
722
     * Removing any CRs (char 13) and only deal with LFs (char 10) internally.
723
     * CR has a very disturbing effect, so just remove all CR and rely on LF
724
     *
725
     * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
726
     *
727
     * @param string $content the content to process
728
     * @return string the modified content
729
     */
730
    protected function streamlineLineBreaksForProcessing(string $content)
731
    {
732
        return str_replace(CR, '', $content);
733
    }
734
735
    /**
736
     * Called after any processing / transformation was made
737
     * just before the content is returned by the RTE parser all line breaks
738
     * get unified to be "CRLF"s again.
739
     *
740
     * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks.
741
     *
742
     * @param string $content the content to process
743
     * @return string the modified content
744
     */
745
    protected function streamlineLineBreaksAfterProcessing(string $content)
746
    {
747
        // Make sure no \r\n sequences has entered in the meantime
748
        $content = $this->streamlineLineBreaksForProcessing($content);
749
        // ... and then change all \n into \r\n
750
        return str_replace(LF, CRLF, $content);
751
    }
752
753
    /**
754
     * Content Transformation from DB to RTE
755
     * Checks all <a> tags which reference a t3://page and checks if the page is available
756
     * If not, some offensive styling is added.
757
     *
758
     * @param string $content
759
     * @return string the modified content
760
     */
761
    protected function markBrokenLinks(string $content): string
762
    {
763
        $blocks = $this->splitIntoBlock('A', $content);
764
        $linkService = GeneralUtility::makeInstance(LinkService::class);
765
        foreach ($blocks as $position => $value) {
766
            if ($position % 2 === 0) {
767
                continue;
768
            }
769
            [$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
770
            if (empty($attributes['href'])) {
771
                continue;
772
            }
773
774
            try {
775
                $hrefInformation = $linkService->resolve($attributes['href']);
776
777
                $brokenLinkAnalysis = new BrokenLinkAnalysisEvent($hrefInformation['type'], $hrefInformation);
778
                $this->eventDispatcher->dispatch($brokenLinkAnalysis);
779
                if ($brokenLinkAnalysis->isBrokenLink()) {
780
                    $attributes['data-rte-error'] = $brokenLinkAnalysis->getReason();
781
                }
782
            } catch (UnknownLinkHandlerException $e) {
783
                $attributes['data-rte-error'] = $e->getMessage();
784
            }
785
786
            // Always rewrite the block to allow the nested calling even if a page is found
787
            $blocks[$position] =
788
                '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
789
                . $this->markBrokenLinks($this->removeFirstAndLastTag($blocks[$position]))
790
                . '</a>';
791
        }
792
        return implode('', $blocks);
793
    }
794
795
    /**
796
     * Content Transformation from RTE to DB
797
     * Removes link information error attributes from <a> tags that are added to broken links
798
     *
799
     * @param string $content the content to process
800
     * @return string the modified content
801
     */
802
    protected function removeBrokenLinkMarkers(string $content): string
803
    {
804
        $blocks = $this->splitIntoBlock('A', $content);
805
        foreach ($blocks as $position => $value) {
806
            if ($position % 2 === 0) {
807
                continue;
808
            }
809
            [$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true);
810
            if (empty($attributes['href'])) {
811
                continue;
812
            }
813
            // Always remove the styling again (regardless of the page was found or not)
814
            // so the database does not contain ugly stuff
815
            unset($attributes['data-rte-error']);
816
            if (isset($attributes['style'])) {
817
                $attributes['style'] = trim(str_replace('background-color: yellow; border:2px red solid; color: black;', '', $attributes['style']));
818
                if (empty($attributes['style'])) {
819
                    unset($attributes['style']);
820
                }
821
            }
822
            $blocks[$position] =
823
                '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
824
                . $this->removeBrokenLinkMarkers($this->removeFirstAndLastTag($blocks[$position]))
825
                . '</a>';
826
        }
827
        return implode('', $blocks);
828
    }
829
}
830