| Total Complexity | 116 |
| Total Lines | 794 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like RteHtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use RteHtmlParser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 34 | class RteHtmlParser extends HtmlParser implements LoggerAwareInterface |
||
| 35 | { |
||
| 36 | use LoggerAwareTrait; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * List of elements that are not wrapped into a "p" tag while doing the transformation. |
||
| 40 | * @var string |
||
| 41 | */ |
||
| 42 | protected $blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE'; |
||
| 43 | |||
| 44 | /** |
||
| 45 | * List of all tags that are allowed by default |
||
| 46 | * @var string |
||
| 47 | */ |
||
| 48 | protected $defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn'; |
||
| 49 | |||
| 50 | /** |
||
| 51 | * Set to the TSconfig options coming from Page TSconfig |
||
| 52 | * |
||
| 53 | * @var array |
||
| 54 | */ |
||
| 55 | protected $procOptions = []; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * Run-away brake for recursive calls. |
||
| 59 | * |
||
| 60 | * @var int |
||
| 61 | */ |
||
| 62 | protected $TS_transform_db_safecounter = 100; |
||
| 63 | |||
| 64 | /** |
||
| 65 | * Data caching for processing function |
||
| 66 | * |
||
| 67 | * @var array |
||
| 68 | */ |
||
| 69 | protected $getKeepTags_cache = []; |
||
| 70 | |||
| 71 | /** |
||
| 72 | * Storage of the allowed CSS class names in the RTE |
||
| 73 | * |
||
| 74 | * @var array |
||
| 75 | */ |
||
| 76 | protected $allowedClasses = []; |
||
| 77 | |||
| 78 | /** |
||
| 79 | * A list of HTML attributes for <p> tags. Because <p> tags are wrapped currently in a special handling, |
||
| 80 | * they have a special place for configuration via 'proc.keepPDIVattribs' |
||
| 81 | * |
||
| 82 | * @var array |
||
| 83 | */ |
||
| 84 | protected $allowedAttributesForParagraphTags = [ |
||
| 85 | 'class', |
||
| 86 | 'align', |
||
| 87 | 'id', |
||
| 88 | 'title', |
||
| 89 | 'dir', |
||
| 90 | 'lang', |
||
| 91 | 'xml:lang', |
||
| 92 | 'itemscope', |
||
| 93 | 'itemtype', |
||
| 94 | 'itemprop' |
||
| 95 | ]; |
||
| 96 | |||
| 97 | /** |
||
| 98 | * Any tags that are allowed outside of <p> sections - usually similar to the block elements |
||
| 99 | * plus some special tags like <hr> and <img> (if images are allowed). |
||
| 100 | * Completely overrideable via 'proc.allowTagsOutside' |
||
| 101 | * |
||
| 102 | * @var array |
||
| 103 | */ |
||
| 104 | protected $allowedTagsOutsideOfParagraphs = [ |
||
| 105 | 'address', |
||
| 106 | 'article', |
||
| 107 | 'aside', |
||
| 108 | 'blockquote', |
||
| 109 | 'div', |
||
| 110 | 'footer', |
||
| 111 | 'header', |
||
| 112 | 'hr', |
||
| 113 | 'nav', |
||
| 114 | 'section' |
||
| 115 | ]; |
||
| 116 | |||
| 117 | /** |
||
| 118 | * @var EventDispatcherInterface |
||
| 119 | */ |
||
| 120 | protected $eventDispatcher; |
||
| 121 | |||
| 122 | public function __construct(EventDispatcherInterface $eventDispatcher) |
||
| 123 | { |
||
| 124 | $this->eventDispatcher = $eventDispatcher; |
||
| 125 | } |
||
| 126 | |||
| 127 | /** |
||
| 128 | * Sanitize and streamline given options (usually from RichTextConfiguration results "proc." |
||
| 129 | * and set them to the respective properties. |
||
| 130 | * |
||
| 131 | * @param array $processingConfiguration |
||
| 132 | */ |
||
| 133 | protected function setProcessingConfiguration(array $processingConfiguration): void |
||
| 134 | { |
||
| 135 | $this->procOptions = $processingConfiguration; |
||
| 136 | if (isset($this->procOptions['allowedClasses.'])) { |
||
| 137 | $this->allowedClasses = (array)$this->procOptions['allowedClasses.']; |
||
| 138 | } else { |
||
| 139 | $this->allowedClasses = GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true); |
||
| 140 | } |
||
| 141 | |||
| 142 | // Dynamic configuration of blockElementList |
||
| 143 | if (!empty($this->procOptions['blockElementList'])) { |
||
| 144 | $this->blockElementList = $this->procOptions['blockElementList']; |
||
| 145 | } |
||
| 146 | |||
| 147 | // Define which attributes are allowed on <p> tags |
||
| 148 | if (isset($this->procOptions['allowAttributes.'])) { |
||
| 149 | $this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.']; |
||
| 150 | } |
||
| 151 | // Override tags which are allowed outside of <p> tags |
||
| 152 | if (isset($this->procOptions['allowTagsOutside'])) { |
||
| 153 | if (!isset($this->procOptions['allowTagsOutside.'])) { |
||
| 154 | $this->allowedTagsOutsideOfParagraphs = GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true); |
||
| 155 | } else { |
||
| 156 | $this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.']; |
||
| 157 | } |
||
| 158 | } |
||
| 159 | } |
||
| 160 | |||
| 161 | /** |
||
| 162 | * Main entry point for transforming RTE content in the database so the Rich Text Editor can deal with |
||
| 163 | * e.g. links. |
||
| 164 | * |
||
| 165 | * @param string $value |
||
| 166 | * @param array $processingConfiguration |
||
| 167 | * @return string |
||
| 168 | */ |
||
| 169 | public function transformTextForRichTextEditor(string $value, array $processingConfiguration): string |
||
| 170 | { |
||
| 171 | $this->setProcessingConfiguration($processingConfiguration); |
||
| 172 | $modes = $this->resolveAppliedTransformationModes('rte'); |
||
| 173 | $value = $this->streamlineLineBreaksForProcessing($value); |
||
| 174 | // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner |
||
| 175 | $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_rte'); |
||
| 176 | // Traverse modes |
||
| 177 | foreach ($modes as $cmd) { |
||
| 178 | // Checking for user defined transformation: |
||
| 179 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) { |
||
| 180 | $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]); |
||
| 181 | $_procObj->pObj = $this; |
||
| 182 | $value = $_procObj->transform_rte($value, $this); |
||
| 183 | } else { |
||
| 184 | // ... else use defaults: |
||
| 185 | switch ($cmd) { |
||
| 186 | case 'detectbrokenlinks': |
||
| 187 | $value = $this->markBrokenLinks($value); |
||
| 188 | break; |
||
| 189 | case 'css_transform': |
||
| 190 | $value = $this->TS_transform_rte($value); |
||
| 191 | break; |
||
| 192 | default: |
||
| 193 | // Do nothing |
||
| 194 | } |
||
| 195 | } |
||
| 196 | } |
||
| 197 | // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner |
||
| 198 | $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_rte'); |
||
| 199 | // Final clean up of linebreaks |
||
| 200 | $value = $this->streamlineLineBreaksAfterProcessing($value); |
||
| 201 | return $value; |
||
| 202 | } |
||
| 203 | |||
| 204 | /** |
||
| 205 | * Called to process HTML content before it is stored in the database. |
||
| 206 | * |
||
| 207 | * @param string $value |
||
| 208 | * @param array $processingConfiguration |
||
| 209 | * @return string |
||
| 210 | */ |
||
| 211 | public function transformTextForPersistence(string $value, array $processingConfiguration): string |
||
| 212 | { |
||
| 213 | $this->setProcessingConfiguration($processingConfiguration); |
||
| 214 | $modes = $this->resolveAppliedTransformationModes('db'); |
||
| 215 | $value = $this->streamlineLineBreaksForProcessing($value); |
||
| 216 | // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner |
||
| 217 | $value = $this->runHtmlParserIfConfigured($value, 'entryHTMLparser_db'); |
||
| 218 | // Traverse modes |
||
| 219 | foreach ($modes as $cmd) { |
||
| 220 | // Checking for user defined transformation: |
||
| 221 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd])) { |
||
| 222 | $_procObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_parsehtml_proc.php']['transformation'][$cmd]); |
||
| 223 | $_procObj->pObj = $this; |
||
| 224 | $_procObj->transformationKey = $cmd; |
||
| 225 | $value = $_procObj->transform_db($value, $this); |
||
| 226 | } else { |
||
| 227 | // ... else use defaults: |
||
| 228 | switch ($cmd) { |
||
| 229 | case 'detectbrokenlinks': |
||
| 230 | $value = $this->removeBrokenLinkMarkers($value); |
||
| 231 | break; |
||
| 232 | case 'ts_links': |
||
| 233 | $value = $this->TS_links_db($value); |
||
| 234 | break; |
||
| 235 | case 'css_transform': |
||
| 236 | // Transform empty paragraphs into spacing paragraphs |
||
| 237 | $value = str_replace('<p></p>', '<p> </p>', $value); |
||
| 238 | // Double any trailing spacing paragraph so that it does not get removed by divideIntoLines() |
||
| 239 | $value = preg_replace('/<p> <\/p>$/', '<p> </p><p> </p>', $value); |
||
| 240 | $value = $this->TS_transform_db($value); |
||
| 241 | break; |
||
| 242 | default: |
||
| 243 | // Do nothing |
||
| 244 | } |
||
| 245 | } |
||
| 246 | } |
||
| 247 | // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner |
||
| 248 | $value = $this->runHtmlParserIfConfigured($value, 'exitHTMLparser_db'); |
||
| 249 | // Final clean up of linebreaks |
||
| 250 | $value = $this->streamlineLineBreaksAfterProcessing($value); |
||
| 251 | return $value; |
||
| 252 | } |
||
| 253 | |||
| 254 | /** |
||
| 255 | * Ensures what transformation modes should be executed, and that they are only executed once. |
||
| 256 | * |
||
| 257 | * @param string $direction |
||
| 258 | * @return array the resolved transformation modes |
||
| 259 | */ |
||
| 260 | protected function resolveAppliedTransformationModes(string $direction): array |
||
| 261 | { |
||
| 262 | // Setting modes / transformations to be called |
||
| 263 | if ((string)$this->procOptions['overruleMode'] !== '') { |
||
| 264 | $modes = GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']); |
||
| 265 | } else { |
||
| 266 | $modes = [$this->procOptions['mode']]; |
||
| 267 | } |
||
| 268 | |||
| 269 | $modeList = implode(',', $modes); |
||
| 270 | |||
| 271 | // Replace the shortcut "default" with all custom modes |
||
| 272 | $modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_links', $modeList); |
||
| 273 | |||
| 274 | // Make list unique |
||
| 275 | $modes = array_unique(GeneralUtility::trimExplode(',', $modeList, true)); |
||
| 276 | // Reverse order if direction is "rte" |
||
| 277 | if ($direction === 'rte') { |
||
| 278 | $modes = array_reverse($modes); |
||
| 279 | } |
||
| 280 | |||
| 281 | return $modes; |
||
| 282 | } |
||
| 283 | |||
| 284 | /** |
||
| 285 | * Runs the HTML parser if it is configured |
||
| 286 | * Getting additional HTML cleaner configuration. These are applied either before or after the main transformation |
||
| 287 | * is done and thus totally independent processing options you can set up. |
||
| 288 | * |
||
| 289 | * This is only possible via TSconfig (procOptions) currently. |
||
| 290 | * |
||
| 291 | * @param string $content |
||
| 292 | * @param string $configurationDirective used to look up in the procOptions if enabled, and then fetch the |
||
| 293 | * @return string the processed content |
||
| 294 | */ |
||
| 295 | protected function runHtmlParserIfConfigured($content, $configurationDirective) |
||
| 296 | { |
||
| 297 | if (!empty($this->procOptions[$configurationDirective])) { |
||
| 298 | [$keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration] = $this->HTMLparserConfig($this->procOptions[$configurationDirective . '.']); |
||
| 299 | $content = $this->HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration); |
||
| 300 | } |
||
| 301 | return $content; |
||
| 302 | } |
||
| 303 | |||
| 304 | /************************************ |
||
| 305 | * |
||
| 306 | * Specific RTE TRANSFORMATION functions |
||
| 307 | * |
||
| 308 | *************************************/ |
||
| 309 | |||
| 310 | /** |
||
| 311 | * Transformation handler: 'ts_links' / direction: "db" |
||
| 312 | * Processing anchor tags, and resolves them correctly again via the LinkService syntax |
||
| 313 | * |
||
| 314 | * Splits content into <a> tag blocks and processes each tag, and allows hooks to actually render |
||
| 315 | * the result. |
||
| 316 | * |
||
| 317 | * @param string $value Content input |
||
| 318 | * @return string Content output |
||
| 319 | */ |
||
| 320 | protected function TS_links_db($value) |
||
|
|
|||
| 321 | { |
||
| 322 | $blockSplit = $this->splitIntoBlock('A', $value); |
||
| 323 | foreach ($blockSplit as $k => $v) { |
||
| 324 | if ($k % 2) { |
||
| 325 | [$tagAttributes] = $this->get_tag_attributes($this->getFirstTag($v), true); |
||
| 326 | |||
| 327 | // Anchors would not have an href attribute |
||
| 328 | if (!isset($tagAttributes['href'])) { |
||
| 329 | continue; |
||
| 330 | } |
||
| 331 | $linkService = GeneralUtility::makeInstance(LinkService::class); |
||
| 332 | // Store the link as <a> tag as default by TYPO3, with the link service syntax |
||
| 333 | try { |
||
| 334 | $linkInformation = $linkService->resolve($tagAttributes['href'] ?? ''); |
||
| 335 | $tagAttributes['href'] = $linkService->asString($linkInformation); |
||
| 336 | } catch (UnknownLinkHandlerException $e) { |
||
| 337 | $tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href']; |
||
| 338 | } |
||
| 339 | |||
| 340 | $blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>' |
||
| 341 | . $this->TS_links_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</a>'; |
||
| 342 | } |
||
| 343 | } |
||
| 344 | return implode('', $blockSplit); |
||
| 345 | } |
||
| 346 | |||
| 347 | /** |
||
| 348 | * Transformation handler: 'css_transform' / direction: "db" |
||
| 349 | * Cleaning (->db) for standard content elements (ts) |
||
| 350 | * |
||
| 351 | * @param string $value Content input |
||
| 352 | * @return string Content output |
||
| 353 | * @see TS_transform_rte() |
||
| 354 | */ |
||
| 355 | protected function TS_transform_db($value) |
||
| 356 | { |
||
| 357 | // Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...) |
||
| 358 | $this->TS_transform_db_safecounter--; |
||
| 359 | if ($this->TS_transform_db_safecounter < 0) { |
||
| 360 | return $value; |
||
| 361 | } |
||
| 362 | // Split the content from RTE by the occurrence of these blocks: |
||
| 363 | $blockSplit = $this->splitIntoBlock($this->blockElementList, $value); |
||
| 364 | |||
| 365 | // Avoid superfluous linebreaks by transform_db after ending headListTag |
||
| 366 | while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') { |
||
| 367 | array_pop($blockSplit); |
||
| 368 | } |
||
| 369 | |||
| 370 | // Traverse the blocks |
||
| 371 | foreach ($blockSplit as $k => $v) { |
||
| 372 | if ($k % 2) { |
||
| 373 | // Inside block: |
||
| 374 | // Init: |
||
| 375 | $tag = $this->getFirstTag($v); |
||
| 376 | $tagName = strtolower($this->getFirstTagName($v)); |
||
| 377 | // Process based on the tag: |
||
| 378 | switch ($tagName) { |
||
| 379 | case 'blockquote': |
||
| 380 | case 'dd': |
||
| 381 | case 'div': |
||
| 382 | case 'header': |
||
| 383 | case 'section': |
||
| 384 | case 'footer': |
||
| 385 | case 'nav': |
||
| 386 | case 'article': |
||
| 387 | case 'aside': |
||
| 388 | $blockSplit[$k] = $tag . $this->TS_transform_db($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>'; |
||
| 389 | break; |
||
| 390 | case 'pre': |
||
| 391 | break; |
||
| 392 | default: |
||
| 393 | // usually <hx> tags and <table> tags where no other block elements are within the tags |
||
| 394 | // Eliminate true linebreaks inside block element tags |
||
| 395 | $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]); |
||
| 396 | } |
||
| 397 | } else { |
||
| 398 | // NON-block: |
||
| 399 | if (trim($blockSplit[$k]) !== '') { |
||
| 400 | $blockSplit[$k] = str_replace('<hr/>', '<hr />', $blockSplit[$k]); |
||
| 401 | // Remove linebreaks preceding hr tags |
||
| 402 | $blockSplit[$k] = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $blockSplit[$k]); |
||
| 403 | // Remove linebreaks following hr tags |
||
| 404 | $blockSplit[$k] = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $blockSplit[$k]); |
||
| 405 | // Replace other linebreaks with space |
||
| 406 | $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]); |
||
| 407 | $blockSplit[$k] = $this->divideIntoLines($blockSplit[$k]); |
||
| 408 | } else { |
||
| 409 | unset($blockSplit[$k]); |
||
| 410 | } |
||
| 411 | } |
||
| 412 | } |
||
| 413 | $this->TS_transform_db_safecounter++; |
||
| 414 | return implode(LF, $blockSplit); |
||
| 415 | } |
||
| 416 | |||
| 417 | /** |
||
| 418 | * Transformation handler: css_transform / direction: "rte" |
||
| 419 | * Set (->rte) for standard content elements (ts) |
||
| 420 | * |
||
| 421 | * @param string $value Content input |
||
| 422 | * @return string Content output |
||
| 423 | * @see TS_transform_db() |
||
| 424 | */ |
||
| 425 | protected function TS_transform_rte($value) |
||
| 426 | { |
||
| 427 | // Split the content from database by the occurrence of the block elements |
||
| 428 | $blockSplit = $this->splitIntoBlock($this->blockElementList, $value); |
||
| 429 | // Traverse the blocks |
||
| 430 | foreach ($blockSplit as $k => $v) { |
||
| 431 | if ($k % 2) { |
||
| 432 | // Inside one of the blocks: |
||
| 433 | // Init: |
||
| 434 | $tag = $this->getFirstTag($v); |
||
| 435 | $tagName = strtolower($this->getFirstTagName($v)); |
||
| 436 | // Based on tagname, we do transformations: |
||
| 437 | switch ($tagName) { |
||
| 438 | case 'blockquote': |
||
| 439 | case 'dd': |
||
| 440 | case 'div': |
||
| 441 | case 'header': |
||
| 442 | case 'section': |
||
| 443 | case 'footer': |
||
| 444 | case 'nav': |
||
| 445 | case 'article': |
||
| 446 | case 'aside': |
||
| 447 | $blockSplit[$k] = $tag . $this->TS_transform_rte($this->removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>'; |
||
| 448 | break; |
||
| 449 | } |
||
| 450 | $blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]); |
||
| 451 | } else { |
||
| 452 | // NON-block: |
||
| 453 | $nextFTN = $this->getFirstTagName($blockSplit[$k + 1] ?? ''); |
||
| 454 | $onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1); |
||
| 455 | // If the line is followed by a block or is the last line: |
||
| 456 | if (GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) { |
||
| 457 | // If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1 |
||
| 458 | if (!$onlyLineBreaks) { |
||
| 459 | $blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]); |
||
| 460 | } else { |
||
| 461 | // If the line contains only linebreaks, remove the leading linebreak |
||
| 462 | $blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]); |
||
| 463 | } |
||
| 464 | } |
||
| 465 | // If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks |
||
| 466 | if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) { |
||
| 467 | unset($blockSplit[$k]); |
||
| 468 | } else { |
||
| 469 | $blockSplit[$k] = $this->setDivTags($blockSplit[$k]); |
||
| 470 | } |
||
| 471 | } |
||
| 472 | } |
||
| 473 | return implode(LF, $blockSplit); |
||
| 474 | } |
||
| 475 | |||
| 476 | /*************************************************************** |
||
| 477 | * |
||
| 478 | * Generic RTE transformation, analysis and helper functions |
||
| 479 | * |
||
| 480 | **************************************************************/ |
||
| 481 | |||
| 482 | /** |
||
| 483 | * Function for cleaning content going into the database. |
||
| 484 | * Content is cleaned eg. by removing unallowed HTML and ds-HSC content |
||
| 485 | * It is basically calling HTMLcleaner from the parent class with some preset configuration specifically set up for cleaning content going from the RTE into the db |
||
| 486 | * |
||
| 487 | * @param string $content Content to clean up |
||
| 488 | * @return string Clean content |
||
| 489 | * @see getKeepTags() |
||
| 490 | */ |
||
| 491 | protected function HTMLcleaner_db($content) |
||
| 492 | { |
||
| 493 | $keepTags = $this->getKeepTags('db'); |
||
| 494 | return $this->HTMLcleaner($content, $keepTags, false); |
||
| 495 | } |
||
| 496 | |||
| 497 | /** |
||
| 498 | * Creates an array of configuration for the HTMLcleaner function based on whether content |
||
| 499 | * go TO or FROM the Rich Text Editor ($direction) |
||
| 500 | * |
||
| 501 | * @param string $direction The direction of the content being processed by the output configuration; "db" (content going into the database FROM the rte) or "rte" (content going into the form) |
||
| 502 | * @return array Configuration array |
||
| 503 | * @see HTMLcleaner_db() |
||
| 504 | */ |
||
| 505 | protected function getKeepTags($direction = 'rte') |
||
| 506 | { |
||
| 507 | if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) { |
||
| 508 | // Setting up allowed tags: |
||
| 509 | // Default is to get allowed/denied tags from internal array of processing options: |
||
| 510 | // Construct default list of tags to keep: |
||
| 511 | if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) { |
||
| 512 | $keepTags = implode(',', $this->procOptions['allowTags.']); |
||
| 513 | } else { |
||
| 514 | $keepTags = $this->procOptions['allowTags'] ?? ''; |
||
| 515 | } |
||
| 516 | $keepTags = array_flip(GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true)); |
||
| 517 | // For tags to deny, remove them from $keepTags array: |
||
| 518 | $denyTags = GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true); |
||
| 519 | foreach ($denyTags as $dKe) { |
||
| 520 | unset($keepTags[$dKe]); |
||
| 521 | } |
||
| 522 | // Based on the direction of content, set further options: |
||
| 523 | switch ($direction) { |
||
| 524 | case 'rte': |
||
| 525 | // Transforming keepTags array so it can be understood by the HTMLcleaner function. |
||
| 526 | // This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array. |
||
| 527 | [$keepTags] = $this->HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags); |
||
| 528 | break; |
||
| 529 | case 'db': |
||
| 530 | // Setting up span tags if they are allowed: |
||
| 531 | if (isset($keepTags['span'])) { |
||
| 532 | $keepTags['span'] = [ |
||
| 533 | 'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop', |
||
| 534 | 'fixAttrib' => [ |
||
| 535 | 'class' => [ |
||
| 536 | 'removeIfFalse' => 1 |
||
| 537 | ] |
||
| 538 | ], |
||
| 539 | 'rmTagIfNoAttrib' => 1 |
||
| 540 | ]; |
||
| 541 | if (!empty($this->allowedClasses)) { |
||
| 542 | $keepTags['span']['fixAttrib']['class']['list'] = $this->allowedClasses; |
||
| 543 | } |
||
| 544 | } |
||
| 545 | // Setting further options, getting them from the processing options |
||
| 546 | $TSc = $this->procOptions['HTMLparser_db.'] ?? []; |
||
| 547 | if (empty($TSc['globalNesting'])) { |
||
| 548 | $TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span'; |
||
| 549 | } |
||
| 550 | if (empty($TSc['noAttrib'])) { |
||
| 551 | $TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike'; |
||
| 552 | } |
||
| 553 | // Transforming the array from TypoScript to regular array: |
||
| 554 | [$keepTags] = $this->HTMLparserConfig($TSc, $keepTags); |
||
| 555 | break; |
||
| 556 | } |
||
| 557 | // Caching (internally, in object memory) the result |
||
| 558 | $this->getKeepTags_cache[$direction] = $keepTags; |
||
| 559 | } |
||
| 560 | // Return result: |
||
| 561 | return $this->getKeepTags_cache[$direction]; |
||
| 562 | } |
||
| 563 | |||
| 564 | /** |
||
| 565 | * This resolves the $value into parts based on <p>-sections. These are returned as lines separated by LF. |
||
| 566 | * This point is to resolve the HTML-code returned from RTE into ordinary lines so it's 'human-readable' |
||
| 567 | * The function ->setDivTags does the opposite. |
||
| 568 | * This function processes content to go into the database. |
||
| 569 | * |
||
| 570 | * @param string $value Value to process. |
||
| 571 | * @param int $count Recursion brake. Decremented on each recursion down to zero. Default is 5 (which equals the allowed nesting levels of p tags). |
||
| 572 | * @param bool $returnArray If TRUE, an array with the lines is returned, otherwise a string of the processed input value. |
||
| 573 | * @return string|array Processed input value. |
||
| 574 | * @see setDivTags() |
||
| 575 | */ |
||
| 576 | protected function divideIntoLines($value, $count = 5, $returnArray = false) |
||
| 577 | { |
||
| 578 | // Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...? |
||
| 579 | $paragraphBlocks = $this->splitIntoBlock('p', $value, true); |
||
| 580 | // Returns plainly the content if there was no p sections in it |
||
| 581 | if (count($paragraphBlocks) <= 1 || $count <= 0) { |
||
| 582 | return $this->sanitizeLineBreaksForContentOnly($value); |
||
| 583 | } |
||
| 584 | |||
| 585 | // Traverse the splitted sections |
||
| 586 | foreach ($paragraphBlocks as $k => $v) { |
||
| 587 | if ($k % 2) { |
||
| 588 | // Inside a <p> section |
||
| 589 | $v = $this->removeFirstAndLastTag($v); |
||
| 590 | // Fetching 'sub-lines' - which will explode any further p nesting recursively |
||
| 591 | $subLines = $this->divideIntoLines($v, $count - 1, true); |
||
| 592 | // So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error') |
||
| 593 | if (is_array($subLines)) { |
||
| 594 | $paragraphBlocks[$k] = implode(LF, $subLines); |
||
| 595 | } else { |
||
| 596 | //... but if NO subsection was found, we process it as a TRUE line without erroneous content: |
||
| 597 | $paragraphBlocks[$k] = $this->processContentWithinParagraph($subLines, $paragraphBlocks[$k]); |
||
| 598 | } |
||
| 599 | // If it turns out the line is just blank (containing a possibly) then just make it pure blank. |
||
| 600 | // But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes. |
||
| 601 | // Those attributes should have been filtered before; if they are still there they must be considered as possible content. |
||
| 602 | if (trim(strip_tags($paragraphBlocks[$k])) === ' ' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) { |
||
| 603 | $paragraphBlocks[$k] = ''; |
||
| 604 | } |
||
| 605 | } else { |
||
| 606 | // Outside a paragraph, if there is still something in there, just add a <p> tag |
||
| 607 | // Remove positions which are outside <p> tags and without content |
||
| 608 | $paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>')); |
||
| 609 | $paragraphBlocks[$k] = $this->sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]); |
||
| 610 | if ((string)$paragraphBlocks[$k] === '') { |
||
| 611 | unset($paragraphBlocks[$k]); |
||
| 612 | } else { |
||
| 613 | // add <p> tags around the content |
||
| 614 | $paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]); |
||
| 615 | } |
||
| 616 | } |
||
| 617 | } |
||
| 618 | return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks); |
||
| 619 | } |
||
| 620 | |||
| 621 | /** |
||
| 622 | * Converts all lines into <p></p>-sections (unless the line has a p - tag already) |
||
| 623 | * For processing of content going FROM database TO RTE. |
||
| 624 | * |
||
| 625 | * @param string $value Value to convert |
||
| 626 | * @return string Processed value. |
||
| 627 | * @see divideIntoLines() |
||
| 628 | */ |
||
| 629 | protected function setDivTags($value) |
||
| 660 | } |
||
| 661 | |||
| 662 | /** |
||
| 663 | * Used for transformation from RTE to DB |
||
| 664 | * |
||
| 665 | * Works on a single line within a <p> tag when storing into the database |
||
| 666 | * This always adds <p> tags and validates the arguments, |
||
| 667 | * additionally the content is cleaned up via the HTMLcleaner. |
||
| 668 | * |
||
| 669 | * @param string $content the content within the <p> tag |
||
| 670 | * @param string $fullContentWithTag the whole <p> tag surrounded as well |
||
| 671 | * |
||
| 672 | * @return string the full <p> tag with cleaned content |
||
| 673 | */ |
||
| 674 | protected function processContentWithinParagraph(string $content, string $fullContentWithTag) |
||
| 704 | } |
||
| 705 | |||
| 706 | /** |
||
| 707 | * Wrap <hr> tags with LFs, and also remove double LFs, used when transforming from RTE to DB |
||
| 708 | * |
||
| 709 | * @param string $content |
||
| 710 | * @return string the modified content |
||
| 711 | */ |
||
| 712 | protected function sanitizeLineBreaksForContentOnly(string $content) |
||
| 718 | } |
||
| 719 | |||
| 720 | /** |
||
| 721 | * Called before any processing / transformation is made |
||
| 722 | * Removing any CRs (char 13) and only deal with LFs (char 10) internally. |
||
| 723 | * CR has a very disturbing effect, so just remove all CR and rely on LF |
||
| 724 | * |
||
| 725 | * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks. |
||
| 726 | * |
||
| 727 | * @param string $content the content to process |
||
| 728 | * @return string the modified content |
||
| 729 | */ |
||
| 730 | protected function streamlineLineBreaksForProcessing(string $content) |
||
| 731 | { |
||
| 732 | return str_replace(CR, '', $content); |
||
| 733 | } |
||
| 734 | |||
| 735 | /** |
||
| 736 | * Called after any processing / transformation was made |
||
| 737 | * just before the content is returned by the RTE parser all line breaks |
||
| 738 | * get unified to be "CRLF"s again. |
||
| 739 | * |
||
| 740 | * Historical note: Previously it was possible to disable this functionality via disableUnifyLineBreaks. |
||
| 741 | * |
||
| 742 | * @param string $content the content to process |
||
| 743 | * @return string the modified content |
||
| 744 | */ |
||
| 745 | protected function streamlineLineBreaksAfterProcessing(string $content) |
||
| 746 | { |
||
| 747 | // Make sure no \r\n sequences has entered in the meantime |
||
| 748 | $content = $this->streamlineLineBreaksForProcessing($content); |
||
| 749 | // ... and then change all \n into \r\n |
||
| 750 | return str_replace(LF, CRLF, $content); |
||
| 751 | } |
||
| 752 | |||
| 753 | /** |
||
| 754 | * Content Transformation from DB to RTE |
||
| 755 | * Checks all <a> tags which reference a t3://page and checks if the page is available |
||
| 756 | * If not, some offensive styling is added. |
||
| 757 | * |
||
| 758 | * @param string $content |
||
| 759 | * @return string the modified content |
||
| 760 | */ |
||
| 761 | protected function markBrokenLinks(string $content): string |
||
| 762 | { |
||
| 763 | $blocks = $this->splitIntoBlock('A', $content); |
||
| 764 | $linkService = GeneralUtility::makeInstance(LinkService::class); |
||
| 765 | foreach ($blocks as $position => $value) { |
||
| 766 | if ($position % 2 === 0) { |
||
| 767 | continue; |
||
| 768 | } |
||
| 769 | [$attributes] = $this->get_tag_attributes($this->getFirstTag($value), true); |
||
| 770 | if (empty($attributes['href'])) { |
||
| 771 | continue; |
||
| 772 | } |
||
| 773 | |||
| 774 | try { |
||
| 775 | $hrefInformation = $linkService->resolve($attributes['href']); |
||
| 776 | |||
| 777 | $brokenLinkAnalysis = new BrokenLinkAnalysisEvent($hrefInformation['type'], $hrefInformation); |
||
| 778 | $this->eventDispatcher->dispatch($brokenLinkAnalysis); |
||
| 779 | if ($brokenLinkAnalysis->isBrokenLink()) { |
||
| 780 | $attributes['data-rte-error'] = $brokenLinkAnalysis->getReason(); |
||
| 781 | } |
||
| 782 | } catch (UnknownLinkHandlerException $e) { |
||
| 783 | $attributes['data-rte-error'] = $e->getMessage(); |
||
| 784 | } |
||
| 785 | |||
| 786 | // Always rewrite the block to allow the nested calling even if a page is found |
||
| 787 | $blocks[$position] = |
||
| 788 | '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>' |
||
| 789 | . $this->markBrokenLinks($this->removeFirstAndLastTag($blocks[$position])) |
||
| 790 | . '</a>'; |
||
| 791 | } |
||
| 792 | return implode('', $blocks); |
||
| 793 | } |
||
| 794 | |||
| 795 | /** |
||
| 796 | * Content Transformation from RTE to DB |
||
| 797 | * Removes link information error attributes from <a> tags that are added to broken links |
||
| 798 | * |
||
| 799 | * @param string $content the content to process |
||
| 800 | * @return string the modified content |
||
| 801 | */ |
||
| 802 | protected function removeBrokenLinkMarkers(string $content): string |
||
| 828 | } |
||
| 829 | } |
||
| 830 |