Passed
Pull Request — main (#3338)
by Rafael
03:25
created

Typo3PageContentExtractor::getIndexableContent()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 10
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
/*
4
 * This file is part of the TYPO3 CMS project.
5
 *
6
 * It is free software; you can redistribute it and/or modify it under
7
 * the terms of the GNU General Public License, either version 2
8
 * of the License, or any later version.
9
 *
10
 * For the full copyright and license information, please read the
11
 * LICENSE.txt file that was distributed with this source code.
12
 *
13
 * The TYPO3 project - inspiring people to share!
14
 */
15
16
namespace ApacheSolrForTypo3\Solr;
17
18
use ApacheSolrForTypo3\Solr\System\Logging\SolrLogManager;
19
use DOMDocument;
20
use DOMXPath;
21
use function libxml_use_internal_errors;
22
use TYPO3\CMS\Core\Utility\GeneralUtility;
23
24
/**
25
 * Content extraction class for TYPO3 pages.
26
 *
27
 * @author Ingo Renner <[email protected]>
28
 */
29
class Typo3PageContentExtractor extends HtmlContentExtractor
30
{
31
    /**
32
     * @var SolrLogManager|null
33
     */
34
    protected ?SolrLogManager $logger;
35
36
    /**
37
     * Shortcut method to retrieve the raw content marked for indexing.
38
     *
39
     * @return string Content marked for indexing.
40
     */
41 69
    public function getContentMarkedForIndexing(): string
42
    {
43 69
        return $this->extractContentMarkedForIndexing($this->content);
44
    }
45
46
    /**
47
     * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
48
     * markers.
49
     *
50
     * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed
51
     * @return string HTML markup found between TYPO3SEARCH markers
52
     */
53 78
    protected function extractContentMarkedForIndexing(string $html): string
54
    {
55 78
        preg_match_all(
56
            '/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis',
57
            $html,
58
            $indexableContents
59
        );
60 78
        $indexableContent = implode('', $indexableContents[0]);
61
62 78
        $indexableContent = $this->excludeContentByClass($indexableContent);
63 78
        if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) {
64 19
            $this->logger = GeneralUtility::makeInstance(SolrLogManager::class, /** @scrutinizer ignore-type */ __CLASS__);
65 19
            $this->logger->log(SolrLogManager::WARNING, 'No TYPO3SEARCH markers found.');
0 ignored issues
show
Bug introduced by
The method log() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

65
            $this->logger->/** @scrutinizer ignore-call */ 
66
                           log(SolrLogManager::WARNING, 'No TYPO3SEARCH markers found.');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
66
        }
67
68 78
        return $indexableContent;
69
    }
70
71
    /**
72
     * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
73
     * markers.
74
     *
75
     * @param string $indexableContent HTML markup
76
     * @return string HTML
77
     */
78 81
    public function excludeContentByClass(string $indexableContent): string
79
    {
80 81
        if (empty(trim($indexableContent))) {
81 19
            return $indexableContent;
82
        }
83
84 64
        $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray();
85 64
        if (count($excludeClasses) === 0) {
86
            return $indexableContent;
87
        }
88
89 64
        $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses);
90 64
        if (!$isInContent) {
91 61
            return $indexableContent;
92
        }
93
94 3
        $doc = new DOMDocument('1.0', 'UTF-8');
95 3
        libxml_use_internal_errors(true);
96 3
        $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent);
97 3
        $xpath = new DOMXPath($doc);
98 3
        foreach ($excludeClasses as $excludePart) {
99 3
            $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]");
100 3
            if (count($elements) == 0) {
101
                continue;
102
            }
103
104 3
            foreach ($elements as $element) {
105 3
                $element->parentNode->removeChild($element);
106
            }
107
        }
108 3
        $html = $doc->saveHTML($doc->documentElement->parentNode);
109
        // remove XML-Preamble, newlines and doctype
110 3
        $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html);
111 3
        return str_replace(['<html>', '</html>', '<body>', '</body>'], ['', '', '', ''], $html);
112
    }
113
114
    /**
115
     * Returns the cleaned indexable content from the page's HTML markup.
116
     *
117
     * The content is cleaned from HTML tags and control chars Solr could
118
     * stumble on.
119
     *
120
     * @return string Indexable, cleaned content ready for indexing.
121
     */
122 78
    public function getIndexableContent(): string
123
    {
124
        // @extensionScannerIgnoreLine
125 78
        $content = $this->extractContentMarkedForIndexing($this->content);
126
127
        // clean content
128 78
        $content = self::cleanContent($content);
129 78
        $content = trim($content);
130
        // reduce multiple spaces to one space and return
131 78
        return preg_replace('!\s+!u', ' ', $content);
132
    }
133
134
    /**
135
     * Retrieves the page's title by checking the indexedDocTitle, altPageTitle,
136
     * and regular page title - in that order.
137
     *
138
     * @return string the page's title
139
     */
140 69
    public function getPageTitle(): string
141
    {
142 69
        $page = $GLOBALS['TSFE'];
143
144 69
        if ($page->indexedDocTitle) {
145 52
            $pageTitle = $page->indexedDocTitle;
146 17
        } elseif ($page->altPageTitle) {
147
            $pageTitle = $page->altPageTitle;
148
        } else {
149 17
            $pageTitle = $page->page['title'];
150
        }
151
152 69
        return $pageTitle ?? '';
153
    }
154
155
    /**
156
     * Retrieves the page's body
157
     *
158
     * @return string the page's body
159
     */
160
    public function getPageBody(): string
161
    {
162
        return stristr($this->content, '<body');
163
    }
164
}
165