Completed
Push — master ( 4904bb...48f77b )
by Timo
11s
created

Typo3PageContentExtractor::excludeContentByClass()   C

Complexity

Conditions 7
Paths 6

Size

Total Lines 37
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 7.004

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 37
ccs 22
cts 23
cp 0.9565
rs 6.7272
cc 7
eloc 23
nc 6
nop 1
crap 7.004
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2010-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
27
use TYPO3\CMS\Core\Utility\GeneralUtility;
28
29
/**
30
 * Content extraction class for TYPO3 pages.
31
 *
32
 * @author Ingo Renner <[email protected]>
33
 */
34
class Typo3PageContentExtractor extends HtmlContentExtractor
35
{
36
37
    /**
38
     * Shortcut method to retrieve the raw content marked for indexing.
39
     *
40
     * @return string Content marked for indexing.
41
     */
42 42
    public function getContentMarkedForIndexing()
43
    {
44 42
        return $this->extractContentMarkedForIndexing($this->content);
45
    }
46
47
    /**
48
     * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
49
     * markers.
50
     *
51
     * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed
52
     * @return string HTML markup found between TYPO3SEARCH markers
53
     */
54 43
    protected function extractContentMarkedForIndexing($html)
55
    {
56 43
        preg_match_all('/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis',
57
            $html, $indexableContents);
58 43
        $indexableContent = implode($indexableContents[0], '');
59
60 43
        $indexableContent = $this->excludeContentByClass($indexableContent);
61 43
        if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) {
62 7
            GeneralUtility::devLog('No TYPO3SEARCH markers found.', 'solr', 2);
63
        }
64
65 43
        return $indexableContent;
66
    }
67
68
    /**
69
     * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
70
     * markers.
71
     *
72
     * @param string $indexableContent HTML markup
73
     * @return string HTML
74
     */
75 46
    public function excludeContentByClass($indexableContent)
76
    {
77 46
        if (empty(trim($indexableContent))) {
78 7
            return html_entity_decode($indexableContent);
79
        }
80
81 39
        $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray();
82 39
        if (count($excludeClasses) === 0) {
83 6
            return html_entity_decode($indexableContent);
84
        }
85
86 33
        $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses);
87 33
        if (!$isInContent) {
88 30
            return html_entity_decode($indexableContent);
89
        }
90
91 3
        $doc = new \DOMDocument('1.0', 'UTF-8');
92 3
        libxml_use_internal_errors(true);
93 3
        $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent);
94 3
        $xpath = new \DOMXPath($doc);
95 3
        foreach ($excludeClasses as $excludePart) {
96 3
            $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]");
97 3
            if (count($elements) == 0) {
98
                continue;
99
            }
100
101 3
            foreach ($elements as $element) {
102 3
                $element->parentNode->removeChild($element);
103
            }
104
        }
105 3
        $html = $doc->saveHTML($doc->documentElement->parentNode);
106
        // remove XML-Preamble, newlines and doctype
107 3
        $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html);
108 3
        $html = str_replace(['<html>', '</html>', '<body>', '</body>'], ['', '', '', ''], $html);
109
110 3
        return $html;
111
    }
112
113
    /**
114
     * Returns the cleaned indexable content from the page's HTML markup.
115
     *
116
     * The content is cleaned from HTML tags and control chars Solr could
117
     * stumble on.
118
     *
119
     * @return string Indexable, cleaned content ready for indexing.
120
     */
121 43
    public function getIndexableContent()
122
    {
123 43
        $content = $this->extractContentMarkedForIndexing($this->content);
124
125
        // clean content
126 43
        $content = self::cleanContent($content);
127 43
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
128 43
        $content = strip_tags($content); // after entity decoding we might have tags again
129 43
        $content = trim($content);
130
131 43
        return $content;
132
    }
133
134
    /**
135
     * Retrieves the page's title by checking the indexedDocTitle, altPageTitle,
136
     * and regular page title - in that order.
137
     *
138
     * @return string the page's title
139
     */
140 42
    public function getPageTitle()
141
    {
142 42
        $page = $GLOBALS['TSFE'];
143
144 42
        if ($page->indexedDocTitle) {
145 35
            $pageTitle = $page->indexedDocTitle;
146 7
        } elseif ($page->altPageTitle) {
147
            $pageTitle = $page->altPageTitle;
148
        } else {
149 7
            $pageTitle = $page->page['title'];
150
        }
151
152 42
        return $pageTitle;
153
    }
154
155
    /**
156
     * Retrieves the page's body
157
     *
158
     * @return string the page's body
159
     */
160
    public function getPageBody()
161
    {
162
        $pageContent = $this->content;
163
164
        return stristr($pageContent, '<body');
165
    }
166
}
167