Completed
Branch master (33af51)
by Timo
04:12
created

Typo3PageContentExtractor::excludeContentByClass()   C

Complexity

Conditions 7
Paths 6

Size

Total Lines 37
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 7.0322

Importance

Changes 0
Metric Value
dl 0
loc 37
ccs 21
cts 23
cp 0.913
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 23
nc 6
nop 1
crap 7.0322
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2010-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
27
use TYPO3\CMS\Core\Utility\GeneralUtility;
28
29
/**
30
 * Content extraction class for TYPO3 pages.
31
 *
32
 * @author Ingo Renner <[email protected]>
33
 */
34
class Typo3PageContentExtractor extends HtmlContentExtractor
35
{
36
37
    /**
38
     * Shortcut method to retrieve the raw content marked for indexing.
39
     *
40
     * @return string Content marked for indexing.
41
     */
42 32
    public function getContentMarkedForIndexing()
43
    {
44 32
        return $this->extractContentMarkedForIndexing($this->content);
45
    }
46
47
    /**
48
     * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
49
     * markers.
50
     *
51
     * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed
52
     * @return string HTML markup found between TYPO3SEARCH markers
53
     */
54 33
    protected function extractContentMarkedForIndexing($html)
55
    {
56 33
        preg_match_all('/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis',
57
            $html, $indexableContents);
58 33
        $indexableContent = implode($indexableContents[0], '');
59
60 33
        $indexableContent = $this->excludeContentByClass($indexableContent);
61 33
        if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) {
62 3
            GeneralUtility::devLog('No TYPO3SEARCH markers found.', 'solr', 2);
63
        }
64
65 33
        return $indexableContent;
66
    }
67
68
    /**
69
     * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end
70
     * markers.
71
     *
72
     * @param string $indexableContent HTML markup
73
     * @return string HTML
74
     */
75 36
    public function excludeContentByClass($indexableContent)
76
    {
77 36
        if (empty(trim($indexableContent))) {
78 3
            return html_entity_decode($indexableContent);
79
        }
80
81 33
        $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray();
82 33
        if (count($excludeClasses) === 0) {
83
            return html_entity_decode($indexableContent);
84
        }
85
86 33
        $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses);
87 33
        if (!$isInContent) {
88 30
            return html_entity_decode($indexableContent);
89
        }
90
91 3
        $doc = new \DOMDocument('1.0', 'UTF-8');
92 3
        libxml_use_internal_errors(true);
93 3
        $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent);
94 3
        $xpath = new \DOMXPath($doc);
95 3
        foreach ($excludeClasses as $excludePart) {
96 3
            $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]");
97 3
            if (count($elements) == 0) {
98
                continue;
99
            }
100
101 3
            foreach ($elements as $element) {
102 3
                $element->parentNode->removeChild($element);
103
            }
104
        }
105 3
        $html = $doc->saveHTML($doc->documentElement->parentNode);
106
        // remove XML-Preamble, newlines and doctype
107 3
        $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html);
108 3
        $html = str_replace(array('<html>', '</html>', '<body>', '</body>'), array('', '', '', ''), $html);
109
110 3
        return $html;
111
    }
112
113
    /**
114
     * Returns the cleaned indexable content from the page's HTML markup.
115
     *
116
     * The content is cleaned from HTML tags and control chars Solr could
117
     * stumble on.
118
     *
119
     * @return string Indexable, cleaned content ready for indexing.
120
     */
121 33
    public function getIndexableContent()
122
    {
123 33
        $content = $this->extractContentMarkedForIndexing($this->content);
124
125
        // clean content
126 33
        $content = self::cleanContent($content);
127 33
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
128 33
        $content = strip_tags($content); // after entity decoding we might have tags again
129 33
        $content = trim($content);
130
131 33
        return $content;
132
    }
133
134
    /**
135
     * Retrieves the page's title by checking the indexedDocTitle, altPageTitle,
136
     * and regular page title - in that order.
137
     *
138
     * @return string the page's title
139
     */
140 32
    public function getPageTitle()
141
    {
142 32
        $page = $GLOBALS['TSFE'];
143 32
        $pageTitle = '';
0 ignored issues
show
Unused Code introduced by
$pageTitle is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
144
145 32
        if ($page->indexedDocTitle) {
146 29
            $pageTitle = $page->indexedDocTitle;
147 3
        } elseif ($page->altPageTitle) {
148
            $pageTitle = $page->altPageTitle;
149
        } else {
150 3
            $pageTitle = $page->page['title'];
151
        }
152
153 32
        return $pageTitle;
154
    }
155
156
    /**
157
     * Retrieves the page's body
158
     *
159
     * @return string the page's body
160
     */
161
    public function getPageBody()
162
    {
163
        $pageContent = $this->content;
164
165
        return stristr($pageContent, '<body');
166
    }
167
}
168