Passed
Push — master ( 4e40a2...cc3f84 )
by Timo
24:09
created

HtmlContentExtractor::cleanContent()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 19
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 10
dl 0
loc 19
ccs 11
cts 11
cp 1
rs 9.9332
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2011-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 3 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
27
28
/**
29
 * A content extractor to get clean, indexable content from HTML markup.
30
 *
31
 * @author Ingo Renner <[email protected]>
32
 */
33
class HtmlContentExtractor
34
{
35
36
    /**
37
     * Unicode ranges which should get stripped before sending a document to solr.
38
     * This is necessary if a document (PDF, etc.) contains unicode characters which
39
     * are valid in the font being used in the document but are not available in the
40
     * font being used for displaying results.
41
     *
42
     * This is often the case if PDFs are being indexed where special fonts are used
43
     * for displaying bullets, etc. Usually those bullets reside in one of the unicode
44
     * "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
45
     *
46
     * @see http://en.wikipedia.org/wiki/Unicode_block
47
     * @var array
48
     */
49
    protected static $stripUnicodeRanges = [
50
        ['FFFD', 'FFFD'],
51
        // Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
52
        ['E000', 'F8FF'],
53
        // Private Use Area (part of Plane 0)
54
        ['F0000', 'FFFFF'],
55
        // Supplementary Private Use Area (Plane 15)
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
56
        ['100000', '10FFFF'],
57
        // Supplementary Private Use Area (Plane 16)
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
58
    ];
59
    /**
60
     * The raw HTML markup content to extract clean content from.
61
     *
62
     * @var string
63
     */
64
    protected $content;
65
    /**
66
     * Mapping of HTML tags to Solr document fields.
67
     *
68
     * @var array
69
     */
70
    protected $tagToFieldMapping = [
71
        'h1' => 'tagsH1',
72
        'h2' => 'tagsH2H3',
73
        'h3' => 'tagsH2H3',
74
        'h4' => 'tagsH4H5H6',
75
        'h5' => 'tagsH4H5H6',
76
        'h6' => 'tagsH4H5H6',
77
        'u' => 'tagsInline',
78
        'b' => 'tagsInline',
79
        'strong' => 'tagsInline',
80
        'i' => 'tagsInline',
81
        'em' => 'tagsInline',
82
        'a' => 'tagsA',
83
    ];
84
85
    /**
86
     * @var TypoScriptConfiguration
87
     */
88
    private $configuration;
89
90
    /**
91
     * Constructor.
92
     *
93
     * @param string $content Content HTML markup
94
     */
95 81
    public function __construct($content)
96
    {
97 81
        $this->content = $content;
98 81
    }
99
100
    /**
101
     * @return TypoScriptConfiguration|array
102
     */
103 76
    protected function getConfiguration()
104
    {
105 76
        if ($this->configuration == null) {
106 66
            $this->configuration = Util::getSolrConfiguration();
107
        }
108
109 76
        return $this->configuration;
110
    }
111
112
    /**
113
     * @param TypoScriptConfiguration $configuration
114
     */
115 10
    public function setConfiguration(TypoScriptConfiguration $configuration)
116
    {
117 10
        $this->configuration = $configuration;
118 10
    }
119
120
    /**
121
     * Returns the cleaned indexable content from the page's HTML markup.
122
     *
123
     * The content is cleaned from HTML tags and control chars Solr could
124
     * stumble on.
125
     *
126
     * @return string Indexable, cleaned content ready for indexing.
127
     */
128 4
    public function getIndexableContent()
129
    {
130 4
        $content = self::cleanContent($this->content);
131 4
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
132
        // after entity decoding we might have tags again
133 4
        $content = strip_tags($content);
134 4
        $content = trim($content);
135
136 4
        return $content;
137
    }
138
139
    /**
140
     * Strips html tags, and tab, new-line, carriage-return, &nbsp; whitespace
141
     * characters.
142
     *
143
     * @param string $content String to clean
144
     * @return string String cleaned from tags and special whitespace characters
145
     */
146 78
    public static function cleanContent($content)
147
    {
148 78
        $content = self::stripControlCharacters($content);
149
        // remove Javascript
150 78
        $content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content);
151
152
        // remove internal CSS styles
153 78
        $content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content);
154
155
        // prevents concatenated words when stripping tags afterwards
156 78
        $content = str_replace(['<', '>'], [' <', '> '], $content);
157 78
        $content = static::stripTags($content);
158
159 78
        $content = str_replace(["\t", "\n", "\r", '&nbsp;'], ' ', $content);
160 78
        $content = self::stripUnicodeRanges($content);
161 78
        $content = preg_replace('/\s{2,}/', ' ', $content);
162 78
        $content = trim($content);
163
164 78
        return $content;
165
    }
166
167
    /**
168
     * Strips html tags, but keeps single < and > characters.
169
     *
170
     * @param string $content
171
     * @return mixed
172
     */
173 78
    protected static function stripTags($content)
174
    {
175 78
        $content = preg_replace('@<([^>]+(<|\z))@msi', '##lt##$1', $content);
176 78
        $content = strip_tags($content);
177
        // unescape < that are not used to open a tag
178 78
        return str_replace('##lt##', '<', $content);
179
    }
180
181
    /**
182
     * Strips control characters that cause Jetty/Solr to fail.
183
     *
184
     * @param string $content the content to sanitize
185
     * @return string the sanitized content
186
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
187
     */
188 78
    public static function stripControlCharacters($content)
189
    {
190
        // Printable utf-8 does not include any of these chars below x7F
191 78
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
192
    }
193
194
    /**
195
     * Strips unusable unicode ranges
196
     *
197
     * @param string $content Content to sanitize
198
     * @return string Sanitized content
199
     */
200 78
    public static function stripUnicodeRanges($content)
201
    {
202 78
        foreach (self::$stripUnicodeRanges as $range) {
203 78
            $content = self::stripUnicodeRange($content, $range[0], $range[1]);
204
        }
205
206 78
        return $content;
207
    }
208
209
    /**
210
     * Strips a UTF-8 character range
211
     *
212
     * @param string $content Content to sanitize
213
     * @param string $start Unicode range start character as uppercase hexadecimal string
214
     * @param string $end Unicode range end character as uppercase hexadecimal string
215
     * @return string Sanitized content
216
     */
217 78
    public static function stripUnicodeRange($content, $start, $end)
218
    {
219 78
        return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
220 78
            $content);
221
    }
222
223
    /**
224
     * Shortcut method to retrieve the raw content marked for indexing.
225
     *
226
     * @return string Content marked for indexing.
227
     */
228 1
    public function getContentMarkedForIndexing()
229
    {
230 1
        return $this->content;
231
    }
232
233
    /**
234
     * Extracts HTML tag content from tags in the content marked for indexing.
235
     *
236
     * @return array A mapping of Solr document field names to content found in defined tags.
237
     */
238 67
    public function getTagContent()
239
    {
240 67
        $result = [];
241 67
        $matches = [];
242 67
        $content = $this->getContentMarkedForIndexing();
243
244
        // strip all ignored tags
245 67
        $content = strip_tags(
246 67
            $content,
247 67
            '<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
248
        );
249
250 67
        preg_match_all(
251 67
            '@<(' . implode('|',
252 67
                array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui',
253 67
            $content,
254 67
            $matches
255
        );
256
257 67
        foreach ($matches[1] as $key => $tag) {
258
            // We don't want to index links auto-generated by the url filter.
259 1
            $pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@';
260 1
            if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) {
261 1
                $fieldName = $this->tagToFieldMapping[$tag];
262 1
                $hasContentForFieldName = empty($result[$fieldName]);
263 1
                $separator = ($hasContentForFieldName) ? '' : ' ';
264 1
                $result[$fieldName] .= $separator . $matches[2][$key];
265
            }
266
        }
267
268 67
        return $result;
269
    }
270
}
271