Passed
Push — master ( 42d2d3...8f9ec7 )
by Timo
69:32 queued 48:37
created

HtmlContentExtractor   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 237
Duplicated Lines 0 %

Coupling/Cohesion

Components 2
Dependencies 1

Test Coverage

Coverage 90.16%

Importance

Changes 0
Metric Value
wmc 17
lcom 2
cbo 1
dl 0
loc 237
ccs 55
cts 61
cp 0.9016
rs 10
c 0
b 0
f 0

11 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 4 1
A setConfiguration() 0 4 1
A getIndexableContent() 0 10 1
A getConfiguration() 0 8 2
A stripUnicodeRange() 0 5 1
A cleanContent() 0 19 1
A stripTags() 0 7 1
A stripControlCharacters() 0 5 1
A stripUnicodeRanges() 0 8 2
A getContentMarkedForIndexing() 0 4 1
B getTagContent() 0 32 5
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2011-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
27
28
/**
29
 * A content extractor to get clean, indexable content from HTML markup.
30
 *
31
 * @author Ingo Renner <[email protected]>
32
 */
33
class HtmlContentExtractor
34
{
35
36
    /**
37
     * Unicode ranges which should get stripped before sending a document to solr.
38
     * This is necessary if a document (PDF, etc.) contains unicode characters which
39
     * are valid in the font being used in the document but are not available in the
40
     * font being used for displaying results.
41
     *
42
     * This is often the case if PDFs are being indexed where special fonts are used
43
     * for displaying bullets, etc. Usually those bullets reside in one of the unicode
44
     * "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
45
     *
46
     * @see http://en.wikipedia.org/wiki/Unicode_block
47
     * @var array
48
     */
49
    protected static $stripUnicodeRanges = [
50
        ['FFFD', 'FFFD'],
51
        // Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
52
        ['E000', 'F8FF'],
53
        // Private Use Area (part of Plane 0)
54
        ['F0000', 'FFFFF'],
55
        // Supplementary Private Use Area (Plane 15)
56
        ['100000', '10FFFF'],
57
        // Supplementary Private Use Area (Plane 16)
58
    ];
59
    /**
60
     * The raw HTML markup content to extract clean content from.
61
     *
62
     * @var string
63
     */
64
    protected $content;
65
    /**
66
     * Mapping of HTML tags to Solr document fields.
67
     *
68
     * @var array
69
     */
70
    protected $tagToFieldMapping = [
71
        'h1' => 'tagsH1',
72
        'h2' => 'tagsH2H3',
73
        'h3' => 'tagsH2H3',
74
        'h4' => 'tagsH4H5H6',
75
        'h5' => 'tagsH4H5H6',
76
        'h6' => 'tagsH4H5H6',
77
        'u' => 'tagsInline',
78
        'b' => 'tagsInline',
79
        'strong' => 'tagsInline',
80
        'i' => 'tagsInline',
81
        'em' => 'tagsInline',
82
        'a' => 'tagsA',
83
    ];
84
85
    /**
86
     * @var TypoScriptConfiguration
87
     */
88
    private $configuration;
89
90
    /**
91
     * Constructor.
92
     *
93
     * @param string $content Content HTML markup
94
     */
95 73
    public function __construct($content)
96
    {
97 73
        $this->content = $content;
98 73
    }
99
100
    /**
101
     * @return TypoScriptConfiguration|array
102
     */
103 72
    protected function getConfiguration()
104
    {
105 72
        if ($this->configuration == null) {
106 62
            $this->configuration = Util::getSolrConfiguration();
107
        }
108
109 72
        return $this->configuration;
110
    }
111
112
    /**
113
     * @param TypoScriptConfiguration $configuration
114
     */
115 10
    public function setConfiguration(TypoScriptConfiguration $configuration)
116
    {
117 10
        $this->configuration = $configuration;
118 10
    }
119
120
    /**
121
     * Returns the cleaned indexable content from the page's HTML markup.
122
     *
123
     * The content is cleaned from HTML tags and control chars Solr could
124
     * stumble on.
125
     *
126
     * @return string Indexable, cleaned content ready for indexing.
127
     */
128
    public function getIndexableContent()
129
    {
130
        $content = self::cleanContent($this->content);
131
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
132
        // after entity decoding we might have tags again
133
        $content = strip_tags($content);
134
        $content = trim($content);
135
136
        return $content;
137
    }
138
139
    /**
140
     * Strips html tags, and tab, new-line, carriage-return, &nbsp; whitespace
141
     * characters.
142
     *
143
     * @param string $content String to clean
144
     * @return string String cleaned from tags and special whitespace characters
145
     */
146 70
    public static function cleanContent($content)
147
    {
148 70
        $content = self::stripControlCharacters($content);
149
        // remove Javascript
150 70
        $content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content);
151
152
        // remove internal CSS styles
153 70
        $content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content);
154
155
        // prevents concatenated words when stripping tags afterwards
156 70
        $content = str_replace(['<', '>'], [' <', '> '], $content);
157 70
        $content = static::stripTags($content);
158
159 70
        $content = str_replace(["\t", "\n", "\r", '&nbsp;'], ' ', $content);
160 70
        $content = self::stripUnicodeRanges($content);
161 70
        $content = trim($content);
162
163 70
        return $content;
164
    }
165
166
    /**
167
     * Strips html tags, but keeps single < and > characters.
168
     *
169
     * @param string $content
170
     * @return mixed
171
     */
172 70
    protected static function stripTags($content)
173
    {
174 70
        $content = preg_replace('@<([^>]+(<|\z))@msi', '##lt##$1', $content);
175 70
        $content = strip_tags($content);
176
        // unescape < that are not used to open a tag
177 70
        return str_replace('##lt##', '<', $content);
178
    }
179
180
    /**
181
     * Strips control characters that cause Jetty/Solr to fail.
182
     *
183
     * @param string $content the content to sanitize
184
     * @return string the sanitized content
185
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
186
     */
187 70
    public static function stripControlCharacters($content)
188
    {
189
        // Printable utf-8 does not include any of these chars below x7F
190 70
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
191
    }
192
193
    /**
194
     * Strips unusable unicode ranges
195
     *
196
     * @param string $content Content to sanitize
197
     * @return string Sanitized content
198
     */
199 70
    public static function stripUnicodeRanges($content)
200
    {
201 70
        foreach (self::$stripUnicodeRanges as $range) {
202 70
            $content = self::stripUnicodeRange($content, $range[0], $range[1]);
203
        }
204
205 70
        return $content;
206
    }
207
208
    /**
209
     * Strips a UTF-8 character range
210
     *
211
     * @param string $content Content to sanitize
212
     * @param string $start Unicode range start character as uppercase hexadecimal string
213
     * @param string $end Unicode range end character as uppercase hexadecimal string
214
     * @return string Sanitized content
215
     */
216 70
    public static function stripUnicodeRange($content, $start, $end)
217
    {
218 70
        return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
219 70
            $content);
220
    }
221
222
    /**
223
     * Shortcut method to retrieve the raw content marked for indexing.
224
     *
225
     * @return string Content marked for indexing.
226
     */
227 1
    public function getContentMarkedForIndexing()
228
    {
229 1
        return $this->content;
230
    }
231
232
    /**
233
     * Extracts HTML tag content from tags in the content marked for indexing.
234
     *
235
     * @return array A mapping of Solr document field names to content found in defined tags.
236
     */
237 63
    public function getTagContent()
238
    {
239 63
        $result = [];
240 63
        $matches = [];
241 63
        $content = $this->getContentMarkedForIndexing();
242
243
        // strip all ignored tags
244 63
        $content = strip_tags(
245 63
            $content,
246 63
            '<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
247
        );
248
249 63
        preg_match_all(
250 63
            '@<(' . implode('|',
251 63
                array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui',
252 63
            $content,
253 63
            $matches
254
        );
255
256 63
        foreach ($matches[1] as $key => $tag) {
257
            // We don't want to index links auto-generated by the url filter.
258 1
            $pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@';
259 1
            if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) {
260 1
                $fieldName = $this->tagToFieldMapping[$tag];
261 1
                $hasContentForFieldName = empty($result[$fieldName]);
262 1
                $separator = ($hasContentForFieldName) ? '' : ' ';
263 1
                $result[$fieldName] .= $separator . $matches[2][$key];
264
            }
265
        }
266
267 63
        return $result;
268
    }
269
}
270