HtmlContentExtractor::getIndexableContent() - Code Metrics - Inspection of "[TASK] Improve index administration template (#174..." - TYPO3-Solr/ext-solr - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 42d2d3...8f9ec7 )

by Timo

created 2017-12-11 13:42 UTC

HtmlContentExtractor::getIndexableContent() A

↳ Parent: HtmlContentExtractor

Complexity

Conditions	1
Paths	1

Size

Total Lines	10
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	2

Importance

Changes

Metric	Value
dl	0
loc	10
ccs	0
cts	6
cp	0
rs	9.4285
c	0
b	0
f	0
cc	1
eloc	6
nc	1
nop	0
crap	2

<?php
namespace ApacheSolrForTypo3\Solr;

/***************************************************************
 *  Copyright notice
 *
 *  (c) 2011-2015 Ingo Renner <[email protected]>
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
 *  free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  The GNU General Public License can be found at
 *  http://www.gnu.org/copyleft/gpl.html.
 *
 *  This script is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  This copyright notice MUST APPEAR in all copies of the script!
 ***************************************************************/
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;

/**
 * A content extractor to get clean, indexable content from HTML markup.
 *
 * @author Ingo Renner <[email protected]>
 */
class HtmlContentExtractor
{

    /**
     * Unicode ranges which should get stripped before sending a document to solr.
     * This is necessary if a document (PDF, etc.) contains unicode characters which
     * are valid in the font being used in the document but are not available in the
     * font being used for displaying results.
     *
     * This is often the case if PDFs are being indexed where special fonts are used
     * for displaying bullets, etc. Usually those bullets reside in one of the unicode
     * "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
     *
     * @see http://en.wikipedia.org/wiki/Unicode_block
     * @var array
     */
    protected static $stripUnicodeRanges = [
        ['FFFD', 'FFFD'],
        // Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
        ['E000', 'F8FF'],
        // Private Use Area (part of Plane 0)
        ['F0000', 'FFFFF'],
        // Supplementary Private Use Area (Plane 15)
        ['100000', '10FFFF'],
        // Supplementary Private Use Area (Plane 16)
    ];
    /**
     * The raw HTML markup content to extract clean content from.
     *
     * @var string
     */
    protected $content;
    /**
     * Mapping of HTML tags to Solr document fields.
     *
     * @var array
     */
    protected $tagToFieldMapping = [
        'h1' => 'tagsH1',
        'h2' => 'tagsH2H3',
        'h3' => 'tagsH2H3',
        'h4' => 'tagsH4H5H6',
        'h5' => 'tagsH4H5H6',
        'h6' => 'tagsH4H5H6',
        'u' => 'tagsInline',
        'b' => 'tagsInline',
        'strong' => 'tagsInline',
        'i' => 'tagsInline',
        'em' => 'tagsInline',
        'a' => 'tagsA',
    ];

    /**
     * @var TypoScriptConfiguration
     */
    private $configuration;

    /**
     * Constructor.
     *
     * @param string $content Content HTML markup
     */
    public function __construct($content)
    {
        $this->content = $content;
    }

    /**
     * @return TypoScriptConfiguration|array
     */
    protected function getConfiguration()
    {
        if ($this->configuration == null) {
            $this->configuration = Util::getSolrConfiguration();
        }

        return $this->configuration;
    }

    /**
     * @param TypoScriptConfiguration $configuration
     */
    public function setConfiguration(TypoScriptConfiguration $configuration)
    {
        $this->configuration = $configuration;
    }

    /**
     * Returns the cleaned indexable content from the page's HTML markup.
     *
     * The content is cleaned from HTML tags and control chars Solr could
     * stumble on.
     *
     * @return string Indexable, cleaned content ready for indexing.
     */
    public function getIndexableContent()
    {
        $content = self::cleanContent($this->content);
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
        // after entity decoding we might have tags again
        $content = strip_tags($content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips html tags, and tab, new-line, carriage-return, &nbsp; whitespace
     * characters.
     *
     * @param string $content String to clean
     * @return string String cleaned from tags and special whitespace characters
     */
    public static function cleanContent($content)
    {
        $content = self::stripControlCharacters($content);
        // remove Javascript
        $content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content);

        // remove internal CSS styles
        $content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content);

        // prevents concatenated words when stripping tags afterwards
        $content = str_replace(['<', '>'], [' <', '> '], $content);
        $content = static::stripTags($content);

        $content = str_replace(["\t", "\n", "\r", '&nbsp;'], ' ', $content);
        $content = self::stripUnicodeRanges($content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips html tags, but keeps single < and > characters.
     *
     * @param string $content
     * @return mixed
     */
    protected static function stripTags($content)
    {
        $content = preg_replace('@<([^>]+(<|\z))@msi', '##lt##$1', $content);
        $content = strip_tags($content);
        // unescape < that are not used to open a tag
        return str_replace('##lt##', '<', $content);
    }

    /**
     * Strips control characters that cause Jetty/Solr to fail.
     *
     * @param string $content the content to sanitize
     * @return string the sanitized content
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
     */
    public static function stripControlCharacters($content)
    {
        // Printable utf-8 does not include any of these chars below x7F
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
    }

    /**
     * Strips unusable unicode ranges
     *
     * @param string $content Content to sanitize
     * @return string Sanitized content
     */
    public static function stripUnicodeRanges($content)
    {
        foreach (self::$stripUnicodeRanges as $range) {
            $content = self::stripUnicodeRange($content, $range[0], $range[1]);
        }

        return $content;
    }

    /**
     * Strips a UTF-8 character range
     *
     * @param string $content Content to sanitize
     * @param string $start Unicode range start character as uppercase hexadecimal string
     * @param string $end Unicode range end character as uppercase hexadecimal string
     * @return string Sanitized content
     */
    public static function stripUnicodeRange($content, $start, $end)
    {
        return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
            $content);
    }

    /**
     * Shortcut method to retrieve the raw content marked for indexing.
     *
     * @return string Content marked for indexing.
     */
    public function getContentMarkedForIndexing()
    {
        return $this->content;
    }

    /**
     * Extracts HTML tag content from tags in the content marked for indexing.
     *
     * @return array A mapping of Solr document field names to content found in defined tags.
     */
    public function getTagContent()
    {
        $result = [];
        $matches = [];
        $content = $this->getContentMarkedForIndexing();

        // strip all ignored tags
        $content = strip_tags(
            $content,
            '<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
        );

        preg_match_all(
            '@<(' . implode('|',
                array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui',
            $content,
            $matches
        );

        foreach ($matches[1] as $key => $tag) {
            // We don't want to index links auto-generated by the url filter.
            $pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@';
            if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) {
                $fieldName = $this->tagToFieldMapping[$tag];
                $hasContentForFieldName = empty($result[$fieldName]);
                $separator = ($hasContentForFieldName) ? '' : ' ';
                $result[$fieldName] .= $separator . $matches[2][$key];
            }
        }

        return $result;
    }
}


1		<?php
2		namespace ApacheSolrForTypo3\Solr;
3
4		/***************************************************************
5		* Copyright notice
6		*
7		* (c) 2011-2015 Ingo Renner <[email protected]>
8		* All rights reserved
9		*
10		* This script is part of the TYPO3 project. The TYPO3 project is
11		* free software; you can redistribute it and/or modify
12		* it under the terms of the GNU General Public License as published by
13		* the Free Software Foundation; either version 2 of the License, or
14		* (at your option) any later version.
15		*
16		* The GNU General Public License can be found at
17		* http://www.gnu.org/copyleft/gpl.html.
18		*
19		* This script is distributed in the hope that it will be useful,
20		* but WITHOUT ANY WARRANTY; without even the implied warranty of
21		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22		* GNU General Public License for more details.
23		*
24		* This copyright notice MUST APPEAR in all copies of the script!
25		***************************************************************/
26		use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
27
28		/**
29		* A content extractor to get clean, indexable content from HTML markup.
30		*
31		* @author Ingo Renner <[email protected]>
32		*/
33		class HtmlContentExtractor
34		{
35
36		/**
37		* Unicode ranges which should get stripped before sending a document to solr.
38		* This is necessary if a document (PDF, etc.) contains unicode characters which
39		* are valid in the font being used in the document but are not available in the
40		* font being used for displaying results.
41		*
42		* This is often the case if PDFs are being indexed where special fonts are used
43		* for displaying bullets, etc. Usually those bullets reside in one of the unicode
44		* "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
45		*
46		* @see http://en.wikipedia.org/wiki/Unicode_block
47		* @var array
48		*/
49		protected static $stripUnicodeRanges = [
50		['FFFD', 'FFFD'],
51		// Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
52		['E000', 'F8FF'],
53		// Private Use Area (part of Plane 0)
54		['F0000', 'FFFFF'],
55		// Supplementary Private Use Area (Plane 15)
56		['100000', '10FFFF'],
57		// Supplementary Private Use Area (Plane 16)
58		];
59		/**
60		* The raw HTML markup content to extract clean content from.
61		*
62		* @var string
63		*/
64		protected $content;
65		/**
66		* Mapping of HTML tags to Solr document fields.
67		*
68		* @var array
69		*/
70		protected $tagToFieldMapping = [
71		'h1' => 'tagsH1',
72		'h2' => 'tagsH2H3',
73		'h3' => 'tagsH2H3',
74		'h4' => 'tagsH4H5H6',
75		'h5' => 'tagsH4H5H6',
76		'h6' => 'tagsH4H5H6',
77		'u' => 'tagsInline',
78		'b' => 'tagsInline',
79		'strong' => 'tagsInline',
80		'i' => 'tagsInline',
81		'em' => 'tagsInline',
82		'a' => 'tagsA',
83		];
84
85		/**
86		* @var TypoScriptConfiguration
87		*/
88		private $configuration;
89
90		/**
91		* Constructor.
92		*
93		* @param string $content Content HTML markup
94		*/
95	73	public function __construct($content)
96		{
97	73	$this->content = $content;
98	73	}
99
100		/**
101		* @return TypoScriptConfiguration\|array
102		*/
103	72	protected function getConfiguration()
104		{
105	72	if ($this->configuration == null) {
106	62	$this->configuration = Util::getSolrConfiguration();
107		}
108
109	72	return $this->configuration;
110		}
111
112		/**
113		* @param TypoScriptConfiguration $configuration
114		*/
115	10	public function setConfiguration(TypoScriptConfiguration $configuration)
116		{
117	10	$this->configuration = $configuration;
118	10	}
119
120		/**
121		* Returns the cleaned indexable content from the page's HTML markup.
122		*
123		* The content is cleaned from HTML tags and control chars Solr could
124		* stumble on.
125		*
126		* @return string Indexable, cleaned content ready for indexing.
127		*/
128		public function getIndexableContent()
129		{
130		$content = self::cleanContent($this->content);
131		$content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
132		// after entity decoding we might have tags again
133		$content = strip_tags($content);
134		$content = trim($content);
135
136		return $content;
137		}
138
139		/**
140		* Strips html tags, and tab, new-line, carriage-return,   whitespace
141		* characters.
142		*
143		* @param string $content String to clean
144		* @return string String cleaned from tags and special whitespace characters
145		*/
146	70	public static function cleanContent($content)
147		{
148	70	$content = self::stripControlCharacters($content);
149		// remove Javascript
150	70	$content = preg_replace('@<script[^>]>.?<\/script>@msi', '', $content);
151
152		// remove internal CSS styles
153	70	$content = preg_replace('@<style[^>]>.?<\/style>@msi', '', $content);
154
155		// prevents concatenated words when stripping tags afterwards
156	70	$content = str_replace(['<', '>'], [' <', '> '], $content);
157	70	$content = static::stripTags($content);
158
159	70	$content = str_replace(["\t", "\n", "\r", ' '], ' ', $content);
160	70	$content = self::stripUnicodeRanges($content);
161	70	$content = trim($content);
162
163	70	return $content;
164		}
165
166		/**
167		* Strips html tags, but keeps single < and > characters.
168		*
169		* @param string $content
170		* @return mixed
171		*/
172	70	protected static function stripTags($content)
173		{
174	70	$content = preg_replace('@<([^>]+(<\|\z))@msi', '##lt##$1', $content);
175	70	$content = strip_tags($content);
176		// unescape < that are not used to open a tag
177	70	return str_replace('##lt##', '<', $content);
178		}
179
180		/**
181		* Strips control characters that cause Jetty/Solr to fail.
182		*
183		* @param string $content the content to sanitize
184		* @return string the sanitized content
185		* @see http://w3.org/International/questions/qa-forms-utf-8.html
186		*/
187	70	public static function stripControlCharacters($content)
188		{
189		// Printable utf-8 does not include any of these chars below x7F
190	70	return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
191		}
192
193		/**
194		* Strips unusable unicode ranges
195		*
196		* @param string $content Content to sanitize
197		* @return string Sanitized content
198		*/
199	70	public static function stripUnicodeRanges($content)
200		{
201	70	foreach (self::$stripUnicodeRanges as $range) {
202	70	$content = self::stripUnicodeRange($content, $range[0], $range[1]);
203		}
204
205	70	return $content;
206		}
207
208		/**
209		* Strips a UTF-8 character range
210		*
211		* @param string $content Content to sanitize
212		* @param string $start Unicode range start character as uppercase hexadecimal string
213		* @param string $end Unicode range end character as uppercase hexadecimal string
214		* @return string Sanitized content
215		*/
216	70	public static function stripUnicodeRange($content, $start, $end)
217		{
218	70	return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
219	70	$content);
220		}
221
222		/**
223		* Shortcut method to retrieve the raw content marked for indexing.
224		*
225		* @return string Content marked for indexing.
226		*/
227	1	public function getContentMarkedForIndexing()
228		{
229	1	return $this->content;
230		}
231
232		/**
233		* Extracts HTML tag content from tags in the content marked for indexing.
234		*
235		* @return array A mapping of Solr document field names to content found in defined tags.
236		*/
237	63	public function getTagContent()
238		{
239	63	$result = [];
240	63	$matches = [];
241	63	$content = $this->getContentMarkedForIndexing();
242
243		// strip all ignored tags
244	63	$content = strip_tags(
245	63	$content,
246	63	'<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
247		);
248
249	63	preg_match_all(
250	63	'@<(' . implode('\|',
251	63	array_keys($this->tagToFieldMapping)) . ')[^>]>(.)</\1>@Ui',
252	63	$content,
253	63	$matches
254		);
255
256	63	foreach ($matches[1] as $key => $tag) {
257		// We don't want to index links auto-generated by the url filter.
258	1	$pattern = '@(?:http://\|https://\|ftp://\|mailto:\|smb://\|afp://\|file://\|gopher://\|news://\|ssl://\|sslv2://\|sslv3://\|tls://\|tcp://\|udp://\|www\.)[a-zA-Z0-9]+@';
259	1	if ($tag != 'a' \|\| !preg_match($pattern, $matches[2][$key])) {
260	1	$fieldName = $this->tagToFieldMapping[$tag];
261	1	$hasContentForFieldName = empty($result[$fieldName]);
262	1	$separator = ($hasContentForFieldName) ? '' : ' ';
263	1	$result[$fieldName] .= $separator . $matches[2][$key];
264		}
265		}
266
267	63	return $result;
268		}
269		}
270

TYPO3-Solr / ext-solr

Push — master ( 42d2d3...8f9ec7 )

HtmlContentExtractor::getIndexableContent() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like