HtmlContentExtractor::cleanContent() - Code Metrics - Inspection of "[BUGFIX] Fix outdated link in README.md (#2066)" - timohund/ext-solr - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 4e40a2...cc3f84 )

by Timo

created 2018-07-31 07:42 UTC

HtmlContentExtractor::cleanContent() A

↳ Parent: HtmlContentExtractor

Complexity

Conditions	1
Paths	1

Size

Total Lines	19
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	11
CRAP Score	1

Importance

Changes

Metric	Value
eloc	10
dl	0
loc	19
ccs	11
cts	11
cp	1
rs	9.9332
c	0
b	0
f	0
cc	1
nc	1
nop	1
crap	1

<?php
namespace ApacheSolrForTypo3\Solr;

/***************************************************************
 *  Copyright notice
 *
 *  (c) 2011-2015 Ingo Renner <[email protected]>
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
 *  free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  The GNU General Public License can be found at
 *  http://www.gnu.org/copyleft/gpl.html.
 *
 *  This script is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  This copyright notice MUST APPEAR in all copies of the script!
 ***************************************************************/
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;

/**
 * A content extractor to get clean, indexable content from HTML markup.
 *
 * @author Ingo Renner <[email protected]>
 */
class HtmlContentExtractor
{

    /**
     * Unicode ranges which should get stripped before sending a document to solr.
     * This is necessary if a document (PDF, etc.) contains unicode characters which
     * are valid in the font being used in the document but are not available in the
     * font being used for displaying results.
     *
     * This is often the case if PDFs are being indexed where special fonts are used
     * for displaying bullets, etc. Usually those bullets reside in one of the unicode
     * "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
     *
     * @see http://en.wikipedia.org/wiki/Unicode_block
     * @var array
     */
    protected static $stripUnicodeRanges = [
        ['FFFD', 'FFFD'],
        // Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
        ['E000', 'F8FF'],
        // Private Use Area (part of Plane 0)
        ['F0000', 'FFFFF'],
        // Supplementary Private Use Area (Plane 15)

        ['100000', '10FFFF'],
        // Supplementary Private Use Area (Plane 16)

    ];
    /**
     * The raw HTML markup content to extract clean content from.
     *
     * @var string
     */
    protected $content;
    /**
     * Mapping of HTML tags to Solr document fields.
     *
     * @var array
     */
    protected $tagToFieldMapping = [
        'h1' => 'tagsH1',
        'h2' => 'tagsH2H3',
        'h3' => 'tagsH2H3',
        'h4' => 'tagsH4H5H6',
        'h5' => 'tagsH4H5H6',
        'h6' => 'tagsH4H5H6',
        'u' => 'tagsInline',
        'b' => 'tagsInline',
        'strong' => 'tagsInline',
        'i' => 'tagsInline',
        'em' => 'tagsInline',
        'a' => 'tagsA',
    ];

    /**
     * @var TypoScriptConfiguration
     */
    private $configuration;

    /**
     * Constructor.
     *
     * @param string $content Content HTML markup
     */
    public function __construct($content)
    {
        $this->content = $content;
    }

    /**
     * @return TypoScriptConfiguration|array
     */
    protected function getConfiguration()
    {
        if ($this->configuration == null) {
            $this->configuration = Util::getSolrConfiguration();
        }

        return $this->configuration;
    }

    /**
     * @param TypoScriptConfiguration $configuration
     */
    public function setConfiguration(TypoScriptConfiguration $configuration)
    {
        $this->configuration = $configuration;
    }

    /**
     * Returns the cleaned indexable content from the page's HTML markup.
     *
     * The content is cleaned from HTML tags and control chars Solr could
     * stumble on.
     *
     * @return string Indexable, cleaned content ready for indexing.
     */
    public function getIndexableContent()
    {
        $content = self::cleanContent($this->content);
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
        // after entity decoding we might have tags again
        $content = strip_tags($content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips html tags, and tab, new-line, carriage-return, &nbsp; whitespace
     * characters.
     *
     * @param string $content String to clean
     * @return string String cleaned from tags and special whitespace characters
     */
    public static function cleanContent($content)
    {
        $content = self::stripControlCharacters($content);
        // remove Javascript
        $content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content);

        // remove internal CSS styles
        $content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content);

        // prevents concatenated words when stripping tags afterwards
        $content = str_replace(['<', '>'], [' <', '> '], $content);
        $content = static::stripTags($content);

        $content = str_replace(["\t", "\n", "\r", '&nbsp;'], ' ', $content);
        $content = self::stripUnicodeRanges($content);
        $content = preg_replace('/\s{2,}/', ' ', $content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips html tags, but keeps single < and > characters.
     *
     * @param string $content
     * @return mixed
     */
    protected static function stripTags($content)
    {
        $content = preg_replace('@<([^>]+(<|\z))@msi', '##lt##$1', $content);
        $content = strip_tags($content);
        // unescape < that are not used to open a tag
        return str_replace('##lt##', '<', $content);
    }

    /**
     * Strips control characters that cause Jetty/Solr to fail.
     *
     * @param string $content the content to sanitize
     * @return string the sanitized content
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
     */
    public static function stripControlCharacters($content)
    {
        // Printable utf-8 does not include any of these chars below x7F
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
    }

    /**
     * Strips unusable unicode ranges
     *
     * @param string $content Content to sanitize
     * @return string Sanitized content
     */
    public static function stripUnicodeRanges($content)
    {
        foreach (self::$stripUnicodeRanges as $range) {
            $content = self::stripUnicodeRange($content, $range[0], $range[1]);
        }

        return $content;
    }

    /**
     * Strips a UTF-8 character range
     *
     * @param string $content Content to sanitize
     * @param string $start Unicode range start character as uppercase hexadecimal string
     * @param string $end Unicode range end character as uppercase hexadecimal string
     * @return string Sanitized content
     */
    public static function stripUnicodeRange($content, $start, $end)
    {
        return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
            $content);
    }

    /**
     * Shortcut method to retrieve the raw content marked for indexing.
     *
     * @return string Content marked for indexing.
     */
    public function getContentMarkedForIndexing()
    {
        return $this->content;
    }

    /**
     * Extracts HTML tag content from tags in the content marked for indexing.
     *
     * @return array A mapping of Solr document field names to content found in defined tags.
     */
    public function getTagContent()
    {
        $result = [];
        $matches = [];
        $content = $this->getContentMarkedForIndexing();

        // strip all ignored tags
        $content = strip_tags(
            $content,
            '<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
        );

        preg_match_all(
            '@<(' . implode('|',
                array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui',
            $content,
            $matches
        );

        foreach ($matches[1] as $key => $tag) {
            // We don't want to index links auto-generated by the url filter.
            $pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@';
            if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) {
                $fieldName = $this->tagToFieldMapping[$tag];
                $hasContentForFieldName = empty($result[$fieldName]);
                $separator = ($hasContentForFieldName) ? '' : ' ';
                $result[$fieldName] .= $separator . $matches[2][$key];
            }
        }

        return $result;
    }
}


1		<?php
2		namespace ApacheSolrForTypo3\Solr;
3
4		/***************************************************************
5		* Copyright notice
6		*
7		* (c) 2011-2015 Ingo Renner <[email protected]>
8		* All rights reserved
9		*
10		* This script is part of the TYPO3 project. The TYPO3 project is
11		* free software; you can redistribute it and/or modify
12		* it under the terms of the GNU General Public License as published by
13		* the Free Software Foundation; either version 3 of the License, or
14		* (at your option) any later version.
15		*
16		* The GNU General Public License can be found at
17		* http://www.gnu.org/copyleft/gpl.html.
18		*
19		* This script is distributed in the hope that it will be useful,
20		* but WITHOUT ANY WARRANTY; without even the implied warranty of
21		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22		* GNU General Public License for more details.
23		*
24		* This copyright notice MUST APPEAR in all copies of the script!
25		***************************************************************/
26		use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
27
28		/**
29		* A content extractor to get clean, indexable content from HTML markup.
30		*
31		* @author Ingo Renner <[email protected]>
32		*/
33		class HtmlContentExtractor
34		{
35
36		/**
37		* Unicode ranges which should get stripped before sending a document to solr.
38		* This is necessary if a document (PDF, etc.) contains unicode characters which
39		* are valid in the font being used in the document but are not available in the
40		* font being used for displaying results.
41		*
42		* This is often the case if PDFs are being indexed where special fonts are used
43		* for displaying bullets, etc. Usually those bullets reside in one of the unicode
44		* "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
45		*
46		* @see http://en.wikipedia.org/wiki/Unicode_block
47		* @var array
48		*/
49		protected static $stripUnicodeRanges = [
50		['FFFD', 'FFFD'],
51		// Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
52		['E000', 'F8FF'],
53		// Private Use Area (part of Plane 0)
54		['F0000', 'FFFFF'],
55		// Supplementary Private Use Area (Plane 15)
		0 ignored issues – show Unused Code Comprehensibility introduced 2018-02-07 08:34 UTC by Report Bug Copy Issue Report `36%` of this comment could be valid code. Did you maybe forget this after debugging? Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. Loading history...
56		['100000', '10FFFF'],
57		// Supplementary Private Use Area (Plane 16)
		0 ignored issues – show Unused Code Comprehensibility introduced 2018-02-07 08:34 UTC by Report Bug Copy Issue Report `36%` of this comment could be valid code. Did you maybe forget this after debugging? Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. Loading history...
58		];
59		/**
60		* The raw HTML markup content to extract clean content from.
61		*
62		* @var string
63		*/
64		protected $content;
65		/**
66		* Mapping of HTML tags to Solr document fields.
67		*
68		* @var array
69		*/
70		protected $tagToFieldMapping = [
71		'h1' => 'tagsH1',
72		'h2' => 'tagsH2H3',
73		'h3' => 'tagsH2H3',
74		'h4' => 'tagsH4H5H6',
75		'h5' => 'tagsH4H5H6',
76		'h6' => 'tagsH4H5H6',
77		'u' => 'tagsInline',
78		'b' => 'tagsInline',
79		'strong' => 'tagsInline',
80		'i' => 'tagsInline',
81		'em' => 'tagsInline',
82		'a' => 'tagsA',
83		];
84
85		/**
86		* @var TypoScriptConfiguration
87		*/
88		private $configuration;
89
90		/**
91		* Constructor.
92		*
93		* @param string $content Content HTML markup
94		*/
95	81	public function __construct($content)
96		{
97	81	$this->content = $content;
98	81	}
99
100		/**
101		* @return TypoScriptConfiguration\|array
102		*/
103	76	protected function getConfiguration()
104		{
105	76	if ($this->configuration == null) {
106	66	$this->configuration = Util::getSolrConfiguration();
107		}
108
109	76	return $this->configuration;
110		}
111
112		/**
113		* @param TypoScriptConfiguration $configuration
114		*/
115	10	public function setConfiguration(TypoScriptConfiguration $configuration)
116		{
117	10	$this->configuration = $configuration;
118	10	}
119
120		/**
121		* Returns the cleaned indexable content from the page's HTML markup.
122		*
123		* The content is cleaned from HTML tags and control chars Solr could
124		* stumble on.
125		*
126		* @return string Indexable, cleaned content ready for indexing.
127		*/
128	4	public function getIndexableContent()
129		{
130	4	$content = self::cleanContent($this->content);
131	4	$content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
132		// after entity decoding we might have tags again
133	4	$content = strip_tags($content);
134	4	$content = trim($content);
135
136	4	return $content;
137		}
138
139		/**
140		* Strips html tags, and tab, new-line, carriage-return,   whitespace
141		* characters.
142		*
143		* @param string $content String to clean
144		* @return string String cleaned from tags and special whitespace characters
145		*/
146	78	public static function cleanContent($content)
147		{
148	78	$content = self::stripControlCharacters($content);
149		// remove Javascript
150	78	$content = preg_replace('@<script[^>]>.?<\/script>@msi', '', $content);
151
152		// remove internal CSS styles
153	78	$content = preg_replace('@<style[^>]>.?<\/style>@msi', '', $content);
154
155		// prevents concatenated words when stripping tags afterwards
156	78	$content = str_replace(['<', '>'], [' <', '> '], $content);
157	78	$content = static::stripTags($content);
158
159	78	$content = str_replace(["\t", "\n", "\r", ' '], ' ', $content);
160	78	$content = self::stripUnicodeRanges($content);
161	78	$content = preg_replace('/\s{2,}/', ' ', $content);
162	78	$content = trim($content);
163
164	78	return $content;
165		}
166
167		/**
168		* Strips html tags, but keeps single < and > characters.
169		*
170		* @param string $content
171		* @return mixed
172		*/
173	78	protected static function stripTags($content)
174		{
175	78	$content = preg_replace('@<([^>]+(<\|\z))@msi', '##lt##$1', $content);
176	78	$content = strip_tags($content);
177		// unescape < that are not used to open a tag
178	78	return str_replace('##lt##', '<', $content);
179		}
180
181		/**
182		* Strips control characters that cause Jetty/Solr to fail.
183		*
184		* @param string $content the content to sanitize
185		* @return string the sanitized content
186		* @see http://w3.org/International/questions/qa-forms-utf-8.html
187		*/
188	78	public static function stripControlCharacters($content)
189		{
190		// Printable utf-8 does not include any of these chars below x7F
191	78	return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
192		}
193
194		/**
195		* Strips unusable unicode ranges
196		*
197		* @param string $content Content to sanitize
198		* @return string Sanitized content
199		*/
200	78	public static function stripUnicodeRanges($content)
201		{
202	78	foreach (self::$stripUnicodeRanges as $range) {
203	78	$content = self::stripUnicodeRange($content, $range[0], $range[1]);
204		}
205
206	78	return $content;
207		}
208
209		/**
210		* Strips a UTF-8 character range
211		*
212		* @param string $content Content to sanitize
213		* @param string $start Unicode range start character as uppercase hexadecimal string
214		* @param string $end Unicode range end character as uppercase hexadecimal string
215		* @return string Sanitized content
216		*/
217	78	public static function stripUnicodeRange($content, $start, $end)
218		{
219	78	return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
220	78	$content);
221		}
222
223		/**
224		* Shortcut method to retrieve the raw content marked for indexing.
225		*
226		* @return string Content marked for indexing.
227		*/
228	1	public function getContentMarkedForIndexing()
229		{
230	1	return $this->content;
231		}
232
233		/**
234		* Extracts HTML tag content from tags in the content marked for indexing.
235		*
236		* @return array A mapping of Solr document field names to content found in defined tags.
237		*/
238	67	public function getTagContent()
239		{
240	67	$result = [];
241	67	$matches = [];
242	67	$content = $this->getContentMarkedForIndexing();
243
244		// strip all ignored tags
245	67	$content = strip_tags(
246	67	$content,
247	67	'<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
248		);
249
250	67	preg_match_all(
251	67	'@<(' . implode('\|',
252	67	array_keys($this->tagToFieldMapping)) . ')[^>]>(.)</\1>@Ui',
253	67	$content,
254	67	$matches
255		);
256
257	67	foreach ($matches[1] as $key => $tag) {
258		// We don't want to index links auto-generated by the url filter.
259	1	$pattern = '@(?:http://\|https://\|ftp://\|mailto:\|smb://\|afp://\|file://\|gopher://\|news://\|ssl://\|sslv2://\|sslv3://\|tls://\|tcp://\|udp://\|www\.)[a-zA-Z0-9]+@';
260	1	if ($tag != 'a' \|\| !preg_match($pattern, $matches[2][$key])) {
261	1	$fieldName = $this->tagToFieldMapping[$tag];
262	1	$hasContentForFieldName = empty($result[$fieldName]);
263	1	$separator = ($hasContentForFieldName) ? '' : ' ';
264	1	$result[$fieldName] .= $separator . $matches[2][$key];
265		}
266		}
267
268	67	return $result;
269		}
270		}
271

timohund / ext-solr

Push — master ( 4e40a2...cc3f84 )

HtmlContentExtractor::cleanContent() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like