HtmlContentExtractor - Code Metrics - Inspection of "[FEATURE] Danish dictionary compound word token fi..." - TYPO3-Solr/ext-solr - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 498335...52b5ef )

by Rafael

created 2021-07-05 07:36 UTC

HtmlContentExtractor A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	223
Duplicated Lines	0 %

Test Coverage

Coverage

73.68%

Importance

Changes

Metric	Value
wmc	16
eloc	66
c	0
b	0
f	0
dl	0
loc	223
ccs	42
cts	57
cp	0.7368
rs	10

10 Methods

Rating	Name	Size	Complexity
A	__construct()	4	1
A	stripUnicodeRange()	4	1
A	stripControlCharacters()	4	1
A	stripUnicodeRanges()	7	2
A	cleanContent()	20	1
A	getConfiguration()	7	2
A	getIndexableContent()	7	1
A	getContentMarkedForIndexing()	4	1
A	setConfiguration()	3	1
A	getTagContent()	31	5

<?php
namespace ApacheSolrForTypo3\Solr;

/***************************************************************
 *  Copyright notice
 *
 *  (c) 2011-2015 Ingo Renner <[email protected]>
 *  All rights reserved
 *
 *  This script is part of the TYPO3 project. The TYPO3 project is
 *  free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  The GNU General Public License can be found at
 *  http://www.gnu.org/copyleft/gpl.html.
 *
 *  This script is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  This copyright notice MUST APPEAR in all copies of the script!
 ***************************************************************/
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;

/**
 * A content extractor to get clean, indexable content from HTML markup.
 *
 * @author Ingo Renner <[email protected]>
 */
class HtmlContentExtractor
{

    /**
     * Unicode ranges which should get stripped before sending a document to solr.
     * This is necessary if a document (PDF, etc.) contains unicode characters which
     * are valid in the font being used in the document but are not available in the
     * font being used for displaying results.
     *
     * This is often the case if PDFs are being indexed where special fonts are used
     * for displaying bullets, etc. Usually those bullets reside in one of the unicode
     * "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
     *
     * @see http://en.wikipedia.org/wiki/Unicode_block
     * @var array
     */
    protected static $stripUnicodeRanges = [
        ['FFFD', 'FFFD'],
        // Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
        ['E000', 'F8FF'],
        // Private Use Area (part of Plane 0)
        ['F0000', 'FFFFF'],
        // Supplementary Private Use Area (Plane 15)
        ['100000', '10FFFF'],
        // Supplementary Private Use Area (Plane 16)
    ];
    /**
     * The raw HTML markup content to extract clean content from.
     *
     * @var string
     */
    protected $content;
    /**
     * Mapping of HTML tags to Solr document fields.
     *
     * @var array
     */
    protected $tagToFieldMapping = [
        'h1' => 'tagsH1',
        'h2' => 'tagsH2H3',
        'h3' => 'tagsH2H3',
        'h4' => 'tagsH4H5H6',
        'h5' => 'tagsH4H5H6',
        'h6' => 'tagsH4H5H6',
        'u' => 'tagsInline',
        'b' => 'tagsInline',
        'strong' => 'tagsInline',
        'i' => 'tagsInline',
        'em' => 'tagsInline',
        'a' => 'tagsA',
    ];

    /**
     * @var TypoScriptConfiguration
     */
    private $configuration;

    /**
     * Constructor.
     *
     * @param string $content Content HTML markup
     */
    public function __construct($content)
    {
        // @extensionScannerIgnoreLine
        $this->content = $content;
    }

    /**
     * @return TypoScriptConfiguration|array
     */
    protected function getConfiguration()
    {
        if ($this->configuration == null) {
            $this->configuration = Util::getSolrConfiguration();
        }

        return $this->configuration;
    }

    /**
     * @param TypoScriptConfiguration $configuration
     */
    public function setConfiguration(TypoScriptConfiguration $configuration)
    {
        $this->configuration = $configuration;
    }

    /**
     * Returns the cleaned indexable content from the page's HTML markup.
     *
     * The content is cleaned from HTML tags and control chars Solr could
     * stumble on.
     *
     * @return string Indexable, cleaned content ready for indexing.
     */
    public function getIndexableContent()
    {
        // @extensionScannerIgnoreLine
        $content = self::cleanContent($this->content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips html tags, and tab, new-line, carriage-return, &nbsp; whitespace
     * characters.
     *
     * @param string $content String to clean
     * @return string String cleaned from tags and special whitespace characters
     */
    public static function cleanContent($content)
    {
        $content = self::stripControlCharacters($content);
        // remove Javascript
        $content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content);

        // remove internal CSS styles
        $content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content);

        // prevents concatenated words when stripping tags afterwards
        $content = str_replace(['<', '>'], [' <', '> '], $content);
        $content = str_replace(["\t", "\n", "\r", '&nbsp;'], ' ', $content);
        $content = strip_tags($content);
        $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');

        $content = self::stripUnicodeRanges($content);
        $content = preg_replace('/\s{2,}/u', ' ', $content);
        $content = trim($content);

        return $content;
    }

    /**
     * Strips control characters that cause Jetty/Solr to fail.
     *
     * @param string $content the content to sanitize
     * @return string the sanitized content
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
     */
    public static function stripControlCharacters($content)
    {
        // Printable utf-8 does not include any of these chars below x7F
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
    }

    /**
     * Strips unusable unicode ranges
     *
     * @param string $content Content to sanitize
     * @return string Sanitized content
     */
    public static function stripUnicodeRanges($content)
    {
        foreach (self::$stripUnicodeRanges as $range) {
            $content = self::stripUnicodeRange($content, $range[0], $range[1]);
        }

        return $content;
    }

    /**
     * Strips a UTF-8 character range
     *
     * @param string $content Content to sanitize
     * @param string $start Unicode range start character as uppercase hexadecimal string
     * @param string $end Unicode range end character as uppercase hexadecimal string
     * @return string Sanitized content
     */
    public static function stripUnicodeRange($content, $start, $end)
    {
        return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
            $content);
    }

    /**
     * Shortcut method to retrieve the raw content marked for indexing.
     *
     * @return string Content marked for indexing.
     */
    public function getContentMarkedForIndexing()
    {
        // @extensionScannerIgnoreLine
        return $this->content;
    }

    /**
     * Extracts HTML tag content from tags in the content marked for indexing.
     *
     * @return array A mapping of Solr document field names to content found in defined tags.
     */
    public function getTagContent()
    {
        $result = [];
        $matches = [];
        $content = $this->getContentMarkedForIndexing();

        // strip all ignored tags
        $content = strip_tags(
            $content,
            '<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
        );

        preg_match_all(
            '@<(' . implode('|',
                array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui',
            $content,
            $matches
        );

        foreach ($matches[1] as $key => $tag) {
            // We don't want to index links auto-generated by the url filter.
            $pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@';
            if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) {
                $fieldName = $this->tagToFieldMapping[$tag];
                $hasContentForFieldName = empty($result[$fieldName]);
                $separator = ($hasContentForFieldName) ? '' : ' ';
                $result[$fieldName] .= $separator . $matches[2][$key];
            }
        }

        return $result;
    }
}


1		<?php
2		namespace ApacheSolrForTypo3\Solr;
3
4		/***************************************************************
5		* Copyright notice
6		*
7		* (c) 2011-2015 Ingo Renner <[email protected]>
8		* All rights reserved
9		*
10		* This script is part of the TYPO3 project. The TYPO3 project is
11		* free software; you can redistribute it and/or modify
12		* it under the terms of the GNU General Public License as published by
13		* the Free Software Foundation; either version 3 of the License, or
14		* (at your option) any later version.
15		*
16		* The GNU General Public License can be found at
17		* http://www.gnu.org/copyleft/gpl.html.
18		*
19		* This script is distributed in the hope that it will be useful,
20		* but WITHOUT ANY WARRANTY; without even the implied warranty of
21		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22		* GNU General Public License for more details.
23		*
24		* This copyright notice MUST APPEAR in all copies of the script!
25		***************************************************************/
26		use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
27
28		/**
29		* A content extractor to get clean, indexable content from HTML markup.
30		*
31		* @author Ingo Renner <[email protected]>
32		*/
33		class HtmlContentExtractor
34		{
35
36		/**
37		* Unicode ranges which should get stripped before sending a document to solr.
38		* This is necessary if a document (PDF, etc.) contains unicode characters which
39		* are valid in the font being used in the document but are not available in the
40		* font being used for displaying results.
41		*
42		* This is often the case if PDFs are being indexed where special fonts are used
43		* for displaying bullets, etc. Usually those bullets reside in one of the unicode
44		* "Private Use Zones" or the "Private Use Area" (plane 15 + 16)
45		*
46		* @see http://en.wikipedia.org/wiki/Unicode_block
47		* @var array
48		*/
49		protected static $stripUnicodeRanges = [
50		['FFFD', 'FFFD'],
51		// Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
52		['E000', 'F8FF'],
53		// Private Use Area (part of Plane 0)
54		['F0000', 'FFFFF'],
55		// Supplementary Private Use Area (Plane 15)
56		['100000', '10FFFF'],
57		// Supplementary Private Use Area (Plane 16)
58		];
59		/**
60		* The raw HTML markup content to extract clean content from.
61		*
62		* @var string
63		*/
64		protected $content;
65		/**
66		* Mapping of HTML tags to Solr document fields.
67		*
68		* @var array
69		*/
70		protected $tagToFieldMapping = [
71		'h1' => 'tagsH1',
72		'h2' => 'tagsH2H3',
73		'h3' => 'tagsH2H3',
74		'h4' => 'tagsH4H5H6',
75		'h5' => 'tagsH4H5H6',
76		'h6' => 'tagsH4H5H6',
77		'u' => 'tagsInline',
78		'b' => 'tagsInline',
79		'strong' => 'tagsInline',
80		'i' => 'tagsInline',
81		'em' => 'tagsInline',
82		'a' => 'tagsA',
83		];
84
85		/**
86		* @var TypoScriptConfiguration
87		*/
88		private $configuration;
89
90		/**
91		* Constructor.
92		*
93		* @param string $content Content HTML markup
94		*/
95	70	public function __construct($content)
96		{
97		// @extensionScannerIgnoreLine
98	70	$this->content = $content;
99	70	}
100
101		/**
102		* @return TypoScriptConfiguration\|array
103		*/
104	70	protected function getConfiguration()
105		{
106	70	if ($this->configuration == null) {
107	70	$this->configuration = Util::getSolrConfiguration();
108		}
109
110	70	return $this->configuration;
111		}
112
113		/**
114		* @param TypoScriptConfiguration $configuration
115		*/
116		public function setConfiguration(TypoScriptConfiguration $configuration)
117		{
118		$this->configuration = $configuration;
119		}
120
121		/**
122		* Returns the cleaned indexable content from the page's HTML markup.
123		*
124		* The content is cleaned from HTML tags and control chars Solr could
125		* stumble on.
126		*
127		* @return string Indexable, cleaned content ready for indexing.
128		*/
129		public function getIndexableContent()
130		{
131		// @extensionScannerIgnoreLine
132		$content = self::cleanContent($this->content);
133		$content = trim($content);
134
135		return $content;
136		}
137
138		/**
139		* Strips html tags, and tab, new-line, carriage-return,   whitespace
140		* characters.
141		*
142		* @param string $content String to clean
143		* @return string String cleaned from tags and special whitespace characters
144		*/
145	70	public static function cleanContent($content)
146		{
147	70	$content = self::stripControlCharacters($content);
148		// remove Javascript
149	70	$content = preg_replace('@<script[^>]>.?<\/script>@msi', '', $content);
150
151		// remove internal CSS styles
152	70	$content = preg_replace('@<style[^>]>.?<\/style>@msi', '', $content);
153
154		// prevents concatenated words when stripping tags afterwards
155	70	$content = str_replace(['<', '>'], [' <', '> '], $content);
156	70	$content = str_replace(["\t", "\n", "\r", ' '], ' ', $content);
157	70	$content = strip_tags($content);
158	70	$content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
159
160	70	$content = self::stripUnicodeRanges($content);
161	70	$content = preg_replace('/\s{2,}/u', ' ', $content);
162	70	$content = trim($content);
163
164	70	return $content;
165		}
166
167		/**
168		* Strips control characters that cause Jetty/Solr to fail.
169		*
170		* @param string $content the content to sanitize
171		* @return string the sanitized content
172		* @see http://w3.org/International/questions/qa-forms-utf-8.html
173		*/
174	70	public static function stripControlCharacters($content)
175		{
176		// Printable utf-8 does not include any of these chars below x7F
177	70	return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content);
178		}
179
180		/**
181		* Strips unusable unicode ranges
182		*
183		* @param string $content Content to sanitize
184		* @return string Sanitized content
185		*/
186	70	public static function stripUnicodeRanges($content)
187		{
188	70	foreach (self::$stripUnicodeRanges as $range) {
189	70	$content = self::stripUnicodeRange($content, $range[0], $range[1]);
190		}
191
192	70	return $content;
193		}
194
195		/**
196		* Strips a UTF-8 character range
197		*
198		* @param string $content Content to sanitize
199		* @param string $start Unicode range start character as uppercase hexadecimal string
200		* @param string $end Unicode range end character as uppercase hexadecimal string
201		* @return string Sanitized content
202		*/
203	70	public static function stripUnicodeRange($content, $start, $end)
204		{
205	70	return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '',
206	70	$content);
207		}
208
209		/**
210		* Shortcut method to retrieve the raw content marked for indexing.
211		*
212		* @return string Content marked for indexing.
213		*/
214		public function getContentMarkedForIndexing()
215		{
216		// @extensionScannerIgnoreLine
217		return $this->content;
218		}
219
220		/**
221		* Extracts HTML tag content from tags in the content marked for indexing.
222		*
223		* @return array A mapping of Solr document field names to content found in defined tags.
224		*/
225	70	public function getTagContent()
226		{
227	70	$result = [];
228	70	$matches = [];
229	70	$content = $this->getContentMarkedForIndexing();
230
231		// strip all ignored tags
232	70	$content = strip_tags(
233	70	$content,
234	70	'<' . implode('><', array_keys($this->tagToFieldMapping)) . '>'
235		);
236
237	70	preg_match_all(
238	70	'@<(' . implode('\|',
239	70	array_keys($this->tagToFieldMapping)) . ')[^>]>(.)</\1>@Ui',
240	70	$content,
241	70	$matches
242		);
243
244	70	foreach ($matches[1] as $key => $tag) {
245		// We don't want to index links auto-generated by the url filter.
246		$pattern = '@(?:http://\|https://\|ftp://\|mailto:\|smb://\|afp://\|file://\|gopher://\|news://\|ssl://\|sslv2://\|sslv3://\|tls://\|tcp://\|udp://\|www\.)[a-zA-Z0-9]+@';
247		if ($tag != 'a' \|\| !preg_match($pattern, $matches[2][$key])) {
248		$fieldName = $this->tagToFieldMapping[$tag];
249		$hasContentForFieldName = empty($result[$fieldName]);
250		$separator = ($hasContentForFieldName) ? '' : ' ';
251		$result[$fieldName] .= $separator . $matches[2][$key];
252		}
253		}
254
255	70	return $result;
256		}
257		}
258

TYPO3-Solr / ext-solr

Push — master ( 498335...52b5ef )

HtmlContentExtractor A

Complexity

Size/Duplication

Test Coverage

Importance

10 Methods

Duplication Side-by-Side

Filter issues like