Lexer::get_word() - Code Metrics - Inspection of "[BUGFIX] Show folder meta information next to path" - TYPO3/TYPO3.CMS - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Branch — master (6c65a4)

by Christian

created 2018-01-13 14:30 UTC

Lexer::get_word() A

↳ Parent: Lexer

Complexity

Conditions	3
Paths	3

Size

Total Lines	15
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	8
nc	3
nop	2
dl	0
loc	15
rs	9.4285
c	0
b	0
f	0

<?php
namespace TYPO3\CMS\IndexedSearch;

/*
 * This file is part of the TYPO3 CMS project.
 *
 * It is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License, either version 2
 * of the License, or any later version.
 *
 * For the full copyright and license information, please read the
 * LICENSE.txt file that was distributed with this source code.
 *
 * The TYPO3 project - inspiring people to share!
 */

/**
 * Lexer class for indexed_search
 * A lexer splits the text into words
 */
class Lexer
{
    /**
     * Debugging options:
     *
     * @var bool
     */
    public $debug = false;

    /**
     * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
     *
     * @var string
     */
    public $debugString = '';

    /**
     * Charset class object
     *
     * @var \TYPO3\CMS\Core\Charset\CharsetConverter
     */
    public $csObj;

    /**
     * Configuration of the lexer:
     *
     * @var array
     */
    public $lexerConf = [
        //Characters: . - _ : / '
        'printjoins' => [46, 45, 95, 58, 47, 39],
        'casesensitive' => false,
        // Set, if case sensitive indexing is wanted.
        'removeChars' => [45]
    ];

    /**
     * Constructor: Initializes the charset class
     */
    public function __construct()
    {
        $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
    }

    /**
     * Splitting string into words.
     * Used for indexing, can also be used to find words in query.
     *
     * @param string String with UTF-8 content to process.
     * @return array Array of words in utf-8
     */
    public function split2Words($wordString)
    {
        // Reset debug string:
        $this->debugString = '';
        // Then convert the string to lowercase:
        if (!$this->lexerConf['casesensitive']) {
            $wordString = mb_strtolower($wordString, 'utf-8');
        }
        // Now, splitting words:
        $len = 0;

        $start = 0;

        $pos = 0;
        $words = [];
        $this->debugString = '';
        while (1) {
            list($start, $len) = $this->get_word($wordString, $pos);
            if ($len) {
                $this->addWords($words, $wordString, $start, $len);
                if ($this->debug) {
                    $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
                }
                $pos = $start + $len;
            } else {
                break;
            }
        }
        return $words;
    }

    /**********************************
     *
     * Helper functions
     *
     ********************************/
    /**
     * Add word to word-array
     * This function should be used to make sure CJK sequences are split up in the right way
     *
     * @param array $words Array of accumulated words
     * @param string $wordString Complete Input string from where to extract word
     * @param int $start Start position of word in input string
     * @param int $len The Length of the word string from start position
     */
    public function addWords(&$words, &$wordString, $start, $len)
    {
        // Get word out of string:
        $theWord = substr($wordString, $start, $len);
        // Get next chars unicode number and find type:
        $bc = 0;
        $cp = $this->utf8_ord($theWord, $bc);
        list($cType) = $this->charType($cp);
        // If string is a CJK sequence we follow this algorithm:
        /*
        DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
        separate letters and numbers into words. This is sufficient for
        all western text.CJK doesn't use spaces or separators to separate words, so the only
        way to really find out what constitutes a word would be to have a
        dictionary and advanced heuristics. Instead, we form pairs from
        consecutive characters, in such a way that searches will find only
        characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
        in the same manner, and since the set of characters is huge so the
        extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
         */
        if ($cType === 'cjk') {
            // Find total string length:
            $strlen = mb_strlen($theWord, 'utf-8');
            // Traverse string length and add words as pairs of two chars:
            for ($a = 0; $a < $strlen; $a++) {
                if ($strlen == 1 || $a < $strlen - 1) {
                    $words[] = mb_substr($theWord, $a, 2, 'utf-8');
                }
            }
        } else {
            // Normal "single-byte" chars:
            // Remove chars:
            foreach ($this->lexerConf['removeChars'] as $skipJoin) {
                $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
            }
            // Add word:
            $words[] = $theWord;
        }
    }

    /**
     * Get the first word in a given utf-8 string (initial non-letters will be skipped)
     *
     * @param string $str Input string (reference)
     * @param int $pos Starting position in input string
     * @return array 0: start, 1: len or FALSE if no word has been found
     */
    public function get_word(&$str, $pos = 0)
    {
        $len = 0;
        // If return is TRUE, a word was found starting at this position, so returning position and length:
        if ($this->utf8_is_letter($str, $len, $pos)) {
            return [$pos, $len];
        }
        // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
        $pos += $len;
        if ($str[$pos] == '') {
            // Check end of string before looking for word of course.
            return false;

        }
        $this->utf8_is_letter($str, $len, $pos);
        return [$pos, $len];
    }

    /**
     * See if a character is a letter (or a string of letters or non-letters).
     *
     * @param string $str Input string (reference)
     * @param int $len Byte-length of character sequence (reference, return value)
     * @param int $pos Starting position in input string
     * @return bool letter (or word) found
     */
    public function utf8_is_letter(&$str, &$len, $pos = 0)
    {
        $len = 0;
        $bc = 0;
        $cp = 0;
        $printJoinLgd = 0;
        $cType = ($cType_prev = false);
        // Letter type
        $letter = true;
        // looking for a letter?
        if ($str[$pos] == '') {
            // Return FALSE on end-of-string at this stage
            return false;
        }
        while (1) {
            // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
            if ($len) {
                if ($letter) {
                    // We are in a sequence of words
                    if (
                        !$cType
                        || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
                        || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
                    ) {
                        // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
                        if (!in_array($cp, $this->lexerConf['printjoins'])) {
                            // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
                            if ($printJoinLgd) {
                                $len = $printJoinLgd;
                            }
                            return true;
                        }
                        // If a printJoin char is found, record the length if it has not been recorded already:
                        if (!$printJoinLgd) {
                            $printJoinLgd = $len;
                        }
                    } else {
                        // When a true letter is found, reset printJoinLgd counter:
                        $printJoinLgd = 0;
                    }
                } elseif (!$letter && $cType) {
                    // end of non-word reached
                    return false;
                }
            }
            $len += $bc;
            // add byte-length of last found character
            if ($str[$pos] == '') {
                // End of string; return status of string till now
                return $letter;
            }
            // Get next chars unicode number:
            $cp = $this->utf8_ord($str, $bc, $pos);
            $pos += $bc;
            // Determine the type:
            $cType_prev = $cType;
            list($cType) = $this->charType($cp);
            if ($cType) {
                continue;
            }
            // Setting letter to FALSE if the first char was not a letter!
            if (!$len) {
                $letter = false;
            }
        }
        return false;
    }

    /**
     * Determine the type of character
     *
     * @param int $cp Unicode number to evaluate
     * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
     */
    public function charType($cp)
    {
        // Numeric?
        if ($cp >= 48 && $cp <= 57) {
            return ['num'];
        }
        // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
        if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
            return ['alpha'];
        }
        // Looking for CJK (Chinese / Japanese / Korean)
        // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
        // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
        if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
            return ['cjk'];
        }
    }

    /**
     * Converts a UTF-8 multibyte character to a UNICODE codepoint
     *
     * @param string $str UTF-8 multibyte character string (reference)
     * @param int $len The length of the character (reference, return value)
     * @param int $pos Starting position in input string
     * @param bool $hex If set, then a hex. number is returned
     * @return int UNICODE codepoint
     */
    public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
    {
        $ord = ord($str[$pos]);
        $len = 1;
        if ($ord > 128) {
            for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
                // calculate number of extra bytes
                $bc++;
            }
            $len += $bc;
            $ord = $ord & (1 << 6 - $bc) - 1;
            // mask utf-8 lead-in bytes
            // "bring in" data bytes
            for ($i = $pos + 1; $bc; $bc--, $i++) {
                $ord = $ord << 6 | ord($str[$i]) & 63;
            }
        }
        return $hex ? 'x' . dechex($ord) : $ord;

    }
}


1			<?php
2			namespace TYPO3\CMS\IndexedSearch;
3
4			/*
5			* This file is part of the TYPO3 CMS project.
6			*
7			* It is free software; you can redistribute it and/or modify it under
8			* the terms of the GNU General Public License, either version 2
9			* of the License, or any later version.
10			*
11			* For the full copyright and license information, please read the
12			* LICENSE.txt file that was distributed with this source code.
13			*
14			* The TYPO3 project - inspiring people to share!
15			*/
16
17			/**
18			* Lexer class for indexed_search
19			* A lexer splits the text into words
20			*/
21			class Lexer
22			{
23			/**
24			* Debugging options:
25			*
26			* @var bool
27			*/
28			public $debug = false;
29
30			/**
31			* If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
32			*
33			* @var string
34			*/
35			public $debugString = '';
36
37			/**
38			* Charset class object
39			*
40			* @var \TYPO3\CMS\Core\Charset\CharsetConverter
41			*/
42			public $csObj;
43
44			/**
45			* Configuration of the lexer:
46			*
47			* @var array
48			*/
49			public $lexerConf = [
50			//Characters: . - _ : / '
51			'printjoins' => [46, 45, 95, 58, 47, 39],
52			'casesensitive' => false,
53			// Set, if case sensitive indexing is wanted.
54			'removeChars' => [45]
55			];
56
57			/**
58			* Constructor: Initializes the charset class
59			*/
60			public function __construct()
61			{
62			$this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
63			}
64
65			/**
66			* Splitting string into words.
67			* Used for indexing, can also be used to find words in query.
68			*
69			* @param string String with UTF-8 content to process.
70			* @return array Array of words in utf-8
71			*/
72			public function split2Words($wordString)
73			{
74			// Reset debug string:
75			$this->debugString = '';
76			// Then convert the string to lowercase:
77			if (!$this->lexerConf['casesensitive']) {
78			$wordString = mb_strtolower($wordString, 'utf-8');
79			}
80			// Now, splitting words:
81			$len = 0;
			0 ignored issues – show Unused Code introduced 2017-12-06 15:56 UTC by Report Bug Copy Issue Report The assignment to `$len` is dead and can be removed. Loading history...
82			$start = 0;
			0 ignored issues – show Unused Code introduced 2017-12-06 15:56 UTC by Report Bug Copy Issue Report The assignment to `$start` is dead and can be removed. Loading history...
83			$pos = 0;
84			$words = [];
85			$this->debugString = '';
86			while (1) {
87			list($start, $len) = $this->get_word($wordString, $pos);
88			if ($len) {
89			$this->addWords($words, $wordString, $start, $len);
90			if ($this->debug) {
91			$this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
92			}
93			$pos = $start + $len;
94			} else {
95			break;
96			}
97			}
98			return $words;
99			}
100
101			/**********************************
102			*
103			* Helper functions
104			*
105			********************************/
106			/**
107			* Add word to word-array
108			* This function should be used to make sure CJK sequences are split up in the right way
109			*
110			* @param array $words Array of accumulated words
111			* @param string $wordString Complete Input string from where to extract word
112			* @param int $start Start position of word in input string
113			* @param int $len The Length of the word string from start position
114			*/
115			public function addWords(&$words, &$wordString, $start, $len)
116			{
117			// Get word out of string:
118			$theWord = substr($wordString, $start, $len);
119			// Get next chars unicode number and find type:
120			$bc = 0;
121			$cp = $this->utf8_ord($theWord, $bc);
122			list($cType) = $this->charType($cp);
123			// If string is a CJK sequence we follow this algorithm:
124			/*
125			DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
126			separate letters and numbers into words. This is sufficient for
127			all western text.CJK doesn't use spaces or separators to separate words, so the only
128			way to really find out what constitutes a word would be to have a
129			dictionary and advanced heuristics. Instead, we form pairs from
130			consecutive characters, in such a way that searches will find only
131			characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
132			in the same manner, and since the set of characters is huge so the
133			extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
134			*/
135			if ($cType === 'cjk') {
136			// Find total string length:
137			$strlen = mb_strlen($theWord, 'utf-8');
138			// Traverse string length and add words as pairs of two chars:
139			for ($a = 0; $a < $strlen; $a++) {
140			if ($strlen == 1 \|\| $a < $strlen - 1) {
141			$words[] = mb_substr($theWord, $a, 2, 'utf-8');
142			}
143			}
144			} else {
145			// Normal "single-byte" chars:
146			// Remove chars:
147			foreach ($this->lexerConf['removeChars'] as $skipJoin) {
148			$theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
149			}
150			// Add word:
151			$words[] = $theWord;
152			}
153			}
154
155			/**
156			* Get the first word in a given utf-8 string (initial non-letters will be skipped)
157			*
158			* @param string $str Input string (reference)
159			* @param int $pos Starting position in input string
160			* @return array 0: start, 1: len or FALSE if no word has been found
161			*/
162			public function get_word(&$str, $pos = 0)
163			{
164			$len = 0;
165			// If return is TRUE, a word was found starting at this position, so returning position and length:
166			if ($this->utf8_is_letter($str, $len, $pos)) {
167			return [$pos, $len];
168			}
169			// If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
170			$pos += $len;
171			if ($str[$pos] == '') {
172			// Check end of string before looking for word of course.
173			return false;
			0 ignored issues – show Bug Best Practice introduced 2017-12-07 22:28 UTC by Report Bug Copy Issue Report The expression `return false` returns the type `false` which is incompatible with the documented return type `array`. Loading history...
174			}
175			$this->utf8_is_letter($str, $len, $pos);
176			return [$pos, $len];
177			}
178
179			/**
180			* See if a character is a letter (or a string of letters or non-letters).
181			*
182			* @param string $str Input string (reference)
183			* @param int $len Byte-length of character sequence (reference, return value)
184			* @param int $pos Starting position in input string
185			* @return bool letter (or word) found
186			*/
187			public function utf8_is_letter(&$str, &$len, $pos = 0)
188			{
189			$len = 0;
190			$bc = 0;
191			$cp = 0;
192			$printJoinLgd = 0;
193			$cType = ($cType_prev = false);
194			// Letter type
195			$letter = true;
196			// looking for a letter?
197			if ($str[$pos] == '') {
198			// Return FALSE on end-of-string at this stage
199			return false;
200			}
201			while (1) {
202			// If characters has been obtained we will know whether the string starts as a sequence of letters or not:
203			if ($len) {
204			if ($letter) {
205			// We are in a sequence of words
206			if (
207			!$cType
208			\|\| $cType_prev === 'cjk' && ($cType === 'num' \|\| $cType === 'alpha')
209			\|\| $cType === 'cjk' && ($cType_prev === 'num' \|\| $cType_prev === 'alpha')
210			) {
211			// Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
212			if (!in_array($cp, $this->lexerConf['printjoins'])) {
213			// If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
214			if ($printJoinLgd) {
215			$len = $printJoinLgd;
216			}
217			return true;
218			}
219			// If a printJoin char is found, record the length if it has not been recorded already:
220			if (!$printJoinLgd) {
221			$printJoinLgd = $len;
222			}
223			} else {
224			// When a true letter is found, reset printJoinLgd counter:
225			$printJoinLgd = 0;
226			}
227			} elseif (!$letter && $cType) {
228			// end of non-word reached
229			return false;
230			}
231			}
232			$len += $bc;
233			// add byte-length of last found character
234			if ($str[$pos] == '') {
235			// End of string; return status of string till now
236			return $letter;
237			}
238			// Get next chars unicode number:
239			$cp = $this->utf8_ord($str, $bc, $pos);
240			$pos += $bc;
241			// Determine the type:
242			$cType_prev = $cType;
243			list($cType) = $this->charType($cp);
244			if ($cType) {
245			continue;
246			}
247			// Setting letter to FALSE if the first char was not a letter!
248			if (!$len) {
249			$letter = false;
250			}
251			}
252			return false;
253			}
254
255			/**
256			* Determine the type of character
257			*
258			* @param int $cp Unicode number to evaluate
259			* @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
260			*/
261			public function charType($cp)
262			{
263			// Numeric?
264			if ($cp >= 48 && $cp <= 57) {
265			return ['num'];
266			}
267			// LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
268			if ($cp >= 65 && $cp <= 90 \|\| $cp >= 97 && $cp <= 122 \|\| $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 \|\| $cp >= 256 && $cp < 640 \|\| ($cp == 902 \|\| $cp >= 904 && $cp < 1024) \|\| ($cp >= 1024 && $cp < 1154 \|\| $cp >= 1162 && $cp < 1328) \|\| ($cp >= 1424 && $cp < 1456 \|\| $cp >= 1488 && $cp < 1523) \|\| ($cp >= 1569 && $cp <= 1624 \|\| $cp >= 1646 && $cp <= 1747) \|\| $cp >= 7680 && $cp < 8192) {
269			return ['alpha'];
270			}
271			// Looking for CJK (Chinese / Japanese / Korean)
272			// Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
273			// Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
274			if ($cp >= 12352 && $cp <= 12543 \|\| $cp >= 12592 && $cp <= 12687 \|\| $cp >= 13312 && $cp <= 19903 \|\| $cp >= 19968 && $cp <= 40879 \|\| $cp >= 44032 && $cp <= 55215 \|\| $cp >= 131072 && $cp <= 195103) {
275			return ['cjk'];
276			}
277			}
278
279			/**
280			* Converts a UTF-8 multibyte character to a UNICODE codepoint
281			*
282			* @param string $str UTF-8 multibyte character string (reference)
283			* @param int $len The length of the character (reference, return value)
284			* @param int $pos Starting position in input string
285			* @param bool $hex If set, then a hex. number is returned
286			* @return int UNICODE codepoint
287			*/
288			public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
289			{
290			$ord = ord($str[$pos]);
291			$len = 1;
292			if ($ord > 128) {
293			for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
294			// calculate number of extra bytes
295			$bc++;
296			}
297			$len += $bc;
298			$ord = $ord & (1 << 6 - $bc) - 1;
299			// mask utf-8 lead-in bytes
300			// "bring in" data bytes
301			for ($i = $pos + 1; $bc; $bc--, $i++) {
302			$ord = $ord << 6 \| ord($str[$i]) & 63;
303			}
304			}
305			return $hex ? 'x' . dechex($ord) : $ord;
			0 ignored issues – show Bug Best Practice introduced 2017-12-07 22:28 UTC by Report Bug Copy Issue Report The expression `return $hex ? 'x' . dechex($ord) : $ord` also could return the type `string` which is incompatible with the documented return type `integer`. Loading history...
306			}
307			}
308

TYPO3 / TYPO3.CMS

Branch — master (6c65a4)

Lexer::get_word() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like