| Total Complexity | 82 |
| Total Lines | 285 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like Lexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Lexer, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 21 | class Lexer |
||
| 22 | { |
||
| 23 | /** |
||
| 24 | * Debugging options: |
||
| 25 | * |
||
| 26 | * @var bool |
||
| 27 | */ |
||
| 28 | public $debug = false; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display) |
||
| 32 | * |
||
| 33 | * @var string |
||
| 34 | */ |
||
| 35 | public $debugString = ''; |
||
| 36 | |||
| 37 | /** |
||
| 38 | * Charset class object |
||
| 39 | * |
||
| 40 | * @var \TYPO3\CMS\Core\Charset\CharsetConverter |
||
| 41 | */ |
||
| 42 | public $csObj; |
||
| 43 | |||
| 44 | /** |
||
| 45 | * Configuration of the lexer: |
||
| 46 | * |
||
| 47 | * @var array |
||
| 48 | */ |
||
| 49 | public $lexerConf = [ |
||
| 50 | //Characters: . - _ : / ' |
||
| 51 | 'printjoins' => [46, 45, 95, 58, 47, 39], |
||
| 52 | 'casesensitive' => false, |
||
| 53 | // Set, if case sensitive indexing is wanted. |
||
| 54 | 'removeChars' => [45] |
||
| 55 | ]; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * Constructor: Initializes the charset class |
||
| 59 | */ |
||
| 60 | public function __construct() |
||
| 61 | { |
||
| 62 | $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class); |
||
| 63 | } |
||
| 64 | |||
| 65 | /** |
||
| 66 | * Splitting string into words. |
||
| 67 | * Used for indexing, can also be used to find words in query. |
||
| 68 | * |
||
| 69 | * @param string String with UTF-8 content to process. |
||
| 70 | * @return array Array of words in utf-8 |
||
| 71 | */ |
||
| 72 | public function split2Words($wordString) |
||
| 73 | { |
||
| 74 | // Reset debug string: |
||
| 75 | $this->debugString = ''; |
||
| 76 | // Then convert the string to lowercase: |
||
| 77 | if (!$this->lexerConf['casesensitive']) { |
||
| 78 | $wordString = mb_strtolower($wordString, 'utf-8'); |
||
| 79 | } |
||
| 80 | // Now, splitting words: |
||
| 81 | $len = 0; |
||
|
|
|||
| 82 | $start = 0; |
||
| 83 | $pos = 0; |
||
| 84 | $words = []; |
||
| 85 | $this->debugString = ''; |
||
| 86 | while (1) { |
||
| 87 | list($start, $len) = $this->get_word($wordString, $pos); |
||
| 88 | if ($len) { |
||
| 89 | $this->addWords($words, $wordString, $start, $len); |
||
| 90 | if ($this->debug) { |
||
| 91 | $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len)); |
||
| 92 | } |
||
| 93 | $pos = $start + $len; |
||
| 94 | } else { |
||
| 95 | break; |
||
| 96 | } |
||
| 97 | } |
||
| 98 | return $words; |
||
| 99 | } |
||
| 100 | |||
| 101 | /********************************** |
||
| 102 | * |
||
| 103 | * Helper functions |
||
| 104 | * |
||
| 105 | ********************************/ |
||
| 106 | /** |
||
| 107 | * Add word to word-array |
||
| 108 | * This function should be used to make sure CJK sequences are split up in the right way |
||
| 109 | * |
||
| 110 | * @param array $words Array of accumulated words |
||
| 111 | * @param string $wordString Complete Input string from where to extract word |
||
| 112 | * @param int $start Start position of word in input string |
||
| 113 | * @param int $len The Length of the word string from start position |
||
| 114 | */ |
||
| 115 | public function addWords(&$words, &$wordString, $start, $len) |
||
| 116 | { |
||
| 117 | // Get word out of string: |
||
| 118 | $theWord = substr($wordString, $start, $len); |
||
| 119 | // Get next chars unicode number and find type: |
||
| 120 | $bc = 0; |
||
| 121 | $cp = $this->utf8_ord($theWord, $bc); |
||
| 122 | list($cType) = $this->charType($cp); |
||
| 123 | // If string is a CJK sequence we follow this algorithm: |
||
| 124 | /* |
||
| 125 | DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols |
||
| 126 | separate letters and numbers into words. This is sufficient for |
||
| 127 | all western text.CJK doesn't use spaces or separators to separate words, so the only |
||
| 128 | way to really find out what constitutes a word would be to have a |
||
| 129 | dictionary and advanced heuristics. Instead, we form pairs from |
||
| 130 | consecutive characters, in such a way that searches will find only |
||
| 131 | characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split |
||
| 132 | in the same manner, and since the set of characters is huge so the |
||
| 133 | extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!] |
||
| 134 | */ |
||
| 135 | if ($cType === 'cjk') { |
||
| 136 | // Find total string length: |
||
| 137 | $strlen = mb_strlen($theWord, 'utf-8'); |
||
| 138 | // Traverse string length and add words as pairs of two chars: |
||
| 139 | for ($a = 0; $a < $strlen; $a++) { |
||
| 140 | if ($strlen == 1 || $a < $strlen - 1) { |
||
| 141 | $words[] = mb_substr($theWord, $a, 2, 'utf-8'); |
||
| 142 | } |
||
| 143 | } |
||
| 144 | } else { |
||
| 145 | // Normal "single-byte" chars: |
||
| 146 | // Remove chars: |
||
| 147 | foreach ($this->lexerConf['removeChars'] as $skipJoin) { |
||
| 148 | $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord); |
||
| 149 | } |
||
| 150 | // Add word: |
||
| 151 | $words[] = $theWord; |
||
| 152 | } |
||
| 153 | } |
||
| 154 | |||
| 155 | /** |
||
| 156 | * Get the first word in a given utf-8 string (initial non-letters will be skipped) |
||
| 157 | * |
||
| 158 | * @param string $str Input string (reference) |
||
| 159 | * @param int $pos Starting position in input string |
||
| 160 | * @return array 0: start, 1: len or FALSE if no word has been found |
||
| 161 | */ |
||
| 162 | public function get_word(&$str, $pos = 0) |
||
| 177 | } |
||
| 178 | |||
| 179 | /** |
||
| 180 | * See if a character is a letter (or a string of letters or non-letters). |
||
| 181 | * |
||
| 182 | * @param string $str Input string (reference) |
||
| 183 | * @param int $len Byte-length of character sequence (reference, return value) |
||
| 184 | * @param int $pos Starting position in input string |
||
| 185 | * @return bool letter (or word) found |
||
| 186 | */ |
||
| 187 | public function utf8_is_letter(&$str, &$len, $pos = 0) |
||
| 253 | } |
||
| 254 | |||
| 255 | /** |
||
| 256 | * Determine the type of character |
||
| 257 | * |
||
| 258 | * @param int $cp Unicode number to evaluate |
||
| 259 | * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean) |
||
| 260 | */ |
||
| 261 | public function charType($cp) |
||
| 276 | } |
||
| 277 | } |
||
| 278 | |||
| 279 | /** |
||
| 280 | * Converts a UTF-8 multibyte character to a UNICODE codepoint |
||
| 281 | * |
||
| 282 | * @param string $str UTF-8 multibyte character string (reference) |
||
| 283 | * @param int $len The length of the character (reference, return value) |
||
| 284 | * @param int $pos Starting position in input string |
||
| 285 | * @param bool $hex If set, then a hex. number is returned |
||
| 286 | * @return int UNICODE codepoint |
||
| 287 | */ |
||
| 288 | public function utf8_ord(&$str, &$len, $pos = 0, $hex = false) |
||
| 306 | } |
||
| 307 | } |
||
| 308 |