Total Complexity | 82 |
Total Lines | 285 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like Lexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Lexer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
21 | class Lexer |
||
22 | { |
||
23 | /** |
||
24 | * Debugging options: |
||
25 | * |
||
26 | * @var bool |
||
27 | */ |
||
28 | public $debug = false; |
||
29 | |||
30 | /** |
||
31 | * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display) |
||
32 | * |
||
33 | * @var string |
||
34 | */ |
||
35 | public $debugString = ''; |
||
36 | |||
37 | /** |
||
38 | * Charset class object |
||
39 | * |
||
40 | * @var \TYPO3\CMS\Core\Charset\CharsetConverter |
||
41 | */ |
||
42 | public $csObj; |
||
43 | |||
44 | /** |
||
45 | * Configuration of the lexer: |
||
46 | * |
||
47 | * @var array |
||
48 | */ |
||
49 | public $lexerConf = [ |
||
50 | //Characters: . - _ : / ' |
||
51 | 'printjoins' => [46, 45, 95, 58, 47, 39], |
||
52 | 'casesensitive' => false, |
||
53 | // Set, if case sensitive indexing is wanted. |
||
54 | 'removeChars' => [45] |
||
55 | ]; |
||
56 | |||
57 | /** |
||
58 | * Constructor: Initializes the charset class |
||
59 | */ |
||
60 | public function __construct() |
||
61 | { |
||
62 | $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class); |
||
63 | } |
||
64 | |||
65 | /** |
||
66 | * Splitting string into words. |
||
67 | * Used for indexing, can also be used to find words in query. |
||
68 | * |
||
69 | * @param string String with UTF-8 content to process. |
||
70 | * @return array Array of words in utf-8 |
||
71 | */ |
||
72 | public function split2Words($wordString) |
||
73 | { |
||
74 | // Reset debug string: |
||
75 | $this->debugString = ''; |
||
76 | // Then convert the string to lowercase: |
||
77 | if (!$this->lexerConf['casesensitive']) { |
||
78 | $wordString = mb_strtolower($wordString, 'utf-8'); |
||
79 | } |
||
80 | // Now, splitting words: |
||
81 | $len = 0; |
||
|
|||
82 | $start = 0; |
||
83 | $pos = 0; |
||
84 | $words = []; |
||
85 | $this->debugString = ''; |
||
86 | while (1) { |
||
87 | list($start, $len) = $this->get_word($wordString, $pos); |
||
88 | if ($len) { |
||
89 | $this->addWords($words, $wordString, $start, $len); |
||
90 | if ($this->debug) { |
||
91 | $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len)); |
||
92 | } |
||
93 | $pos = $start + $len; |
||
94 | } else { |
||
95 | break; |
||
96 | } |
||
97 | } |
||
98 | return $words; |
||
99 | } |
||
100 | |||
101 | /********************************** |
||
102 | * |
||
103 | * Helper functions |
||
104 | * |
||
105 | ********************************/ |
||
106 | /** |
||
107 | * Add word to word-array |
||
108 | * This function should be used to make sure CJK sequences are split up in the right way |
||
109 | * |
||
110 | * @param array $words Array of accumulated words |
||
111 | * @param string $wordString Complete Input string from where to extract word |
||
112 | * @param int $start Start position of word in input string |
||
113 | * @param int $len The Length of the word string from start position |
||
114 | */ |
||
115 | public function addWords(&$words, &$wordString, $start, $len) |
||
116 | { |
||
117 | // Get word out of string: |
||
118 | $theWord = substr($wordString, $start, $len); |
||
119 | // Get next chars unicode number and find type: |
||
120 | $bc = 0; |
||
121 | $cp = $this->utf8_ord($theWord, $bc); |
||
122 | list($cType) = $this->charType($cp); |
||
123 | // If string is a CJK sequence we follow this algorithm: |
||
124 | /* |
||
125 | DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols |
||
126 | separate letters and numbers into words. This is sufficient for |
||
127 | all western text.CJK doesn't use spaces or separators to separate words, so the only |
||
128 | way to really find out what constitutes a word would be to have a |
||
129 | dictionary and advanced heuristics. Instead, we form pairs from |
||
130 | consecutive characters, in such a way that searches will find only |
||
131 | characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split |
||
132 | in the same manner, and since the set of characters is huge so the |
||
133 | extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!] |
||
134 | */ |
||
135 | if ($cType === 'cjk') { |
||
136 | // Find total string length: |
||
137 | $strlen = mb_strlen($theWord, 'utf-8'); |
||
138 | // Traverse string length and add words as pairs of two chars: |
||
139 | for ($a = 0; $a < $strlen; $a++) { |
||
140 | if ($strlen == 1 || $a < $strlen - 1) { |
||
141 | $words[] = mb_substr($theWord, $a, 2, 'utf-8'); |
||
142 | } |
||
143 | } |
||
144 | } else { |
||
145 | // Normal "single-byte" chars: |
||
146 | // Remove chars: |
||
147 | foreach ($this->lexerConf['removeChars'] as $skipJoin) { |
||
148 | $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord); |
||
149 | } |
||
150 | // Add word: |
||
151 | $words[] = $theWord; |
||
152 | } |
||
153 | } |
||
154 | |||
155 | /** |
||
156 | * Get the first word in a given utf-8 string (initial non-letters will be skipped) |
||
157 | * |
||
158 | * @param string $str Input string (reference) |
||
159 | * @param int $pos Starting position in input string |
||
160 | * @return array 0: start, 1: len or FALSE if no word has been found |
||
161 | */ |
||
162 | public function get_word(&$str, $pos = 0) |
||
177 | } |
||
178 | |||
179 | /** |
||
180 | * See if a character is a letter (or a string of letters or non-letters). |
||
181 | * |
||
182 | * @param string $str Input string (reference) |
||
183 | * @param int $len Byte-length of character sequence (reference, return value) |
||
184 | * @param int $pos Starting position in input string |
||
185 | * @return bool letter (or word) found |
||
186 | */ |
||
187 | public function utf8_is_letter(&$str, &$len, $pos = 0) |
||
253 | } |
||
254 | |||
255 | /** |
||
256 | * Determine the type of character |
||
257 | * |
||
258 | * @param int $cp Unicode number to evaluate |
||
259 | * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean) |
||
260 | */ |
||
261 | public function charType($cp) |
||
276 | } |
||
277 | } |
||
278 | |||
279 | /** |
||
280 | * Converts a UTF-8 multibyte character to a UNICODE codepoint |
||
281 | * |
||
282 | * @param string $str UTF-8 multibyte character string (reference) |
||
283 | * @param int $len The length of the character (reference, return value) |
||
284 | * @param int $pos Starting position in input string |
||
285 | * @param bool $hex If set, then a hex. number is returned |
||
286 | * @return int UNICODE codepoint |
||
287 | */ |
||
288 | public function utf8_ord(&$str, &$len, $pos = 0, $hex = false) |
||
306 | } |
||
307 | } |
||
308 |