Conditions | 24 |
Paths | 17 |
Total Lines | 150 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | <?php |
||
36 | public static function fromUtf8($str, $strict = false) |
||
37 | { |
||
38 | $mState = 0; // cached expected number of octets after the current octet |
||
39 | // until the beginning of the next UTF8 character sequence |
||
40 | $mUcs4 = 0; // cached Unicode character |
||
41 | $mBytes = 1; // cached expected number of octets in the current sequence |
||
42 | |||
43 | $out = array(); |
||
44 | |||
45 | $len = strlen($str); |
||
46 | |||
47 | for ($i = 0; $i < $len; $i++) { |
||
48 | |||
49 | $in = ord($str{$i}); |
||
50 | |||
51 | if ($mState === 0) { |
||
52 | |||
53 | // When mState is zero we expect either a US-ASCII character or a |
||
54 | // multi-octet sequence. |
||
55 | if (0 === (0x80 & $in)) { |
||
56 | // US-ASCII, pass straight through. |
||
57 | $out[] = $in; |
||
58 | $mBytes = 1; |
||
59 | |||
60 | } else if (0xC0 === (0xE0 & $in)) { |
||
61 | // First octet of 2 octet sequence |
||
62 | $mUcs4 = $in; |
||
63 | $mUcs4 = ($mUcs4 & 0x1F) << 6; |
||
64 | $mState = 1; |
||
65 | $mBytes = 2; |
||
66 | |||
67 | } else if (0xE0 === (0xF0 & $in)) { |
||
68 | // First octet of 3 octet sequence |
||
69 | $mUcs4 = $in; |
||
70 | $mUcs4 = ($mUcs4 & 0x0F) << 12; |
||
71 | $mState = 2; |
||
72 | $mBytes = 3; |
||
73 | |||
74 | } else if (0xF0 === (0xF8 & $in)) { |
||
75 | // First octet of 4 octet sequence |
||
76 | $mUcs4 = $in; |
||
77 | $mUcs4 = ($mUcs4 & 0x07) << 18; |
||
78 | $mState = 3; |
||
79 | $mBytes = 4; |
||
80 | |||
81 | } else if (0xF8 === (0xFC & $in)) { |
||
82 | /* First octet of 5 octet sequence. |
||
83 | * |
||
84 | * This is illegal because the encoded codepoint must be either |
||
85 | * (a) not the shortest form or |
||
86 | * (b) outside the Unicode range of 0-0x10FFFF. |
||
87 | * Rather than trying to resynchronize, we will carry on until the end |
||
88 | * of the sequence and let the later error handling code catch it. |
||
89 | */ |
||
90 | $mUcs4 = $in; |
||
91 | $mUcs4 = ($mUcs4 & 0x03) << 24; |
||
92 | $mState = 4; |
||
93 | $mBytes = 5; |
||
94 | |||
95 | } else if (0xFC === (0xFE & $in)) { |
||
96 | // First octet of 6 octet sequence, see comments for 5 octet sequence. |
||
97 | $mUcs4 = $in; |
||
98 | $mUcs4 = ($mUcs4 & 1) << 30; |
||
99 | $mState = 5; |
||
100 | $mBytes = 6; |
||
101 | |||
102 | } elseif ($strict) { |
||
103 | /* Current octet is neither in the US-ASCII range nor a legal first |
||
104 | * octet of a multi-octet sequence. |
||
105 | */ |
||
106 | trigger_error( |
||
107 | 'utf8_to_unicode: Illegal sequence identifier ' . |
||
108 | 'in UTF-8 at byte ' . $i, |
||
109 | E_USER_WARNING |
||
110 | ); |
||
111 | return false; |
||
112 | |||
113 | } |
||
114 | |||
115 | } else { |
||
116 | |||
117 | // When mState is non-zero, we expect a continuation of the multi-octet |
||
118 | // sequence |
||
119 | if (0x80 === (0xC0 & $in)) { |
||
120 | |||
121 | // Legal continuation. |
||
122 | $shift = ($mState - 1) * 6; |
||
123 | $tmp = $in; |
||
124 | $tmp = ($tmp & 0x0000003F) << $shift; |
||
125 | $mUcs4 |= $tmp; |
||
126 | |||
127 | /** |
||
128 | * End of the multi-octet sequence. mUcs4 now contains the final |
||
129 | * Unicode codepoint to be output |
||
130 | */ |
||
131 | if (0 === --$mState) { |
||
132 | |||
133 | /* |
||
134 | * Check for illegal sequences and codepoints. |
||
135 | */ |
||
136 | // From Unicode 3.1, non-shortest form is illegal |
||
137 | if (((2 === $mBytes) && ($mUcs4 < 0x0080)) || |
||
138 | ((3 === $mBytes) && ($mUcs4 < 0x0800)) || |
||
139 | ((4 === $mBytes) && ($mUcs4 < 0x10000)) || |
||
140 | (4 < $mBytes) || |
||
141 | // From Unicode 3.2, surrogate characters are illegal |
||
142 | (($mUcs4 & 0xFFFFF800) === 0xD800) || |
||
143 | // Codepoints outside the Unicode range are illegal |
||
144 | ($mUcs4 > 0x10FFFF)) { |
||
145 | |||
146 | if ($strict) { |
||
147 | trigger_error( |
||
148 | 'utf8_to_unicode: Illegal sequence or codepoint ' . |
||
149 | 'in UTF-8 at byte ' . $i, |
||
150 | E_USER_WARNING |
||
151 | ); |
||
152 | |||
153 | return false; |
||
154 | } |
||
155 | |||
156 | } |
||
157 | |||
158 | if (0xFEFF !== $mUcs4) { |
||
159 | // BOM is legal but we don't want to output it |
||
160 | $out[] = $mUcs4; |
||
161 | } |
||
162 | |||
163 | //initialize UTF8 cache |
||
164 | $mState = 0; |
||
165 | $mUcs4 = 0; |
||
166 | $mBytes = 1; |
||
167 | } |
||
168 | |||
169 | } elseif ($strict) { |
||
170 | /** |
||
171 | *((0xC0 & (*in) != 0x80) && (mState != 0)) |
||
172 | * Incomplete multi-octet sequence. |
||
173 | */ |
||
174 | trigger_error( |
||
175 | 'utf8_to_unicode: Incomplete multi-octet ' . |
||
176 | ' sequence in UTF-8 at byte ' . $i, |
||
177 | E_USER_WARNING |
||
178 | ); |
||
179 | |||
180 | return false; |
||
181 | } |
||
182 | } |
||
183 | } |
||
184 | return $out; |
||
185 | } |
||
186 | |||
278 |