| Conditions | 24 |
| Paths | 17 |
| Total Lines | 150 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | <?php |
||
| 36 | public static function fromUtf8($str, $strict = false) |
||
| 37 | { |
||
| 38 | $mState = 0; // cached expected number of octets after the current octet |
||
| 39 | // until the beginning of the next UTF8 character sequence |
||
| 40 | $mUcs4 = 0; // cached Unicode character |
||
| 41 | $mBytes = 1; // cached expected number of octets in the current sequence |
||
| 42 | |||
| 43 | $out = array(); |
||
| 44 | |||
| 45 | $len = strlen($str); |
||
| 46 | |||
| 47 | for ($i = 0; $i < $len; $i++) { |
||
| 48 | |||
| 49 | $in = ord($str{$i}); |
||
| 50 | |||
| 51 | if ($mState === 0) { |
||
| 52 | |||
| 53 | // When mState is zero we expect either a US-ASCII character or a |
||
| 54 | // multi-octet sequence. |
||
| 55 | if (0 === (0x80 & $in)) { |
||
| 56 | // US-ASCII, pass straight through. |
||
| 57 | $out[] = $in; |
||
| 58 | $mBytes = 1; |
||
| 59 | |||
| 60 | } else if (0xC0 === (0xE0 & $in)) { |
||
| 61 | // First octet of 2 octet sequence |
||
| 62 | $mUcs4 = $in; |
||
| 63 | $mUcs4 = ($mUcs4 & 0x1F) << 6; |
||
| 64 | $mState = 1; |
||
| 65 | $mBytes = 2; |
||
| 66 | |||
| 67 | } else if (0xE0 === (0xF0 & $in)) { |
||
| 68 | // First octet of 3 octet sequence |
||
| 69 | $mUcs4 = $in; |
||
| 70 | $mUcs4 = ($mUcs4 & 0x0F) << 12; |
||
| 71 | $mState = 2; |
||
| 72 | $mBytes = 3; |
||
| 73 | |||
| 74 | } else if (0xF0 === (0xF8 & $in)) { |
||
| 75 | // First octet of 4 octet sequence |
||
| 76 | $mUcs4 = $in; |
||
| 77 | $mUcs4 = ($mUcs4 & 0x07) << 18; |
||
| 78 | $mState = 3; |
||
| 79 | $mBytes = 4; |
||
| 80 | |||
| 81 | } else if (0xF8 === (0xFC & $in)) { |
||
| 82 | /* First octet of 5 octet sequence. |
||
| 83 | * |
||
| 84 | * This is illegal because the encoded codepoint must be either |
||
| 85 | * (a) not the shortest form or |
||
| 86 | * (b) outside the Unicode range of 0-0x10FFFF. |
||
| 87 | * Rather than trying to resynchronize, we will carry on until the end |
||
| 88 | * of the sequence and let the later error handling code catch it. |
||
| 89 | */ |
||
| 90 | $mUcs4 = $in; |
||
| 91 | $mUcs4 = ($mUcs4 & 0x03) << 24; |
||
| 92 | $mState = 4; |
||
| 93 | $mBytes = 5; |
||
| 94 | |||
| 95 | } else if (0xFC === (0xFE & $in)) { |
||
| 96 | // First octet of 6 octet sequence, see comments for 5 octet sequence. |
||
| 97 | $mUcs4 = $in; |
||
| 98 | $mUcs4 = ($mUcs4 & 1) << 30; |
||
| 99 | $mState = 5; |
||
| 100 | $mBytes = 6; |
||
| 101 | |||
| 102 | } elseif ($strict) { |
||
| 103 | /* Current octet is neither in the US-ASCII range nor a legal first |
||
| 104 | * octet of a multi-octet sequence. |
||
| 105 | */ |
||
| 106 | trigger_error( |
||
| 107 | 'utf8_to_unicode: Illegal sequence identifier ' . |
||
| 108 | 'in UTF-8 at byte ' . $i, |
||
| 109 | E_USER_WARNING |
||
| 110 | ); |
||
| 111 | return false; |
||
| 112 | |||
| 113 | } |
||
| 114 | |||
| 115 | } else { |
||
| 116 | |||
| 117 | // When mState is non-zero, we expect a continuation of the multi-octet |
||
| 118 | // sequence |
||
| 119 | if (0x80 === (0xC0 & $in)) { |
||
| 120 | |||
| 121 | // Legal continuation. |
||
| 122 | $shift = ($mState - 1) * 6; |
||
| 123 | $tmp = $in; |
||
| 124 | $tmp = ($tmp & 0x0000003F) << $shift; |
||
| 125 | $mUcs4 |= $tmp; |
||
| 126 | |||
| 127 | /** |
||
| 128 | * End of the multi-octet sequence. mUcs4 now contains the final |
||
| 129 | * Unicode codepoint to be output |
||
| 130 | */ |
||
| 131 | if (0 === --$mState) { |
||
| 132 | |||
| 133 | /* |
||
| 134 | * Check for illegal sequences and codepoints. |
||
| 135 | */ |
||
| 136 | // From Unicode 3.1, non-shortest form is illegal |
||
| 137 | if (((2 === $mBytes) && ($mUcs4 < 0x0080)) || |
||
| 138 | ((3 === $mBytes) && ($mUcs4 < 0x0800)) || |
||
| 139 | ((4 === $mBytes) && ($mUcs4 < 0x10000)) || |
||
| 140 | (4 < $mBytes) || |
||
| 141 | // From Unicode 3.2, surrogate characters are illegal |
||
| 142 | (($mUcs4 & 0xFFFFF800) === 0xD800) || |
||
| 143 | // Codepoints outside the Unicode range are illegal |
||
| 144 | ($mUcs4 > 0x10FFFF)) { |
||
| 145 | |||
| 146 | if ($strict) { |
||
| 147 | trigger_error( |
||
| 148 | 'utf8_to_unicode: Illegal sequence or codepoint ' . |
||
| 149 | 'in UTF-8 at byte ' . $i, |
||
| 150 | E_USER_WARNING |
||
| 151 | ); |
||
| 152 | |||
| 153 | return false; |
||
| 154 | } |
||
| 155 | |||
| 156 | } |
||
| 157 | |||
| 158 | if (0xFEFF !== $mUcs4) { |
||
| 159 | // BOM is legal but we don't want to output it |
||
| 160 | $out[] = $mUcs4; |
||
| 161 | } |
||
| 162 | |||
| 163 | //initialize UTF8 cache |
||
| 164 | $mState = 0; |
||
| 165 | $mUcs4 = 0; |
||
| 166 | $mBytes = 1; |
||
| 167 | } |
||
| 168 | |||
| 169 | } elseif ($strict) { |
||
| 170 | /** |
||
| 171 | *((0xC0 & (*in) != 0x80) && (mState != 0)) |
||
| 172 | * Incomplete multi-octet sequence. |
||
| 173 | */ |
||
| 174 | trigger_error( |
||
| 175 | 'utf8_to_unicode: Incomplete multi-octet ' . |
||
| 176 | ' sequence in UTF-8 at byte ' . $i, |
||
| 177 | E_USER_WARNING |
||
| 178 | ); |
||
| 179 | |||
| 180 | return false; |
||
| 181 | } |
||
| 182 | } |
||
| 183 | } |
||
| 184 | return $out; |
||
| 185 | } |
||
| 186 | |||
| 278 |