@@ -38,7 +38,8 @@ discard block |
||
| 38 | 38 | |
| 39 | 39 | // We need some of these for further analysis below. |
| 40 | 40 | $derived_normalization_props = array(); |
| 41 | -foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line) { |
|
| 41 | +foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line) |
|
| 42 | +{ |
|
| 42 | 43 | $line = substr($line, 0, strcspn($line, '#')); |
| 43 | 44 | |
| 44 | 45 | if (strpos($line, ';') === false) |
@@ -46,17 +47,22 @@ discard block |
||
| 46 | 47 | |
| 47 | 48 | $fields = explode(';', $line); |
| 48 | 49 | |
| 49 | - foreach ($fields as $key => $value) { |
|
| 50 | + foreach ($fields as $key => $value) |
|
| 51 | + { |
|
| 50 | 52 | $fields[$key] = trim($value); |
| 51 | 53 | } |
| 52 | 54 | |
| 53 | - if (!isset($derived_normalization_props[$fields[1]])) { |
|
| 55 | + if (!isset($derived_normalization_props[$fields[1]])) |
|
| 56 | + { |
|
| 54 | 57 | $derived_normalization_props[$fields[1]] = array(); |
| 55 | 58 | } |
| 56 | 59 | |
| 57 | - if (strpos($fields[0], '..') === false) { |
|
| 60 | + if (strpos($fields[0], '..') === false) |
|
| 61 | + { |
|
| 58 | 62 | $entities = array('&#x' . $fields[0] . ';'); |
| 59 | - } else { |
|
| 63 | + } |
|
| 64 | + else |
|
| 65 | + { |
|
| 60 | 66 | $entities = array(); |
| 61 | 67 | |
| 62 | 68 | list($start, $end) = explode('..', $fields[0]); |
@@ -65,27 +71,35 @@ discard block |
||
| 65 | 71 | $ord_e = hexdec($end); |
| 66 | 72 | |
| 67 | 73 | $ord = $ord_s; |
| 68 | - while ($ord <= $ord_e) { |
|
| 74 | + while ($ord <= $ord_e) |
|
| 75 | + { |
|
| 69 | 76 | $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
| 70 | 77 | } |
| 71 | 78 | } |
| 72 | 79 | |
| 73 | 80 | $value = ''; |
| 74 | - if (!isset($fields[2])) { |
|
| 81 | + if (!isset($fields[2])) |
|
| 82 | + { |
|
| 75 | 83 | $value = 'SAME'; |
| 76 | - } elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) { |
|
| 84 | + } |
|
| 85 | + elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) |
|
| 86 | + { |
|
| 77 | 87 | $value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : ''; |
| 78 | - } else { |
|
| 88 | + } |
|
| 89 | + else |
|
| 90 | + { |
|
| 79 | 91 | $value = $fields[2]; |
| 80 | 92 | } |
| 81 | 93 | |
| 82 | - foreach ($entities as $entity) { |
|
| 94 | + foreach ($entities as $entity) |
|
| 95 | + { |
|
| 83 | 96 | $derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; |
| 84 | 97 | } |
| 85 | 98 | } |
| 86 | 99 | |
| 87 | 100 | // Go through all the characters in the Unicode database. |
| 88 | -foreach (file($unicode_data_url . '/UnicodeData.txt') as $line) { |
|
| 101 | +foreach (file($unicode_data_url . '/UnicodeData.txt') as $line) |
|
| 102 | +{ |
|
| 89 | 103 | $fields = explode(';', $line); |
| 90 | 104 | |
| 91 | 105 | if (!empty($fields[3])) |
@@ -106,12 +120,14 @@ discard block |
||
| 106 | 120 | $full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';'; |
| 107 | 121 | |
| 108 | 122 | // Just the canonical decompositions. |
| 109 | - if (strpos($fields[5], '<') === false) { |
|
| 123 | + if (strpos($fields[5], '<') === false) |
|
| 124 | + { |
|
| 110 | 125 | $utf8_arrays['utf8_normalize_d_maps']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[5])) . ';'; |
| 111 | 126 | } |
| 112 | 127 | } |
| 113 | 128 | |
| 114 | -foreach (file($unicode_data_url . '/CaseFolding.txt') as $line) { |
|
| 129 | +foreach (file($unicode_data_url . '/CaseFolding.txt') as $line) |
|
| 130 | +{ |
|
| 115 | 131 | $line = substr($line, 0, strcspn($line, '#')); |
| 116 | 132 | |
| 117 | 133 | if (strpos($line, ';') === false) |
@@ -119,12 +135,14 @@ discard block |
||
| 119 | 135 | |
| 120 | 136 | $fields = explode(';', $line); |
| 121 | 137 | |
| 122 | - foreach ($fields as $key => $value) { |
|
| 138 | + foreach ($fields as $key => $value) |
|
| 139 | + { |
|
| 123 | 140 | $fields[$key] = trim($value); |
| 124 | 141 | } |
| 125 | 142 | |
| 126 | 143 | // Full casefolding. |
| 127 | - if (in_array($fields[1], array('C', 'F'))) { |
|
| 144 | + if (in_array($fields[1], array('C', 'F'))) |
|
| 145 | + { |
|
| 128 | 146 | $utf8_arrays['utf8_casefold_maps']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; |
| 129 | 147 | } |
| 130 | 148 | |
@@ -138,13 +156,17 @@ discard block |
||
| 138 | 156 | // This is necessary because some characters decompose to other characters that |
| 139 | 157 | // themselves decompose further. |
| 140 | 158 | $changed = true; |
| 141 | -while ($changed) { |
|
| 159 | +while ($changed) |
|
| 160 | +{ |
|
| 142 | 161 | $temp = array(); |
| 143 | - foreach ($full_decomposition_maps as $composed => $decomposed) { |
|
| 162 | + foreach ($full_decomposition_maps as $composed => $decomposed) |
|
| 163 | + { |
|
| 144 | 164 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
| 145 | 165 | |
| 146 | - foreach ($parts as $partnum => $hex) { |
|
| 147 | - if (isset($full_decomposition_maps[$hex])) { |
|
| 166 | + foreach ($parts as $partnum => $hex) |
|
| 167 | + { |
|
| 168 | + if (isset($full_decomposition_maps[$hex])) |
|
| 169 | + { |
|
| 148 | 170 | $parts[$partnum] = $full_decomposition_maps[$hex]; |
| 149 | 171 | } |
| 150 | 172 | } |
@@ -163,17 +185,22 @@ discard block |
||
| 163 | 185 | // Same as above, but using only canonical decompositions. |
| 164 | 186 | $changed = true; |
| 165 | 187 | $iteration = 0; |
| 166 | -while ($changed) { |
|
| 188 | +while ($changed) |
|
| 189 | +{ |
|
| 167 | 190 | $temp = array(); |
| 168 | - foreach ($utf8_arrays['utf8_normalize_d_maps'] as $composed => $decomposed) { |
|
| 169 | - if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion'])) { |
|
| 191 | + foreach ($utf8_arrays['utf8_normalize_d_maps'] as $composed => $decomposed) |
|
| 192 | + { |
|
| 193 | + if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion'])) |
|
| 194 | + { |
|
| 170 | 195 | $utf8_arrays['utf8_compose_maps'][$decomposed] = $composed; |
| 171 | 196 | } |
| 172 | 197 | |
| 173 | 198 | $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; |
| 174 | 199 | |
| 175 | - foreach ($parts as $partnum => $hex) { |
|
| 176 | - if (isset($utf8_arrays['utf8_normalize_d_maps'][$hex])) { |
|
| 200 | + foreach ($parts as $partnum => $hex) |
|
| 201 | + { |
|
| 202 | + if (isset($utf8_arrays['utf8_normalize_d_maps'][$hex])) |
|
| 203 | + { |
|
| 177 | 204 | $parts[$partnum] = $utf8_arrays['utf8_normalize_d_maps'][$hex]; |
| 178 | 205 | } |
| 179 | 206 | } |
@@ -193,7 +220,8 @@ discard block |
||
| 193 | 220 | $utf8_arrays['utf8_normalize_kd_maps'] = array_diff_assoc($full_decomposition_maps, $utf8_arrays['utf8_normalize_d_maps']); |
| 194 | 221 | |
| 195 | 222 | // Some characters have the 'Default_Ignorable_Code_Point' property. |
| 196 | -foreach (file($unicode_data_url . '/DerivedCoreProperties.txt') as $line) { |
|
| 223 | +foreach (file($unicode_data_url . '/DerivedCoreProperties.txt') as $line) |
|
| 224 | +{ |
|
| 197 | 225 | if (strpos($line, 'Default_Ignorable_Code_Point') === false) |
| 198 | 226 | continue; |
| 199 | 227 | |
@@ -204,13 +232,17 @@ discard block |
||
| 204 | 232 | |
| 205 | 233 | $fields = explode(';', $line); |
| 206 | 234 | |
| 207 | - foreach ($fields as $key => $value) { |
|
| 235 | + foreach ($fields as $key => $value) |
|
| 236 | + { |
|
| 208 | 237 | $fields[$key] = trim($value); |
| 209 | 238 | } |
| 210 | 239 | |
| 211 | - if (strpos($fields[0], '..') === false) { |
|
| 240 | + if (strpos($fields[0], '..') === false) |
|
| 241 | + { |
|
| 212 | 242 | $utf8_arrays['utf8_default_ignorables'][] = '&#x' . $fields[0] . ';'; |
| 213 | - } else { |
|
| 243 | + } |
|
| 244 | + else |
|
| 245 | + { |
|
| 214 | 246 | $entities = array(); |
| 215 | 247 | |
| 216 | 248 | list($start, $end) = explode('..', $fields[0]); |
@@ -219,7 +251,8 @@ discard block |
||
| 219 | 251 | $ord_e = hexdec($end); |
| 220 | 252 | |
| 221 | 253 | $ord = $ord_s; |
| 222 | - while ($ord <= $ord_e) { |
|
| 254 | + while ($ord <= $ord_e) |
|
| 255 | + { |
|
| 223 | 256 | $utf8_arrays['utf8_default_ignorables'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; |
| 224 | 257 | } |
| 225 | 258 | } |
@@ -228,35 +261,43 @@ discard block |
||
| 228 | 261 | // Now update the file. |
| 229 | 262 | $subs_charset_contents = file_get_contents($sourcedir . '/Subs-Charset.php'); |
| 230 | 263 | |
| 231 | -foreach ($utf8_arrays as $func_name => $arr) { |
|
| 264 | +foreach ($utf8_arrays as $func_name => $arr) |
|
| 265 | +{ |
|
| 232 | 266 | $func_text = 'function ' . $func_name . '()' . "\n" . '{'; |
| 233 | 267 | |
| 234 | 268 | $func_regex = '/' . preg_quote($func_text, '/') . '[^}]*}/'; |
| 235 | 269 | |
| 236 | 270 | $func_text .= "\n\t" . 'return array(' . "\n"; |
| 237 | 271 | |
| 238 | - foreach ($arr as $key => $value) { |
|
| 272 | + foreach ($arr as $key => $value) |
|
| 273 | + { |
|
| 239 | 274 | $func_text .= "\t\t"; |
| 240 | 275 | |
| 241 | - if ($func_name !== 'utf8_default_ignorables') { |
|
| 276 | + if ($func_name !== 'utf8_default_ignorables') |
|
| 277 | + { |
|
| 242 | 278 | $func_text .= '"'; |
| 243 | 279 | |
| 244 | 280 | $key = mb_decode_numericentity(str_replace(' ', '', $key), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8'); |
| 245 | 281 | |
| 246 | - foreach (unpack('C*', $key) as $byte_value) { |
|
| 282 | + foreach (unpack('C*', $key) as $byte_value) |
|
| 283 | + { |
|
| 247 | 284 | $func_text .= '\\x' . strtoupper(dechex($byte_value)); |
| 248 | 285 | } |
| 249 | 286 | |
| 250 | 287 | $func_text .= '" => '; |
| 251 | 288 | } |
| 252 | 289 | |
| 253 | - if ($func_name == 'utf8_combining_classes') { |
|
| 290 | + if ($func_name == 'utf8_combining_classes') |
|
| 291 | + { |
|
| 254 | 292 | $func_text .= $value; |
| 255 | - } else { |
|
| 293 | + } |
|
| 294 | + else |
|
| 295 | + { |
|
| 256 | 296 | $func_text .= '"'; |
| 257 | 297 | |
| 258 | 298 | $value = mb_decode_numericentity(str_replace(' ', '', $value), array(0,0x10FFFF,0,0xFFFFFF), 'UTF-8'); |
| 259 | - foreach (unpack('C*', $value) as $byte_value) { |
|
| 299 | + foreach (unpack('C*', $value) as $byte_value) |
|
| 300 | + { |
|
| 260 | 301 | $func_text .= '\\x' . strtoupper(dechex($byte_value)); |
| 261 | 302 | } |
| 262 | 303 | |