|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/* |
|
4
|
|
|
* This file is part of the Symfony package. |
|
5
|
|
|
* |
|
6
|
|
|
* (c) Fabien Potencier <[email protected]> |
|
7
|
|
|
* |
|
8
|
|
|
* For the full copyright and license information, please view the LICENSE |
|
9
|
|
|
* file that was distributed with this source code. |
|
10
|
|
|
*/ |
|
11
|
|
|
|
|
12
|
|
|
namespace Symfony\Polyfill\Intl\Normalizer; |
|
13
|
|
|
|
|
14
|
|
|
/** |
|
15
|
|
|
* Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension. |
|
16
|
|
|
* |
|
17
|
|
|
* It has been validated with Unicode 6.3 Normalization Conformance Test. |
|
18
|
|
|
* See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations. |
|
19
|
|
|
* |
|
20
|
|
|
* @author Nicolas Grekas <[email protected]> |
|
21
|
|
|
* |
|
22
|
|
|
* @internal |
|
23
|
|
|
*/ |
|
24
|
|
|
class Normalizer |
|
25
|
|
|
{ |
|
26
|
|
|
const FORM_D = \Normalizer::FORM_D; |
|
27
|
|
|
const FORM_KD = \Normalizer::FORM_KD; |
|
28
|
|
|
const FORM_C = \Normalizer::FORM_C; |
|
29
|
|
|
const FORM_KC = \Normalizer::FORM_KC; |
|
30
|
|
|
const NFD = \Normalizer::NFD; |
|
31
|
|
|
const NFKD = \Normalizer::NFKD; |
|
32
|
|
|
const NFC = \Normalizer::NFC; |
|
33
|
|
|
const NFKC = \Normalizer::NFKC; |
|
34
|
|
|
|
|
35
|
|
|
private static $C; |
|
36
|
|
|
private static $D; |
|
37
|
|
|
private static $KD; |
|
38
|
|
|
private static $cC; |
|
39
|
|
|
private static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4); |
|
40
|
|
|
private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; |
|
41
|
|
|
|
|
42
|
|
|
public static function isNormalized($s, $form = self::NFC) |
|
43
|
|
|
{ |
|
44
|
|
|
if (!\in_array($form, array(self::NFD, self::NFKD, self::NFC, self::NFKC))) { |
|
45
|
|
|
return false; |
|
46
|
|
|
} |
|
47
|
|
|
$s = (string) $s; |
|
48
|
|
|
if (!isset($s[strspn($s, self::$ASCII)])) { |
|
49
|
|
|
return true; |
|
50
|
|
|
} |
|
51
|
|
|
if (self::NFC == $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) { |
|
52
|
|
|
return true; |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
return self::normalize($s, $form) === $s; |
|
56
|
|
|
} |
|
57
|
|
|
|
|
58
|
|
|
public static function normalize($s, $form = self::NFC) |
|
59
|
|
|
{ |
|
60
|
|
|
$s = (string) $s; |
|
61
|
|
|
if (!preg_match('//u', $s)) { |
|
62
|
|
|
return false; |
|
63
|
|
|
} |
|
64
|
|
|
|
|
65
|
|
|
switch ($form) { |
|
66
|
|
|
case self::NFC: $C = true; $K = false; break; |
|
67
|
|
|
case self::NFD: $C = false; $K = false; break; |
|
68
|
|
|
case self::NFKC: $C = true; $K = true; break; |
|
69
|
|
|
case self::NFKD: $C = false; $K = true; break; |
|
70
|
|
|
default: |
|
71
|
|
|
if (\defined('Normalizer::NONE') && \Normalizer::NONE == $form) { |
|
|
|
|
|
|
72
|
|
|
return $s; |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
return false; |
|
76
|
|
|
} |
|
77
|
|
|
|
|
78
|
|
|
if ('' === $s) { |
|
79
|
|
|
return ''; |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
|
|
if ($K && null === self::$KD) { |
|
83
|
|
|
self::$KD = self::getData('compatibilityDecomposition'); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
if (null === self::$D) { |
|
87
|
|
|
self::$D = self::getData('canonicalDecomposition'); |
|
88
|
|
|
self::$cC = self::getData('combiningClass'); |
|
89
|
|
|
} |
|
90
|
|
|
|
|
91
|
|
|
if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) { |
|
92
|
|
|
mb_internal_encoding('8bit'); |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
$r = self::decompose($s, $K); |
|
96
|
|
|
|
|
97
|
|
|
if ($C) { |
|
98
|
|
|
if (null === self::$C) { |
|
99
|
|
|
self::$C = self::getData('canonicalComposition'); |
|
100
|
|
|
} |
|
101
|
|
|
|
|
102
|
|
|
$r = self::recompose($r); |
|
103
|
|
|
} |
|
104
|
|
|
if (null !== $mbEncoding) { |
|
105
|
|
|
mb_internal_encoding($mbEncoding); |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
return $r; |
|
109
|
|
|
} |
|
110
|
|
|
|
|
111
|
|
|
private static function recompose($s) |
|
112
|
|
|
{ |
|
113
|
|
|
$ASCII = self::$ASCII; |
|
114
|
|
|
$compMap = self::$C; |
|
115
|
|
|
$combClass = self::$cC; |
|
116
|
|
|
$ulenMask = self::$ulenMask; |
|
117
|
|
|
|
|
118
|
|
|
$result = $tail = ''; |
|
119
|
|
|
|
|
120
|
|
|
$i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"]; |
|
121
|
|
|
$len = \strlen($s); |
|
122
|
|
|
|
|
123
|
|
|
$lastUchr = substr($s, 0, $i); |
|
124
|
|
|
$lastUcls = isset($combClass[$lastUchr]) ? 256 : 0; |
|
125
|
|
|
|
|
126
|
|
|
while ($i < $len) { |
|
127
|
|
|
if ($s[$i] < "\x80") { |
|
128
|
|
|
// ASCII chars |
|
129
|
|
|
|
|
130
|
|
|
if ($tail) { |
|
131
|
|
|
$lastUchr .= $tail; |
|
132
|
|
|
$tail = ''; |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
|
|
if ($j = strspn($s, $ASCII, $i + 1)) { |
|
136
|
|
|
$lastUchr .= substr($s, $i, $j); |
|
137
|
|
|
$i += $j; |
|
138
|
|
|
} |
|
139
|
|
|
|
|
140
|
|
|
$result .= $lastUchr; |
|
141
|
|
|
$lastUchr = $s[$i]; |
|
142
|
|
|
$lastUcls = 0; |
|
143
|
|
|
++$i; |
|
144
|
|
|
continue; |
|
145
|
|
|
} |
|
146
|
|
|
|
|
147
|
|
|
$ulen = $ulenMask[$s[$i] & "\xF0"]; |
|
148
|
|
|
$uchr = substr($s, $i, $ulen); |
|
149
|
|
|
|
|
150
|
|
|
if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr |
|
151
|
|
|
|| $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr |
|
152
|
|
|
|| $lastUcls) { |
|
153
|
|
|
// Table lookup and combining chars composition |
|
154
|
|
|
|
|
155
|
|
|
$ucls = isset($combClass[$uchr]) ? $combClass[$uchr] : 0; |
|
156
|
|
|
|
|
157
|
|
|
if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) { |
|
158
|
|
|
$lastUchr = $compMap[$lastUchr.$uchr]; |
|
159
|
|
|
} elseif ($lastUcls = $ucls) { |
|
160
|
|
|
$tail .= $uchr; |
|
161
|
|
|
} else { |
|
162
|
|
|
if ($tail) { |
|
163
|
|
|
$lastUchr .= $tail; |
|
164
|
|
|
$tail = ''; |
|
165
|
|
|
} |
|
166
|
|
|
|
|
167
|
|
|
$result .= $lastUchr; |
|
168
|
|
|
$lastUchr = $uchr; |
|
169
|
|
|
} |
|
170
|
|
|
} else { |
|
171
|
|
|
// Hangul chars |
|
172
|
|
|
|
|
173
|
|
|
$L = \ord($lastUchr[2]) - 0x80; |
|
174
|
|
|
$V = \ord($uchr[2]) - 0xA1; |
|
175
|
|
|
$T = 0; |
|
176
|
|
|
|
|
177
|
|
|
$uchr = substr($s, $i + $ulen, 3); |
|
178
|
|
|
|
|
179
|
|
|
if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") { |
|
180
|
|
|
$T = \ord($uchr[2]) - 0xA7; |
|
181
|
|
|
0 > $T && $T += 0x40; |
|
182
|
|
|
$ulen += 3; |
|
183
|
|
|
} |
|
184
|
|
|
|
|
185
|
|
|
$L = 0xAC00 + ($L * 21 + $V) * 28 + $T; |
|
186
|
|
|
$lastUchr = \chr(0xE0 | $L >> 12).\chr(0x80 | $L >> 6 & 0x3F).\chr(0x80 | $L & 0x3F); |
|
187
|
|
|
} |
|
188
|
|
|
|
|
189
|
|
|
$i += $ulen; |
|
190
|
|
|
} |
|
191
|
|
|
|
|
192
|
|
|
return $result.$lastUchr.$tail; |
|
193
|
|
|
} |
|
194
|
|
|
|
|
195
|
|
|
private static function decompose($s, $c) |
|
196
|
|
|
{ |
|
197
|
|
|
$result = ''; |
|
198
|
|
|
|
|
199
|
|
|
$ASCII = self::$ASCII; |
|
200
|
|
|
$decompMap = self::$D; |
|
201
|
|
|
$combClass = self::$cC; |
|
202
|
|
|
$ulenMask = self::$ulenMask; |
|
203
|
|
|
if ($c) { |
|
204
|
|
|
$compatMap = self::$KD; |
|
205
|
|
|
} |
|
206
|
|
|
|
|
207
|
|
|
$c = array(); |
|
208
|
|
|
$i = 0; |
|
209
|
|
|
$len = \strlen($s); |
|
210
|
|
|
|
|
211
|
|
|
while ($i < $len) { |
|
212
|
|
|
if ($s[$i] < "\x80") { |
|
213
|
|
|
// ASCII chars |
|
214
|
|
|
|
|
215
|
|
|
if ($c) { |
|
|
|
|
|
|
216
|
|
|
ksort($c); |
|
217
|
|
|
$result .= implode('', $c); |
|
218
|
|
|
$c = array(); |
|
219
|
|
|
} |
|
220
|
|
|
|
|
221
|
|
|
$j = 1 + strspn($s, $ASCII, $i + 1); |
|
222
|
|
|
$result .= substr($s, $i, $j); |
|
223
|
|
|
$i += $j; |
|
224
|
|
|
continue; |
|
225
|
|
|
} |
|
226
|
|
|
|
|
227
|
|
|
$ulen = $ulenMask[$s[$i] & "\xF0"]; |
|
228
|
|
|
$uchr = substr($s, $i, $ulen); |
|
229
|
|
|
$i += $ulen; |
|
230
|
|
|
|
|
231
|
|
|
if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) { |
|
232
|
|
|
// Table lookup |
|
233
|
|
|
|
|
234
|
|
|
if ($uchr !== $j = isset($compatMap[$uchr]) ? $compatMap[$uchr] : (isset($decompMap[$uchr]) ? $decompMap[$uchr] : $uchr)) { |
|
235
|
|
|
$uchr = $j; |
|
236
|
|
|
|
|
237
|
|
|
$j = \strlen($uchr); |
|
238
|
|
|
$ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"]; |
|
239
|
|
|
|
|
240
|
|
|
if ($ulen != $j) { |
|
241
|
|
|
// Put trailing chars in $s |
|
242
|
|
|
|
|
243
|
|
|
$j -= $ulen; |
|
244
|
|
|
$i -= $j; |
|
245
|
|
|
|
|
246
|
|
|
if (0 > $i) { |
|
247
|
|
|
$s = str_repeat(' ', -$i).$s; |
|
248
|
|
|
$len -= $i; |
|
249
|
|
|
$i = 0; |
|
250
|
|
|
} |
|
251
|
|
|
|
|
252
|
|
|
while ($j--) { |
|
253
|
|
|
$s[$i + $j] = $uchr[$ulen + $j]; |
|
254
|
|
|
} |
|
255
|
|
|
|
|
256
|
|
|
$uchr = substr($uchr, 0, $ulen); |
|
257
|
|
|
} |
|
258
|
|
|
} |
|
259
|
|
|
if (isset($combClass[$uchr])) { |
|
260
|
|
|
// Combining chars, for sorting |
|
261
|
|
|
|
|
262
|
|
|
if (!isset($c[$combClass[$uchr]])) { |
|
263
|
|
|
$c[$combClass[$uchr]] = ''; |
|
264
|
|
|
} |
|
265
|
|
|
$c[$combClass[$uchr]] .= $uchr; |
|
266
|
|
|
continue; |
|
267
|
|
|
} |
|
268
|
|
|
} else { |
|
269
|
|
|
// Hangul chars |
|
270
|
|
|
|
|
271
|
|
|
$uchr = unpack('C*', $uchr); |
|
272
|
|
|
$j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80; |
|
273
|
|
|
|
|
274
|
|
|
$uchr = "\xE1\x84".\chr(0x80 + (int) ($j / 588)) |
|
275
|
|
|
."\xE1\x85".\chr(0xA1 + (int) (($j % 588) / 28)); |
|
276
|
|
|
|
|
277
|
|
|
if ($j %= 28) { |
|
278
|
|
|
$uchr .= $j < 25 |
|
279
|
|
|
? ("\xE1\x86".\chr(0xA7 + $j)) |
|
280
|
|
|
: ("\xE1\x87".\chr(0x67 + $j)); |
|
281
|
|
|
} |
|
282
|
|
|
} |
|
283
|
|
|
if ($c) { |
|
|
|
|
|
|
284
|
|
|
ksort($c); |
|
285
|
|
|
$result .= implode('', $c); |
|
286
|
|
|
$c = array(); |
|
287
|
|
|
} |
|
288
|
|
|
|
|
289
|
|
|
$result .= $uchr; |
|
290
|
|
|
} |
|
291
|
|
|
|
|
292
|
|
|
if ($c) { |
|
|
|
|
|
|
293
|
|
|
ksort($c); |
|
294
|
|
|
$result .= implode('', $c); |
|
295
|
|
|
} |
|
296
|
|
|
|
|
297
|
|
|
return $result; |
|
298
|
|
|
} |
|
299
|
|
|
|
|
300
|
|
View Code Duplication |
private static function getData($file) |
|
|
|
|
|
|
301
|
|
|
{ |
|
302
|
|
|
if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) { |
|
303
|
|
|
return require $file; |
|
304
|
|
|
} |
|
305
|
|
|
|
|
306
|
|
|
return false; |
|
307
|
|
|
} |
|
308
|
|
|
} |
|
309
|
|
|
|
This class constant has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.