1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Pinyin.php. |
5
|
|
|
* |
6
|
|
|
* For the full copyright and license information, please view the LICENSE |
7
|
|
|
* file that was distributed with this source code. |
8
|
|
|
* |
9
|
|
|
* @author overtrue <[email protected]> |
10
|
|
|
* @copyright 2015 overtrue <[email protected]> |
11
|
|
|
* |
12
|
|
|
* @link https://github.com/overtrue/pinyin |
13
|
|
|
* @link http://overtrue.me |
14
|
|
|
*/ |
15
|
|
|
|
16
|
|
|
namespace Overtrue\Pinyin; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Chinese to pinyin translator. |
20
|
|
|
* |
21
|
|
|
* @example |
22
|
|
|
* <pre> |
23
|
|
|
* echo \Overtrue\Pinyin\Pinyin::trans('带着希望去旅行,比到达终点更美好'), "\n"; |
24
|
|
|
* //output: "dài zhe xī wàng qù lǔ xíng bǐ dào dá zhōng diǎn gèng měi hǎo" |
25
|
|
|
* </pre> |
26
|
|
|
*/ |
27
|
|
|
class Pinyin |
28
|
|
|
{ |
29
|
|
|
/** |
30
|
|
|
* Dictionary. |
31
|
|
|
* |
32
|
|
|
* @var array |
33
|
|
|
*/ |
34
|
|
|
protected static $dictionary = array(); |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* Settings. |
38
|
|
|
* |
39
|
|
|
* @var array |
40
|
|
|
*/ |
41
|
|
|
protected static $settings = array( |
42
|
|
|
'delimiter' => ' ', |
43
|
|
|
'accent' => true, |
44
|
|
|
'only_chinese' => false, |
45
|
|
|
'uppercase' => false, |
46
|
|
|
'charset' => 'UTF-8' // GB2312,UTF-8 |
47
|
|
|
); |
48
|
|
|
/** |
49
|
|
|
* Internal charset used by this package. |
50
|
|
|
* |
51
|
|
|
* @var string |
52
|
|
|
*/ |
53
|
|
|
protected static $internalCharset = 'UTF-8'; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* The instance. |
57
|
|
|
* |
58
|
|
|
* @var \Overtrue\Pinyin\Pinyin |
59
|
|
|
*/ |
60
|
|
|
private static $_instance; |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* Constructor. |
64
|
|
|
* |
65
|
|
|
* set dictionary path. |
66
|
|
|
*/ |
67
|
1 |
|
private function __construct() |
68
|
|
|
{ |
69
|
1 |
|
if (!static::$dictionary) { |
|
|
|
|
70
|
1 |
|
$list = json_decode(file_get_contents(dirname(__DIR__).'/data/dict.php'), true); |
71
|
1 |
|
static::appends($list); |
72
|
1 |
|
} |
73
|
1 |
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* Disable clone. |
77
|
|
|
*/ |
78
|
|
|
private function __clone() |
79
|
|
|
{ |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* Get class instance. |
84
|
|
|
* |
85
|
|
|
* @return \Overtrue\Pinyin\Pinyin |
86
|
|
|
*/ |
87
|
14 |
|
public static function getInstance() |
88
|
|
|
{ |
89
|
14 |
|
if (is_null(self::$_instance)) { |
90
|
1 |
|
self::$_instance = new static(); |
91
|
1 |
|
} |
92
|
|
|
|
93
|
14 |
|
return self::$_instance; |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Setter. |
98
|
|
|
* |
99
|
|
|
* @param string $key |
100
|
|
|
* @param mixed $value |
101
|
|
|
*/ |
102
|
3 |
|
public static function set($key, $value) |
103
|
|
|
{ |
104
|
3 |
|
static::$settings[$key] = $value; |
105
|
3 |
|
} |
106
|
|
|
|
107
|
|
|
/** |
108
|
|
|
* Global settings. |
109
|
|
|
* |
110
|
|
|
* @param array $settings settings. |
111
|
|
|
*/ |
112
|
|
|
public static function settings(array $settings = array()) |
113
|
|
|
{ |
114
|
|
|
static::$settings = array_merge(static::$settings, $settings); |
115
|
|
|
} |
116
|
|
|
|
117
|
|
|
/** |
118
|
|
|
* Chinese to pinyin. |
119
|
|
|
* |
120
|
|
|
* @param string $string source string. |
121
|
|
|
* @param array $settings settings. |
122
|
|
|
* |
123
|
|
|
* @return string |
124
|
|
|
*/ |
125
|
12 |
|
public static function trans($string, array $settings = array()) |
126
|
|
|
{ |
127
|
12 |
|
$parsed = self::parse($string, $settings); |
128
|
|
|
|
129
|
12 |
|
return $parsed['pinyin']; |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Get first letters of string. |
134
|
|
|
* |
135
|
|
|
* @param string $string source string. |
136
|
|
|
* @param string $settings settings |
137
|
|
|
* |
138
|
|
|
* @return string |
139
|
|
|
*/ |
140
|
2 |
|
public static function letter($string, array $settings = array()) |
141
|
|
|
{ |
142
|
2 |
|
$settings = array_merge($settings, array('accent' => false, 'only_chinese' => true)); |
143
|
|
|
|
144
|
2 |
|
$parsed = self::parse($string, $settings); |
145
|
|
|
|
146
|
2 |
|
return $parsed['letter']; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* Parse the string to pinyin. |
151
|
|
|
* |
152
|
|
|
* Overtrue\Pinyin\Pinyin::parse('带着梦想旅行'); |
153
|
|
|
* |
154
|
|
|
* @param string $string |
155
|
|
|
* @param array $settings |
156
|
|
|
* |
157
|
|
|
* @return array |
158
|
|
|
*/ |
159
|
14 |
|
public static function parse($string, array $settings = array()) |
160
|
|
|
{ |
161
|
14 |
|
$instance = static::getInstance(); |
162
|
14 |
|
$raw = $string; |
163
|
|
|
|
164
|
14 |
|
$settings = array_merge(self::$settings, $settings); |
165
|
|
|
|
166
|
|
|
// add charset set |
167
|
14 |
|
if (!empty($settings['charset']) && $settings['charset'] != static::$internalCharset) { |
168
|
|
|
$string = iconv($settings['charset'], static::$internalCharset, $string); |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
// remove non-Chinese char. |
172
|
14 |
|
if ($settings['only_chinese']) { |
173
|
2 |
|
$string = $instance->justChinese($string); |
174
|
2 |
|
} |
175
|
|
|
|
176
|
14 |
|
$source = $instance->string2pinyin($string); |
177
|
|
|
|
178
|
|
|
// add accents |
179
|
14 |
|
if ($settings['accent']) { |
180
|
11 |
|
$pinyin = $instance->addAccents($source); |
181
|
11 |
|
} else { |
182
|
5 |
|
$pinyin = $instance->removeTone($source); |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
//add delimiter |
186
|
14 |
|
$delimitedPinyin = $instance->delimit($pinyin, $settings['delimiter']); |
187
|
|
|
|
188
|
|
|
$return = array( |
189
|
14 |
|
'src' => $raw, |
190
|
14 |
|
'pinyin' => stripslashes($delimitedPinyin), |
191
|
14 |
|
'letter' => stripslashes($instance->getFirstLetters($source, $settings)), |
192
|
14 |
|
); |
193
|
|
|
|
194
|
14 |
|
return $return; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Add custom words. |
199
|
|
|
* |
200
|
|
|
* @param array $appends |
201
|
|
|
*/ |
202
|
2 |
|
public static function appends(array $appends) |
203
|
|
|
{ |
204
|
2 |
|
$list = static::formatWords($appends); |
205
|
2 |
|
foreach ($list as $key => $value) { |
206
|
2 |
|
$firstChar = mb_substr($key, 0, 1, static::$internalCharset); |
207
|
2 |
|
self::$dictionary[$firstChar][$key] = $value; |
208
|
2 |
|
} |
209
|
2 |
|
} |
210
|
|
|
|
211
|
|
|
/** |
212
|
|
|
* Get first letters from pinyin. |
213
|
|
|
* |
214
|
|
|
* @param string $pinyin |
215
|
|
|
* @param array $settings |
216
|
|
|
* |
217
|
|
|
* @return string |
218
|
|
|
*/ |
219
|
14 |
|
protected function getFirstLetters($pinyin, $settings) |
220
|
|
|
{ |
221
|
14 |
|
$letterCase = $settings['uppercase'] ? 'strtoupper' : 'strtolower'; |
222
|
|
|
|
223
|
14 |
|
$letters = array(); |
224
|
|
|
|
225
|
14 |
|
foreach (explode(' ', $pinyin) as $word) { |
226
|
14 |
|
if (empty($word)) { |
227
|
1 |
|
continue; |
228
|
|
|
} |
229
|
|
|
|
230
|
14 |
|
$ord = ord(strtolower($word{0})); |
231
|
|
|
|
232
|
14 |
|
if ($ord >= 97 && $ord <= 122) { |
233
|
14 |
|
$letters[] = $letterCase($word{0}); |
234
|
14 |
|
} |
235
|
14 |
|
} |
236
|
|
|
|
237
|
14 |
|
return implode($settings['delimiter'], $letters); |
238
|
|
|
} |
239
|
|
|
|
240
|
|
|
/** |
241
|
|
|
* Replace string to pinyin. |
242
|
|
|
* |
243
|
|
|
* @param string $string |
244
|
|
|
* |
245
|
|
|
* @return string |
246
|
|
|
*/ |
247
|
14 |
|
protected function string2pinyin($string) |
248
|
|
|
{ |
249
|
14 |
|
$preparedString = $this->prepare($string); |
250
|
14 |
|
$count = mb_strlen($preparedString, static::$internalCharset); |
251
|
14 |
|
$dictionary = []; |
252
|
|
|
|
253
|
14 |
|
$i = 0; |
254
|
14 |
|
while ($i < $count) { |
255
|
14 |
|
$char = mb_substr($preparedString, $i++, 1, static::$internalCharset); |
256
|
14 |
|
$pinyinGroup = isset(self::$dictionary[$char]) ? self::$dictionary[$char] : []; |
257
|
14 |
|
$dictionary = array_merge($dictionary, $pinyinGroup); |
258
|
14 |
|
} |
259
|
|
|
|
260
|
14 |
|
$pinyin = strtr($preparedString, $dictionary); |
261
|
|
|
|
262
|
14 |
|
return trim(str_replace(' ', ' ', $pinyin)); |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
/** |
266
|
|
|
* Format user's words. |
267
|
|
|
* |
268
|
|
|
* @param array $words |
269
|
|
|
* |
270
|
|
|
* @return array |
271
|
|
|
*/ |
272
|
2 |
|
public static function formatWords($words) |
273
|
|
|
{ |
274
|
2 |
|
foreach ($words as $word => $pinyin) { |
275
|
2 |
|
$words[$word] = static::formatDictPinyin($pinyin); |
276
|
2 |
|
} |
277
|
|
|
|
278
|
2 |
|
return $words; |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
/** |
282
|
|
|
* Format pinyin to lowercase. |
283
|
|
|
* |
284
|
|
|
* @param string $pinyin pinyin string. |
285
|
|
|
* |
286
|
|
|
* @return string |
287
|
|
|
*/ |
288
|
2 |
|
protected static function formatDictPinyin($pinyin) |
289
|
|
|
{ |
290
|
2 |
|
$pinyin = trim($pinyin); |
291
|
|
|
|
292
|
2 |
|
return preg_replace_callback('/[a-z]{1,}:?\d{1}\s?/i', function ($matches) { |
293
|
2 |
|
return strtolower($matches[0]); |
294
|
2 |
|
}, " {$pinyin} "); |
295
|
|
|
} |
296
|
|
|
|
297
|
|
|
/** |
298
|
|
|
* Check if the string has Chinese characters. |
299
|
|
|
* |
300
|
|
|
* @param string $string string to check. |
301
|
|
|
* |
302
|
|
|
* @return int |
303
|
|
|
*/ |
304
|
|
|
protected function containChinese($string) |
305
|
|
|
{ |
306
|
|
|
return preg_match('/\p{Han}+/u', $string); |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
/** |
310
|
|
|
* Remove the non-Chinese characters. |
311
|
|
|
* |
312
|
|
|
* @param string $string source string. |
313
|
|
|
* |
314
|
|
|
* @return string |
315
|
|
|
*/ |
316
|
2 |
|
public function justChinese($string) |
317
|
|
|
{ |
318
|
2 |
|
return preg_replace('/[^\p{Han}]/u', '', $string); |
319
|
|
|
} |
320
|
|
|
|
321
|
|
|
/** |
322
|
|
|
* Prepare the string. |
323
|
|
|
* |
324
|
|
|
* @param string $string source string. |
325
|
|
|
* |
326
|
|
|
* @return string |
327
|
|
|
*/ |
328
|
14 |
|
protected function prepare($string) |
329
|
|
|
{ |
330
|
|
|
$pattern = array( |
331
|
14 |
|
'/([A-z])(\d)/' => '$1\\\\\2', // test4 => test\\4 |
332
|
14 |
|
); |
333
|
|
|
|
334
|
14 |
|
return preg_replace(array_keys($pattern), $pattern, $string); |
335
|
|
|
} |
336
|
|
|
|
337
|
|
|
/** |
338
|
|
|
* Add delimiter. |
339
|
|
|
* |
340
|
|
|
* @param string $string |
341
|
|
|
*/ |
342
|
14 |
|
protected function delimit($string, $delimiter = '') |
343
|
|
|
{ |
344
|
14 |
|
return preg_replace('/\s+/', strval($delimiter), trim($string)); |
345
|
|
|
} |
346
|
|
|
|
347
|
|
|
/** |
348
|
|
|
* Remove tone. |
349
|
|
|
* |
350
|
|
|
* @param string $string string with tone. |
351
|
|
|
* |
352
|
|
|
* @return string |
353
|
|
|
*/ |
354
|
5 |
|
protected function removeTone($string) |
355
|
|
|
{ |
356
|
|
|
$replacement = array( |
357
|
5 |
|
'/u:/' => 'u', |
358
|
5 |
|
'/([a-z])[1-5]/i' => '\\1', |
359
|
5 |
|
); |
360
|
|
|
|
361
|
5 |
|
return preg_replace(array_keys($replacement), $replacement, $string); |
362
|
|
|
} |
363
|
|
|
|
364
|
|
|
/** |
365
|
|
|
* Credits for these 2 functions go to Bouke Versteegh, who shared these |
366
|
|
|
* at http://stackoverflow.com/questions/1598856/convert-numbered-to-accentuated-pinyin. |
367
|
|
|
* |
368
|
|
|
* @param string $string The pinyin string with tone numbers, i.e. "ni3 hao3" |
369
|
|
|
* |
370
|
|
|
* @return string The formatted string with tone marks, i.e. |
371
|
|
|
*/ |
372
|
11 |
|
protected function addAccents($string) |
373
|
|
|
{ |
374
|
|
|
// find words with a number behind them, and replace with callback fn. |
375
|
11 |
|
return str_replace('u:', 'ü', preg_replace_callback( |
376
|
11 |
|
'~([a-zA-ZüÜ]+\:?)([1-5])~', |
377
|
11 |
|
array($this, 'addAccentsCallback'), |
378
|
11 |
|
$string)); |
379
|
|
|
} |
380
|
|
|
|
381
|
|
|
/** |
382
|
|
|
* Helper callback. |
383
|
|
|
* |
384
|
|
|
* @param array $match |
385
|
|
|
*/ |
386
|
11 |
|
protected function addAccentsCallback($match) |
387
|
|
|
{ |
388
|
11 |
|
static $accentmap = null; |
389
|
|
|
|
390
|
11 |
|
if ($accentmap === null) { |
391
|
|
|
// where to place the accent marks |
392
|
|
|
$stars = 'a* e* i* o* u* ü* ü* '. |
393
|
1 |
|
'A* E* I* O* U* Ü* '. |
394
|
1 |
|
'a*i a*o e*i ia* ia*o ie* io* iu* '. |
395
|
1 |
|
'A*I A*O E*I IA* IA*O IE* IO* IU* '. |
396
|
1 |
|
'o*u ua* ua*i ue* ui* uo* üe* '. |
397
|
1 |
|
'O*U UA* UA*I UE* UI* UO* ÜE*'; |
398
|
|
|
$nostars = 'a e i o u u: ü '. |
399
|
1 |
|
'A E I O U Ü '. |
400
|
1 |
|
'ai ao ei ia iao ie io iu '. |
401
|
1 |
|
'AI AO EI IA IAO IE IO IU '. |
402
|
1 |
|
'ou ua uai ue ui uo üe '. |
403
|
1 |
|
'OU UA UAI UE UI UO ÜE'; |
404
|
|
|
|
405
|
|
|
// build an array like array('a' => 'a*') and store statically |
406
|
1 |
|
$accentmap = array_combine(explode(' ', $nostars), explode(' ', $stars)); |
407
|
1 |
|
} |
408
|
|
|
|
409
|
11 |
|
$vowels = array('a*', 'e*', 'i*', 'o*', 'u*', 'ü*', 'A*', 'E*', 'I*', 'O*', 'U*', 'Ü*'); |
410
|
|
|
|
411
|
|
|
$pinyin = array( |
412
|
11 |
|
1 => array('ā', 'ē', 'ī', 'ō', 'ū', 'ǖ', 'Ā', 'Ē', 'Ī', 'Ō', 'Ū', 'Ǖ'), |
413
|
11 |
|
2 => array('á', 'é', 'í', 'ó', 'ú', 'ǘ', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ǘ'), |
414
|
11 |
|
3 => array('ǎ', 'ě', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'Ǎ', 'Ě', 'Ǐ', 'Ǒ', 'Ǔ', 'Ǚ'), |
415
|
11 |
|
4 => array('à', 'è', 'ì', 'ò', 'ù', 'ǜ', 'À', 'È', 'Ì', 'Ò', 'Ù', 'Ǜ'), |
416
|
11 |
|
5 => array('a', 'e', 'i', 'o', 'u', 'ü', 'A', 'E', 'I', 'O', 'U', 'Ü'), |
417
|
11 |
|
); |
418
|
|
|
|
419
|
11 |
|
list(, $word, $tone) = $match; |
420
|
|
|
|
421
|
|
|
// add star to vowelcluster |
422
|
11 |
|
$word = strtr($word, $accentmap); |
|
|
|
|
423
|
|
|
|
424
|
|
|
// replace starred letter with accented |
425
|
11 |
|
$word = str_replace($vowels, $pinyin[$tone], $word); |
426
|
|
|
|
427
|
11 |
|
return $word; |
428
|
|
|
} |
429
|
|
|
}//end class |
430
|
|
|
|
431
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.