1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/* |
4
|
|
|
* This file is part of the overtrue/pinyin. |
5
|
|
|
* |
6
|
|
|
* (c) 2016 overtrue <[email protected]> |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
namespace Overtrue\Pinyin; |
10
|
|
|
|
11
|
|
|
use InvalidArgumentException; |
12
|
|
|
|
13
|
1 |
|
define('PINYIN_NONE', 'none'); |
14
|
1 |
|
define('PINYIN_ASCII', 'ascii'); |
15
|
1 |
|
define('PINYIN_UNICODE', 'unicode'); |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* Chinese to pinyin translator. |
19
|
|
|
* |
20
|
|
|
* @author overtrue <[email protected]> |
21
|
|
|
* @copyright 2015 overtrue <[email protected]> |
22
|
|
|
* |
23
|
|
|
* @link https://github.com/overtrue/pinyin |
24
|
|
|
* @link http://overtrue.me |
25
|
|
|
*/ |
26
|
|
|
class Pinyin |
27
|
|
|
{ |
28
|
|
|
const NONE = 'none'; |
29
|
|
|
const ASCII = 'ascii'; |
30
|
|
|
const UNICODE = 'unicode'; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* Dict loader. |
34
|
|
|
* |
35
|
|
|
* @var \Overtrue\Pinyin\DictLoaderInterface |
36
|
|
|
*/ |
37
|
|
|
protected $loader; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Punctuations map. |
41
|
|
|
* |
42
|
|
|
* @var array |
43
|
|
|
*/ |
44
|
|
|
protected $punctuations = array( |
45
|
|
|
',' => ',', |
46
|
|
|
'。' => '.', |
47
|
|
|
'!' => '!', |
48
|
|
|
'?' => '?', |
49
|
|
|
':' => ':', |
50
|
|
|
'“' => '"', |
51
|
|
|
'”' => '"', |
52
|
|
|
'‘' => "'", |
53
|
|
|
'’' => "'", |
54
|
|
|
); |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* Constructor. |
58
|
|
|
* |
59
|
|
|
* @param \Overtrue\Pinyin\DictLoaderInterface $loader |
60
|
|
|
*/ |
61
|
9 |
|
public function __construct(DictLoaderInterface $loader = null) |
62
|
|
|
{ |
63
|
9 |
|
$this->loader = $loader; |
64
|
9 |
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Convert string to pinyin. |
68
|
|
|
* |
69
|
|
|
* @param string $string |
70
|
|
|
* @param string $option |
71
|
|
|
* |
72
|
|
|
* @return array |
73
|
|
|
*/ |
74
|
4 |
View Code Duplication |
public function convert($string, $option = self::NONE) |
|
|
|
|
75
|
|
|
{ |
76
|
4 |
|
$pinyin = $this->romanize($string); |
77
|
|
|
|
78
|
4 |
|
$split = array_filter(preg_split('/[^üāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜa-z]+/iu', $pinyin)); |
79
|
|
|
|
80
|
4 |
|
if ($option !== self::UNICODE) { |
81
|
3 |
|
foreach ($split as $index => $pinyin) { |
82
|
3 |
|
$split[$index] = $this->format($pinyin, $option === self::ASCII); |
83
|
3 |
|
} |
84
|
3 |
|
} |
85
|
|
|
|
86
|
4 |
|
return array_values($split); |
87
|
|
|
} |
88
|
|
|
|
89
|
|
|
/** |
90
|
|
|
* Convert string (person name) to pinyin. |
91
|
|
|
* |
92
|
|
|
* @param string $stringName |
93
|
|
|
* @param string $option |
94
|
|
|
* |
95
|
|
|
* @return array |
96
|
|
|
*/ |
97
|
1 |
View Code Duplication |
public function convertName($stringName, $option = self::NONE) |
|
|
|
|
98
|
|
|
{ |
99
|
1 |
|
$pinyin = $this->romanize($stringName, true); |
100
|
1 |
|
|
101
|
|
|
$split = array_filter(preg_split('/[^üāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜa-z]+/iu', $pinyin)); |
102
|
|
|
|
103
|
1 |
|
if ($option !== self::UNICODE) { |
104
|
|
|
foreach ($split as $index => $pinyin) { |
105
|
|
|
$split[$index] = $this->format($pinyin, $option === self::ASCII); |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
|
109
|
|
|
return array_values($split); |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
/** |
113
|
|
|
* Return a pinyin permlink from string. |
114
|
1 |
|
* |
115
|
|
|
* @param string $string |
116
|
|
|
* @param string $delimiter |
117
|
1 |
|
* |
118
|
1 |
|
* @return string |
119
|
|
|
*/ |
120
|
|
|
public function permlink($string, $delimiter = '-') |
121
|
|
|
{ |
122
|
|
|
if (!in_array($delimiter, array('_', '-', '.', ''), true)) { |
123
|
|
|
throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'."); |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
return implode($delimiter, $this->convert($string, false)); |
|
|
|
|
127
|
|
|
} |
128
|
|
|
|
129
|
5 |
|
/** |
130
|
|
|
* Return first letters. |
131
|
5 |
|
* |
132
|
5 |
|
* @param string $string |
133
|
5 |
|
* @param string $delimiter |
134
|
|
|
* |
135
|
5 |
|
* @return string |
136
|
|
|
*/ |
137
|
5 |
|
public function abbr($string, $delimiter = '') |
138
|
5 |
|
{ |
139
|
|
|
return implode($delimiter, array_map(function ($pinyin) { |
140
|
5 |
|
return $pinyin[0]; |
141
|
|
|
}, $this->convert($string, false))); |
|
|
|
|
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
/** |
145
|
|
|
* Chinese to pinyin sentense. |
146
|
|
|
* |
147
|
|
|
* @param string $sentence |
148
|
|
|
* @param string $option |
|
|
|
|
149
|
|
|
* |
150
|
1 |
|
* @return string |
151
|
|
|
*/ |
152
|
1 |
|
public function sentence($sentence, $withTone = false) |
153
|
|
|
{ |
154
|
1 |
|
$marks = array_keys($this->punctuations); |
155
|
|
|
$punctuationsRegex = preg_quote(implode(array_merge($marks, $this->punctuations)), '/'); |
156
|
|
|
$regex = '/[^üāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜa-z0-9'.$punctuationsRegex.'\s_]+/iu'; |
157
|
|
|
|
158
|
|
|
$pinyin = preg_replace($regex, '', $this->romanize($sentence)); |
159
|
|
|
|
160
|
|
|
$punctuations = array_merge($this->punctuations, array("\t" => ' ', ' ' => ' ')); |
161
|
|
|
$pinyin = trim(str_replace(array_keys($punctuations), $punctuations, $pinyin)); |
162
|
9 |
|
|
163
|
|
|
return $withTone ? $pinyin : $this->format($pinyin, false); |
164
|
9 |
|
} |
165
|
|
|
|
166
|
|
|
/** |
167
|
|
|
* Loader setter. |
168
|
|
|
* |
169
|
|
|
* @param \Overtrue\Pinyin\DictLoaderInterface $loader |
170
|
|
|
* |
171
|
|
|
* @return $this |
172
|
|
|
*/ |
173
|
|
|
public function setLoader(DictLoaderInterface $loader) |
174
|
9 |
|
{ |
175
|
|
|
$this->loader = $loader; |
176
|
|
|
|
177
|
3 |
|
return $this; |
178
|
9 |
|
} |
179
|
|
|
|
180
|
9 |
|
/** |
181
|
|
|
* Return dict loader,. |
182
|
|
|
* |
183
|
|
|
* @return \Overtrue\Pinyin\DictLoaderInterface |
184
|
|
|
*/ |
185
|
|
|
public function getLoader() |
186
|
|
|
{ |
187
|
|
|
return $this->loader ?: new FileDictLoader(__DIR__.'/../data/'); |
188
|
|
|
} |
189
|
|
|
|
190
|
9 |
|
/** |
191
|
|
|
* Preprocess. |
192
|
9 |
|
* |
193
|
|
|
* @param string $string |
194
|
9 |
|
* |
195
|
9 |
|
* @return string |
196
|
9 |
|
*/ |
197
|
|
|
protected function prepare($string) |
198
|
9 |
|
{ |
199
|
|
|
$string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) { |
200
|
|
|
return "\t".$matches[0]; |
201
|
|
|
}, $string); |
202
|
|
|
|
203
|
|
|
return preg_replace("~[^\p{Han}\p{P}\p{Z}\p{M}\p{N}\p{L}\t]~u", '', $string); |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
/** |
207
|
|
|
* Convert Chinese to pinyin. |
208
|
|
|
* |
209
|
7 |
|
* @param string $string |
210
|
|
|
* @param bool $isName |
211
|
|
|
* |
212
|
7 |
|
* @return string |
213
|
7 |
|
*/ |
214
|
7 |
|
protected function romanize($string, $isName = false) |
215
|
7 |
|
{ |
216
|
7 |
|
$string = $this->prepare($string); |
217
|
7 |
|
|
218
|
|
|
$dictLoader = $this->getLoader(); |
219
|
7 |
|
|
220
|
7 |
|
if ($isName) { |
221
|
6 |
|
$string = $this->convertSurname($string, $dictLoader); |
222
|
6 |
|
} |
223
|
7 |
|
|
224
|
|
|
$dictLoader->map(function ($dictionary) use (&$string) { |
225
|
7 |
|
$string = strtr($string, $dictionary); |
226
|
|
|
}); |
227
|
|
|
|
228
|
|
|
return $string; |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
/** |
232
|
|
|
* Convert Chinese Surname to pinyin. |
233
|
|
|
* |
234
|
|
|
* @param string $string |
235
|
|
|
* @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader |
236
|
|
|
* |
237
|
|
|
* @return string |
238
|
|
|
*/ |
239
|
|
|
protected function convertSurname($string, $dictLoader) { |
240
|
|
|
$dictLoader->mapSurname(function ($dictionary) use (&$string) { |
241
|
|
|
foreach ($dictionary as $surname => $pinyin) { |
242
|
|
|
if (strpos($string, $surname) === 0) { |
243
|
|
|
$string = $pinyin . mb_substr($string, mb_strlen($surname, 'UTF-8')); |
244
|
|
|
break; |
245
|
|
|
} |
246
|
|
|
} |
247
|
|
|
}); |
248
|
|
|
|
249
|
|
|
return $string; |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
/** |
253
|
|
|
* Format. |
254
|
|
|
* |
255
|
|
|
* @param string $pinyin |
256
|
|
|
* @param bool $tone |
257
|
|
|
* |
258
|
|
|
* @return string |
259
|
|
|
*/ |
260
|
|
|
protected function format($pinyin, $tone = false) |
261
|
|
|
{ |
262
|
|
|
$replacements = array( |
263
|
|
|
'üē' => array('ue', 1), 'üé' => array('ue', 2), 'üě' => array('ue', 3), 'üè' => array('ue', 4), |
264
|
|
|
'ā' => array('a', 1), 'ē' => array('e', 1), 'ī' => array('i', 1), 'ō' => array('o', 1), 'ū' => array('u', 1), 'ǖ' => array('v', 1), |
265
|
|
|
'á' => array('a', 2), 'é' => array('e', 2), 'í' => array('i', 2), 'ó' => array('o', 2), 'ú' => array('u', 2), 'ǘ' => array('v', 2), |
266
|
|
|
'ǎ' => array('a', 3), 'ě' => array('e', 3), 'ǐ' => array('i', 3), 'ǒ' => array('o', 3), 'ǔ' => array('u', 3), 'ǚ' => array('v', 3), |
267
|
|
|
'à' => array('a', 4), 'è' => array('e', 4), 'ì' => array('i', 4), 'ò' => array('o', 4), 'ù' => array('u', 4), 'ǜ' => array('v', 4), |
268
|
|
|
); |
269
|
|
|
|
270
|
|
|
foreach ($replacements as $unicde => $replacements) { |
271
|
|
|
if (false !== strpos($pinyin, $unicde)) { |
272
|
|
|
$pinyin = str_replace($unicde, $replacements[0], $pinyin).($tone ? $replacements[1] : ''); |
273
|
|
|
} |
274
|
|
|
} |
275
|
|
|
|
276
|
|
|
return $pinyin; |
277
|
|
|
} |
278
|
|
|
} |
279
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.