Pinyin::setLoader()   A
last analyzed

Complexity

Conditions 6
Paths 5

Size

Total Lines 19
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 1 Features 0
Metric Value
cc 6
eloc 13
c 3
b 1
f 0
nc 5
nop 1
dl 0
loc 19
rs 9.2222
1
<?php
2
/**
3
 * This file is part of the mucts/pinyin.
4
 *
5
 * This source file is subject to the MIT license that is bundled
6
 * with this source code in the file LICENSE.
7
 *
8
 * @version 1.0
9
 * @author herry<[email protected]>
10
 * @copyright © 2020 MuCTS.com All Rights Reserved.
11
 */
12
13
namespace MuCTS\Pinyin;
14
15
use Exception;
16
use MuCTS\Pinyin\Exceptions\InvalidArgumentException;
17
use MuCTS\Pinyin\Interfaces\DictLoader;
18
use MuCTS\Pinyin\Loaders\File;
19
use MuCTS\Pinyin\Loaders\GeneratorFile;
20
use MuCTS\Pinyin\Loaders\MemoryFile;
21
use MuCTS\Support\Arr;
22
23
class Pinyin
24
{
25
    public const DEFAULT = 4096;
26
    /** @var int UNICODE 式音调:měi hǎo */
27
    public const TONE = 2;
28
    /** @var int 无音调:mei hao */
29
    public const NO_TONE = 4;
30
    /** @var int 带数字式音调: mei3 hao3 */
31
    public const ASCII_TONE = 8;
32
    /** @var int 翻译姓名 */
33
    public const NAME = 16;
34
    /** @var int 保留数字 */
35
    public const KEEP_NUMBER = 32;
36
    /** @var int 保留英文 */
37
    public const KEEP_ENGLISH = 64;
38
    /** @var int 使用 v 代替 yu, 例如:吕 lyu 将会转为 lv */
39
    public const UMLAUT_V = 128;
40
    /** @var int  保留标点 */
41
    public const KEEP_PUNCTUATION = 256;
42
    /**
43
     * Dict loader.
44
     *
45
     * @var DictLoader|string
46
     */
47
    private $loader;
48
    /** @var string */
49
    private string $path;
50
    /** @var string[] */
51
    protected array $alias = [
52
        'file' => File::class,
53
        'generator' => GeneratorFile::class,
54
        'memory' => MemoryFile::class
55
    ];
56
57
    /**
58
     * Punctuations map.
59
     *
60
     * @var array
61
     */
62
    protected array $punctuations = [
63
        ',' => ',',
64
        '。' => '.',
65
        '!' => '!',
66
        '?' => '?',
67
        ':' => ':',
68
        '“' => '"',
69
        '”' => '"',
70
        '‘' => "'",
71
        '’' => "'",
72
        '_' => '_',
73
    ];
74
75
    /**
76
     * Constructor.
77
     *
78
     * @param string|null $loader
79
     * @param string|null $path
80
     * @throws Exception|InvalidArgumentException
81
     */
82
    public function __construct(?string $loader = null, ?string $path = null)
83
    {
84
        $this->setLoader($loader);
85
        $this->setPath($path);
86
    }
87
88
    /**
89
     * Convert string to pinyin.
90
     *
91
     * @param string $string
92
     * @param int $option
93
     *
94
     * @return array
95
     */
96
    public function convert(string $string, int $option = self::DEFAULT): array
97
    {
98
        $pinyin = $this->romanize($string, $option);
99
100
        return $this->splitWords($pinyin, $option);
101
    }
102
103
    /**
104
     * Convert string (person name) to pinyin.
105
     *
106
     * @param string $string
107
     * @param int $option
108
     *
109
     * @return array
110
     */
111
    public function name(string $string, int $option = self::NAME): array
112
    {
113
        $option = $option | self::NAME;
114
115
        $pinyin = $this->romanize($string, $option);
116
117
        return $this->splitWords($pinyin, $option);
118
    }
119
120
    /**
121
     * Return a pinyin permalink from string.
122
     *
123
     * @param string $string
124
     * @param string|int $delimiter
125
     * @param int $option
126
     *
127
     * @return string
128
     */
129
    public function permalink(string $string, $delimiter = '-', int $option = self::DEFAULT): string
130
    {
131
        if (is_int($delimiter)) {
132
            list($option, $delimiter) = [$delimiter, '-'];
133
        }
134
135
        if (!in_array($delimiter, ['_', '-', '.', ''], true)) {
136
            throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
137
        }
138
139
        return implode($delimiter, $this->convert($string, $option | self::KEEP_NUMBER | self::KEEP_ENGLISH));
140
    }
141
142
    /**
143
     * Return first letters.
144
     *
145
     * @param string $string
146
     * @param string|int $delimiter
147
     * @param int $option
148
     *
149
     * @return string
150
     */
151
    public function abbr(string $string, $delimiter = '', int $option = self::DEFAULT): string
152
    {
153
        if (is_int($delimiter)) {
154
            list($option, $delimiter) = [$delimiter, ''];
155
        }
156
157
        return implode($delimiter, array_map(function ($pinyin) {
158
            return is_numeric($pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
159
        }, $this->convert($string, $option)));
160
    }
161
162
    /**
163
     * Chinese phrase to pinyin.
164
     *
165
     * @param string $string
166
     * @param string|int $delimiter
167
     * @param int $option
168
     *
169
     * @return string
170
     */
171
    public function phrase(string $string, $delimiter = ' ', int $option = self::DEFAULT): string
172
    {
173
        if (is_int($delimiter)) {
174
            list($option, $delimiter) = [$delimiter, ' '];
175
        }
176
177
        return implode($delimiter, $this->convert($string, $option));
178
    }
179
180
    /**
181
     * Chinese to pinyin sentence.
182
     *
183
     * @param string|int $string
184
     * @param string|int $delimiter
185
     * @param int $option
186
     *
187
     * @return string
188
     */
189
    public function sentence($string, $delimiter = ' ', $option = self::NO_TONE)
190
    {
191
        if (is_int($delimiter)) {
192
            list($option, $delimiter) = [$delimiter, ' '];
193
        }
194
195
        return implode($delimiter, $this->convert($string, $option | self::KEEP_PUNCTUATION | self::KEEP_ENGLISH | self::KEEP_NUMBER));
196
    }
197
198
    /**
199
     * Determine if it's an alias
200
     *
201
     * @param string $name
202
     * @return bool
203
     */
204
    protected function isAlias(string $name): bool
205
    {
206
        return Arr::exists($this->alias, $name);
207
    }
208
209
    /**
210
     * Loader setter.
211
     *
212
     * @param DictLoader|string|null $loader
213
     *
214
     * @return $this
215
     * @throws Exception|InvalidArgumentException
216
     */
217
    public function setLoader($loader = null)
218
    {
219
        if (is_null($loader)) {
220
            $this->loader = File::class;
221
            return $this;
222
        }
223
        if ($loader instanceof DictLoader) {
224
            $this->loader = $loader;
225
            return $this;
226
        }
227
        if ($this->isAlias($loader)) {
228
            $this->loader = Arr::get($this->alias, $loader);
229
            return $this;
230
        }
231
        if (class_exists($loader) && in_array(DictLoader::class, class_implements($loader))) {
232
            $this->loader = $loader;
233
            return $this;
234
        }
235
        throw new InvalidArgumentException('This\'s not valid dict loader class.');
236
    }
237
238
    /**
239
     * Return dict loader,.
240
     *
241
     * @return DictLoader
242
     */
243
    public function getLoader(): DictLoader
244
    {
245
        if (!($this->loader instanceof DictLoader)) {
246
            $loaderName = $this->loader;
247
            $this->loader = new $loaderName($this->path);
248
        }
249
250
        return $this->loader;
251
    }
252
253
    /**
254
     * Set data path
255
     *
256
     * @param string|null $path
257
     * @return $this
258
     */
259
    public function setPath(?string $path): self
260
    {
261
        $path = $path ?? dirname(__DIR__) . '/data/';
262
        if (!is_dir($path)) {
263
            throw new InvalidArgumentException(sprintf('\'%s\' is not valid data path.', $path));
264
        }
265
        $this->path = $path;
266
        return $this;
267
    }
268
269
    /**
270
     * Convert Chinese to pinyin.
271
     *
272
     * @param string $string
273
     * @param int $option
274
     *
275
     * @return string
276
     */
277
    protected function romanize(string $string, int $option = self::DEFAULT): string
278
    {
279
        $string = $this->prepare($string, $option);
280
281
        $dictLoader = $this->getLoader();
282
283
        if ($this->hasOption($option, self::NAME)) {
284
            $string = $this->convertSurname($string, $dictLoader);
285
        }
286
287
        $dictLoader->map(function ($dictionary) use (&$string) {
288
            $string = strtr($string, $dictionary);
289
        });
290
291
        return $string;
292
    }
293
294
    /**
295
     * Convert Chinese Surname to pinyin.
296
     *
297
     * @param string $string
298
     * @param DictLoader $dictLoader
299
     *
300
     * @return string
301
     */
302
    protected function convertSurname(string $string, DictLoader $dictLoader): string
303
    {
304
        $dictLoader->mapSurname(function ($dictionary) use (&$string) {
305
            foreach ($dictionary as $surname => $pinyin) {
306
                if (0 === strpos($string, $surname)) {
307
                    $string = $pinyin . mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
308
309
                    break;
310
                }
311
            }
312
        });
313
314
        return $string;
315
    }
316
317
    /**
318
     * Split pinyin string to words.
319
     *
320
     * @param string $pinyin
321
     * @param int $option
322
     *
323
     * @return array
324
     */
325
    protected function splitWords(string $pinyin, int $option): array
326
    {
327
        $split = preg_split('/\s+/i', $pinyin);
328
        if (!is_array($split)) {
329
            throw new InvalidArgumentException(sprintf('\'%s\' is not valid pinyin.', $pinyin));
330
        }
331
        $split = array_filter($split);
332
333
        if (!$this->hasOption($option, self::TONE)) {
334
            foreach ($split as $index => $pinyin) {
335
                $split[$index] = $this->formatTone($pinyin, $option);
336
            }
337
        }
338
339
        return array_values($split);
340
    }
341
342
    /**
343
     * @param int $option
344
     * @param int $check
345
     *
346
     * @return bool
347
     */
348
    public function hasOption(int $option, int $check): bool
349
    {
350
        return ($option & $check) === $check;
351
    }
352
353
    /**
354
     * Pre-process.
355
     *
356
     * @param string $string
357
     * @param int $option
358
     *
359
     * @return string
360
     */
361
    protected function prepare(string $string, int $option = self::DEFAULT): string
362
    {
363
        $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
364
            return "\t" . $matches[0];
365
        }, $string);
366
367
        $regex = ['\p{Han}', '\p{Z}', '\p{M}', "\t"];
368
369
        if ($this->hasOption($option, self::KEEP_NUMBER)) {
370
            array_push($regex, '0-9');
371
        }
372
373
        if ($this->hasOption($option, self::KEEP_ENGLISH)) {
374
            array_push($regex, 'a-zA-Z');
375
        }
376
377
        if ($this->hasOption($option, self::KEEP_PUNCTUATION)) {
378
            $punctuations = array_merge($this->punctuations, ['  ' => ' ']);
379
            $string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
380
381
            array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
382
        }
383
384
        return preg_replace(sprintf('~[^%s]~u', implode($regex)), '', $string);
385
    }
386
387
    /**
388
     * Format.
389
     *
390
     * @param string $pinyin
391
     * @param int $option
392
     *
393
     * @return string
394
     */
395
    protected function formatTone(string $pinyin, int $option = self::NO_TONE): string
396
    {
397
        $replacements = [
398
            'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4],
399
            'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['yu', 1],
400
            'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['yu', 2],
401
            'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['yu', 3],
402
            'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['yu', 4],
403
        ];
404
405
        foreach ($replacements as $unicode => $replacement) {
406
            if (false !== strpos($pinyin, $unicode)) {
407
                $umlaut = $replacement[0];
408
409
                if ($this->hasOption($option, self::UMLAUT_V) && 'yu' == $umlaut) {
410
                    $umlaut = 'v';
411
                }
412
413
                $pinyin = str_replace($unicode, $umlaut, $pinyin) . ($this->hasOption($option, self::ASCII_TONE) ? $replacement[1] : '');
414
            }
415
        }
416
417
        return $pinyin;
418
    }
419
}
420