Passed
Push — master ( 6e7810...23e828 )
by jxlwqq
02:00
created

ChineseTypesetting::removeSpace()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 17
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 10
nc 2
nop 1
dl 0
loc 17
rs 9.9332
c 0
b 0
f 0
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * User: jxlwqq
5
 * Date: 2018/7/26
6
 * Time: 09:33.
7
 */
8
9
namespace Jxlwqq\ChineseTypesetting;
10
11
class ChineseTypesetting
12
{
13
    /**
14
     * cjk 是中日韩统一表意文字缩写
15
     * cjk is short for Chinese, Japanese and Korean.
16
     *
17
     * @link http://unicode-table.com/en/
18
     *
19
     * @var string
20
     */
21
    private $cjk = ''.
22
    '\x{2e80}-\x{2eff}'.
23
    '\x{2f00}-\x{2fdf}'.
24
    '\x{3040}-\x{309f}'.
25
    '\x{30a0}-\x{30ff}'.
26
    '\x{3100}-\x{312f}'.
27
    '\x{3200}-\x{32ff}'.
28
    '\x{3400}-\x{4dbf}'.
29
    '\x{4e00}-\x{9fff}'.
30
    '\x{f900}-\x{faff}';
31
32
    /**
33
     * ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写
34
     * ln is short of alphabetical letters, greek letters and numerical digits and symbols.
35
     *
36
     * @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering
37
     *
38
     * @var string
39
     */
40
    private $ln = ''.
41
    'A-Za-z'.
42
    'Α-Ωα-ω'.
43
    '0-9';
44
45
    /**
46
     * 保留的全角标点符号
47
     *
48
     * @var string
49
     */
50
    private $fullwidthPunctuation = '!?。,;:、“”‘’『』「」〖〗【】《》()';
51
52
    /**
53
     * 空格
54
     *
55
     * @var string
56
     */
57
    private $space = '\s|&nbsp;| ';
58
59
    /**
60
     * 使用全部或指定的方法来纠正排版
61
     * Correct typesetting error.
62
     *
63
     * @param $text
64
     * @param array $methods
65
     *
66
     * @throws \ReflectionException
67
     *
68
     * @return mixed
69
     */
70
    public function correct($text, array $methods = [])
71
    {
72
        if (empty($methods)) {
73
            $class = new \ReflectionClass($this);
74
            $methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC);
75
            foreach ($methodsList as $methodObj) {
76
                $methods[] = $methodObj->name;
77
            }
78
        }
79
80
        $methods = array_unique($methods);
81
82
        // removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法
83
        if (in_array('removeEmptyTag', $methods)) {
84
            $methods = array_diff($methods, ['removeEmptyParagraph']);
85
        }
86
87
        // insertSpace 方法应该是数组的最后一个元素
88
        // the method insertSpace should be the end of array
89
        if (in_array('insertSpace', $methods)) {
90
            $methods = array_diff($methods, ['insertSpace']);
91
            array_push($methods, 'insertSpace');
92
        }
93
94
        foreach ($methods as $method) {
95
            if (__FUNCTION__ == $method || !method_exists($this, $method)) {
96
                continue;
97
            }
98
            $text = $this->$method($text);
99
        }
100
101
        return $text;
102
    }
103
104
    /**
105
     * 修复错误的标点符号
106
     * Fix incorrect punctuations.
107
     *
108
     * update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php
109
     *
110
     * @param $text
111
     *
112
     * @return null|string|string[]
113
     */
114
    public function fixPunctuation($text)
115
    {
116
        // 正确使用省略号
117
        $text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text);
118
        $text = preg_replace('/(……){2,}/iu', '……', $text);
119
120
        // 中文以及中文标点符号)》后使用全角中文标点符号(包括!?。,():;)
121
        $text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) {
122
            $replace = [
123
                '!' => '!',
124
                '?' => '?',
125
                '.' => '。',
126
                ',' => ',',
127
                '(' => '(',
128
                ')' => ')',
129
                ':' => ':',
130
                ';' => ';',
131
            ];
132
133
            return $matches[1].$replace[$matches[2]];
134
        }, $text);
135
136
        // 不重复使用中文标点符号,重复时只保留第一个
137
        $text = preg_replace('/(['.$this->fullwidthPunctuation.'])\1{1,}/iu', '\1', $text);
138
139
        return $text;
140
    }
141
142
    /**
143
     * 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符)
144
     * Limited full-width to half-width transformer.
145
     *
146
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角
147
     *
148
     * @param $text
149
     *
150
     * @return null|string|string[]
151
     */
152
    public function full2Half($text)
153
    {
154
        $arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4',
155
            '5'     => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
156
            'A'     => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E',
157
            'F'     => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J',
158
            'K'     => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O',
159
            'P'     => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
160
            'U'     => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y',
161
            'Z'     => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd',
162
            'e'     => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i',
163
            'j'     => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
164
            'o'     => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's',
165
            't'     => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x',
166
            'y'     => 'y', 'z' => 'z',
167
            '-'     => '-', ' ' => ' ', '/' => '/',
168
            '%'     => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<',
169
            '>'     => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}',
170
            '\'     => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_',
171
            '^'     => '^', ' ̄' => '~', '`' => '`', ];
172
173
        return strtr($text, $arr);
174
    }
175
176
    /**
177
     * 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格
178
     * Insert a space between Chinese character and English/Greek/Number character.
179
     *
180
     * update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php
181
     *
182
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格
183
     *
184
     * @param $text
185
     *
186
     * @return null|string|string[]
187
     */
188
    public function insertSpace($text)
189
    {
190
        $patterns = [
191
            'cjk_quote' => [
192
                '(['.$this->cjk.'])(["\'])',
193
                '$1 $2',
194
            ],
195
            'quote_cjk' => [
196
                '(["\'])(['.$this->cjk.'])',
197
                '$1 $2',
198
            ],
199
            'fix_quote' => [
200
                '(["\']+)(\s*)(.+?)(\s*)(["\']+)',
201
                '$1$3$5',
202
            ],
203
            'cjk_hash' => [
204
                '(['.$this->cjk.'])(#(\S+))',
205
                '$1 $2',
206
            ],
207
            'hash_cjk' => [
208
                '((\S+)#)(['.$this->cjk.'])',
209
                '$1 $3',
210
            ],
211
            'cjk_operator_ans' => [
212
                '(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])',
213
                '$1 $2 $3',
214
            ],
215
            'ans_operator_cjk' => [
216
                '([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])',
217
                '$1 $2 $3',
218
            ],
219
            'bracket' => [
220
                [
221
                    '(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])',
222
                    '$1 $2 $4',
223
                ],
224
                [
225
                    'cjk_bracket' => [
226
                        '(['.$this->cjk.'])([<>\[\]\{\}\(\)])',
227
                        '$1 $2',
228
                    ],
229
                    'bracket_cjk' => [
230
                        '([<>\[\]\{\}\(\)])(['.$this->cjk.'])',
231
                        '$1 $2',
232
                    ],
233
                ],
234
            ],
235
            'fix_bracket' => [
236
                '([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)',
237
                '$1$3$5',
238
            ],
239
            'cjk_ans' => [
240
                '(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])',
241
                '$1 $2',
242
            ],
243
            'ans_cjk' => [
244
                '(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])',
245
                '$1 $2',
246
            ],
247
        ];
248
        foreach ($patterns as $key => $value) {
249
            if ($key === 'bracket') {
250
                $old = $text;
251
                $new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text);
252
                $text = $new;
253
                if ($old === $new) {
254
                    foreach ($value[1] as $val) {
255
                        $text = preg_replace('/'.$val[0].'/iu', $val[1], $text);
256
                    }
257
                }
258
                continue;
259
            }
260
            $text = preg_replace('/'.$value[0].'/iu', $value[1], $text);
261
        }
262
263
        return $text;
264
    }
265
266
    /**
267
     * 全角标点符号与其他字符之间无需添加空格;
268
     *
269
     * @param $text
270
     *
271
     * @return null|string|string[]
272
     */
273
    public function removeSpace($text)
274
    {
275
        $patterns = [
276
            'fullwidth_space' => [
277
                '(['.$this->fullwidthPunctuation.'])(['.$this->space.'])',
278
                '$1'
279
            ],
280
            'space_fullwidth' => [
281
                '(['.$this->space.'])(['.$this->fullwidthPunctuation.'])',
282
                '$2'
283
            ]
284
        ];
285
286
        foreach ($patterns as $key => $value) {
287
            $text = preg_replace('/'.$value[0].'/u', $value[1], $text);
288
        }
289
        return $text;
290
    }
291
    /**
292
     * 专有名词使用正确的大小写
293
     * Correct English proper nouns.
294
     *
295
     * @param $text
296
     * @param array $extend
297
     * @param array $ignore
298
     *
299
     * @return null|string|string[]
300
     */
301
    public function properNoun($text, array $extend = [], array $ignore = [])
302
    {
303
        $dict = include __DIR__.'/../data/dict.php';
304
        if (!empty($extend)) {
305
            $dict = array_merge($dict, $extend);
306
        }
307
        if (!empty($ignore)) {
308
            $dict = array_diff($dict, $ignore);
309
        }
310
311
        foreach ($dict as $noun) {
312
            // Matching proper nouns Outside Of Html Tags
313
            $text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text);
314
        }
315
316
        return $text;
317
    }
318
319
    /**
320
     * 清除 Class 属性
321
     * Remove specific class of HTML tags.
322
     *
323
     * @param $text
324
     *
325
     * @return null|string|string[]
326
     */
327
    public function removeClass($text)
328
    {
329
        return preg_replace('#\s(class)="[^"]+"#', '', $text);
330
    }
331
332
    /**
333
     * 清除 ID 属性
334
     * Remove specific id of HTML tags.
335
     *
336
     * @param $text
337
     *
338
     * @return null|string|string[]
339
     */
340
    public function removeId($text)
341
    {
342
        return preg_replace('#\s(id)="[^"]+"#', '', $text);
343
    }
344
345
    /**
346
     * 清除 Style 属性
347
     * Remove specific style of HTML tags.
348
     *
349
     * @param $text
350
     *
351
     * @return null|string|string[]
352
     */
353
    public function removeStyle($text)
354
    {
355
        return preg_replace('#\s(style)="[^"]+"#', '', $text);
356
    }
357
358
    /**
359
     * 清除空段落标签
360
     * Remove empty Paragraph tags.
361
     *
362
     * @param $text
363
     * @param bool $nested
364
     *
365
     * @return null|string|string[]
366
     */
367
    public function removeEmptyParagraph($text, $nested = true)
368
    {
369
        $pattern = '/<p[^>]*>(['.$this->space.']?)<\\/p[^>]*>/';
370
        if ($nested) {
371
            while (preg_match($pattern, $text)) {
372
                $text = preg_replace($pattern, '', $text);
373
            }
374
        } else {
375
            $text = preg_replace($pattern, '', $text);
376
        }
377
378
        return $text;
379
    }
380
381
    /**
382
     * 清除所有空标签
383
     * Remote all empty HTML tags.
384
     *
385
     * @param $text
386
     * @param bool $nested
387
     *
388
     * @return null|string|string[]
389
     */
390
    public function removeEmptyTag($text, $nested = true)
391
    {
392
        $pattern = '/<[^\/>]*>(['.$this->space.']?)*<\/[^>]*>/';
393
        if ($nested) {
394
            while (preg_match($pattern, $text)) {
395
                $text = preg_replace($pattern, '', $text);
396
            }
397
        } else {
398
            $text = preg_replace($pattern, '', $text);
399
        }
400
401
        return $text;
402
    }
403
404
    /**
405
     * 清除段首缩进.
406
     * Remove indent.
407
     *
408
     * @param $text
409
     *
410
     * @return null|string|string[]
411
     */
412
    public function removeIndent($text)
413
    {
414
        return preg_replace('/<p([^>]*)>('.$this->space.')+/', '<p${1}>', $text);
415
    }
416
}
417