ChineseTypesetting::removeEmptyParagraph()   A
last analyzed

Complexity

Conditions 3
Paths 2

Size

Total Lines 12
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 7
nc 2
nop 2
dl 0
loc 12
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Jxlwqq\ChineseTypesetting;
4
5
/**
6
 * Class ChineseTypesetting.
7
 */
8
class ChineseTypesetting
9
{
10
    /**
11
     * cjk 是中日韩统一表意文字缩写
12
     * cjk is short for Chinese, Japanese and Korean.
13
     *
14
     * @link http://unicode-table.com/en/
15
     *
16
     * @var string
17
     */
18
    private $cjk = ''.
19
    '\x{2e80}-\x{2eff}'.
20
    '\x{2f00}-\x{2fdf}'.
21
    '\x{3040}-\x{309f}'.
22
    '\x{30a0}-\x{30ff}'.
23
    '\x{3100}-\x{312f}'.
24
    '\x{3200}-\x{32ff}'.
25
    '\x{3400}-\x{4dbf}'.
26
    '\x{4e00}-\x{9fff}'.
27
    '\x{f900}-\x{faff}';
28
29
    /**
30
     * ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写
31
     * ln is short of alphabetical letters, greek letters and numerical digits and symbols.
32
     *
33
     * @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering
34
     *
35
     * @var string
36
     */
37
    private $ln = ''.
38
    'A-Za-z'.
39
    'Α-Ωα-ω'.
40
    '0-9';
41
42
    /**
43
     * 保留的全角标点符号.
44
     *
45
     * @var string
46
     */
47
    private $fullwidthPunctuation = '!?。,;:、“”‘’『』「」〖〗【】《》()';
48
49
    /**
50
     * 空格
51
     *
52
     * @var string
53
     */
54
    private $space = '\s|&nbsp;| ';
55
56
    /**
57
     * 使用全部或指定的方法来纠正排版
58
     * Correct typesetting error.
59
     *
60
     * @param string $text
61
     * @param array  $methods
62
     *
63
     * @throws \ReflectionException
64
     *
65
     * @return mixed
66
     */
67
    public function correct($text, array $methods = [])
68
    {
69
        if (empty($methods)) {
70
            $class = new \ReflectionClass($this);
71
            $methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC);
72
            foreach ($methodsList as $methodObj) {
73
                $methods[] = $methodObj->name;
74
            }
75
        }
76
77
        $methods = array_unique($methods);
78
79
        // removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法
80
        if (in_array('removeEmptyTag', $methods)) {
81
            $methods = array_diff($methods, ['removeEmptyParagraph']);
82
        }
83
84
        // insertSpace 方法应该是数组的最后一个元素
85
        // the method insertSpace should be the end of array
86
        if (in_array('insertSpace', $methods)) {
87
            $methods = array_diff($methods, ['insertSpace']);
88
            array_push($methods, 'insertSpace');
89
        }
90
91
        foreach ($methods as $method) {
92
            if (__FUNCTION__ == $method || !method_exists($this, $method)) {
93
                continue;
94
            }
95
            $text = $this->$method($text);
96
        }
97
98
        return $text;
99
    }
100
101
    /**
102
     * 修复错误的标点符号
103
     * Fix incorrect punctuations.
104
     *
105
     * update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php
106
     *
107
     * @param string $text
108
     *
109
     * @return null|string|string[]
110
     */
111
    public function fixPunctuation($text)
112
    {
113
        // 正确使用省略号
114
        $text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text);
115
        $text = preg_replace('/(……){2,}/iu', '……', $text);
116
117
        // 中文以及中文标点符号)》后使用全角中文标点符号(包括!?。,():;)
118
        $text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) {
119
            $replace = [
120
                '!' => '!',
121
                '?' => '?',
122
                '.' => '。',
123
                ',' => ',',
124
                '(' => '(',
125
                ')' => ')',
126
                ':' => ':',
127
                ';' => ';',
128
            ];
129
130
            return $matches[1].$replace[$matches[2]];
131
        }, $text);
132
133
        // 不重复使用中文标点符号,重复时只保留第一个
134
        $text = preg_replace('/(['.$this->fullwidthPunctuation.'])\1{1,}/iu', '\1', $text);
135
136
        return $text;
137
    }
138
139
    /**
140
     * 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符)
141
     * Limited full-width to half-width transformer.
142
     *
143
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角
144
     *
145
     * @param string $text
146
     *
147
     * @return null|string|string[]
148
     */
149
    public function full2Half($text)
150
    {
151
        $arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4',
152
            '5'     => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
153
            'A'     => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E',
154
            'F'     => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J',
155
            'K'     => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O',
156
            'P'     => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
157
            'U'     => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y',
158
            'Z'     => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd',
159
            'e'     => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i',
160
            'j'     => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
161
            'o'     => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's',
162
            't'     => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x',
163
            'y'     => 'y', 'z' => 'z',
164
            '-'     => '-', ' ' => ' ', '/' => '/',
165
            '%'     => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<',
166
            '>'     => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}',
167
            '\'     => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_',
168
            '^'     => '^', ' ̄' => '~', '`' => '`', ];
169
170
        return strtr($text, $arr);
171
    }
172
173
    /**
174
     * 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格
175
     * Insert a space between Chinese character and English/Greek/Number character.
176
     *
177
     * update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php
178
     *
179
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格
180
     *
181
     * @param string $text
182
     *
183
     * @return null|string|string[]
184
     */
185
    public function insertSpace($text)
186
    {
187
        $patterns = [
188
            'cjk_quote' => [
189
                '(['.$this->cjk.'])(["\'])',
190
                '$1 $2',
191
            ],
192
            'quote_cjk' => [
193
                '(["\'])(['.$this->cjk.'])',
194
                '$1 $2',
195
            ],
196
            'fix_quote' => [
197
                '(["\']+)(\s*)(.+?)(\s*)(["\']+)',
198
                '$1$3$5',
199
            ],
200
            'cjk_hash' => [
201
                '(['.$this->cjk.'])(#(\S+))',
202
                '$1 $2',
203
            ],
204
            'hash_cjk' => [
205
                '((\S+)#)(['.$this->cjk.'])',
206
                '$1 $3',
207
            ],
208
            'cjk_operator_ans' => [
209
                '(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])',
210
                '$1 $2 $3',
211
            ],
212
            'ans_operator_cjk' => [
213
                '([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])',
214
                '$1 $2 $3',
215
            ],
216
            'bracket' => [
217
                [
218
                    '(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])',
219
                    '$1 $2 $4',
220
                ],
221
                [
222
                    'cjk_bracket' => [
223
                        '(['.$this->cjk.'])([<>\[\]\{\}\(\)])',
224
                        '$1 $2',
225
                    ],
226
                    'bracket_cjk' => [
227
                        '([<>\[\]\{\}\(\)])(['.$this->cjk.'])',
228
                        '$1 $2',
229
                    ],
230
                ],
231
            ],
232
            'fix_bracket' => [
233
                '([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)',
234
                '$1$3$5',
235
            ],
236
            'cjk_ans' => [
237
                '(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])',
238
                '$1 $2',
239
            ],
240
            'ans_cjk' => [
241
                '(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])',
242
                '$1 $2',
243
            ],
244
        ];
245
        foreach ($patterns as $key => $value) {
246
            if ($key === 'bracket') {
247
                $old = $text;
248
                $new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text);
249
                $text = $new;
250
                if ($old === $new) {
251
                    foreach ($value[1] as $val) {
252
                        $text = preg_replace('/'.$val[0].'/iu', $val[1], $text);
253
                    }
254
                }
255
                continue;
256
            }
257
            $text = preg_replace('/'.$value[0].'/iu', $value[1], $text);
258
        }
259
260
        return $text;
261
    }
262
263
    /**
264
     * 全角标点符号与其他字符之间无需添加空格;.
265
     *
266
     * @param string $text
267
     *
268
     * @return null|string|string[]
269
     */
270
    public function removeSpace($text)
271
    {
272
        $patterns = [
273
            'fullwidth_space' => [
274
                '(['.$this->fullwidthPunctuation.'])(\s)+',
275
                '$1',
276
            ],
277
            'space_fullwidth' => [
278
                '(\s)+(['.$this->fullwidthPunctuation.'])',
279
                '$2',
280
            ],
281
        ];
282
283
        foreach ($patterns as $key => $value) {
284
            $text = preg_replace('/'.$value[0].'/u', $value[1], $text);
285
        }
286
287
        return $text;
288
    }
289
290
    /**
291
     * 专有名词使用正确的大小写
292
     * Correct English proper nouns.
293
     *
294
     * @param string $text
295
     * @param array  $extend
296
     * @param array  $ignore
297
     *
298
     * @return null|string|string[]
299
     */
300
    public function properNoun($text, array $extend = [], array $ignore = [])
301
    {
302
        $dict = include __DIR__.'/../data/dict.php';
303
        if (!empty($extend)) {
304
            $dict = array_merge($dict, $extend);
305
        }
306
        if (!empty($ignore)) {
307
            $dict = array_diff($dict, $ignore);
308
        }
309
310
        foreach ($dict as $noun) {
311
            // Matching proper nouns Outside Of Html Tags
312
            $text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text);
313
        }
314
315
        return $text;
316
    }
317
318
    /**
319
     * 清除 Class 属性
320
     * Remove specific class of HTML tags.
321
     *
322
     * @param string $text
323
     *
324
     * @return null|string|string[]
325
     */
326
    public function removeClass($text)
327
    {
328
        return preg_replace('#\s(class)="[^"]+"#', '', $text);
329
    }
330
331
    /**
332
     * 清除 ID 属性
333
     * Remove specific id of HTML tags.
334
     *
335
     * @param string $text
336
     *
337
     * @return null|string|string[]
338
     */
339
    public function removeId($text)
340
    {
341
        return preg_replace('#\s(id)="[^"]+"#', '', $text);
342
    }
343
344
    /**
345
     * 清除 Style 属性
346
     * Remove specific style of HTML tags.
347
     *
348
     * @param string $text
349
     *
350
     * @return null|string|string[]
351
     */
352
    public function removeStyle($text)
353
    {
354
        return preg_replace('#\s(style)="[^"]+"#', '', $text);
355
    }
356
357
    /**
358
     * 清除空段落标签
359
     * Remove empty Paragraph tags.
360
     *
361
     * @param string $text
362
     * @param bool   $nested
363
     *
364
     * @return null|string|string[]
365
     */
366
    public function removeEmptyParagraph($text, $nested = true)
367
    {
368
        $pattern = '/<p[^>]*>(['.$this->space.']?)<\\/p[^>]*>/';
369
        if ($nested) {
370
            while (preg_match($pattern, $text)) {
371
                $text = preg_replace($pattern, '', $text);
372
            }
373
        } else {
374
            $text = preg_replace($pattern, '', $text);
375
        }
376
377
        return $text;
378
    }
379
380
    /**
381
     * 清除所有空标签
382
     * Remote all empty HTML tags.
383
     *
384
     * @param string $text
385
     * @param bool   $nested
386
     *
387
     * @return null|string|string[]
388
     */
389
    public function removeEmptyTag($text, $nested = true)
390
    {
391
        $pattern = '/<[^\/>]*>(['.$this->space.']?)*<\/[^>]*>/';
392
        if ($nested) {
393
            while (preg_match($pattern, $text)) {
394
                $text = preg_replace($pattern, '', $text);
395
            }
396
        } else {
397
            $text = preg_replace($pattern, '', $text);
398
        }
399
400
        return $text;
401
    }
402
403
    /**
404
     * 清除段首缩进.
405
     * Remove indent.
406
     *
407
     * @param string $text
408
     *
409
     * @return null|string|string[]
410
     */
411
    public function removeIndent($text)
412
    {
413
        return preg_replace('/<p([^>]*)>('.$this->space.')+/', '<p${1}>', $text);
414
    }
415
}
416