Completed
Pull Request — master (#7)
by jxlwqq
02:03
created

ChineseTypesetting::removeIndent()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * User: jxlwqq
5
 * Date: 2018/7/26
6
 * Time: 09:33.
7
 */
8
9
namespace Jxlwqq\ChineseTypesetting;
10
11
class ChineseTypesetting
12
{
13
    /**
14
     * cjk 是中日韩统一表意文字缩写
15
     * cjk is short for Chinese, Japanese and Korean.
16
     *
17
     * @link http://unicode-table.com/en/
18
     *
19
     * @var string
20
     */
21
    private $cjk = ''.
22
    '\x{2e80}-\x{2eff}'.
23
    '\x{2f00}-\x{2fdf}'.
24
    '\x{3040}-\x{309f}'.
25
    '\x{30a0}-\x{30ff}'.
26
    '\x{3100}-\x{312f}'.
27
    '\x{3200}-\x{32ff}'.
28
    '\x{3400}-\x{4dbf}'.
29
    '\x{4e00}-\x{9fff}'.
30
    '\x{f900}-\x{faff}';
31
32
    /**
33
     * ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写
34
     * ln is short of alphabetical letters, greek letters and numerical digits and symbols.
35
     *
36
     * @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering
37
     *
38
     * @var string
39
     */
40
    private $ln = ''.
41
    'A-Za-z'.
42
    'Α-Ωα-ω'.
43
    '0-9';
44
45
    /**
46
     * 使用全部或指定的方法来纠正排版
47
     * Correct typesetting error.
48
     *
49
     * @param $text
50
     * @param array $methods
51
     *
52
     * @throws \ReflectionException
53
     *
54
     * @return mixed
55
     */
56
    public function correct($text, array $methods = [])
57
    {
58
        if (empty($methods)) {
59
            $class = new \ReflectionClass($this);
60
            $methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC);
61
            foreach ($methodsList as $methodObj) {
62
                $methods[] = $methodObj->name;
63
            }
64
        }
65
66
        $methods = array_unique($methods);
67
68
        // removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法
69
        if (in_array('removeEmptyTag', $methods)) {
70
            $methods = array_diff($methods, ['removeEmptyParagraph']);
71
        }
72
73
        // insertSpace 方法应该是数组的最后一个元素
74
        // the method insertSpace should be the end of array
75
        if (in_array('insertSpace', $methods)) {
76
            $methods = array_diff($methods, ['insertSpace']);
77
            array_push($methods, 'insertSpace');
78
        }
79
80
        foreach ($methods as $method) {
81
            if (__FUNCTION__ == $method || !method_exists($this, $method)) {
82
                continue;
83
            }
84
            $text = $this->$method($text);
85
        }
86
87
        return $text;
88
    }
89
90
    /**
91
     * 修复错误的标点符号
92
     * Fix incorrect punctuations.
93
     *
94
     * update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php
95
     *
96
     * @param $text
97
     *
98
     * @return null|string|string[]
99
     */
100
    public function fixPunctuation($text)
101
    {
102
        // 正确使用省略号
103
        $text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text);
104
        $text = preg_replace('/(……){2,}/iu', '……', $text);
105
106
        // 中文以及中文标点符号()》)后使用全角中文标点符号(包括!?。,():;)
107
        $text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) {
108
            $replace = [
109
                '!' => '!',
110
                '?' => '?',
111
                '.' => '。',
112
                ',' => ',',
113
                '(' => '(',
114
                ')' => ')',
115
                ':' => ':',
116
                ';' => ';',
117
            ];
118
119
            return $matches[1].$replace[$matches[2]];
120
        }, $text);
121
122
        // 不重复使用中文标点符号,重复时只保留第一个
123
        $text = preg_replace('/([!?。,;:、“”‘’『』〖〗《》()])\1{1,}/iu', '\1', $text);
124
125
        return $text;
126
    }
127
128
    /**
129
     * 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符)
130
     * Limited full-width to half-width transformer.
131
     *
132
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角
133
     *
134
     * @param $text
135
     *
136
     * @return null|string|string[]
137
     */
138
    public function full2Half($text)
139
    {
140
        $arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4',
141
            '5'     => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
142
            'A'     => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E',
143
            'F'     => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J',
144
            'K'     => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O',
145
            'P'     => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
146
            'U'     => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y',
147
            'Z'     => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd',
148
            'e'     => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i',
149
            'j'     => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
150
            'o'     => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's',
151
            't'     => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x',
152
            'y'     => 'y', 'z' => 'z',
153
            '-'     => '-', ' ' => ' ', '/' => '/',
154
            '%'     => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<',
155
            '>'     => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}',
156
            '\'     => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_',
157
            '^'     => '^', ' ̄' => '~', '`' => '`', ];
158
159
        return strtr($text, $arr);
160
    }
161
162
    /**
163
     * 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格
164
     * Insert a space between Chinese character and English/Greek/Number character.
165
     *
166
     * update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php
167
     *
168
     * @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格
169
     *
170
     * @param $text
171
     *
172
     * @return null|string|string[]
173
     */
174
    public function insertSpace($text)
175
    {
176
        $patterns = [
177
            'cjk_quote' => [
178
                '(['.$this->cjk.'])(["\'])',
179
                '$1 $2',
180
            ],
181
            'quote_cjk' => [
182
                '(["\'])(['.$this->cjk.'])',
183
                '$1 $2',
184
            ],
185
            'fix_quote' => [
186
                '(["\']+)(\s*)(.+?)(\s*)(["\']+)',
187
                '$1$3$5',
188
            ],
189
            'cjk_hash' => [
190
                '(['.$this->cjk.'])(#(\S+))',
191
                '$1 $2',
192
            ],
193
            'hash_cjk' => [
194
                '((\S+)#)(['.$this->cjk.'])',
195
                '$1 $3',
196
            ],
197
            'cjk_operator_ans' => [
198
                '(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])',
199
                '$1 $2 $3',
200
            ],
201
            'ans_operator_cjk' => [
202
                '([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])',
203
                '$1 $2 $3',
204
            ],
205
            'bracket' => [
206
                [
207
                    '(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])',
208
                    '$1 $2 $4',
209
                ],
210
                [
211
                    'cjk_bracket' => [
212
                        '(['.$this->cjk.'])([<>\[\]\{\}\(\)])',
213
                        '$1 $2',
214
                    ],
215
                    'bracket_cjk' => [
216
                        '([<>\[\]\{\}\(\)])(['.$this->cjk.'])',
217
                        '$1 $2',
218
                    ],
219
                ],
220
            ],
221
            'fix_bracket' => [
222
                '([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)',
223
                '$1$3$5',
224
            ],
225
            'cjk_ans' => [
226
                '(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])',
227
                '$1 $2',
228
            ],
229
            'ans_cjk' => [
230
                '(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])',
231
                '$1 $2',
232
            ],
233
        ];
234
        foreach ($patterns as $key => $value) {
235
            if ($key === 'bracket') {
236
                $old = $text;
237
                $new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text);
238
                $text = $new;
239
                if ($old === $new) {
240
                    foreach ($value[1] as $val) {
241
                        $text = preg_replace('/'.$val[0].'/iu', $val[1], $text);
242
                    }
243
                }
244
                continue;
245
            }
246
            $text = preg_replace('/'.$value[0].'/iu', $value[1], $text);
247
        }
248
249
        return $text;
250
    }
251
252
    /**
253
     * 专有名词使用正确的大小写
254
     * Correct English proper nouns.
255
     *
256
     * @param $text
257
     * @param array $extend
258
     * @param array $ignore
259
     *
260
     * @return null|string|string[]
261
     */
262
    public function properNoun($text, array $extend = [], array $ignore = [])
263
    {
264
        $dict = include __DIR__.'/../data/dict.php';
265
        if ($extend) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extend of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
266
            $dict = array_merge($dict, $extend);
267
        }
268
        if ($ignore) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $ignore of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
269
            $dict = array_diff($dict, $ignore);
270
        }
271
272
        foreach ($dict as $noun) {
273
            // Matching proper nouns Outside Of Html Tags
274
            $text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text);
275
        }
276
277
        return $text;
278
    }
279
280
    /**
281
     * 清除 Class 属性
282
     * Remove specific class of HTML tags.
283
     *
284
     * @param $text
285
     *
286
     * @return null|string|string[]
287
     */
288
    public function removeClass($text)
289
    {
290
        return preg_replace('#\s(class)="[^"]+"#', '', $text);
291
    }
292
293
    /**
294
     * 清除 ID 属性
295
     * Remove specific id of HTML tags.
296
     *
297
     * @param $text
298
     *
299
     * @return null|string|string[]
300
     */
301
    public function removeId($text)
302
    {
303
        return preg_replace('#\s(id)="[^"]+"#', '', $text);
304
    }
305
306
    /**
307
     * 清除 Style 属性
308
     * Remove specific style of HTML tags.
309
     *
310
     * @param $text
311
     *
312
     * @return null|string|string[]
313
     */
314
    public function removeStyle($text)
315
    {
316
        return preg_replace('#\s(style)="[^"]+"#', '', $text);
317
    }
318
319
    /**
320
     * 清除空段落标签
321
     * Remove empty Paragraph tags.
322
     *
323
     * @param $text
324
     *
325
     * @return null|string|string[]
326
     */
327
    public function removeEmptyParagraph($text)
328
    {
329
        return preg_replace('/<p[^>]*>([\s|&nbsp;]?)<\\/p[^>]*>/', '', $text);
330
    }
331
332
    /**
333
     * 清除所有空标签
334
     * Remote all empty HTML tags.
335
     *
336
     * @param $text
337
     *
338
     * @return null|string|string[]
339
     */
340
    public function removeEmptyTag($text)
341
    {
342
        return preg_replace('/<[^\/>]*>([\s|&nbsp;]?)*<\/[^>]*>/', '', $text);
343
    }
344
345
    /**
346
     * 清除段首缩进.
347
     * Remove indent.
348
     *
349
     * @param $text
350
     *
351
     * @return null|string|string[]
352
     */
353
    public function removeIndent($text)
354
    {
355
        return preg_replace('/<p([^>]*)>(\s|&nbsp;)+/', '<p${1}>', $text);
356
    }
357
}
358