1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Created by PhpStorm. |
4
|
|
|
* User: jxlwqq |
5
|
|
|
* Date: 2018/7/26 |
6
|
|
|
* Time: 09:33. |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
namespace Jxlwqq\ChineseTypesetting; |
10
|
|
|
|
11
|
|
|
class ChineseTypesetting |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* cjk 是中日韩统一表意文字缩写 |
15
|
|
|
* cjk is short for Chinese, Japanese and Korean. |
16
|
|
|
* |
17
|
|
|
* @link http://unicode-table.com/en/ |
18
|
|
|
* |
19
|
|
|
* @var string |
20
|
|
|
*/ |
21
|
|
|
private $cjk = ''. |
22
|
|
|
'\x{2e80}-\x{2eff}'. |
23
|
|
|
'\x{2f00}-\x{2fdf}'. |
24
|
|
|
'\x{3040}-\x{309f}'. |
25
|
|
|
'\x{30a0}-\x{30ff}'. |
26
|
|
|
'\x{3100}-\x{312f}'. |
27
|
|
|
'\x{3200}-\x{32ff}'. |
28
|
|
|
'\x{3400}-\x{4dbf}'. |
29
|
|
|
'\x{4e00}-\x{9fff}'. |
30
|
|
|
'\x{f900}-\x{faff}'; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写 |
34
|
|
|
* ln is short of alphabetical letters, greek letters and numerical digits and symbols. |
35
|
|
|
* |
36
|
|
|
* @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering |
37
|
|
|
* |
38
|
|
|
* @var string |
39
|
|
|
*/ |
40
|
|
|
private $ln = ''. |
41
|
|
|
'A-Za-z'. |
42
|
|
|
'Α-Ωα-ω'. |
43
|
|
|
'0-9'; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* 保留的全角标点符号 |
47
|
|
|
* |
48
|
|
|
* @var string |
49
|
|
|
*/ |
50
|
|
|
private $fullwidthPunctuation = '!?。,;:、“”‘’『』「」〖〗【】《》()'; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* 空格 |
54
|
|
|
* |
55
|
|
|
* @var string |
56
|
|
|
*/ |
57
|
|
|
private $space = '\s| | '; |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* 使用全部或指定的方法来纠正排版 |
61
|
|
|
* Correct typesetting error. |
62
|
|
|
* |
63
|
|
|
* @param $text |
64
|
|
|
* @param array $methods |
65
|
|
|
* |
66
|
|
|
* @throws \ReflectionException |
67
|
|
|
* |
68
|
|
|
* @return mixed |
69
|
|
|
*/ |
70
|
|
|
public function correct($text, array $methods = []) |
71
|
|
|
{ |
72
|
|
|
if (empty($methods)) { |
73
|
|
|
$class = new \ReflectionClass($this); |
74
|
|
|
$methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC); |
75
|
|
|
foreach ($methodsList as $methodObj) { |
76
|
|
|
$methods[] = $methodObj->name; |
77
|
|
|
} |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
$methods = array_unique($methods); |
81
|
|
|
|
82
|
|
|
// removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法 |
83
|
|
|
if (in_array('removeEmptyTag', $methods)) { |
84
|
|
|
$methods = array_diff($methods, ['removeEmptyParagraph']); |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
// insertSpace 方法应该是数组的最后一个元素 |
88
|
|
|
// the method insertSpace should be the end of array |
89
|
|
|
if (in_array('insertSpace', $methods)) { |
90
|
|
|
$methods = array_diff($methods, ['insertSpace']); |
91
|
|
|
array_push($methods, 'insertSpace'); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
foreach ($methods as $method) { |
95
|
|
|
if (__FUNCTION__ == $method || !method_exists($this, $method)) { |
96
|
|
|
continue; |
97
|
|
|
} |
98
|
|
|
$text = $this->$method($text); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
return $text; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
/** |
105
|
|
|
* 修复错误的标点符号 |
106
|
|
|
* Fix incorrect punctuations. |
107
|
|
|
* |
108
|
|
|
* update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php |
109
|
|
|
* |
110
|
|
|
* @param $text |
111
|
|
|
* |
112
|
|
|
* @return null|string|string[] |
113
|
|
|
*/ |
114
|
|
|
public function fixPunctuation($text) |
115
|
|
|
{ |
116
|
|
|
// 正确使用省略号 |
117
|
|
|
$text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text); |
118
|
|
|
$text = preg_replace('/(……){2,}/iu', '……', $text); |
119
|
|
|
|
120
|
|
|
// 中文以及中文标点符号)》后使用全角中文标点符号(包括!?。,():;) |
121
|
|
|
$text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) { |
122
|
|
|
$replace = [ |
123
|
|
|
'!' => '!', |
124
|
|
|
'?' => '?', |
125
|
|
|
'.' => '。', |
126
|
|
|
',' => ',', |
127
|
|
|
'(' => '(', |
128
|
|
|
')' => ')', |
129
|
|
|
':' => ':', |
130
|
|
|
';' => ';', |
131
|
|
|
]; |
132
|
|
|
|
133
|
|
|
return $matches[1].$replace[$matches[2]]; |
134
|
|
|
}, $text); |
135
|
|
|
|
136
|
|
|
// 不重复使用中文标点符号,重复时只保留第一个 |
137
|
|
|
$text = preg_replace('/(['.$this->fullwidthPunctuation.'])\1{1,}/iu', '\1', $text); |
138
|
|
|
|
139
|
|
|
return $text; |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
/** |
143
|
|
|
* 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符) |
144
|
|
|
* Limited full-width to half-width transformer. |
145
|
|
|
* |
146
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角 |
147
|
|
|
* |
148
|
|
|
* @param $text |
149
|
|
|
* |
150
|
|
|
* @return null|string|string[] |
151
|
|
|
*/ |
152
|
|
|
public function full2Half($text) |
153
|
|
|
{ |
154
|
|
|
$arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', |
155
|
|
|
'5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9', |
156
|
|
|
'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E', |
157
|
|
|
'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', |
158
|
|
|
'K' => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O', |
159
|
|
|
'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T', |
160
|
|
|
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', |
161
|
|
|
'Z' => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', |
162
|
|
|
'e' => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i', |
163
|
|
|
'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', |
164
|
|
|
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', |
165
|
|
|
't' => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', |
166
|
|
|
'y' => 'y', 'z' => 'z', |
167
|
|
|
'-' => '-', ' ' => ' ', '/' => '/', |
168
|
|
|
'%' => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<', |
169
|
|
|
'>' => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}', |
170
|
|
|
'\' => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_', |
171
|
|
|
'^' => '^', ' ̄' => '~', '`' => '`', ]; |
172
|
|
|
|
173
|
|
|
return strtr($text, $arr); |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
/** |
177
|
|
|
* 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格 |
178
|
|
|
* Insert a space between Chinese character and English/Greek/Number character. |
179
|
|
|
* |
180
|
|
|
* update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php |
181
|
|
|
* |
182
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格 |
183
|
|
|
* |
184
|
|
|
* @param $text |
185
|
|
|
* |
186
|
|
|
* @return null|string|string[] |
187
|
|
|
*/ |
188
|
|
|
public function insertSpace($text) |
189
|
|
|
{ |
190
|
|
|
$patterns = [ |
191
|
|
|
'cjk_quote' => [ |
192
|
|
|
'(['.$this->cjk.'])(["\'])', |
193
|
|
|
'$1 $2', |
194
|
|
|
], |
195
|
|
|
'quote_cjk' => [ |
196
|
|
|
'(["\'])(['.$this->cjk.'])', |
197
|
|
|
'$1 $2', |
198
|
|
|
], |
199
|
|
|
'fix_quote' => [ |
200
|
|
|
'(["\']+)(\s*)(.+?)(\s*)(["\']+)', |
201
|
|
|
'$1$3$5', |
202
|
|
|
], |
203
|
|
|
'cjk_hash' => [ |
204
|
|
|
'(['.$this->cjk.'])(#(\S+))', |
205
|
|
|
'$1 $2', |
206
|
|
|
], |
207
|
|
|
'hash_cjk' => [ |
208
|
|
|
'((\S+)#)(['.$this->cjk.'])', |
209
|
|
|
'$1 $3', |
210
|
|
|
], |
211
|
|
|
'cjk_operator_ans' => [ |
212
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])', |
213
|
|
|
'$1 $2 $3', |
214
|
|
|
], |
215
|
|
|
'ans_operator_cjk' => [ |
216
|
|
|
'([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])', |
217
|
|
|
'$1 $2 $3', |
218
|
|
|
], |
219
|
|
|
'bracket' => [ |
220
|
|
|
[ |
221
|
|
|
'(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])', |
222
|
|
|
'$1 $2 $4', |
223
|
|
|
], |
224
|
|
|
[ |
225
|
|
|
'cjk_bracket' => [ |
226
|
|
|
'(['.$this->cjk.'])([<>\[\]\{\}\(\)])', |
227
|
|
|
'$1 $2', |
228
|
|
|
], |
229
|
|
|
'bracket_cjk' => [ |
230
|
|
|
'([<>\[\]\{\}\(\)])(['.$this->cjk.'])', |
231
|
|
|
'$1 $2', |
232
|
|
|
], |
233
|
|
|
], |
234
|
|
|
], |
235
|
|
|
'fix_bracket' => [ |
236
|
|
|
'([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)', |
237
|
|
|
'$1$3$5', |
238
|
|
|
], |
239
|
|
|
'cjk_ans' => [ |
240
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])', |
241
|
|
|
'$1 $2', |
242
|
|
|
], |
243
|
|
|
'ans_cjk' => [ |
244
|
|
|
'(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])', |
245
|
|
|
'$1 $2', |
246
|
|
|
], |
247
|
|
|
]; |
248
|
|
|
foreach ($patterns as $key => $value) { |
249
|
|
|
if ($key === 'bracket') { |
250
|
|
|
$old = $text; |
251
|
|
|
$new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text); |
252
|
|
|
$text = $new; |
253
|
|
|
if ($old === $new) { |
254
|
|
|
foreach ($value[1] as $val) { |
255
|
|
|
$text = preg_replace('/'.$val[0].'/iu', $val[1], $text); |
256
|
|
|
} |
257
|
|
|
} |
258
|
|
|
continue; |
259
|
|
|
} |
260
|
|
|
$text = preg_replace('/'.$value[0].'/iu', $value[1], $text); |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
return $text; |
264
|
|
|
} |
265
|
|
|
|
266
|
|
|
/** |
267
|
|
|
* 全角标点符号与其他字符之间无需添加空格; |
268
|
|
|
* |
269
|
|
|
* @param $text |
270
|
|
|
* |
271
|
|
|
* @return null|string|string[] |
272
|
|
|
*/ |
273
|
|
|
public function removeSpace($text) |
274
|
|
|
{ |
275
|
|
|
$patterns = [ |
276
|
|
|
'fullwidth_space' => [ |
277
|
|
|
'(['.$this->fullwidthPunctuation.'])(['.$this->space.'])', |
278
|
|
|
'$1' |
279
|
|
|
], |
280
|
|
|
'space_fullwidth' => [ |
281
|
|
|
'(['.$this->space.'])(['.$this->fullwidthPunctuation.'])', |
282
|
|
|
'$2' |
283
|
|
|
] |
284
|
|
|
]; |
285
|
|
|
|
286
|
|
|
foreach ($patterns as $key => $value) { |
287
|
|
|
$text = preg_replace('/'.$value[0].'/u', $value[1], $text); |
288
|
|
|
} |
289
|
|
|
return $text; |
290
|
|
|
} |
291
|
|
|
/** |
292
|
|
|
* 专有名词使用正确的大小写 |
293
|
|
|
* Correct English proper nouns. |
294
|
|
|
* |
295
|
|
|
* @param $text |
296
|
|
|
* @param array $extend |
297
|
|
|
* @param array $ignore |
298
|
|
|
* |
299
|
|
|
* @return null|string|string[] |
300
|
|
|
*/ |
301
|
|
|
public function properNoun($text, array $extend = [], array $ignore = []) |
302
|
|
|
{ |
303
|
|
|
$dict = include __DIR__.'/../data/dict.php'; |
304
|
|
|
if (!empty($extend)) { |
305
|
|
|
$dict = array_merge($dict, $extend); |
306
|
|
|
} |
307
|
|
|
if (!empty($ignore)) { |
308
|
|
|
$dict = array_diff($dict, $ignore); |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
foreach ($dict as $noun) { |
312
|
|
|
// Matching proper nouns Outside Of Html Tags |
313
|
|
|
$text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text); |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
return $text; |
317
|
|
|
} |
318
|
|
|
|
319
|
|
|
/** |
320
|
|
|
* 清除 Class 属性 |
321
|
|
|
* Remove specific class of HTML tags. |
322
|
|
|
* |
323
|
|
|
* @param $text |
324
|
|
|
* |
325
|
|
|
* @return null|string|string[] |
326
|
|
|
*/ |
327
|
|
|
public function removeClass($text) |
328
|
|
|
{ |
329
|
|
|
return preg_replace('#\s(class)="[^"]+"#', '', $text); |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
/** |
333
|
|
|
* 清除 ID 属性 |
334
|
|
|
* Remove specific id of HTML tags. |
335
|
|
|
* |
336
|
|
|
* @param $text |
337
|
|
|
* |
338
|
|
|
* @return null|string|string[] |
339
|
|
|
*/ |
340
|
|
|
public function removeId($text) |
341
|
|
|
{ |
342
|
|
|
return preg_replace('#\s(id)="[^"]+"#', '', $text); |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
/** |
346
|
|
|
* 清除 Style 属性 |
347
|
|
|
* Remove specific style of HTML tags. |
348
|
|
|
* |
349
|
|
|
* @param $text |
350
|
|
|
* |
351
|
|
|
* @return null|string|string[] |
352
|
|
|
*/ |
353
|
|
|
public function removeStyle($text) |
354
|
|
|
{ |
355
|
|
|
return preg_replace('#\s(style)="[^"]+"#', '', $text); |
356
|
|
|
} |
357
|
|
|
|
358
|
|
|
/** |
359
|
|
|
* 清除空段落标签 |
360
|
|
|
* Remove empty Paragraph tags. |
361
|
|
|
* |
362
|
|
|
* @param $text |
363
|
|
|
* @param bool $nested |
364
|
|
|
* |
365
|
|
|
* @return null|string|string[] |
366
|
|
|
*/ |
367
|
|
|
public function removeEmptyParagraph($text, $nested = true) |
368
|
|
|
{ |
369
|
|
|
$pattern = '/<p[^>]*>(['.$this->space.']?)<\\/p[^>]*>/'; |
370
|
|
|
if ($nested) { |
371
|
|
|
while (preg_match($pattern, $text)) { |
372
|
|
|
$text = preg_replace($pattern, '', $text); |
373
|
|
|
} |
374
|
|
|
} else { |
375
|
|
|
$text = preg_replace($pattern, '', $text); |
376
|
|
|
} |
377
|
|
|
|
378
|
|
|
return $text; |
379
|
|
|
} |
380
|
|
|
|
381
|
|
|
/** |
382
|
|
|
* 清除所有空标签 |
383
|
|
|
* Remote all empty HTML tags. |
384
|
|
|
* |
385
|
|
|
* @param $text |
386
|
|
|
* @param bool $nested |
387
|
|
|
* |
388
|
|
|
* @return null|string|string[] |
389
|
|
|
*/ |
390
|
|
|
public function removeEmptyTag($text, $nested = true) |
391
|
|
|
{ |
392
|
|
|
$pattern = '/<[^\/>]*>(['.$this->space.']?)*<\/[^>]*>/'; |
393
|
|
|
if ($nested) { |
394
|
|
|
while (preg_match($pattern, $text)) { |
395
|
|
|
$text = preg_replace($pattern, '', $text); |
396
|
|
|
} |
397
|
|
|
} else { |
398
|
|
|
$text = preg_replace($pattern, '', $text); |
399
|
|
|
} |
400
|
|
|
|
401
|
|
|
return $text; |
402
|
|
|
} |
403
|
|
|
|
404
|
|
|
/** |
405
|
|
|
* 清除段首缩进. |
406
|
|
|
* Remove indent. |
407
|
|
|
* |
408
|
|
|
* @param $text |
409
|
|
|
* |
410
|
|
|
* @return null|string|string[] |
411
|
|
|
*/ |
412
|
|
|
public function removeIndent($text) |
413
|
|
|
{ |
414
|
|
|
return preg_replace('/<p([^>]*)>('.$this->space.')+/', '<p${1}>', $text); |
415
|
|
|
} |
416
|
|
|
} |
417
|
|
|
|