1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Jxlwqq\ChineseTypesetting; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Class ChineseTypesetting. |
7
|
|
|
*/ |
8
|
|
|
class ChineseTypesetting |
9
|
|
|
{ |
10
|
|
|
/** |
11
|
|
|
* cjk 是中日韩统一表意文字缩写 |
12
|
|
|
* cjk is short for Chinese, Japanese and Korean. |
13
|
|
|
* |
14
|
|
|
* @link http://unicode-table.com/en/ |
15
|
|
|
* |
16
|
|
|
* @var string |
17
|
|
|
*/ |
18
|
|
|
private $cjk = ''. |
19
|
|
|
'\x{2e80}-\x{2eff}'. |
20
|
|
|
'\x{2f00}-\x{2fdf}'. |
21
|
|
|
'\x{3040}-\x{309f}'. |
22
|
|
|
'\x{30a0}-\x{30ff}'. |
23
|
|
|
'\x{3100}-\x{312f}'. |
24
|
|
|
'\x{3200}-\x{32ff}'. |
25
|
|
|
'\x{3400}-\x{4dbf}'. |
26
|
|
|
'\x{4e00}-\x{9fff}'. |
27
|
|
|
'\x{f900}-\x{faff}'; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写 |
31
|
|
|
* ln is short of alphabetical letters, greek letters and numerical digits and symbols. |
32
|
|
|
* |
33
|
|
|
* @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering |
34
|
|
|
* |
35
|
|
|
* @var string |
36
|
|
|
*/ |
37
|
|
|
private $ln = ''. |
38
|
|
|
'A-Za-z'. |
39
|
|
|
'Α-Ωα-ω'. |
40
|
|
|
'0-9'; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* 保留的全角标点符号. |
44
|
|
|
* |
45
|
|
|
* @var string |
46
|
|
|
*/ |
47
|
|
|
private $fullwidthPunctuation = '!?。,;:、“”‘’『』「」〖〗【】《》()'; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* 空格 |
51
|
|
|
* |
52
|
|
|
* @var string |
53
|
|
|
*/ |
54
|
|
|
private $space = '\s| | '; |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* 使用全部或指定的方法来纠正排版 |
58
|
|
|
* Correct typesetting error. |
59
|
|
|
* |
60
|
|
|
* @param string $text |
61
|
|
|
* @param array $methods |
62
|
|
|
* |
63
|
|
|
* @throws \ReflectionException |
64
|
|
|
* |
65
|
|
|
* @return mixed |
66
|
|
|
*/ |
67
|
|
|
public function correct($text, array $methods = []) |
68
|
|
|
{ |
69
|
|
|
if (empty($methods)) { |
70
|
|
|
$class = new \ReflectionClass($this); |
71
|
|
|
$methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC); |
72
|
|
|
foreach ($methodsList as $methodObj) { |
73
|
|
|
$methods[] = $methodObj->name; |
74
|
|
|
} |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
$methods = array_unique($methods); |
78
|
|
|
|
79
|
|
|
// removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法 |
80
|
|
|
if (in_array('removeEmptyTag', $methods)) { |
81
|
|
|
$methods = array_diff($methods, ['removeEmptyParagraph']); |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
// insertSpace 方法应该是数组的最后一个元素 |
85
|
|
|
// the method insertSpace should be the end of array |
86
|
|
|
if (in_array('insertSpace', $methods)) { |
87
|
|
|
$methods = array_diff($methods, ['insertSpace']); |
88
|
|
|
array_push($methods, 'insertSpace'); |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
foreach ($methods as $method) { |
92
|
|
|
if (__FUNCTION__ == $method || !method_exists($this, $method)) { |
93
|
|
|
continue; |
94
|
|
|
} |
95
|
|
|
$text = $this->$method($text); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
return $text; |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* 修复错误的标点符号 |
103
|
|
|
* Fix incorrect punctuations. |
104
|
|
|
* |
105
|
|
|
* update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php |
106
|
|
|
* |
107
|
|
|
* @param string $text |
108
|
|
|
* |
109
|
|
|
* @return null|string|string[] |
110
|
|
|
*/ |
111
|
|
|
public function fixPunctuation($text) |
112
|
|
|
{ |
113
|
|
|
// 正确使用省略号 |
114
|
|
|
$text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text); |
115
|
|
|
$text = preg_replace('/(……){2,}/iu', '……', $text); |
116
|
|
|
|
117
|
|
|
// 中文以及中文标点符号)》后使用全角中文标点符号(包括!?。,():;) |
118
|
|
|
$text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) { |
119
|
|
|
$replace = [ |
120
|
|
|
'!' => '!', |
121
|
|
|
'?' => '?', |
122
|
|
|
'.' => '。', |
123
|
|
|
',' => ',', |
124
|
|
|
'(' => '(', |
125
|
|
|
')' => ')', |
126
|
|
|
':' => ':', |
127
|
|
|
';' => ';', |
128
|
|
|
]; |
129
|
|
|
|
130
|
|
|
return $matches[1].$replace[$matches[2]]; |
131
|
|
|
}, $text); |
132
|
|
|
|
133
|
|
|
// 不重复使用中文标点符号,重复时只保留第一个 |
134
|
|
|
$text = preg_replace('/(['.$this->fullwidthPunctuation.'])\1{1,}/iu', '\1', $text); |
135
|
|
|
|
136
|
|
|
return $text; |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
/** |
140
|
|
|
* 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符) |
141
|
|
|
* Limited full-width to half-width transformer. |
142
|
|
|
* |
143
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角 |
144
|
|
|
* |
145
|
|
|
* @param string $text |
146
|
|
|
* |
147
|
|
|
* @return null|string|string[] |
148
|
|
|
*/ |
149
|
|
|
public function full2Half($text) |
150
|
|
|
{ |
151
|
|
|
$arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', |
152
|
|
|
'5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9', |
153
|
|
|
'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E', |
154
|
|
|
'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', |
155
|
|
|
'K' => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O', |
156
|
|
|
'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T', |
157
|
|
|
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', |
158
|
|
|
'Z' => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', |
159
|
|
|
'e' => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i', |
160
|
|
|
'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', |
161
|
|
|
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', |
162
|
|
|
't' => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', |
163
|
|
|
'y' => 'y', 'z' => 'z', |
164
|
|
|
'-' => '-', ' ' => ' ', '/' => '/', |
165
|
|
|
'%' => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<', |
166
|
|
|
'>' => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}', |
167
|
|
|
'\' => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_', |
168
|
|
|
'^' => '^', ' ̄' => '~', '`' => '`', ]; |
169
|
|
|
|
170
|
|
|
return strtr($text, $arr); |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
/** |
174
|
|
|
* 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格 |
175
|
|
|
* Insert a space between Chinese character and English/Greek/Number character. |
176
|
|
|
* |
177
|
|
|
* update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php |
178
|
|
|
* |
179
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格 |
180
|
|
|
* |
181
|
|
|
* @param string $text |
182
|
|
|
* |
183
|
|
|
* @return null|string|string[] |
184
|
|
|
*/ |
185
|
|
|
public function insertSpace($text) |
186
|
|
|
{ |
187
|
|
|
$patterns = [ |
188
|
|
|
'cjk_quote' => [ |
189
|
|
|
'(['.$this->cjk.'])(["\'])', |
190
|
|
|
'$1 $2', |
191
|
|
|
], |
192
|
|
|
'quote_cjk' => [ |
193
|
|
|
'(["\'])(['.$this->cjk.'])', |
194
|
|
|
'$1 $2', |
195
|
|
|
], |
196
|
|
|
'fix_quote' => [ |
197
|
|
|
'(["\']+)(\s*)(.+?)(\s*)(["\']+)', |
198
|
|
|
'$1$3$5', |
199
|
|
|
], |
200
|
|
|
'cjk_hash' => [ |
201
|
|
|
'(['.$this->cjk.'])(#(\S+))', |
202
|
|
|
'$1 $2', |
203
|
|
|
], |
204
|
|
|
'hash_cjk' => [ |
205
|
|
|
'((\S+)#)(['.$this->cjk.'])', |
206
|
|
|
'$1 $3', |
207
|
|
|
], |
208
|
|
|
'cjk_operator_ans' => [ |
209
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])', |
210
|
|
|
'$1 $2 $3', |
211
|
|
|
], |
212
|
|
|
'ans_operator_cjk' => [ |
213
|
|
|
'([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])', |
214
|
|
|
'$1 $2 $3', |
215
|
|
|
], |
216
|
|
|
'bracket' => [ |
217
|
|
|
[ |
218
|
|
|
'(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])', |
219
|
|
|
'$1 $2 $4', |
220
|
|
|
], |
221
|
|
|
[ |
222
|
|
|
'cjk_bracket' => [ |
223
|
|
|
'(['.$this->cjk.'])([<>\[\]\{\}\(\)])', |
224
|
|
|
'$1 $2', |
225
|
|
|
], |
226
|
|
|
'bracket_cjk' => [ |
227
|
|
|
'([<>\[\]\{\}\(\)])(['.$this->cjk.'])', |
228
|
|
|
'$1 $2', |
229
|
|
|
], |
230
|
|
|
], |
231
|
|
|
], |
232
|
|
|
'fix_bracket' => [ |
233
|
|
|
'([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)', |
234
|
|
|
'$1$3$5', |
235
|
|
|
], |
236
|
|
|
'cjk_ans' => [ |
237
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])', |
238
|
|
|
'$1 $2', |
239
|
|
|
], |
240
|
|
|
'ans_cjk' => [ |
241
|
|
|
'(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])', |
242
|
|
|
'$1 $2', |
243
|
|
|
], |
244
|
|
|
]; |
245
|
|
|
foreach ($patterns as $key => $value) { |
246
|
|
|
if ($key === 'bracket') { |
247
|
|
|
$old = $text; |
248
|
|
|
$new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text); |
249
|
|
|
$text = $new; |
250
|
|
|
if ($old === $new) { |
251
|
|
|
foreach ($value[1] as $val) { |
252
|
|
|
$text = preg_replace('/'.$val[0].'/iu', $val[1], $text); |
253
|
|
|
} |
254
|
|
|
} |
255
|
|
|
continue; |
256
|
|
|
} |
257
|
|
|
$text = preg_replace('/'.$value[0].'/iu', $value[1], $text); |
258
|
|
|
} |
259
|
|
|
|
260
|
|
|
return $text; |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
/** |
264
|
|
|
* 全角标点符号与其他字符之间无需添加空格;. |
265
|
|
|
* |
266
|
|
|
* @param string $text |
267
|
|
|
* |
268
|
|
|
* @return null|string|string[] |
269
|
|
|
*/ |
270
|
|
|
public function removeSpace($text) |
271
|
|
|
{ |
272
|
|
|
$patterns = [ |
273
|
|
|
'fullwidth_space' => [ |
274
|
|
|
'(['.$this->fullwidthPunctuation.'])(\s)+', |
275
|
|
|
'$1', |
276
|
|
|
], |
277
|
|
|
'space_fullwidth' => [ |
278
|
|
|
'(\s)+(['.$this->fullwidthPunctuation.'])', |
279
|
|
|
'$2', |
280
|
|
|
], |
281
|
|
|
]; |
282
|
|
|
|
283
|
|
|
foreach ($patterns as $key => $value) { |
284
|
|
|
$text = preg_replace('/'.$value[0].'/u', $value[1], $text); |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
return $text; |
288
|
|
|
} |
289
|
|
|
|
290
|
|
|
/** |
291
|
|
|
* 专有名词使用正确的大小写 |
292
|
|
|
* Correct English proper nouns. |
293
|
|
|
* |
294
|
|
|
* @param string $text |
295
|
|
|
* @param array $extend |
296
|
|
|
* @param array $ignore |
297
|
|
|
* |
298
|
|
|
* @return null|string|string[] |
299
|
|
|
*/ |
300
|
|
|
public function properNoun($text, array $extend = [], array $ignore = []) |
301
|
|
|
{ |
302
|
|
|
$dict = include __DIR__.'/../data/dict.php'; |
303
|
|
|
if (!empty($extend)) { |
304
|
|
|
$dict = array_merge($dict, $extend); |
305
|
|
|
} |
306
|
|
|
if (!empty($ignore)) { |
307
|
|
|
$dict = array_diff($dict, $ignore); |
308
|
|
|
} |
309
|
|
|
|
310
|
|
|
foreach ($dict as $noun) { |
311
|
|
|
// Matching proper nouns Outside Of Html Tags |
312
|
|
|
$text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text); |
313
|
|
|
} |
314
|
|
|
|
315
|
|
|
return $text; |
316
|
|
|
} |
317
|
|
|
|
318
|
|
|
/** |
319
|
|
|
* 清除 Class 属性 |
320
|
|
|
* Remove specific class of HTML tags. |
321
|
|
|
* |
322
|
|
|
* @param string $text |
323
|
|
|
* |
324
|
|
|
* @return null|string|string[] |
325
|
|
|
*/ |
326
|
|
|
public function removeClass($text) |
327
|
|
|
{ |
328
|
|
|
return preg_replace('#\s(class)="[^"]+"#', '', $text); |
329
|
|
|
} |
330
|
|
|
|
331
|
|
|
/** |
332
|
|
|
* 清除 ID 属性 |
333
|
|
|
* Remove specific id of HTML tags. |
334
|
|
|
* |
335
|
|
|
* @param string $text |
336
|
|
|
* |
337
|
|
|
* @return null|string|string[] |
338
|
|
|
*/ |
339
|
|
|
public function removeId($text) |
340
|
|
|
{ |
341
|
|
|
return preg_replace('#\s(id)="[^"]+"#', '', $text); |
342
|
|
|
} |
343
|
|
|
|
344
|
|
|
/** |
345
|
|
|
* 清除 Style 属性 |
346
|
|
|
* Remove specific style of HTML tags. |
347
|
|
|
* |
348
|
|
|
* @param string $text |
349
|
|
|
* |
350
|
|
|
* @return null|string|string[] |
351
|
|
|
*/ |
352
|
|
|
public function removeStyle($text) |
353
|
|
|
{ |
354
|
|
|
return preg_replace('#\s(style)="[^"]+"#', '', $text); |
355
|
|
|
} |
356
|
|
|
|
357
|
|
|
/** |
358
|
|
|
* 清除空段落标签 |
359
|
|
|
* Remove empty Paragraph tags. |
360
|
|
|
* |
361
|
|
|
* @param string $text |
362
|
|
|
* @param bool $nested |
363
|
|
|
* |
364
|
|
|
* @return null|string|string[] |
365
|
|
|
*/ |
366
|
|
|
public function removeEmptyParagraph($text, $nested = true) |
367
|
|
|
{ |
368
|
|
|
$pattern = '/<p[^>]*>(['.$this->space.']?)<\\/p[^>]*>/'; |
369
|
|
|
if ($nested) { |
370
|
|
|
while (preg_match($pattern, $text)) { |
371
|
|
|
$text = preg_replace($pattern, '', $text); |
372
|
|
|
} |
373
|
|
|
} else { |
374
|
|
|
$text = preg_replace($pattern, '', $text); |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
return $text; |
378
|
|
|
} |
379
|
|
|
|
380
|
|
|
/** |
381
|
|
|
* 清除所有空标签 |
382
|
|
|
* Remote all empty HTML tags. |
383
|
|
|
* |
384
|
|
|
* @param string $text |
385
|
|
|
* @param bool $nested |
386
|
|
|
* |
387
|
|
|
* @return null|string|string[] |
388
|
|
|
*/ |
389
|
|
|
public function removeEmptyTag($text, $nested = true) |
390
|
|
|
{ |
391
|
|
|
$pattern = '/<[^\/>]*>(['.$this->space.']?)*<\/[^>]*>/'; |
392
|
|
|
if ($nested) { |
393
|
|
|
while (preg_match($pattern, $text)) { |
394
|
|
|
$text = preg_replace($pattern, '', $text); |
395
|
|
|
} |
396
|
|
|
} else { |
397
|
|
|
$text = preg_replace($pattern, '', $text); |
398
|
|
|
} |
399
|
|
|
|
400
|
|
|
return $text; |
401
|
|
|
} |
402
|
|
|
|
403
|
|
|
/** |
404
|
|
|
* 清除段首缩进. |
405
|
|
|
* Remove indent. |
406
|
|
|
* |
407
|
|
|
* @param string $text |
408
|
|
|
* |
409
|
|
|
* @return null|string|string[] |
410
|
|
|
*/ |
411
|
|
|
public function removeIndent($text) |
412
|
|
|
{ |
413
|
|
|
return preg_replace('/<p([^>]*)>('.$this->space.')+/', '<p${1}>', $text); |
414
|
|
|
} |
415
|
|
|
} |
416
|
|
|
|