|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Created by PhpStorm. |
|
4
|
|
|
* User: jxlwqq |
|
5
|
|
|
* Date: 2018/7/26 |
|
6
|
|
|
* Time: 09:33. |
|
7
|
|
|
*/ |
|
8
|
|
|
|
|
9
|
|
|
namespace Jxlwqq\ChineseTypesetting; |
|
10
|
|
|
|
|
11
|
|
|
class ChineseTypesetting |
|
12
|
|
|
{ |
|
13
|
|
|
/** |
|
14
|
|
|
* cjk 是中日韩统一表意文字缩写 |
|
15
|
|
|
* cjk is short for Chinese, Japanese and Korean. |
|
16
|
|
|
* |
|
17
|
|
|
* @link http://unicode-table.com/en/ |
|
18
|
|
|
* |
|
19
|
|
|
* @var string |
|
20
|
|
|
*/ |
|
21
|
|
|
private $cjk = ''. |
|
22
|
|
|
'\x{2e80}-\x{2eff}'. |
|
23
|
|
|
'\x{2f00}-\x{2fdf}'. |
|
24
|
|
|
'\x{3040}-\x{309f}'. |
|
25
|
|
|
'\x{30a0}-\x{30ff}'. |
|
26
|
|
|
'\x{3100}-\x{312f}'. |
|
27
|
|
|
'\x{3200}-\x{32ff}'. |
|
28
|
|
|
'\x{3400}-\x{4dbf}'. |
|
29
|
|
|
'\x{4e00}-\x{9fff}'. |
|
30
|
|
|
'\x{f900}-\x{faff}'; |
|
31
|
|
|
|
|
32
|
|
|
/** |
|
33
|
|
|
* ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写 |
|
34
|
|
|
* ln is short of alphabetical letters, greek letters and numerical digits and symbols. |
|
35
|
|
|
* |
|
36
|
|
|
* @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering |
|
37
|
|
|
* |
|
38
|
|
|
* @var string |
|
39
|
|
|
*/ |
|
40
|
|
|
private $ln = ''. |
|
41
|
|
|
'A-Za-z'. |
|
42
|
|
|
'Α-Ωα-ω'. |
|
43
|
|
|
'0-9'; |
|
44
|
|
|
|
|
45
|
|
|
/** |
|
46
|
|
|
* 使用全部或指定的方法来纠正排版 |
|
47
|
|
|
* Correct typesetting error. |
|
48
|
|
|
* |
|
49
|
|
|
* @param $text |
|
50
|
|
|
* @param array $methods |
|
51
|
|
|
* |
|
52
|
|
|
* @throws \ReflectionException |
|
53
|
|
|
* |
|
54
|
|
|
* @return mixed |
|
55
|
|
|
*/ |
|
56
|
|
|
public function correct($text, array $methods = []) |
|
57
|
|
|
{ |
|
58
|
|
|
if (empty($methods)) { |
|
59
|
|
|
$class = new \ReflectionClass($this); |
|
60
|
|
|
$methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC); |
|
61
|
|
|
foreach ($methodsList as $methodObj) { |
|
62
|
|
|
$methods[] = $methodObj->name; |
|
63
|
|
|
} |
|
64
|
|
|
} |
|
65
|
|
|
|
|
66
|
|
|
$methods = array_unique($methods); |
|
67
|
|
|
|
|
68
|
|
|
// removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法 |
|
69
|
|
|
if (in_array('removeEmptyTag', $methods)) { |
|
70
|
|
|
$methods = array_diff($methods, ['removeEmptyParagraph']); |
|
71
|
|
|
} |
|
72
|
|
|
|
|
73
|
|
|
// insertSpace 方法应该是数组的最后一个元素 |
|
74
|
|
|
// the method insertSpace should be the end of array |
|
75
|
|
|
if (in_array('insertSpace', $methods)) { |
|
76
|
|
|
$methods = array_diff($methods, ['insertSpace']); |
|
77
|
|
|
array_push($methods, 'insertSpace'); |
|
78
|
|
|
} |
|
79
|
|
|
|
|
80
|
|
|
foreach ($methods as $method) { |
|
81
|
|
|
if (__FUNCTION__ == $method || !method_exists($this, $method)) { |
|
82
|
|
|
continue; |
|
83
|
|
|
} |
|
84
|
|
|
$text = $this->$method($text); |
|
85
|
|
|
} |
|
86
|
|
|
|
|
87
|
|
|
return $text; |
|
88
|
|
|
} |
|
89
|
|
|
|
|
90
|
|
|
/** |
|
91
|
|
|
* 修复错误的标点符号 |
|
92
|
|
|
* Fix incorrect punctuations. |
|
93
|
|
|
* |
|
94
|
|
|
* update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php |
|
95
|
|
|
* |
|
96
|
|
|
* @param $text |
|
97
|
|
|
* |
|
98
|
|
|
* @return null|string|string[] |
|
99
|
|
|
*/ |
|
100
|
|
|
public function fixPunctuation($text) |
|
101
|
|
|
{ |
|
102
|
|
|
// 正确使用省略号 |
|
103
|
|
|
$text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text); |
|
104
|
|
|
$text = preg_replace('/(……){2,}/iu', '……', $text); |
|
105
|
|
|
|
|
106
|
|
|
// 中文以及中文标点符号()》)后使用全角中文标点符号(包括!?。,():;) |
|
107
|
|
|
$text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) { |
|
108
|
|
|
$replace = [ |
|
109
|
|
|
'!' => '!', |
|
110
|
|
|
'?' => '?', |
|
111
|
|
|
'.' => '。', |
|
112
|
|
|
',' => ',', |
|
113
|
|
|
'(' => '(', |
|
114
|
|
|
')' => ')', |
|
115
|
|
|
':' => ':', |
|
116
|
|
|
';' => ';', |
|
117
|
|
|
]; |
|
118
|
|
|
|
|
119
|
|
|
return $matches[1].$replace[$matches[2]]; |
|
120
|
|
|
}, $text); |
|
121
|
|
|
|
|
122
|
|
|
// 不重复使用中文标点符号,重复时只保留第一个 |
|
123
|
|
|
$text = preg_replace('/([!?。,;:、“”‘’『』〖〗《》()])\1{1,}/iu', '\1', $text); |
|
124
|
|
|
|
|
125
|
|
|
return $text; |
|
126
|
|
|
} |
|
127
|
|
|
|
|
128
|
|
|
/** |
|
129
|
|
|
* 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符) |
|
130
|
|
|
* Limited full-width to half-width transformer. |
|
131
|
|
|
* |
|
132
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角 |
|
133
|
|
|
* |
|
134
|
|
|
* @param $text |
|
135
|
|
|
* |
|
136
|
|
|
* @return null|string|string[] |
|
137
|
|
|
*/ |
|
138
|
|
|
public function full2Half($text) |
|
139
|
|
|
{ |
|
140
|
|
|
$arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', |
|
141
|
|
|
'5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9', |
|
142
|
|
|
'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E', |
|
143
|
|
|
'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', |
|
144
|
|
|
'K' => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O', |
|
145
|
|
|
'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T', |
|
146
|
|
|
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', |
|
147
|
|
|
'Z' => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', |
|
148
|
|
|
'e' => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i', |
|
149
|
|
|
'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', |
|
150
|
|
|
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', |
|
151
|
|
|
't' => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', |
|
152
|
|
|
'y' => 'y', 'z' => 'z', |
|
153
|
|
|
'-' => '-', ' ' => ' ', '/' => '/', |
|
154
|
|
|
'%' => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<', |
|
155
|
|
|
'>' => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}', |
|
156
|
|
|
'\' => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_', |
|
157
|
|
|
'^' => '^', ' ̄' => '~', '`' => '`', ]; |
|
158
|
|
|
|
|
159
|
|
|
return strtr($text, $arr); |
|
160
|
|
|
} |
|
161
|
|
|
|
|
162
|
|
|
/** |
|
163
|
|
|
* 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格 |
|
164
|
|
|
* Insert a space between Chinese character and English/Greek/Number character. |
|
165
|
|
|
* |
|
166
|
|
|
* update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php |
|
167
|
|
|
* |
|
168
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格 |
|
169
|
|
|
* |
|
170
|
|
|
* @param $text |
|
171
|
|
|
* |
|
172
|
|
|
* @return null|string|string[] |
|
173
|
|
|
*/ |
|
174
|
|
|
public function insertSpace($text) |
|
175
|
|
|
{ |
|
176
|
|
|
$patterns = [ |
|
177
|
|
|
'cjk_quote' => [ |
|
178
|
|
|
'(['.$this->cjk.'])(["\'])', |
|
179
|
|
|
'$1 $2', |
|
180
|
|
|
], |
|
181
|
|
|
'quote_cjk' => [ |
|
182
|
|
|
'(["\'])(['.$this->cjk.'])', |
|
183
|
|
|
'$1 $2', |
|
184
|
|
|
], |
|
185
|
|
|
'fix_quote' => [ |
|
186
|
|
|
'(["\']+)(\s*)(.+?)(\s*)(["\']+)', |
|
187
|
|
|
'$1$3$5', |
|
188
|
|
|
], |
|
189
|
|
|
'cjk_hash' => [ |
|
190
|
|
|
'(['.$this->cjk.'])(#(\S+))', |
|
191
|
|
|
'$1 $2', |
|
192
|
|
|
], |
|
193
|
|
|
'hash_cjk' => [ |
|
194
|
|
|
'((\S+)#)(['.$this->cjk.'])', |
|
195
|
|
|
'$1 $3', |
|
196
|
|
|
], |
|
197
|
|
|
'cjk_operator_ans' => [ |
|
198
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])', |
|
199
|
|
|
'$1 $2 $3', |
|
200
|
|
|
], |
|
201
|
|
|
'ans_operator_cjk' => [ |
|
202
|
|
|
'([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])', |
|
203
|
|
|
'$1 $2 $3', |
|
204
|
|
|
], |
|
205
|
|
|
'bracket' => [ |
|
206
|
|
|
[ |
|
207
|
|
|
'(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])', |
|
208
|
|
|
'$1 $2 $4', |
|
209
|
|
|
], |
|
210
|
|
|
[ |
|
211
|
|
|
'cjk_bracket' => [ |
|
212
|
|
|
'(['.$this->cjk.'])([<>\[\]\{\}\(\)])', |
|
213
|
|
|
'$1 $2', |
|
214
|
|
|
], |
|
215
|
|
|
'bracket_cjk' => [ |
|
216
|
|
|
'([<>\[\]\{\}\(\)])(['.$this->cjk.'])', |
|
217
|
|
|
'$1 $2', |
|
218
|
|
|
], |
|
219
|
|
|
], |
|
220
|
|
|
], |
|
221
|
|
|
'fix_bracket' => [ |
|
222
|
|
|
'([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)', |
|
223
|
|
|
'$1$3$5', |
|
224
|
|
|
], |
|
225
|
|
|
'cjk_ans' => [ |
|
226
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])', |
|
227
|
|
|
'$1 $2', |
|
228
|
|
|
], |
|
229
|
|
|
'ans_cjk' => [ |
|
230
|
|
|
'(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])', |
|
231
|
|
|
'$1 $2', |
|
232
|
|
|
], |
|
233
|
|
|
]; |
|
234
|
|
|
foreach ($patterns as $key => $value) { |
|
235
|
|
|
if ($key === 'bracket') { |
|
236
|
|
|
$old = $text; |
|
237
|
|
|
$new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text); |
|
238
|
|
|
$text = $new; |
|
239
|
|
|
if ($old === $new) { |
|
240
|
|
|
foreach ($value[1] as $val) { |
|
241
|
|
|
$text = preg_replace('/'.$val[0].'/iu', $val[1], $text); |
|
242
|
|
|
} |
|
243
|
|
|
} |
|
244
|
|
|
continue; |
|
245
|
|
|
} |
|
246
|
|
|
$text = preg_replace('/'.$value[0].'/iu', $value[1], $text); |
|
247
|
|
|
} |
|
248
|
|
|
|
|
249
|
|
|
return $text; |
|
250
|
|
|
} |
|
251
|
|
|
|
|
252
|
|
|
/** |
|
253
|
|
|
* 专有名词使用正确的大小写 |
|
254
|
|
|
* Correct English proper nouns. |
|
255
|
|
|
* |
|
256
|
|
|
* @param $text |
|
257
|
|
|
* @param array $extend |
|
258
|
|
|
* @param array $ignore |
|
259
|
|
|
* |
|
260
|
|
|
* @return null|string|string[] |
|
261
|
|
|
*/ |
|
262
|
|
|
public function properNoun($text, array $extend = [], array $ignore = []) |
|
263
|
|
|
{ |
|
264
|
|
|
$dict = include __DIR__.'/../data/dict.php'; |
|
265
|
|
|
if ($extend) { |
|
|
|
|
|
|
266
|
|
|
$dict = array_merge($dict, $extend); |
|
267
|
|
|
} |
|
268
|
|
|
if ($ignore) { |
|
|
|
|
|
|
269
|
|
|
$dict = array_diff($dict, $ignore); |
|
270
|
|
|
} |
|
271
|
|
|
|
|
272
|
|
|
foreach ($dict as $noun) { |
|
273
|
|
|
// Matching proper nouns Outside Of Html Tags |
|
274
|
|
|
$text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text); |
|
275
|
|
|
} |
|
276
|
|
|
|
|
277
|
|
|
return $text; |
|
278
|
|
|
} |
|
279
|
|
|
|
|
280
|
|
|
/** |
|
281
|
|
|
* 清除 Class 属性 |
|
282
|
|
|
* Remove specific class of HTML tags. |
|
283
|
|
|
* |
|
284
|
|
|
* @param $text |
|
285
|
|
|
* |
|
286
|
|
|
* @return null|string|string[] |
|
287
|
|
|
*/ |
|
288
|
|
|
public function removeClass($text) |
|
289
|
|
|
{ |
|
290
|
|
|
return preg_replace('#\s(class)="[^"]+"#', '', $text); |
|
291
|
|
|
} |
|
292
|
|
|
|
|
293
|
|
|
/** |
|
294
|
|
|
* 清除 ID 属性 |
|
295
|
|
|
* Remove specific id of HTML tags. |
|
296
|
|
|
* |
|
297
|
|
|
* @param $text |
|
298
|
|
|
* |
|
299
|
|
|
* @return null|string|string[] |
|
300
|
|
|
*/ |
|
301
|
|
|
public function removeId($text) |
|
302
|
|
|
{ |
|
303
|
|
|
return preg_replace('#\s(id)="[^"]+"#', '', $text); |
|
304
|
|
|
} |
|
305
|
|
|
|
|
306
|
|
|
/** |
|
307
|
|
|
* 清除 Style 属性 |
|
308
|
|
|
* Remove specific style of HTML tags. |
|
309
|
|
|
* |
|
310
|
|
|
* @param $text |
|
311
|
|
|
* |
|
312
|
|
|
* @return null|string|string[] |
|
313
|
|
|
*/ |
|
314
|
|
|
public function removeStyle($text) |
|
315
|
|
|
{ |
|
316
|
|
|
return preg_replace('#\s(style)="[^"]+"#', '', $text); |
|
317
|
|
|
} |
|
318
|
|
|
|
|
319
|
|
|
/** |
|
320
|
|
|
* 清除空段落标签 |
|
321
|
|
|
* Remove empty Paragraph tags. |
|
322
|
|
|
* |
|
323
|
|
|
* @param $text |
|
324
|
|
|
* |
|
325
|
|
|
* @return null|string|string[] |
|
326
|
|
|
*/ |
|
327
|
|
|
public function removeEmptyParagraph($text) |
|
328
|
|
|
{ |
|
329
|
|
|
return preg_replace('/<p[^>]*>([\s| ]?)<\\/p[^>]*>/', '', $text); |
|
330
|
|
|
} |
|
331
|
|
|
|
|
332
|
|
|
/** |
|
333
|
|
|
* 清除所有空标签 |
|
334
|
|
|
* Remote all empty HTML tags. |
|
335
|
|
|
* |
|
336
|
|
|
* @param $text |
|
337
|
|
|
* |
|
338
|
|
|
* @return null|string|string[] |
|
339
|
|
|
*/ |
|
340
|
|
|
public function removeEmptyTag($text) |
|
341
|
|
|
{ |
|
342
|
|
|
return preg_replace('/<[^\/>]*>([\s| ]?)*<\/[^>]*>/', '', $text); |
|
343
|
|
|
} |
|
344
|
|
|
|
|
345
|
|
|
/** |
|
346
|
|
|
* 清除段首缩进. |
|
347
|
|
|
* Remove indent. |
|
348
|
|
|
* |
|
349
|
|
|
* @param $text |
|
350
|
|
|
* |
|
351
|
|
|
* @return null|string|string[] |
|
352
|
|
|
*/ |
|
353
|
|
|
public function removeIndent($text) |
|
354
|
|
|
{ |
|
355
|
|
|
return preg_replace('/<p([^>]*)>(\s| )+/', '<p${1}>', $text); |
|
356
|
|
|
} |
|
357
|
|
|
} |
|
358
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.