1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Created by PhpStorm. |
4
|
|
|
* User: jxlwqq |
5
|
|
|
* Date: 2018/7/26 |
6
|
|
|
* Time: 09:33. |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
namespace Jxlwqq\ChineseTypesetting; |
10
|
|
|
|
11
|
|
|
class ChineseTypesetting |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* cjk 是中日韩统一表意文字缩写 |
15
|
|
|
* cjk is short for Chinese, Japanese and Korean. |
16
|
|
|
* |
17
|
|
|
* @link http://unicode-table.com/en/ |
18
|
|
|
* |
19
|
|
|
* @var string |
20
|
|
|
*/ |
21
|
|
|
private $cjk = ''. |
22
|
|
|
'\x{2e80}-\x{2eff}'. |
23
|
|
|
'\x{2f00}-\x{2fdf}'. |
24
|
|
|
'\x{3040}-\x{309f}'. |
25
|
|
|
'\x{30a0}-\x{30ff}'. |
26
|
|
|
'\x{3100}-\x{312f}'. |
27
|
|
|
'\x{3200}-\x{32ff}'. |
28
|
|
|
'\x{3400}-\x{4dbf}'. |
29
|
|
|
'\x{4e00}-\x{9fff}'. |
30
|
|
|
'\x{f900}-\x{faff}'; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* ln 是英文字母、希腊字母(用于数学、科学与工程)和阿拉伯数字的缩写 |
34
|
|
|
* ln is short of alphabetical letters, greek letters and numerical digits and symbols. |
35
|
|
|
* |
36
|
|
|
* @link https://en.wikipedia.org/wiki/Greek_letters_used_in_mathematics,_science,_and_engineering |
37
|
|
|
* |
38
|
|
|
* @var string |
39
|
|
|
*/ |
40
|
|
|
private $ln = ''. |
41
|
|
|
'A-Za-z'. |
42
|
|
|
'Α-Ωα-ω'. |
43
|
|
|
'0-9'; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* 使用全部或指定的方法来纠正排版 |
47
|
|
|
* Correct typesetting error. |
48
|
|
|
* |
49
|
|
|
* @param $text |
50
|
|
|
* @param array $methods |
51
|
|
|
* |
52
|
|
|
* @throws \ReflectionException |
53
|
|
|
* |
54
|
|
|
* @return mixed |
55
|
|
|
*/ |
56
|
|
|
public function correct($text, array $methods = []) |
57
|
|
|
{ |
58
|
|
|
if (empty($methods)) { |
59
|
|
|
$class = new \ReflectionClass($this); |
60
|
|
|
$methodsList = $class->getMethods(\ReflectionMethod::IS_PUBLIC); |
61
|
|
|
foreach ($methodsList as $methodObj) { |
62
|
|
|
$methods[] = $methodObj->name; |
63
|
|
|
} |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
$methods = array_unique($methods); |
67
|
|
|
|
68
|
|
|
// removeEmptyTag 方法包含了 removeEmptyParagraph 方法的功能,如果这两个函数都存在,则需去除 removeEmptyParagraph 方法 |
69
|
|
|
if (in_array('removeEmptyTag', $methods)) { |
70
|
|
|
$methods = array_diff($methods, ['removeEmptyParagraph']); |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
// insertSpace 方法应该是数组的最后一个元素 |
74
|
|
|
// the method insertSpace should be the end of array |
75
|
|
|
if (in_array('insertSpace', $methods)) { |
76
|
|
|
$methods = array_diff($methods, ['insertSpace']); |
77
|
|
|
array_push($methods, 'insertSpace'); |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
foreach ($methods as $method) { |
81
|
|
|
if (__FUNCTION__ == $method || !method_exists($this, $method)) { |
82
|
|
|
continue; |
83
|
|
|
} |
84
|
|
|
$text = $this->$method($text); |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
return $text; |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
/** |
91
|
|
|
* 修复错误的标点符号 |
92
|
|
|
* Fix incorrect punctuations. |
93
|
|
|
* |
94
|
|
|
* update base on @link https://github.com/ricoa/copywriting-correct/blob/master/src/Correctors/CharacterCorrector.php |
95
|
|
|
* |
96
|
|
|
* @param $text |
97
|
|
|
* |
98
|
|
|
* @return null|string|string[] |
99
|
|
|
*/ |
100
|
|
|
public function fixPunctuation($text) |
101
|
|
|
{ |
102
|
|
|
// 正确使用省略号 |
103
|
|
|
$text = preg_replace('/([。\.]){3,}|(…){1}/iu', '……', $text); |
104
|
|
|
$text = preg_replace('/(……){2,}/iu', '……', $text); |
105
|
|
|
|
106
|
|
|
// 中文以及中文标点符号()》)后使用全角中文标点符号(包括!?。,():;) |
107
|
|
|
$text = preg_replace_callback('/(['.$this->cjk.')》”])([!?\.,\(\):;])/iu', function ($matches) { |
108
|
|
|
$replace = [ |
109
|
|
|
'!' => '!', |
110
|
|
|
'?' => '?', |
111
|
|
|
'.' => '。', |
112
|
|
|
',' => ',', |
113
|
|
|
'(' => '(', |
114
|
|
|
')' => ')', |
115
|
|
|
':' => ':', |
116
|
|
|
';' => ';', |
117
|
|
|
]; |
118
|
|
|
|
119
|
|
|
return $matches[1].$replace[$matches[2]]; |
120
|
|
|
}, $text); |
121
|
|
|
|
122
|
|
|
// 不重复使用中文标点符号,重复时只保留第一个 |
123
|
|
|
$text = preg_replace('/([!?。,;:、“”‘’『』〖〗《》()])\1{1,}/iu', '\1', $text); |
124
|
|
|
|
125
|
|
|
return $text; |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
/** |
129
|
|
|
* 有限度的全角转半角(英文、数字、空格以及某些特殊字符等使用半角字符) |
130
|
|
|
* Limited full-width to half-width transformer. |
131
|
|
|
* |
132
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#全角和半角 |
133
|
|
|
* |
134
|
|
|
* @param $text |
135
|
|
|
* |
136
|
|
|
* @return null|string|string[] |
137
|
|
|
*/ |
138
|
|
|
public function full2Half($text) |
139
|
|
|
{ |
140
|
|
|
$arr = ['0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', |
141
|
|
|
'5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9', |
142
|
|
|
'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E', |
143
|
|
|
'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', |
144
|
|
|
'K' => 'K', 'L' => 'L', 'M' => 'M', 'N' => 'N', 'O' => 'O', |
145
|
|
|
'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T', |
146
|
|
|
'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', |
147
|
|
|
'Z' => 'Z', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', |
148
|
|
|
'e' => 'e', 'f' => 'f', 'g' => 'g', 'h' => 'h', 'i' => 'i', |
149
|
|
|
'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', |
150
|
|
|
'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', |
151
|
|
|
't' => 't', 'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', |
152
|
|
|
'y' => 'y', 'z' => 'z', |
153
|
|
|
'-' => '-', ' ' => ' ', '/' => '/', |
154
|
|
|
'%' => '%', '#' => '#', '@' => '@', '&' => '&', '<' => '<', |
155
|
|
|
'>' => '>', '[' => '[', ']' => ']', '{' => '{', '}' => '}', |
156
|
|
|
'\' => '\\', '|' => '|', '+' => '+', '=' => '=', '_' => '_', |
157
|
|
|
'^' => '^', ' ̄' => '~', '`' => '`', ]; |
158
|
|
|
|
159
|
|
|
return strtr($text, $arr); |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
/** |
163
|
|
|
* 在中文与英文字母/用于数学、科学和工程的希腊字母/数字之间添加空格 |
164
|
|
|
* Insert a space between Chinese character and English/Greek/Number character. |
165
|
|
|
* |
166
|
|
|
* update base on @link https://github.com/Rakume/pangu.php/blob/master/pangu.php |
167
|
|
|
* |
168
|
|
|
* @link https://github.com/mzlogin/chinese-copywriting-guidelines#空格 |
169
|
|
|
* |
170
|
|
|
* @param $text |
171
|
|
|
* |
172
|
|
|
* @return null|string|string[] |
173
|
|
|
*/ |
174
|
|
|
public function insertSpace($text) |
175
|
|
|
{ |
176
|
|
|
$patterns = [ |
177
|
|
|
'cjk_quote' => [ |
178
|
|
|
'(['.$this->cjk.'])(["\'])', |
179
|
|
|
'$1 $2', |
180
|
|
|
], |
181
|
|
|
'quote_cjk' => [ |
182
|
|
|
'(["\'])(['.$this->cjk.'])', |
183
|
|
|
'$1 $2', |
184
|
|
|
], |
185
|
|
|
'fix_quote' => [ |
186
|
|
|
'(["\']+)(\s*)(.+?)(\s*)(["\']+)', |
187
|
|
|
'$1$3$5', |
188
|
|
|
], |
189
|
|
|
'cjk_hash' => [ |
190
|
|
|
'(['.$this->cjk.'])(#(\S+))', |
191
|
|
|
'$1 $2', |
192
|
|
|
], |
193
|
|
|
'hash_cjk' => [ |
194
|
|
|
'((\S+)#)(['.$this->cjk.'])', |
195
|
|
|
'$1 $3', |
196
|
|
|
], |
197
|
|
|
'cjk_operator_ans' => [ |
198
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'])([\+\-\*\/=&\\|<>])', |
199
|
|
|
'$1 $2 $3', |
200
|
|
|
], |
201
|
|
|
'ans_operator_cjk' => [ |
202
|
|
|
'([\+\-\*\/=&\\|<>])(['.$this->ln.'])(['.$this->cjk.'])', |
203
|
|
|
'$1 $2 $3', |
204
|
|
|
], |
205
|
|
|
'bracket' => [ |
206
|
|
|
[ |
207
|
|
|
'(['.$this->cjk.'])([<\[\{\(]+(.*?)[>\]\}\)]+)(['.$this->cjk.'])', |
208
|
|
|
'$1 $2 $4', |
209
|
|
|
], |
210
|
|
|
[ |
211
|
|
|
'cjk_bracket' => [ |
212
|
|
|
'(['.$this->cjk.'])([<>\[\]\{\}\(\)])', |
213
|
|
|
'$1 $2', |
214
|
|
|
], |
215
|
|
|
'bracket_cjk' => [ |
216
|
|
|
'([<>\[\]\{\}\(\)])(['.$this->cjk.'])', |
217
|
|
|
'$1 $2', |
218
|
|
|
], |
219
|
|
|
], |
220
|
|
|
], |
221
|
|
|
'fix_bracket' => [ |
222
|
|
|
'([<\[\{\(]+)(\s*)(.+?)(\s*)([>\]\}\)]+)', |
223
|
|
|
'$1$3$5', |
224
|
|
|
], |
225
|
|
|
'cjk_ans' => [ |
226
|
|
|
'(['.$this->cjk.'])(['.$this->ln.'`@&%\=\$\^\*\-\+\\/|\\\])', |
227
|
|
|
'$1 $2', |
228
|
|
|
], |
229
|
|
|
'ans_cjk' => [ |
230
|
|
|
'(['.$this->ln.'`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])(['.$this->cjk.'])', |
231
|
|
|
'$1 $2', |
232
|
|
|
], |
233
|
|
|
]; |
234
|
|
|
foreach ($patterns as $key => $value) { |
235
|
|
|
if ($key === 'bracket') { |
236
|
|
|
$old = $text; |
237
|
|
|
$new = preg_replace('/'.$value[0][0].'/iu', $value[0][1], $text); |
238
|
|
|
$text = $new; |
239
|
|
|
if ($old === $new) { |
240
|
|
|
foreach ($value[1] as $val) { |
241
|
|
|
$text = preg_replace('/'.$val[0].'/iu', $val[1], $text); |
242
|
|
|
} |
243
|
|
|
} |
244
|
|
|
continue; |
245
|
|
|
} |
246
|
|
|
$text = preg_replace('/'.$value[0].'/iu', $value[1], $text); |
247
|
|
|
} |
248
|
|
|
|
249
|
|
|
return $text; |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
/** |
253
|
|
|
* 专有名词使用正确的大小写 |
254
|
|
|
* Correct English proper nouns. |
255
|
|
|
* |
256
|
|
|
* @param $text |
257
|
|
|
* @param array $extend |
258
|
|
|
* @param array $ignore |
259
|
|
|
* |
260
|
|
|
* @return null|string|string[] |
261
|
|
|
*/ |
262
|
|
|
public function properNoun($text, array $extend = [], array $ignore = []) |
263
|
|
|
{ |
264
|
|
|
$dict = include __DIR__.'/../data/dict.php'; |
265
|
|
|
if ($extend) { |
|
|
|
|
266
|
|
|
$dict = array_merge($dict, $extend); |
267
|
|
|
} |
268
|
|
|
if ($ignore) { |
|
|
|
|
269
|
|
|
$dict = array_diff($dict, $ignore); |
270
|
|
|
} |
271
|
|
|
|
272
|
|
|
foreach ($dict as $noun) { |
273
|
|
|
// Matching proper nouns Outside Of Html Tags |
274
|
|
|
$text = preg_replace("/(?<!\.|[a-z]){$noun}(?!\.|[a-z])(?!([^<]+)?>)/i", $noun, $text); |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
return $text; |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
/** |
281
|
|
|
* 清除 Class 属性 |
282
|
|
|
* Remove specific class of HTML tags. |
283
|
|
|
* |
284
|
|
|
* @param $text |
285
|
|
|
* |
286
|
|
|
* @return null|string|string[] |
287
|
|
|
*/ |
288
|
|
|
public function removeClass($text) |
289
|
|
|
{ |
290
|
|
|
return preg_replace('#\s(class)="[^"]+"#', '', $text); |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
/** |
294
|
|
|
* 清除 ID 属性 |
295
|
|
|
* Remove specific id of HTML tags. |
296
|
|
|
* |
297
|
|
|
* @param $text |
298
|
|
|
* |
299
|
|
|
* @return null|string|string[] |
300
|
|
|
*/ |
301
|
|
|
public function removeId($text) |
302
|
|
|
{ |
303
|
|
|
return preg_replace('#\s(id)="[^"]+"#', '', $text); |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
/** |
307
|
|
|
* 清除 Style 属性 |
308
|
|
|
* Remove specific style of HTML tags. |
309
|
|
|
* |
310
|
|
|
* @param $text |
311
|
|
|
* |
312
|
|
|
* @return null|string|string[] |
313
|
|
|
*/ |
314
|
|
|
public function removeStyle($text) |
315
|
|
|
{ |
316
|
|
|
return preg_replace('#\s(style)="[^"]+"#', '', $text); |
317
|
|
|
} |
318
|
|
|
|
319
|
|
|
/** |
320
|
|
|
* 清除空段落标签 |
321
|
|
|
* Remove empty Paragraph tags. |
322
|
|
|
* |
323
|
|
|
* @param $text |
324
|
|
|
* |
325
|
|
|
* @return null|string|string[] |
326
|
|
|
*/ |
327
|
|
|
public function removeEmptyParagraph($text) |
328
|
|
|
{ |
329
|
|
|
return preg_replace('/<p[^>]*>([\s| ]?)<\\/p[^>]*>/', '', $text); |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
/** |
333
|
|
|
* 清除所有空标签 |
334
|
|
|
* Remote all empty HTML tags. |
335
|
|
|
* |
336
|
|
|
* @param $text |
337
|
|
|
* |
338
|
|
|
* @return null|string|string[] |
339
|
|
|
*/ |
340
|
|
|
public function removeEmptyTag($text) |
341
|
|
|
{ |
342
|
|
|
return preg_replace('/<[^\/>]*>([\s| ]?)*<\/[^>]*>/', '', $text); |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
/** |
346
|
|
|
* 清除段首缩进. |
347
|
|
|
* Remove indent. |
348
|
|
|
* |
349
|
|
|
* @param $text |
350
|
|
|
* |
351
|
|
|
* @return null|string|string[] |
352
|
|
|
*/ |
353
|
|
|
public function removeIndent($text) |
354
|
|
|
{ |
355
|
|
|
return preg_replace('/<p([^>]*)>(\s| )+/', '<p${1}>', $text); |
356
|
|
|
} |
357
|
|
|
} |
358
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.