|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace dokuwiki\Utf8; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Methods and constants to handle Asian "words" |
|
7
|
|
|
* |
|
8
|
|
|
* This uses a crude regexp to determine which parts of an Asian string should be treated as words. |
|
9
|
|
|
* This is necessary because in some Asian languages a single unicode char represents a whole idea |
|
10
|
|
|
* without spaces separating them. |
|
11
|
|
|
*/ |
|
12
|
|
|
class Asian |
|
13
|
|
|
{ |
|
14
|
|
|
|
|
15
|
|
|
/** |
|
16
|
|
|
* This defines a non-capturing group for the use in regular expressions to match any asian character that |
|
17
|
|
|
* needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from |
|
18
|
|
|
* http://en.wikipedia.org/wiki/Unicode_block |
|
19
|
|
|
*/ |
|
20
|
|
|
const REGEXP = |
|
21
|
|
|
'(?:' . |
|
22
|
|
|
|
|
23
|
|
|
'[\x{0E00}-\x{0E7F}]' . // Thai |
|
24
|
|
|
|
|
25
|
|
|
'|' . |
|
26
|
|
|
|
|
27
|
|
|
'[' . |
|
28
|
|
|
'\x{2E80}-\x{3040}' . // CJK -> Hangul |
|
29
|
|
|
'\x{309D}-\x{30A0}' . |
|
30
|
|
|
'\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . |
|
31
|
|
|
'\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs |
|
32
|
|
|
'\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms |
|
33
|
|
|
"\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B |
|
34
|
|
|
"\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C |
|
35
|
|
|
"\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D |
|
36
|
|
|
"\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement |
|
37
|
|
|
']' . |
|
38
|
|
|
|
|
39
|
|
|
'|' . |
|
40
|
|
|
|
|
41
|
|
|
'[' . // Hiragana/Katakana (can be two characters) |
|
42
|
|
|
'\x{3042}\x{3044}\x{3046}\x{3048}' . |
|
43
|
|
|
'\x{304A}-\x{3062}\x{3064}-\x{3082}' . |
|
44
|
|
|
'\x{3084}\x{3086}\x{3088}-\x{308D}' . |
|
45
|
|
|
'\x{308F}-\x{3094}' . |
|
46
|
|
|
'\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . |
|
47
|
|
|
'\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . |
|
48
|
|
|
'\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . |
|
49
|
|
|
'\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . |
|
50
|
|
|
'][' . |
|
51
|
|
|
'\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . |
|
52
|
|
|
'\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . |
|
53
|
|
|
'\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . |
|
54
|
|
|
'\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . |
|
55
|
|
|
'\x{31F0}-\x{31FF}' . |
|
56
|
|
|
']?' . |
|
57
|
|
|
')'; |
|
58
|
|
|
|
|
59
|
|
|
|
|
60
|
|
|
/** |
|
61
|
|
|
* Check if the given term contains Asian word characters |
|
62
|
|
|
* |
|
63
|
|
|
* @param string $term |
|
64
|
|
|
* @return bool |
|
65
|
|
|
*/ |
|
66
|
|
|
public static function isAsianWords($term) |
|
67
|
|
|
{ |
|
68
|
|
|
return (bool)preg_match('/' . self::REGEXP . '/u', $term); |
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Surround all Asian words in the given text with the given separator |
|
73
|
|
|
* |
|
74
|
|
|
* @param string $text Original text containing asian words |
|
75
|
|
|
* @param string $sep the separator to use |
|
76
|
|
|
* @return string Text with separated asian words |
|
77
|
|
|
*/ |
|
78
|
|
|
public static function separateAsianWords($text, $sep = ' ') |
|
79
|
|
|
{ |
|
80
|
|
|
// handle asian chars as single words (may fail on older PHP version) |
|
81
|
|
|
$asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text); |
|
82
|
|
|
if (!is_null($asia)) $text = $asia; // recover from regexp falure |
|
83
|
|
|
|
|
84
|
|
|
return $text; |
|
85
|
|
|
} |
|
86
|
|
|
|
|
87
|
|
|
/** |
|
88
|
|
|
* Split the given text into separate parts |
|
89
|
|
|
* |
|
90
|
|
|
* Each part is either a non-asian string, or a single asian word |
|
91
|
|
|
* |
|
92
|
|
|
* @param string $term |
|
93
|
|
|
* @return string[] |
|
94
|
|
|
*/ |
|
95
|
|
|
public static function splitAsianWords($term) |
|
96
|
|
|
{ |
|
97
|
|
|
return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
|
98
|
|
|
} |
|
99
|
|
|
} |
|
100
|
|
|
|