1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace dokuwiki\Utf8; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Methods and constants to handle Asian "words" |
7
|
|
|
* |
8
|
|
|
* This uses a crude regexp to determine which parts of an Asian string should be treated as words. |
9
|
|
|
* This is necessary because in some Asian languages a single unicode char represents a whole idea |
10
|
|
|
* without spaces separating them. |
11
|
|
|
*/ |
12
|
|
|
class Asian |
13
|
|
|
{ |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* This defines a non-capturing group for the use in regular expressions to match any asian character that |
17
|
|
|
* needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from |
18
|
|
|
* http://en.wikipedia.org/wiki/Unicode_block |
19
|
|
|
*/ |
20
|
|
|
const REGEXP = |
21
|
|
|
'(?:' . |
22
|
|
|
|
23
|
|
|
'[\x{0E00}-\x{0E7F}]' . // Thai |
24
|
|
|
|
25
|
|
|
'|' . |
26
|
|
|
|
27
|
|
|
'[' . |
28
|
|
|
'\x{2E80}-\x{3040}' . // CJK -> Hangul |
29
|
|
|
'\x{309D}-\x{30A0}' . |
30
|
|
|
'\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . |
31
|
|
|
'\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs |
32
|
|
|
'\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms |
33
|
|
|
"\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B |
34
|
|
|
"\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C |
35
|
|
|
"\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D |
36
|
|
|
"\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement |
37
|
|
|
']' . |
38
|
|
|
|
39
|
|
|
'|' . |
40
|
|
|
|
41
|
|
|
'[' . // Hiragana/Katakana (can be two characters) |
42
|
|
|
'\x{3042}\x{3044}\x{3046}\x{3048}' . |
43
|
|
|
'\x{304A}-\x{3062}\x{3064}-\x{3082}' . |
44
|
|
|
'\x{3084}\x{3086}\x{3088}-\x{308D}' . |
45
|
|
|
'\x{308F}-\x{3094}' . |
46
|
|
|
'\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . |
47
|
|
|
'\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . |
48
|
|
|
'\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . |
49
|
|
|
'\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . |
50
|
|
|
'][' . |
51
|
|
|
'\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . |
52
|
|
|
'\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . |
53
|
|
|
'\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . |
54
|
|
|
'\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . |
55
|
|
|
'\x{31F0}-\x{31FF}' . |
56
|
|
|
']?' . |
57
|
|
|
')'; |
58
|
|
|
|
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Check if the given term contains Asian word characters |
62
|
|
|
* |
63
|
|
|
* @param string $term |
64
|
|
|
* @return bool |
65
|
|
|
*/ |
66
|
|
|
public static function isAsianWords($term) |
67
|
|
|
{ |
68
|
|
|
return (bool)preg_match('/' . self::REGEXP . '/u', $term); |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Surround all Asian words in the given text with the given separator |
73
|
|
|
* |
74
|
|
|
* @param string $text Original text containing asian words |
75
|
|
|
* @param string $sep the separator to use |
76
|
|
|
* @return string Text with separated asian words |
77
|
|
|
*/ |
78
|
|
|
public static function separateAsianWords($text, $sep = ' ') |
79
|
|
|
{ |
80
|
|
|
// handle asian chars as single words (may fail on older PHP version) |
81
|
|
|
$asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text); |
82
|
|
|
if (!is_null($asia)) $text = $asia; // recover from regexp falure |
83
|
|
|
|
84
|
|
|
return $text; |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Split the given text into separate parts |
89
|
|
|
* |
90
|
|
|
* Each part is either a non-asian string, or a single asian word |
91
|
|
|
* |
92
|
|
|
* @param string $term |
93
|
|
|
* @return string[] |
94
|
|
|
*/ |
95
|
|
|
public static function splitAsianWords($term) |
96
|
|
|
{ |
97
|
|
|
return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
|