1 | <?php |
||
12 | class Asian |
||
13 | { |
||
14 | |||
15 | /** |
||
16 | * This defines a non-capturing group for the use in regular expressions to match any asian character that |
||
17 | * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from |
||
18 | * http://en.wikipedia.org/wiki/Unicode_block |
||
19 | */ |
||
20 | const REGEXP = |
||
21 | '(?:' . |
||
22 | |||
23 | '[\x{0E00}-\x{0E7F}]' . // Thai |
||
24 | |||
25 | '|' . |
||
26 | |||
27 | '[' . |
||
28 | '\x{2E80}-\x{3040}' . // CJK -> Hangul |
||
29 | '\x{309D}-\x{30A0}' . |
||
30 | '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . |
||
31 | '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs |
||
32 | '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms |
||
33 | "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B |
||
34 | "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C |
||
35 | "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D |
||
36 | "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement |
||
37 | ']' . |
||
38 | |||
39 | '|' . |
||
40 | |||
41 | '[' . // Hiragana/Katakana (can be two characters) |
||
42 | '\x{3042}\x{3044}\x{3046}\x{3048}' . |
||
43 | '\x{304A}-\x{3062}\x{3064}-\x{3082}' . |
||
44 | '\x{3084}\x{3086}\x{3088}-\x{308D}' . |
||
45 | '\x{308F}-\x{3094}' . |
||
46 | '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . |
||
47 | '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . |
||
48 | '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . |
||
49 | '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . |
||
50 | '][' . |
||
51 | '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . |
||
52 | '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . |
||
53 | '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . |
||
54 | '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . |
||
55 | '\x{31F0}-\x{31FF}' . |
||
56 | ']?' . |
||
57 | ')'; |
||
58 | |||
59 | |||
60 | /** |
||
61 | * Check if the given term contains Asian word characters |
||
62 | * |
||
63 | * @param string $term |
||
64 | * @return bool |
||
65 | */ |
||
66 | public static function isAsianWords($term) |
||
70 | |||
71 | /** |
||
72 | * Surround all Asian words in the given text with the given separator |
||
73 | * |
||
74 | * @param string $text Original text containing asian words |
||
75 | * @param string $sep the separator to use |
||
76 | * @return string Text with separated asian words |
||
77 | */ |
||
78 | public static function separateAsianWords($text, $sep = ' ') |
||
86 | |||
87 | /** |
||
88 | * Split the given text into separate parts |
||
89 | * |
||
90 | * Each part is either a non-asian string, or a single asian word |
||
91 | * |
||
92 | * @param string $term |
||
93 | * @return string[] |
||
94 | */ |
||
95 | public static function splitAsianWords($term) |
||
99 | } |
||
100 |