Completed
Push — psr2 ( b47790...42c62e )
by Andreas
02:47
created

Asian::splitAsianWords()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Utf8;
4
5
/**
6
 * Methods and constants to handle Asian "words"
7
 *
8
 * This uses a crude regexp to determine which parts of an Asian string should be treated as words.
9
 * This is necessary because in some Asian languages a single unicode char represents a whole idea
10
 * without spaces separating them.
11
 */
12
class Asian
13
{
14
15
    /**
16
     * This defines a non-capturing group for the use in regular expressions to match any asian character that
17
     * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
18
     * http://en.wikipedia.org/wiki/Unicode_block
19
     */
20
    const REGEXP =
21
        '(?:' .
22
23
        '[\x{0E00}-\x{0E7F}]' . // Thai
24
25
        '|' .
26
27
        '[' .
28
        '\x{2E80}-\x{3040}' .  // CJK -> Hangul
29
        '\x{309D}-\x{30A0}' .
30
        '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' .
31
        '\x{F900}-\x{FAFF}' .  // CJK Compatibility Ideographs
32
        '\x{FE30}-\x{FE4F}' .  // CJK Compatibility Forms
33
        "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B
34
        "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C
35
        "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D
36
        "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement
37
        ']' .
38
39
        '|' .
40
41
        '[' .                // Hiragana/Katakana (can be two characters)
42
        '\x{3042}\x{3044}\x{3046}\x{3048}' .
43
        '\x{304A}-\x{3062}\x{3064}-\x{3082}' .
44
        '\x{3084}\x{3086}\x{3088}-\x{308D}' .
45
        '\x{308F}-\x{3094}' .
46
        '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' .
47
        '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' .
48
        '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' .
49
        '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' .
50
        '][' .
51
        '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' .
52
        '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' .
53
        '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' .
54
        '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' .
55
        '\x{31F0}-\x{31FF}' .
56
        ']?' .
57
        ')';
58
59
60
    /**
61
     * Check if the given term contains Asian word characters
62
     *
63
     * @param string $term
64
     * @return bool
65
     */
66
    public static function isAsianWords($term)
67
    {
68
        return (bool)preg_match('/' . self::REGEXP . '/u', $term);
69
    }
70
71
    /**
72
     * Surround all Asian words in the given text with the given separator
73
     *
74
     * @param string $text Original text containing asian words
75
     * @param string $sep the separator to use
76
     * @return string Text with separated asian words
77
     */
78
    public static function separateAsianWords($text, $sep = ' ')
79
    {
80
        // handle asian chars as single words (may fail on older PHP version)
81
        $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text);
82
        if (!is_null($asia)) $text = $asia; // recover from regexp falure
83
84
        return $text;
85
    }
86
87
    /**
88
     * Split the given text into separate parts
89
     *
90
     * Each part is either a non-asian string, or a single asian word
91
     *
92
     * @param string $term
93
     * @return string[]
94
     */
95
    public static function splitAsianWords($term)
96
    {
97
        return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
98
    }
99
}
100