Failed Conditions
Push — psr2 ( 2b9c4a...b47790 )
by Andreas
05:48 queued 02:59
created

Clean::correctIdx()   B

Complexity

Conditions 8
Paths 6

Size

Total Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
nc 6
nop 3
dl 0
loc 16
rs 8.4444
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Utf8;
4
5
/**
6
 * Methods to assess and clean UTF-8 strings
7
 */
8
class Clean
9
{
10
    /**
11
     * Checks if a string contains 7bit ASCII only
12
     *
13
     * @author Andreas Haerter <[email protected]>
14
     *
15
     * @param string $str
16
     * @return bool
17
     */
18
    public static function isASCII($str)
19
    {
20
        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
21
    }
22
23
    /**
24
     * Tries to detect if a string is in Unicode encoding
25
     *
26
     * @author <[email protected]>
27
     * @link   http://php.net/manual/en/function.utf8-encode.php
28
     *
29
     * @param string $str
30
     * @return bool
31
     */
32
    public static function isUtf8($str)
33
    {
34
        $len = strlen($str);
35
        for ($i = 0; $i < $len; $i++) {
36
            $b = ord($str[$i]);
37
            if ($b < 0x80) continue; # 0bbbbbbb
38
            elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39
            elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40
            elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41
            elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42
            elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43
            else return false; # Does not match any model
44
45
            for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
46
                if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
47
                    return false;
48
            }
49
        }
50
        return true;
51
    }
52
53
    /**
54
     * Strips all high byte chars
55
     *
56
     * Returns a pure ASCII7 string
57
     *
58
     * @author Andreas Gohr <[email protected]>
59
     *
60
     * @param string $str
61
     * @return string
62
     */
63
    public static function strip($str)
64
    {
65
        $ascii = '';
66
        $len = strlen($str);
67
        for ($i = 0; $i < $len; $i++) {
68
            if (ord($str{$i}) < 128) {
69
                $ascii .= $str{$i};
70
            }
71
        }
72
        return $ascii;
73
    }
74
75
    /**
76
     * Removes special characters (nonalphanumeric) from a UTF-8 string
77
     *
78
     * This function adds the controlchars 0x00 to 0x19 to the array of
79
     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
80
     *
81
     * @author Andreas Gohr <[email protected]>
82
     *
83
     * @param  string $string The UTF8 string to strip of special chars
84
     * @param  string $repl Replace special with this string
85
     * @param  string $additional Additional chars to strip (used in regexp char class)
86
     * @return string
87
     */
88
    public static function stripspecials($string, $repl = '', $additional = '')
89
    {
90
        static $specials = null;
91
        if ($specials === null) {
92
            $specials = preg_quote(Table::specialChars(), '/');
93
        }
94
95
        return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
96
    }
97
98
    /**
99
     * Replace bad bytes with an alternative character
100
     *
101
     * ASCII character is recommended for replacement char
102
     *
103
     * PCRE Pattern to locate bad bytes in a UTF-8 string
104
     * Comes from W3 FAQ: Multilingual Forms
105
     * Note: modified to include full ASCII range including control chars
106
     *
107
     * @author Harry Fuecks <[email protected]>
108
     * @see http://www.w3.org/International/questions/qa-forms-utf-8
109
     *
110
     * @param string $str to search
111
     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
112
     * @return string
113
     */
114
    public static function replaceBadBytes($str, $replace = '')
115
    {
116
        $UTF8_BAD =
117
            '([\x00-\x7F]' .                          # ASCII (including control chars)
118
            '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
119
            '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
120
            '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
121
            '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
122
            '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
123
            '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
124
            '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
125
            '|(.{1}))';                               # invalid byte
126
        ob_start();
127
        while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128
            if (!isset($matches[2])) {
129
                echo $matches[0];
130
            } else {
131
                echo $replace;
132
            }
133
            $str = substr($str, strlen($matches[0]));
134
        }
135
        return ob_get_clean();
136
    }
137
138
139
    /**
140
     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
141
     *
142
     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143
     * letters. Default is to deaccent both cases ($case = 0)
144
     *
145
     * @author Andreas Gohr <[email protected]>
146
     *
147
     * @param string $string
148
     * @param int $case
149
     * @return string
150
     */
151
    public static function deaccent($string, $case = 0)
152
    {
153
        if ($case <= 0) {
154
            $string = strtr($string, Table::lowerAccents());
155
        }
156
        if ($case >= 0) {
157
            $string = strtr($string, Table::upperAccents());
158
        }
159
        return $string;
160
    }
161
162
    /**
163
     * Romanize a non-latin string
164
     *
165
     * @author Andreas Gohr <[email protected]>
166
     *
167
     * @param string $string
168
     * @return string
169
     */
170
    public static function romanize($string)
171
    {
172
        if (self::isASCII($string)) return $string; //nothing to do
173
174
        return strtr($string, Table::romanization());
175
    }
176
177
    /**
178
     * adjust a byte index into a utf8 string to a utf8 character boundary
179
     *
180
     * @author       chris smith <[email protected]>
181
     *
182
     * @param string $str utf8 character string
183
     * @param int $i byte index into $str
184
     * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185
     * @return int byte index into $str now pointing to a utf8 character boundary
186
     */
187
    public static function correctIdx($str, $i, $next = false)
188
    {
189
190
        if ($i <= 0) return 0;
191
192
        $limit = strlen($str);
193
        if ($i >= $limit) return $limit;
194
195
        if ($next) {
196
            while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
197
        } else {
198
            while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
199
        }
200
201
        return $i;
202
    }
203
204
}
205