|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace dokuwiki\Utf8; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Methods to assess and clean UTF-8 strings |
|
7
|
|
|
*/ |
|
8
|
|
|
class Clean |
|
9
|
|
|
{ |
|
10
|
|
|
/** |
|
11
|
|
|
* Checks if a string contains 7bit ASCII only |
|
12
|
|
|
* |
|
13
|
|
|
* @author Andreas Haerter <[email protected]> |
|
14
|
|
|
* |
|
15
|
|
|
* @param string $str |
|
16
|
|
|
* @return bool |
|
17
|
|
|
*/ |
|
18
|
|
|
public static function isASCII($str) |
|
19
|
|
|
{ |
|
20
|
|
|
return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); |
|
21
|
|
|
} |
|
22
|
|
|
|
|
23
|
|
|
/** |
|
24
|
|
|
* Tries to detect if a string is in Unicode encoding |
|
25
|
|
|
* |
|
26
|
|
|
* @author <[email protected]> |
|
27
|
|
|
* @link http://php.net/manual/en/function.utf8-encode.php |
|
28
|
|
|
* |
|
29
|
|
|
* @param string $str |
|
30
|
|
|
* @return bool |
|
31
|
|
|
*/ |
|
32
|
|
|
public static function isUtf8($str) |
|
33
|
|
|
{ |
|
34
|
|
|
$len = strlen($str); |
|
35
|
|
|
for ($i = 0; $i < $len; $i++) { |
|
36
|
|
|
$b = ord($str[$i]); |
|
37
|
|
|
if ($b < 0x80) continue; # 0bbbbbbb |
|
38
|
|
|
elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb |
|
39
|
|
|
elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb |
|
40
|
|
|
elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb |
|
41
|
|
|
elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb |
|
42
|
|
|
elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b |
|
43
|
|
|
else return false; # Does not match any model |
|
44
|
|
|
|
|
45
|
|
|
for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ? |
|
46
|
|
|
if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80)) |
|
47
|
|
|
return false; |
|
48
|
|
|
} |
|
49
|
|
|
} |
|
50
|
|
|
return true; |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
|
|
/** |
|
54
|
|
|
* Strips all high byte chars |
|
55
|
|
|
* |
|
56
|
|
|
* Returns a pure ASCII7 string |
|
57
|
|
|
* |
|
58
|
|
|
* @author Andreas Gohr <[email protected]> |
|
59
|
|
|
* |
|
60
|
|
|
* @param string $str |
|
61
|
|
|
* @return string |
|
62
|
|
|
*/ |
|
63
|
|
|
public static function strip($str) |
|
64
|
|
|
{ |
|
65
|
|
|
$ascii = ''; |
|
66
|
|
|
$len = strlen($str); |
|
67
|
|
|
for ($i = 0; $i < $len; $i++) { |
|
68
|
|
|
if (ord($str{$i}) < 128) { |
|
69
|
|
|
$ascii .= $str{$i}; |
|
70
|
|
|
} |
|
71
|
|
|
} |
|
72
|
|
|
return $ascii; |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
/** |
|
76
|
|
|
* Removes special characters (nonalphanumeric) from a UTF-8 string |
|
77
|
|
|
* |
|
78
|
|
|
* This function adds the controlchars 0x00 to 0x19 to the array of |
|
79
|
|
|
* stripped chars (they are not included in $UTF8_SPECIAL_CHARS) |
|
80
|
|
|
* |
|
81
|
|
|
* @author Andreas Gohr <[email protected]> |
|
82
|
|
|
* |
|
83
|
|
|
* @param string $string The UTF8 string to strip of special chars |
|
84
|
|
|
* @param string $repl Replace special with this string |
|
85
|
|
|
* @param string $additional Additional chars to strip (used in regexp char class) |
|
86
|
|
|
* @return string |
|
87
|
|
|
*/ |
|
88
|
|
|
public static function stripspecials($string, $repl = '', $additional = '') |
|
89
|
|
|
{ |
|
90
|
|
|
static $specials = null; |
|
91
|
|
|
if ($specials === null) { |
|
92
|
|
|
$specials = preg_quote(Table::specialChars(), '/'); |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string); |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
/** |
|
99
|
|
|
* Replace bad bytes with an alternative character |
|
100
|
|
|
* |
|
101
|
|
|
* ASCII character is recommended for replacement char |
|
102
|
|
|
* |
|
103
|
|
|
* PCRE Pattern to locate bad bytes in a UTF-8 string |
|
104
|
|
|
* Comes from W3 FAQ: Multilingual Forms |
|
105
|
|
|
* Note: modified to include full ASCII range including control chars |
|
106
|
|
|
* |
|
107
|
|
|
* @author Harry Fuecks <[email protected]> |
|
108
|
|
|
* @see http://www.w3.org/International/questions/qa-forms-utf-8 |
|
109
|
|
|
* |
|
110
|
|
|
* @param string $str to search |
|
111
|
|
|
* @param string $replace to replace bad bytes with (defaults to '?') - use ASCII |
|
112
|
|
|
* @return string |
|
113
|
|
|
*/ |
|
114
|
|
|
public static function replaceBadBytes($str, $replace = '') |
|
115
|
|
|
{ |
|
116
|
|
|
$UTF8_BAD = |
|
117
|
|
|
'([\x00-\x7F]' . # ASCII (including control chars) |
|
118
|
|
|
'|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte |
|
119
|
|
|
'|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs |
|
120
|
|
|
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte |
|
121
|
|
|
'|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates |
|
122
|
|
|
'|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 |
|
123
|
|
|
'|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 |
|
124
|
|
|
'|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 |
|
125
|
|
|
'|(.{1}))'; # invalid byte |
|
126
|
|
|
ob_start(); |
|
127
|
|
|
while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { |
|
128
|
|
|
if (!isset($matches[2])) { |
|
129
|
|
|
echo $matches[0]; |
|
130
|
|
|
} else { |
|
131
|
|
|
echo $replace; |
|
132
|
|
|
} |
|
133
|
|
|
$str = substr($str, strlen($matches[0])); |
|
134
|
|
|
} |
|
135
|
|
|
return ob_get_clean(); |
|
136
|
|
|
} |
|
137
|
|
|
|
|
138
|
|
|
|
|
139
|
|
|
/** |
|
140
|
|
|
* Replace accented UTF-8 characters by unaccented ASCII-7 equivalents |
|
141
|
|
|
* |
|
142
|
|
|
* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) |
|
143
|
|
|
* letters. Default is to deaccent both cases ($case = 0) |
|
144
|
|
|
* |
|
145
|
|
|
* @author Andreas Gohr <[email protected]> |
|
146
|
|
|
* |
|
147
|
|
|
* @param string $string |
|
148
|
|
|
* @param int $case |
|
149
|
|
|
* @return string |
|
150
|
|
|
*/ |
|
151
|
|
|
public static function deaccent($string, $case = 0) |
|
152
|
|
|
{ |
|
153
|
|
|
if ($case <= 0) { |
|
154
|
|
|
$string = strtr($string, Table::lowerAccents()); |
|
155
|
|
|
} |
|
156
|
|
|
if ($case >= 0) { |
|
157
|
|
|
$string = strtr($string, Table::upperAccents()); |
|
158
|
|
|
} |
|
159
|
|
|
return $string; |
|
160
|
|
|
} |
|
161
|
|
|
|
|
162
|
|
|
/** |
|
163
|
|
|
* Romanize a non-latin string |
|
164
|
|
|
* |
|
165
|
|
|
* @author Andreas Gohr <[email protected]> |
|
166
|
|
|
* |
|
167
|
|
|
* @param string $string |
|
168
|
|
|
* @return string |
|
169
|
|
|
*/ |
|
170
|
|
|
public static function romanize($string) |
|
171
|
|
|
{ |
|
172
|
|
|
if (self::isASCII($string)) return $string; //nothing to do |
|
173
|
|
|
|
|
174
|
|
|
return strtr($string, Table::romanization()); |
|
175
|
|
|
} |
|
176
|
|
|
|
|
177
|
|
|
/** |
|
178
|
|
|
* adjust a byte index into a utf8 string to a utf8 character boundary |
|
179
|
|
|
* |
|
180
|
|
|
* @author chris smith <[email protected]> |
|
181
|
|
|
* |
|
182
|
|
|
* @param string $str utf8 character string |
|
183
|
|
|
* @param int $i byte index into $str |
|
184
|
|
|
* @param bool $next direction to search for boundary, false = up (current character) true = down (next character) |
|
185
|
|
|
* @return int byte index into $str now pointing to a utf8 character boundary |
|
186
|
|
|
*/ |
|
187
|
|
|
public static function correctIdx($str, $i, $next = false) |
|
188
|
|
|
{ |
|
189
|
|
|
|
|
190
|
|
|
if ($i <= 0) return 0; |
|
191
|
|
|
|
|
192
|
|
|
$limit = strlen($str); |
|
193
|
|
|
if ($i >= $limit) return $limit; |
|
194
|
|
|
|
|
195
|
|
|
if ($next) { |
|
196
|
|
|
while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++; |
|
197
|
|
|
} else { |
|
198
|
|
|
while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--; |
|
199
|
|
|
} |
|
200
|
|
|
|
|
201
|
|
|
return $i; |
|
202
|
|
|
} |
|
203
|
|
|
|
|
204
|
|
|
} |
|
205
|
|
|
|