1 | <?php |
||
2 | |||
3 | /* |
||
4 | * This file is part of Utf8. |
||
5 | * (c) Fabrice de Stefanis / https://github.com/fab2s/Utf8 |
||
6 | * This source file is licensed under the MIT license which you will |
||
7 | * find in the LICENSE file or at https://opensource.org/licenses/MIT |
||
8 | */ |
||
9 | |||
10 | namespace fab2s\Utf8; |
||
11 | |||
12 | /** |
||
13 | * UTF8 string manipulations |
||
14 | */ |
||
15 | class Utf8 |
||
16 | { |
||
17 | /** |
||
18 | * utf8 charset name in mb dialect |
||
19 | */ |
||
20 | const ENC_UTF8 = 'UTF-8'; |
||
21 | |||
22 | /** |
||
23 | * \Normalizer::NFC |
||
24 | */ |
||
25 | const NORMALIZE_NFC = 4; |
||
26 | |||
27 | /** |
||
28 | * \Normalizer::NFD |
||
29 | */ |
||
30 | const NORMALIZE_NFD = 2; |
||
31 | |||
32 | /** |
||
33 | * @var bool |
||
34 | */ |
||
35 | protected static $normalizerSupport = false; |
||
36 | |||
37 | /** |
||
38 | * @var bool |
||
39 | */ |
||
40 | protected static $ordSupport = false; |
||
41 | |||
42 | /** |
||
43 | * strrpos |
||
44 | * |
||
45 | * @param string $haystack |
||
46 | * @param string $needle |
||
47 | * @param int $offset |
||
48 | * |
||
49 | * @return int|false |
||
50 | */ |
||
51 | public static function strrpos(string $haystack, string $needle, ?int $offset = 0) |
||
52 | { |
||
53 | // Emulate strrpos behaviour (no warning) |
||
54 | if (empty($haystack)) { |
||
55 | return false; |
||
56 | } |
||
57 | |||
58 | return mb_strrpos($haystack, $needle, $offset, static::ENC_UTF8); |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
59 | } |
||
60 | |||
61 | /** |
||
62 | * strpos |
||
63 | * |
||
64 | * @param string $haystack |
||
65 | * @param string $needle |
||
66 | * @param int $offset |
||
67 | * |
||
68 | * @return int|false |
||
69 | */ |
||
70 | public static function strpos(string $haystack, string $needle, $offset = 0) |
||
71 | { |
||
72 | return mb_strpos($haystack, $needle, $offset, static::ENC_UTF8); |
||
73 | } |
||
74 | |||
75 | /** |
||
76 | * strtolower |
||
77 | * |
||
78 | * @param string $string |
||
79 | * |
||
80 | * @return string |
||
81 | */ |
||
82 | public static function strtolower(string $string): string |
||
83 | { |
||
84 | return mb_strtolower($string, static::ENC_UTF8); |
||
85 | } |
||
86 | |||
87 | /** |
||
88 | * strtoupper |
||
89 | * |
||
90 | * @param string $string |
||
91 | * |
||
92 | * @return string |
||
93 | */ |
||
94 | public static function strtoupper(string $string): string |
||
95 | { |
||
96 | return mb_strtoupper($string, static::ENC_UTF8); |
||
97 | } |
||
98 | |||
99 | /** |
||
100 | * @param string $string |
||
101 | * @param int $offset |
||
102 | * @param int|null $length |
||
103 | * |
||
104 | * @return string |
||
105 | */ |
||
106 | public static function substr(string $string, int $offset, ?int $length = null): string |
||
107 | { |
||
108 | return mb_substr($string, $offset, $length === null ? mb_strlen($string, static::ENC_UTF8) : $length, static::ENC_UTF8); |
||
109 | } |
||
110 | |||
111 | /** |
||
112 | * strlen |
||
113 | * |
||
114 | * @param string $string |
||
115 | * |
||
116 | * @return int |
||
117 | */ |
||
118 | public static function strlen(string $string): int |
||
119 | { |
||
120 | return mb_strlen($string, static::ENC_UTF8); |
||
121 | } |
||
122 | |||
123 | /** |
||
124 | * ucfirst |
||
125 | * |
||
126 | * @param string $string |
||
127 | * |
||
128 | * @return string |
||
129 | */ |
||
130 | public static function ucfirst(string $string): string |
||
131 | { |
||
132 | switch (static::strlen($string)) { |
||
133 | case 0: |
||
134 | return ''; |
||
135 | case 1: |
||
136 | return static::strtoupper($string); |
||
137 | default: |
||
138 | return static::strtoupper(static::substr($string, 0, 1)) . static::substr($string, 1); |
||
139 | } |
||
140 | } |
||
141 | |||
142 | /** |
||
143 | * @param string $string |
||
144 | * |
||
145 | * @return string |
||
146 | */ |
||
147 | public static function ucwords(string $string): string |
||
148 | { |
||
149 | return mb_convert_case($string, MB_CASE_TITLE, static::ENC_UTF8); |
||
150 | } |
||
151 | |||
152 | /** |
||
153 | * ord |
||
154 | * |
||
155 | * @param string $chr |
||
156 | * |
||
157 | * @return int|false |
||
158 | */ |
||
159 | public static function ord(string $chr) |
||
160 | { |
||
161 | if (($strLen = strlen($chr)) === 0) { |
||
162 | return false; |
||
163 | } |
||
164 | |||
165 | if (static::$ordSupport) { |
||
166 | return mb_ord($chr, static::ENC_UTF8); |
||
167 | } |
||
168 | |||
169 | return static::ordCompat($chr, $strLen); |
||
170 | } |
||
171 | |||
172 | /** |
||
173 | * chr |
||
174 | * |
||
175 | * @param int $num |
||
176 | * |
||
177 | * @return string|false |
||
178 | */ |
||
179 | public static function chr(int $num) |
||
180 | { |
||
181 | if ($num === 0) { |
||
182 | return "\0"; |
||
183 | } |
||
184 | |||
185 | if (static::$ordSupport) { |
||
186 | return mb_chr($num, static::ENC_UTF8); |
||
187 | } |
||
188 | |||
189 | // prolly the fastest |
||
190 | $result = mb_convert_encoding($input = '&#' . $num . ';', static::ENC_UTF8, 'HTML-ENTITIES'); |
||
191 | |||
192 | return $result !== $input ? $result : false; |
||
0 ignored issues
–
show
|
|||
193 | } |
||
194 | |||
195 | /** |
||
196 | * normalize an utf8 string to canonical form |
||
197 | * Default to NFC |
||
198 | * |
||
199 | * @see https://stackoverflow.com/a/7934397/7630496 |
||
200 | * |
||
201 | * @param string $string |
||
202 | * @param int $canonicalForm |
||
203 | * |
||
204 | * @return string |
||
205 | */ |
||
206 | public static function normalize(string $string, int $canonicalForm = self::NORMALIZE_NFC): string |
||
207 | { |
||
208 | if (static::$normalizerSupport) { |
||
209 | return \Normalizer::normalize($string, $canonicalForm); |
||
210 | } |
||
211 | |||
212 | return $string; |
||
213 | } |
||
214 | |||
215 | /** |
||
216 | * tels if a string contains utf8 chars (which may not be valid) |
||
217 | * |
||
218 | * @param string $string |
||
219 | * |
||
220 | * @return bool |
||
221 | */ |
||
222 | public static function hasUtf8(string $string): bool |
||
223 | { |
||
224 | // From http://w3.org/International/questions/qa-forms-utf-8.html |
||
225 | // non-overlong 2-byte|excluding overlong|straight 3-byte|excluding surrogates|planes 1-3|planes 4-15|plane 16 |
||
226 | return (bool) preg_match('%(?:[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF] |\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})+%xs', $string); |
||
227 | } |
||
228 | |||
229 | /** |
||
230 | * @param string $string |
||
231 | * |
||
232 | * @return bool |
||
233 | */ |
||
234 | public static function isUtf8(string $string): bool |
||
235 | { |
||
236 | return (bool) preg_match('//u', $string); |
||
237 | } |
||
238 | |||
239 | /** |
||
240 | * Remove any 4byte multi bit chars, useful to make sure we can insert in utf8-nonMb4 db tables |
||
241 | * |
||
242 | * @param string $string |
||
243 | * @param string $replace |
||
244 | * |
||
245 | * @return string |
||
246 | */ |
||
247 | public static function replaceMb4(string $string, string $replace = ''): string |
||
248 | { |
||
249 | return preg_replace('%(?: |
||
250 | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
||
251 | | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
||
252 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||
253 | )%xs', $replace, $string); |
||
254 | } |
||
255 | |||
256 | /** |
||
257 | * @param bool $disable |
||
258 | * |
||
259 | * @return bool |
||
260 | */ |
||
261 | public static function normalizerSupport(bool $disable = false): bool |
||
262 | { |
||
263 | if ($disable) { |
||
264 | return static::$normalizerSupport = false; |
||
265 | } |
||
266 | |||
267 | return static::$normalizerSupport = function_exists('normalizer_normalize'); |
||
268 | } |
||
269 | |||
270 | /** |
||
271 | * Performs the few compatibility operations |
||
272 | */ |
||
273 | public static function support() |
||
274 | { |
||
275 | static::normalizerSupport(); |
||
276 | static::$ordSupport = function_exists('mb_ord'); |
||
277 | } |
||
278 | |||
279 | /** |
||
280 | * @param string $chr |
||
281 | * @param int $strLen |
||
282 | * |
||
283 | * @return int|false |
||
284 | */ |
||
285 | public static function ordCompat(string $chr, int $strLen) |
||
286 | { |
||
287 | switch ($strLen) { |
||
288 | case 1: |
||
289 | return ord($chr); |
||
290 | case 2: |
||
291 | return ((ord($chr[0]) & 0x1F) << 6) | (ord($chr[1]) & 0x3F); |
||
292 | case 3: |
||
293 | return ((ord($chr[0]) & 0x0F) << 12) | ((ord($chr[1]) & 0x3F) << 6) | (ord($chr[2]) & 0x3F); |
||
294 | case 4: |
||
295 | return ((ord($chr[0]) & 0x07) << 18) | ((ord($chr[1]) & 0x3F) << 12) | ((ord($chr[2]) & 0x3F) << 6) | (ord($chr[3]) & 0x3F); |
||
296 | default: |
||
297 | return false; |
||
298 | } |
||
299 | } |
||
300 | } |
||
301 | |||
302 | // OMG a dynamic static anti pattern ^^ |
||
303 | Utf8::support(); |
||
304 |