Completed
Push — master ( 1a3b2f...b50fb4 )
by cam
09:59
created

charsets.php ➔ unicode_to_utf_8()   B

Complexity

Conditions 8
Paths 4

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
nc 4
nop 1
dl 0
loc 28
rs 8.4444
c 0
b 0
f 0
1
<?php
2
3
/***************************************************************************\
4
 *  SPIP, Systeme de publication pour l'internet                           *
5
 *                                                                         *
6
 *  Copyright (c) 2001-2018                                                *
7
 *  Arnaud Martin, Antoine Pitrou, Philippe Riviere, Emmanuel Saint-James  *
8
 *                                                                         *
9
 *  Ce programme est un logiciel libre distribue sous licence GNU/GPL.     *
10
 *  Pour plus de details voir le fichier COPYING.txt ou l'aide en ligne.   *
11
\***************************************************************************/
12
13
/**
14
 * Gestion des charsets et des conversions
15
 *
16
 * Ce fichier contient les fonctions relatives à la gestion de charsets,
17
 * à la conversion de textes dans différents charsets et
18
 * propose des fonctions émulant la librairie mb si elle est absente
19
 *
20
 * @package SPIP\Core\Texte\Charsets
21
 **/
22
23
// securité
24
if (!defined('_ECRIRE_INC_VERSION')) {
25
	return;
26
}
27
28
// se faciliter la lecture du charset
29
include_spip('inc/config');
30
31
/**
32
 * Charge en mémoire la liste des caractères d'un charset
33
 *
34
 * Charsets supportés en natif : voir les tables dans ecrire/charsets/
35
 * Les autres charsets sont supportés via mbstring()
36
 *
37
 * @param string $charset
38
 *     Charset à charger.
39
 *     Par défaut (AUTO), utilise le charset du site
40
 * @return string|bool
0 ignored issues
show
Documentation introduced by
Consider making the return type a bit more specific; maybe use string|false.

This check looks for the generic type array as a return type and suggests a more specific type. This type is inferred from the actual code.

Loading history...
41
 *     - Nom du charset
42
 *     - false si le charset n'est pas décrit dans le répertoire charsets/
43
 **/
44
function load_charset($charset = 'AUTO') {
45
	if ($charset == 'AUTO') {
46
		$charset = $GLOBALS['meta']['charset'];
47
	}
48
	$charset = trim(strtolower($charset));
49
	if (isset($GLOBALS['CHARSET'][$charset])) {
50
		return $charset;
51
	}
52
53
	if ($charset == 'utf-8') {
54
		$GLOBALS['CHARSET'][$charset] = array();
55
56
		return $charset;
57
	}
58
59
	// Quelques synonymes
60
	if ($charset == '') {
61
		$charset = 'iso-8859-1';
62
	} else {
63
		if ($charset == 'windows-1250') {
64
			$charset = 'cp1250';
65
		} else {
66
			if ($charset == 'windows-1251') {
67
				$charset = 'cp1251';
68
			} else {
69
				if ($charset == 'windows-1256') {
70
					$charset = 'cp1256';
71
				}
72
			}
73
		}
74
	}
75
76
	if (find_in_path($charset . '.php', 'charsets/', true)) {
77
		return $charset;
78
	} else {
79
		spip_log("Erreur: pas de fichier de conversion 'charsets/$charset'");
80
		$GLOBALS['CHARSET'][$charset] = array();
81
82
		return false;
83
	}
84
}
85
86
87
/**
88
 * Vérifier qu'on peut utiliser mb_string
89
 *
90
 * @return bool
91
 *     true si toutes les fonctions mb nécessaires sont présentes
92
 **/
93
function init_mb_string() {
94
	static $mb;
95
96
	// verifier que tout est present (fonctions mb_string pour php >= 4.0.6)
97
	// et que le charset interne est connu de mb_string
98
	if (!$mb) {
99
		if (function_exists('mb_internal_encoding')
100
			and function_exists('mb_detect_order')
101
			and function_exists('mb_substr')
102
			and function_exists('mb_strlen')
103
			and function_exists('mb_strtolower')
104
			and function_exists('mb_strtoupper')
105
			and function_exists('mb_encode_mimeheader')
106
			and function_exists('mb_encode_numericentity')
107
			and function_exists('mb_decode_numericentity')
108
			and mb_detect_order(lire_config('charset', _DEFAULT_CHARSET))
109
		) {
110
			mb_internal_encoding('utf-8');
111
			$mb = 1;
112
		} else {
113
			$mb = -1;
114
		}
115
	}
116
117
	return ($mb == 1);
118
}
119
120
/**
121
 * Test le fonctionnement correct d'iconv
122
 *
123
 * Celui-ci coupe sur certaines versions la chaine
124
 * quand un caractère n'appartient pas au charset
125
 *
126
 * @link http://php.net/manual/fr/function.iconv.php
127
 *
128
 * @return bool
129
 *     true si iconv fonctionne correctement
130
 **/
131
function test_iconv() {
132
	static $iconv_ok;
133
134
	if (!$iconv_ok) {
135
		if (!function_exists('iconv')) {
136
			$iconv_ok = -1;
137
		} else {
138
			if (utf_32_to_unicode(@iconv('utf-8', 'utf-32', 'chaine de test')) == 'chaine de test') {
139
				$iconv_ok = 1;
140
			} else {
141
				$iconv_ok = -1;
142
			}
143
		}
144
	}
145
146
	return ($iconv_ok == 1);
147
}
148
149
150
/**
151
 * Test de fonctionnement du support UTF-8 dans PCRE
152
 *
153
 * Contournement bug Debian Woody
154
 *
155
 * @return bool
156
 *     true si PCRE supporte l'UTF-8 correctement
157
 **/
158
function test_pcre_unicode() {
159
	static $pcre_ok = 0;
160
161
	if (!$pcre_ok) {
162
		$s = " " . chr(195) . chr(169) . "t" . chr(195) . chr(169) . " ";
163
		if (preg_match(',\W...\W,u', $s)) {
164
			$pcre_ok = 1;
165
		} else {
166
			$pcre_ok = -1;
167
		}
168
	}
169
170
	return $pcre_ok == 1;
171
}
172
173
/**
174
 * Renvoie une plage de caractères alphanumeriques unicodes (incomplet...)
175
 *
176
 * Retourne pour une expression rationnelle une plage
177
 * de caractères alphanumériques à utiliser entre crochets [$plage]
178
 *
179
 * @internal
180
 *    N'est pas utilisé
181
 *    Servait à inc/ortho passé dans le grenier
182
 * @return string
183
 *    Plage de caractères
184
 **/
185
function pcre_lettres_unicode() {
186
	static $plage_unicode;
187
188
	if (!$plage_unicode) {
189
		if (test_pcre_unicode()) {
190
			// cf. http://www.unicode.org/charts/
191
			$plage_unicode = '\w' // iso-latin
192
				. '\x{100}-\x{24f}' // europeen etendu
193
				. '\x{300}-\x{1cff}' // des tas de trucs
194
			;
195
		} else {
196
			// fallback a trois sous
197
			$plage_unicode = '\w';
198
		}
199
	}
200
201
	return $plage_unicode;
202
}
203
204
205
/**
206
 * Renvoie une plage de caractères de ponctuation unicode de 0x2000 a 0x206F
207
 *
208
 * Retourne pour une expression rationnelle une plage
209
 * de caractères de ponctuation à utiliser entre crochets [$plage]
210
 * (i.e. de 226-128-128 a 226-129-176)
211
 *
212
 * @internal
213
 *    N'est pas utilisé
214
 *    Servait à inc/ortho passé dans le grenier
215
 * @return string
216
 *    Plage de caractères
217
 **/
218
function plage_punct_unicode() {
219
	return '\xE2(\x80[\x80-\xBF]|\x81[\x80-\xAF])';
220
}
221
222
/**
223
 * Corriger des caractères non-conformes : 128-159
224
 *
225
 * Cf. charsets/iso-8859-1.php (qu'on recopie ici pour aller plus vite)
226
 * On peut passer un charset cible en parametre pour accelerer le passage iso-8859-1 -> autre charset
227
 *
228
 * @param string|array $texte
229
 *     Le texte à corriger
230
 * @param string $charset
231
 *     Charset d'origine du texte
232
 *     Par défaut (AUTO) utilise le charset du site
233
 * @param string $charset_cible
234
 *     Charset de destination (unicode par défaut)
235
 * @return string|array
236
 *     Texte corrigé
237
 **/
238
function corriger_caracteres_windows($texte, $charset = 'AUTO', $charset_cible = 'unicode') {
239
	static $trans;
240
241
	if (is_array($texte)) {
242
		return array_map('corriger_caracteres_windows', $texte);
243
	}
244
245
	if ($charset == 'AUTO') {
246
		$charset = lire_config('charset', _DEFAULT_CHARSET);
247
	}
248
	if ($charset == 'utf-8') {
249
		$p = chr(194);
250
		if (strpos($texte, $p) == false) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing strpos($texte, $p) of type integer to the boolean false. If you are specifically checking for 0, consider using something more explicit like === 0 instead.
Loading history...
251
			return $texte;
252
		}
253
	} else {
254
		if ($charset == 'iso-8859-1') {
255
			$p = '';
256
		} else {
257
			return $texte;
258
		}
259
	}
260
261
	if (!isset($trans[$charset][$charset_cible])) {
262
		$trans[$charset][$charset_cible] = array(
263
			$p . chr(128) => "&#8364;",
264
			$p . chr(129) => ' ', # pas affecte
265
			$p . chr(130) => "&#8218;",
266
			$p . chr(131) => "&#402;",
267
			$p . chr(132) => "&#8222;",
268
			$p . chr(133) => "&#8230;",
269
			$p . chr(134) => "&#8224;",
270
			$p . chr(135) => "&#8225;",
271
			$p . chr(136) => "&#710;",
272
			$p . chr(137) => "&#8240;",
273
			$p . chr(138) => "&#352;",
274
			$p . chr(139) => "&#8249;",
275
			$p . chr(140) => "&#338;",
276
			$p . chr(141) => ' ', # pas affecte
277
			$p . chr(142) => "&#381;",
278
			$p . chr(143) => ' ', # pas affecte
279
			$p . chr(144) => ' ', # pas affecte
280
			$p . chr(145) => "&#8216;",
281
			$p . chr(146) => "&#8217;",
282
			$p . chr(147) => "&#8220;",
283
			$p . chr(148) => "&#8221;",
284
			$p . chr(149) => "&#8226;",
285
			$p . chr(150) => "&#8211;",
286
			$p . chr(151) => "&#8212;",
287
			$p . chr(152) => "&#732;",
288
			$p . chr(153) => "&#8482;",
289
			$p . chr(154) => "&#353;",
290
			$p . chr(155) => "&#8250;",
291
			$p . chr(156) => "&#339;",
292
			$p . chr(157) => ' ', # pas affecte
293
			$p . chr(158) => "&#382;",
294
			$p . chr(159) => "&#376;",
295
		);
296
		if ($charset_cible != 'unicode') {
297
			foreach ($trans[$charset][$charset_cible] as $k => $c) {
298
				$trans[$charset][$charset_cible][$k] = unicode2charset($c, $charset_cible);
299
			}
300
		}
301
	}
302
303
	return @str_replace(array_keys($trans[$charset][$charset_cible]),
304
		array_values($trans[$charset][$charset_cible]), $texte);
305
}
306
307
308
/**
309
 * Transforme les entités HTML en unicode
310
 *
311
 * Transforme les &eacute; en &#123;
312
 *
313
 * @param string $texte
314
 *     Texte à convertir
315
 * @param bool $secure
316
 *     true pour *ne pas convertir* les caracteres malins &lt; &amp; etc.
317
 * @return string
318
 *     Texte converti
319
 **/
320
function html2unicode($texte, $secure = false) {
321
	if (strpos($texte, '&') === false) {
322
		return $texte;
323
	}
324
	static $trans = array();
325 View Code Duplication
	if (!$trans) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $trans of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
326
		load_charset('html');
327
		foreach ($GLOBALS['CHARSET']['html'] as $key => $val) {
328
			$trans["&$key;"] = $val;
329
		}
330
	}
331
332
	if ($secure) {
333
		return str_replace(array_keys($trans), array_values($trans), $texte);
334
	} else {
335
		return str_replace(array('&amp;', '&quot;', '&lt;', '&gt;'), array('&', '"', '<', '>'),
336
			str_replace(array_keys($trans), array_values($trans), $texte)
337
		);
338
	}
339
}
340
341
342
/**
343
 * Transforme les entités mathématiques (MathML) en unicode
344
 *
345
 * Transforme &angle; en &#x2220; ainsi que toutes autres entités mathématiques
346
 *
347
 * @param string $texte
348
 *     Texte à convertir
349
 * @return string
350
 *     Texte converti
351
 **/
352
function mathml2unicode($texte) {
353
	static $trans;
354 View Code Duplication
	if (!$trans) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
355
		load_charset('mathml');
356
357
		foreach ($GLOBALS['CHARSET']['mathml'] as $key => $val) {
358
			$trans["&$key;"] = $val;
359
		}
360
	}
361
362
	return str_replace(array_keys($trans), array_values($trans), $texte);
363
}
364
365
366
/**
367
 * Transforme une chaine en entites unicode &#129;
368
 *
369
 * Utilise la librairie mb si elle est présente.
370
 *
371
 * @internal
372
 *     Note: l'argument $forcer est obsolete : il visait a ne pas
373
 *     convertir les accents iso-8859-1
374
 *
375
 * @param string $texte
376
 *     Texte à convertir
377
 * @param string $charset
378
 *     Charset actuel du texte
379
 *     Par défaut (AUTO), le charset est celui du site.
380
 * @return string
381
 *     Texte converti en unicode
382
 **/
383
function charset2unicode($texte, $charset = 'AUTO' /* $forcer: obsolete*/) {
384
	static $trans;
385
386
	if ($charset == 'AUTO') {
387
		$charset = lire_config('charset', _DEFAULT_CHARSET);
388
	}
389
390
	if ($charset == '') {
391
		$charset = 'iso-8859-1';
392
	}
393
	$charset = strtolower($charset);
394
395
	switch ($charset) {
396
		case 'utf-8':
397
		case 'utf8':
398
			return utf_8_to_unicode($texte);
399
400
		case 'iso-8859-1':
401
			$texte = corriger_caracteres_windows($texte, 'iso-8859-1');
402
		// pas de break; ici, on suit sur default:
403
404
		default:
405
			// mbstring presente ?
406
			if (init_mb_string()) {
407 View Code Duplication
				if ($order = mb_detect_order() # mb_string connait-il $charset?
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
408
					and mb_detect_order($charset)
409
				) {
410
					$s = mb_convert_encoding($texte, 'utf-8', $charset);
411
					if ($s && $s != $texte) {
412
						return utf_8_to_unicode($s);
413
					}
414
				}
415
				mb_detect_order($order); # remettre comme precedemment
416
			}
417
418
			// Sinon, peut-etre connaissons-nous ce charset ?
419 View Code Duplication
			if (!isset($trans[$charset])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
420
				if ($cset = load_charset($charset)
421
					and is_array($GLOBALS['CHARSET'][$cset])
422
				) {
423
					foreach ($GLOBALS['CHARSET'][$cset] as $key => $val) {
424
						$trans[$charset][chr($key)] = '&#' . $val . ';';
425
					}
426
				}
427
			}
428 View Code Duplication
			if (count($trans[$charset])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
429
				return str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
430
			}
431
432
			// Sinon demander a iconv (malgre le fait qu'il coupe quand un
433
			// caractere n'appartient pas au charset, mais c'est un probleme
434
			// surtout en utf-8, gere ci-dessus)
435
			if (test_iconv()) {
436
				$s = iconv($charset, 'utf-32le', $texte);
437
				if ($s) {
438
					return utf_32_to_unicode($s);
439
				}
440
			}
441
442
			// Au pire ne rien faire
443
			spip_log("erreur charset '$charset' non supporte");
444
445
			return $texte;
446
	}
447
}
448
449
450
/**
451
 * Transforme les entites unicode &#129; dans le charset specifie
452
 *
453
 * Attention on ne transforme pas les entites < &#128; car si elles
454
 * ont ete encodees ainsi c'est a dessein
455
 *
456
 * @param string $texte
457
 *     Texte unicode à transformer
458
 * @param string $charset
459
 *     Charset à appliquer au texte
460
 *     Par défaut (AUTO), le charset sera celui du site.
461
 * @return string
0 ignored issues
show
Documentation introduced by
Should the return type not be string|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
462
 *     Texte transformé dans le charset souhaité
463
 **/
464
function unicode2charset($texte, $charset = 'AUTO') {
465
	static $CHARSET_REVERSE;
466
	static $trans = array();
467
468
	if ($charset == 'AUTO') {
469
		$charset = lire_config('charset', _DEFAULT_CHARSET);
470
	}
471
472
	switch ($charset) {
473
		case 'utf-8':
474
			return unicode_to_utf_8($texte);
475
			break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
476
477
		default:
478
			$charset = load_charset($charset);
479
480
			if (!is_array($CHARSET_REVERSE[$charset])) {
481
				$CHARSET_REVERSE[$charset] = array_flip($GLOBALS['CHARSET'][$charset]);
482
			}
483
484
			if (!isset($trans[$charset])) {
485
				$trans[$charset] = array();
486
				$t = &$trans[$charset];
487
				for ($e = 128; $e < 255; $e++) {
488
					$h = dechex($e);
489
					if ($s = isset($CHARSET_REVERSE[$charset][$e])) {
0 ignored issues
show
Unused Code introduced by
$s is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
490
						$s = $CHARSET_REVERSE[$charset][$e];
491
						$t['&#' . $e . ';'] = $t['&#0' . $e . ';'] = $t['&#00' . $e . ';'] = chr($s);
492
						$t['&#x' . $h . ';'] = $t['&#x0' . $h . ';'] = $t['&#x00' . $h . ';'] = chr($s);
493
					} else {
494
						$t['&#' . $e . ';'] = $t['&#0' . $e . ';'] = $t['&#00' . $e . ';'] = chr($e);
495
						$t['&#x' . $h . ';'] = $t['&#x0' . $h . ';'] = $t['&#x00' . $h . ';'] = chr($e);
496
					}
497
				}
498
			}
499
			$texte = str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
500
501
			return $texte;
502
	}
503
}
504
505
506
/**
507
 * Importer un texte depuis un charset externe vers le charset du site
508
 *
509
 * Les caractères non resolus sont transformés en `&#123`;
510
 *
511
 * @param string $texte
512
 *     Texte unicode à importer
513
 * @param string $charset
514
 *     Charset d'origine du texte
515
 *     Par défaut (AUTO), le charset d'origine est celui du site.
516
 * @return string
517
 *     Texte transformé dans le charset site
518
 **/
519
function importer_charset($texte, $charset = 'AUTO') {
520
	static $trans = array();
521
	// on traite le cas le plus frequent iso-8859-1 vers utf directement pour aller plus vite !
522
	if (($charset == 'iso-8859-1') && ($GLOBALS['meta']['charset'] == 'utf-8')) {
523
		$texte = corriger_caracteres_windows($texte, 'iso-8859-1', $GLOBALS['meta']['charset']);
524 View Code Duplication
		if (init_mb_string()) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
525
			if ($order = mb_detect_order() # mb_string connait-il $charset?
526
				and mb_detect_order($charset)
527
			) {
528
				$s = mb_convert_encoding($texte, 'utf-8', $charset);
529
			}
530
			mb_detect_order($order); # remettre comme precedemment
531
			return $s;
0 ignored issues
show
Bug introduced by
The variable $s does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
532
		}
533
		// Sinon, peut-etre connaissons-nous ce charset ?
534 View Code Duplication
		if (!isset($trans[$charset])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
535
			if ($cset = load_charset($charset)
536
				and is_array($GLOBALS['CHARSET'][$cset])
537
			) {
538
				foreach ($GLOBALS['CHARSET'][$cset] as $key => $val) {
539
					$trans[$charset][chr($key)] = unicode2charset('&#' . $val . ';');
540
				}
541
			}
542
		}
543 View Code Duplication
		if (count($trans[$charset])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
544
			return str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
545
		}
546
547
		return $texte;
548
	}
549
550
	return unicode2charset(charset2unicode($texte, $charset));
551
}
552
553
554
/**
555
 * Transforme un texte UTF-8 en unicode
556
 *
557
 * Utilise la librairie mb si présente
558
 *
559
 * @param string $source
560
 *    Texte UTF-8 à transformer
561
 * @return string
562
 *    Texte transformé en unicode
563
 **/
564
function utf_8_to_unicode($source) {
565
566
	// mb_string : methode rapide
567
	if (init_mb_string()) {
568
		$convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
569
570
		return mb_encode_numericentity($source, $convmap, 'UTF-8');
571
	}
572
573
	// Sinon methode pas a pas
574
	static $decrement;
575
	static $shift;
576
577
	// Cf. php.net, par Ronen. Adapte pour compatibilite < php4
578
	if (!is_array($decrement)) {
579
		// array used to figure what number to decrement from character order value
580
		// according to number of characters used to map unicode to ascii by utf-8
581
		$decrement[4] = 240;
582
		$decrement[3] = 224;
583
		$decrement[2] = 192;
584
		$decrement[1] = 0;
585
		// the number of bits to shift each charNum by
586
		$shift[1][0] = 0;
587
		$shift[2][0] = 6;
588
		$shift[2][1] = 0;
589
		$shift[3][0] = 12;
590
		$shift[3][1] = 6;
591
		$shift[3][2] = 0;
592
		$shift[4][0] = 18;
593
		$shift[4][1] = 12;
594
		$shift[4][2] = 6;
595
		$shift[4][3] = 0;
596
	}
597
598
	$pos = 0;
599
	$len = strlen($source);
600
	$encodedString = '';
601
	while ($pos < $len) {
602
		$char = '';
603
		$ischar = false;
604
		$asciiPos = ord(substr($source, $pos, 1));
605
		if (($asciiPos >= 240) && ($asciiPos <= 255)) {
606
			// 4 chars representing one unicode character
607
			$thisLetter = substr($source, $pos, 4);
608
			$pos += 4;
609
		} else {
610
			if (($asciiPos >= 224) && ($asciiPos <= 239)) {
611
				// 3 chars representing one unicode character
612
				$thisLetter = substr($source, $pos, 3);
613
				$pos += 3;
614
			} else {
615
				if (($asciiPos >= 192) && ($asciiPos <= 223)) {
616
					// 2 chars representing one unicode character
617
					$thisLetter = substr($source, $pos, 2);
618
					$pos += 2;
619
				} else {
620
					// 1 char (lower ascii)
621
					$thisLetter = substr($source, $pos, 1);
622
					$pos += 1;
623
					$char = $thisLetter;
624
					$ischar = true;
625
				}
626
			}
627
		}
628
629
		if ($ischar) {
630
			$encodedString .= $char;
631
		} else {  // process the string representing the letter to a unicode entity
632
			$thisLen = strlen($thisLetter);
633
			$thisPos = 0;
634
			$decimalCode = 0;
635
			while ($thisPos < $thisLen) {
636
				$thisCharOrd = ord(substr($thisLetter, $thisPos, 1));
637
				if ($thisPos == 0) {
638
					$charNum = intval($thisCharOrd - $decrement[$thisLen]);
639
					$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
640
				} else {
641
					$charNum = intval($thisCharOrd - 128);
642
					$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
643
				}
644
				$thisPos++;
645
			}
646
			$encodedLetter = "&#" . preg_replace('/^0+/', '', $decimalCode) . ';';
647
			$encodedString .= $encodedLetter;
648
		}
649
	}
650
651
	return $encodedString;
652
}
653
654
/**
655
 * Transforme un texte UTF-32 en unicode
656
 *
657
 * UTF-32 ne sert plus que si on passe par iconv, c'est-a-dire quand
658
 * mb_string est absente ou ne connait pas notre charset.
659
 *
660
 * Mais on l'optimise quand meme par mb_string
661
 * => tout ca sera osolete quand on sera surs d'avoir mb_string
662
 *
663
 * @param string $source
664
 *    Texte UTF-8 à transformer
665
 * @return string
666
 *    Texte transformé en unicode
667
 **/
668
function utf_32_to_unicode($source) {
669
670
	// mb_string : methode rapide
671
	if (init_mb_string()) {
672
		$convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
673
		$source = mb_encode_numericentity($source, $convmap, 'UTF-32LE');
674
675
		return str_replace(chr(0), '', $source);
676
	}
677
678
	// Sinon methode lente
679
	$texte = '';
680
	while ($source) {
681
		$words = unpack("V*", substr($source, 0, 1024));
682
		$source = substr($source, 1024);
683
		foreach ($words as $word) {
684
			if ($word < 128) {
685
				$texte .= chr($word);
686
			} // ignorer le BOM - http://www.unicode.org/faq/utf_bom.html
687
			else {
688
				if ($word != 65279) {
689
					$texte .= '&#' . $word . ';';
690
				}
691
			}
692
		}
693
	}
694
695
	return $texte;
696
697
}
698
699
700
/**
701
 * Transforme un numéro unicode en caractère utf-8
702
 *
703
 * Ce bloc provient de php.net
704
 *
705
 * @author Ronen
706
 *
707
 * @param int $num
708
 *    Numéro de l'entité unicode
709
 * @return char
0 ignored issues
show
Documentation introduced by
Should the return type not be string?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
710
 *    Caractère utf8 si trouvé, '' sinon
711
 **/
712
function caractere_utf_8($num) {
713
	$num = intval($num);
714
	if ($num < 128) {
715
		return chr($num);
716
	}
717
	if ($num < 2048) {
718
		return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
719
	}
720
	if ($num < 65536) {
721
		return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
722
	}
723
	if ($num < 1114112) {
724
		return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
725
	}
726
727
	return '';
728
}
729
730
/**
731
 * Convertit un texte unicode en utf-8
732
 *
733
 * @param string $texte
734
 *     Texte à convertir
735
 * @return string
736
 *     Texte converti
737
 **/
738
function unicode_to_utf_8($texte) {
739
740
	// 1. Entites &#128; et suivantes
741
	$vu = array();
742
	if (preg_match_all(',&#0*([1-9][0-9][0-9]+);,S',
743
		$texte, $regs, PREG_SET_ORDER)) {
744
		foreach ($regs as $reg) {
745
			if ($reg[1] > 127 and !isset($vu[$reg[0]])) {
746
				$vu[$reg[0]] = caractere_utf_8($reg[1]);
747
			}
748
		}
749
	}
750
	//$texte = str_replace(array_keys($vu), array_values($vu), $texte);
751
752
	// 2. Entites > &#xFF;
753
	//$vu = array();
754
	if (preg_match_all(',&#x0*([1-9a-f][0-9a-f][0-9a-f]+);,iS',
755
		$texte, $regs, PREG_SET_ORDER)) {
756
		foreach ($regs as $reg) {
757
			if (!isset($vu[$reg[0]])) {
758
				$vu[$reg[0]] = caractere_utf_8(hexdec($reg[1]));
759
			}
760
		}
761
	}
762
763
	return str_replace(array_keys($vu), array_values($vu), $texte);
764
765
}
766
767
/**
768
 * Convertit les unicode &#264; en javascript \u0108
769
 *
770
 * @param string $texte
771
 *     Texte à convertir
772
 * @return string
773
 *     Texte converti
774
 **/
775
function unicode_to_javascript($texte) {
776
	$vu = array();
777
	while (preg_match(',&#0*([0-9]+);,S', $texte, $regs) and !isset($vu[$regs[1]])) {
778
		$num = $regs[1];
779
		$vu[$num] = true;
780
		$s = '\u' . sprintf("%04x", $num);
781
		$texte = str_replace($regs[0], $s, $texte);
782
	}
783
784
	return $texte;
785
}
786
787
/**
788
 * Convertit les %uxxxx (envoyés par javascript) en &#yyy unicode
789
 *
790
 * @param string $texte
791
 *     Texte à convertir
792
 * @return string
793
 *     Texte converti
794
 **/
795
function javascript_to_unicode($texte) {
796
	while (preg_match(",%u([0-9A-F][0-9A-F][0-9A-F][0-9A-F]),", $texte, $regs)) {
797
		$texte = str_replace($regs[0], "&#" . hexdec($regs[1]) . ";", $texte);
798
	}
799
800
	return $texte;
801
}
802
803
/**
804
 * Convertit les %E9 (envoyés par le browser) en chaîne du charset du site (binaire)
805
 *
806
 * @param string $texte
807
 *     Texte à convertir
808
 * @return string
809
 *     Texte converti
810
 **/
811
function javascript_to_binary($texte) {
812
	while (preg_match(",%([0-9A-F][0-9A-F]),", $texte, $regs)) {
813
		$texte = str_replace($regs[0], chr(hexdec($regs[1])), $texte);
814
	}
815
816
	return $texte;
817
}
818
819
820
/**
821
 * Substition rapide de chaque graphème selon le charset sélectionné.
822
 *
823
 * @uses caractere_utf_8()
824
 *
825
 * @global array $CHARSET
826
 * @staticvar array $trans
827
 *
828
 * @param string $texte
829
 * @param string $charset
830
 * @param string $complexe
831
 * @return string
832
 */
833
function translitteration_rapide($texte, $charset = 'AUTO', $complexe = '') {
834
	static $trans;
835
	if ($charset == 'AUTO') {
836
		$charset = $GLOBALS['meta']['charset'];
0 ignored issues
show
Unused Code introduced by
$charset is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
837
	}
838
	if (!strlen($texte)) {
839
		return $texte;
840
	}
841
842
	$table_translit = 'translit' . $complexe;
843
844
	// 2. Translitterer grace a la table predefinie
845
	if (!$trans[$complexe]) {
846
		load_charset($table_translit);
847
		foreach ($GLOBALS['CHARSET'][$table_translit] as $key => $val) {
848
			$trans[$complexe][caractere_utf_8($key)] = $val;
849
		}
850
	}
851
852
	return str_replace(array_keys($trans[$complexe]), array_values($trans[$complexe]), $texte);
853
}
854
855
/**
856
 * Translittération charset => ascii (pour l'indexation)
857
 *
858
 * Permet, entre autres, d’enlever les accents, 
859
 * car la table ASCII non étendue ne les comporte pas.
860
 *
861
 * Attention les caractères non reconnus sont renvoyés en utf-8
862
 *
863
 * @uses corriger_caracteres()
864
 * @uses unicode_to_utf_8()
865
 * @uses html2unicode()
866
 * @uses charset2unicode()
867
 * @uses translitteration_rapide()
868
 *
869
 * @param string $texte
870
 * @param string $charset
871
 * @param string $complexe
872
 * @return string
873
 */
874
function translitteration($texte, $charset = 'AUTO', $complexe = '') {
875
	// 0. Supprimer les caracteres illegaux
876
	include_spip('inc/filtres');
877
	$texte = corriger_caracteres($texte);
878
879
	// 1. Passer le charset et les &eacute en utf-8
880
	$texte = unicode_to_utf_8(html2unicode(charset2unicode($texte, $charset, true)));
0 ignored issues
show
Bug introduced by
It seems like $texte can also be of type array; however, charset2unicode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
Unused Code introduced by
The call to charset2unicode() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
881
882
	return translitteration_rapide($texte, $charset, $complexe);
883
}
884
885
/**
886
 * Translittération complexe
887
 *
888
 * `&agrave;` est retourné sous la forme ``a` `` et pas `à`
889
 * mais si `$chiffre=true`, on retourne `a8` (vietnamien)
890
 *
891
 * @uses translitteration()
892
 * @param string $texte
893
 * @param bool $chiffres
894
 * @return string
895
 */
896
function translitteration_complexe($texte, $chiffres = false) {
897
	$texte = translitteration($texte, 'AUTO', 'complexe');
898
899
	if ($chiffres) {
900
		$texte = preg_replace("/[aeiuoyd]['`?~.^+(-]{1,2}/eS",
901
			"translitteration_chiffree('\\0')", $texte);
902
	}
903
904
	return $texte;
905
}
906
907
/**
908
 * Translittération chiffrée
909
 *
910
 * Remplace des caractères dans une chaîne par des chiffres
911
 *
912
 * @param string $car
913
 * @return string
914
 */
915
function translitteration_chiffree($car) {
916
	return strtr($car, "'`?~.^+(-", "123456789");
917
}
918
919
920
/**
921
 * Reconnaitre le BOM utf-8 (0xEFBBBF)
922
 *
923
 * @param string $texte
924
 *    Texte dont on vérifie la présence du BOM
925
 * @return bool
926
 *    true s'il a un BOM
927
 **/
928
function bom_utf8($texte) {
929
	return (substr($texte, 0, 3) == chr(0xEF) . chr(0xBB) . chr(0xBF));
930
}
931
932
/**
933
 * Vérifie qu'une chaîne est en utf-8 valide
934
 *
935
 * Note: preg_replace permet de contourner un "stack overflow" sur PCRE
936
 *
937
 * @link http://us2.php.net/manual/fr/function.mb-detect-encoding.php#50087
938
 * @link http://w3.org/International/questions/qa-forms-utf-8.html
939
 *
940
 * @param string $string
941
 *     Texte dont on vérifie qu'il est de l'utf-8
942
 * @return bool
943
 *     true si c'est le cas
944
 **/
945
function is_utf8($string) {
946
	return !strlen(
947
		preg_replace(
948
			',[\x09\x0A\x0D\x20-\x7E]'            # ASCII
949
			. '|[\xC2-\xDF][\x80-\xBF]'             # non-overlong 2-byte
950
			. '|\xE0[\xA0-\xBF][\x80-\xBF]'         # excluding overlongs
951
			. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'  # straight 3-byte
952
			. '|\xED[\x80-\x9F][\x80-\xBF]'         # excluding surrogates
953
			. '|\xF0[\x90-\xBF][\x80-\xBF]{2}'      # planes 1-3
954
			. '|[\xF1-\xF3][\x80-\xBF]{3}'          # planes 4-15
955
			. '|\xF4[\x80-\x8F][\x80-\xBF]{2}'      # plane 16
956
			. ',sS',
957
			'', $string));
958
}
959
960
/**
961
 * Vérifie qu'une chaîne est en ascii valide
962
 *
963
 * @param string $string
964
 *     Texte dont on vérifie qu'il est de l'ascii
965
 * @return bool
966
 *     true si c'est le cas
967
 **/
968
function is_ascii($string) {
969
	return !strlen(
970
		preg_replace(
971
			',[\x09\x0A\x0D\x20-\x7E],sS',
972
			'', $string));
973
}
974
975
/**
976
 * Transcode une page vers le charset du site
977
 *
978
 * Transcode une page (attrapée sur le web, ou un squelette) vers le
979
 * charset du site en essayant par tous les moyens de deviner son charset
980
 * (y compris dans les headers HTTP)
981
 *
982
 * @param string $texte
983
 *     Page à transcoder, dont on souhaite découvrir son charset
984
 * @param string $headers
985
 *     Éventuels headers HTTP liés à cette page
986
 * @return string
987
 *     Texte transcodé dans le charset du site
988
 **/
989
function transcoder_page($texte, $headers = '') {
990
991
	// Si tout est < 128 pas la peine d'aller plus loin
992
	if (is_ascii($texte)) {
993
		#spip_log('charset: ascii');
994
		return $texte;
995
	}
996
997
	// Reconnaitre le BOM utf-8 (0xEFBBBF)
998
	if (bom_utf8($texte)) {
999
		$charset = 'utf-8';
1000
		$texte = substr($texte, 3);
1001
	} // charset precise par le contenu (xml)
1002
	else {
1003
		if (preg_match(
1004
			',<[?]xml[^>]*encoding[^>]*=[^>]*([-_a-z0-9]+?),UimsS', $texte, $regs)) {
1005
			$charset = trim(strtolower($regs[1]));
1006
		} // charset precise par le contenu (html)
1007
		else {
1008
			if (preg_match(
1009
					',<(meta|html|body)[^>]*charset[^>]*=[^>]*([-_a-z0-9]+?),UimsS',
1010
					$texte, $regs)
1011
				# eviter #CHARSET des squelettes
1012
				and (($tmp = trim(strtolower($regs[2]))) != 'charset')
1013
			) {
1014
				$charset = $tmp;
1015
			} // charset de la reponse http
1016
			else {
1017
				if (preg_match(',charset=([-_a-z0-9]+),i', $headers, $regs)) {
1018
					$charset = trim(strtolower($regs[1]));
1019
				} else {
1020
					$charset = '';
1021
				}
1022
			}
1023
		}
1024
	}
1025
	// normaliser les noms du shif-jis japonais
1026
	if (preg_match(',^(x|shift)[_-]s?jis$,i', $charset)) {
1027
		$charset = 'shift-jis';
1028
	}
1029
1030
	if ($charset) {
1031
		spip_log("charset: $charset");
1032
	} else {
1033
		// valeur par defaut
1034
		if (is_utf8($texte)) {
1035
			$charset = 'utf-8';
1036
		} else {
1037
			$charset = 'iso-8859-1';
1038
		}
1039
		spip_log("charset probable: $charset");
1040
	}
1041
1042
	return importer_charset($texte, $charset);
1043
}
1044
1045
1046
//
1047
// Gerer les outils mb_string
1048
//
1049
1050
/**
1051
 * Coupe un texte selon substr()
1052
 *
1053
 * Coupe une chaîne en utilisant les outils mb* lorsque le site est en utf8
1054
 *
1055
 * @link http://fr.php.net/manual/fr/function.mb-substr.php
1056
 * @link http://www.php.net/manual/fr/function.substr.php
1057
 * @uses spip_substr_manuelle() si les fonctions php mb sont absentes
1058
 *
1059
 * @param string $c Le texte
1060
 * @param int $start Début
1061
 * @param null|int $length Longueur ou fin
1062
 * @return string
1063
 *     Le texte coupé
1064
 **/
1065
function spip_substr($c, $start = 0, $length = null) {
1066
	// Si ce n'est pas utf-8, utiliser substr
1067
	if ($GLOBALS['meta']['charset'] != 'utf-8') {
1068
		if ($length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type null|integer is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
1069
			return substr($c, $start, $length);
1070
		} else {
1071
			substr($c, $start);
1072
		}
1073
	}
1074
1075
	// Si utf-8, voir si on dispose de mb_string
1076
	if (init_mb_string()) {
1077
		if ($length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type null|integer is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
1078
			return mb_substr($c, $start, $length);
1079
		} else {
1080
			return mb_substr($c, $start);
1081
		}
1082
	}
1083
1084
	// Version manuelle (cf. ci-dessous)
1085
	return spip_substr_manuelle($c, $start, $length);
1086
}
1087
1088
1089
/**
1090
 * Coupe un texte comme mb_substr()
1091
 *
1092
 * Version manuelle de substr utf8, pour php vieux et/ou mal installe
1093
 *
1094
 * @link http://fr.php.net/manual/fr/function.mb-substr.php
1095
 *
1096
 * @param string $c Le texte
1097
 * @param int $start Début
1098
 * @param null|int $length Longueur ou fin
1099
 * @return string
1100
 *     Le texte coupé
1101
 **/
1102
function spip_substr_manuelle($c, $start, $length = null) {
1103
1104
	// Cas pathologique
1105
	if ($length === 0) {
1106
		return '';
1107
	}
1108
1109
	// S'il y a un demarrage, on se positionne
1110
	if ($start > 0) {
1111
		$c = substr($c, strlen(spip_substr_manuelle($c, 0, $start)));
1112
	} elseif ($start < 0) {
1113
		return spip_substr_manuelle($c, spip_strlen($c) + $start, $length);
1114
	}
1115
1116
	if (!$length) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $length of type null|integer is loosely compared to false; this is ambiguous if the integer can be zero. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
1117
		return $c;
1118
	}
1119
1120
	if ($length > 0) {
1121
		// on prend n fois la longueur desiree, pour etre surs d'avoir tout
1122
		// (un caractere utf-8 prenant au maximum n bytes)
1123
		$n = 0;
1124
		while (preg_match(',[\x80-\xBF]{' . (++$n) . '},', $c)) {
1125
			;
1126
		}
1127
		$c = substr($c, 0, $n * $length);
1128
		// puis, tant qu'on est trop long, on coupe...
1129
		while (($l = spip_strlen($c)) > $length) {
1130
			$c = substr($c, 0, $length - $l);
1131
		}
1132
1133
		return $c;
1134
	}
1135
1136
	// $length < 0
1137
	return spip_substr_manuelle($c, 0, spip_strlen($c) + $length);
1138
}
1139
1140
/**
1141
 * Rend majuscule le premier caractère d'une chaîne utf-8
1142
 *
1143
 * Version utf-8 d'ucfirst
1144
 *
1145
 * @param string $c
1146
 *     La chaîne à transformer
1147
 * @return string
1148
 *     La chaîne avec une majuscule sur le premier mot
1149
 */
1150
function spip_ucfirst($c) {
1151
	// Si on n'a pas mb_* ou si ce n'est pas utf-8, utiliser ucfirst
1152 View Code Duplication
	if (!init_mb_string() or $GLOBALS['meta']['charset'] != 'utf-8') {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1153
		return ucfirst($c);
1154
	}
1155
1156
	$lettre1 = mb_strtoupper(spip_substr($c, 0, 1));
1157
1158
	return $lettre1 . spip_substr($c, 1);
1159
}
1160
1161
/**
1162
 * Passe une chaîne utf-8 en minuscules
1163
 *
1164
 * Version utf-8 de strtolower
1165
 *
1166
 * @param string $c
1167
 *     La chaîne à transformer
1168
 * @return string
1169
 *     La chaîne en minuscules
1170
 */
1171
function spip_strtolower($c) {
1172
	// Si on n'a pas mb_* ou si ce n'est pas utf-8, utiliser strtolower 
1173 View Code Duplication
	if (!init_mb_string() or $GLOBALS['meta']['charset'] != 'utf-8') {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1174
		return strtolower($c);
1175
	}
1176
1177
	return mb_strtolower($c);
1178
}
1179
1180
/**
1181
 * Retourne la longueur d'une chaîne utf-8
1182
 *
1183
 * Version utf-8 de strlen
1184
 *
1185
 * @param string $c
1186
 *     La chaîne à compter
1187
 * @return int
1188
 *     Longueur de la chaîne
1189
 */
1190
function spip_strlen($c) {
1191
	// On transforme les sauts de ligne pour ne pas compter deux caractères
1192
	$c = str_replace("\r\n", "\n", $c);
1193
1194
	// Si ce n'est pas utf-8, utiliser strlen
1195
	if ($GLOBALS['meta']['charset'] != 'utf-8') {
1196
		return strlen($c);
1197
	}
1198
1199
	// Sinon, utiliser mb_strlen() si disponible
1200
	if (init_mb_string()) {
1201
		return mb_strlen($c);
1202
	}
1203
1204
	// Methode manuelle : on supprime les bytes 10......,
1205
	// on compte donc les ascii (0.......) et les demarrages
1206
	// de caracteres utf-8 (11......)
1207
	return strlen(preg_replace(',[\x80-\xBF],S', '', $c));
1208
}
1209
1210
// Initialisation
1211
$GLOBALS['CHARSET'] = array();
1212
1213
// noter a l'occasion dans la meta pcre_u notre capacite a utiliser le flag /u
1214
// dans les preg_replace pour ne pas casser certaines lettres accentuees :
1215
// en utf-8 chr(195).chr(160) = a` alors qu'en iso-latin chr(160) = nbsp
1216
if (!isset($GLOBALS['meta']['pcre_u'])
1217
	or (isset($_GET['var_mode']) and !isset($_GET['var_profile']))
1218
) {
1219
	include_spip('inc/meta');
1220
	ecrire_meta('pcre_u',
1221
		$u = (lire_config('charset', _DEFAULT_CHARSET) == 'utf-8'
1222
			and test_pcre_unicode())
1223
			? 'u' : ''
1224
	);
1225
}
1226
1227
1228
/**
1229
 * Transforme une chaîne utf-8 en utf-8 sans "planes"
1230
 * ce qui permet de la donner à MySQL "utf8", qui n'est pas un utf-8 complet
1231
 * L'alternative serait d'utiliser utf8mb4
1232
 *
1233
 * @param string $x
1234
 *     La chaîne à transformer
1235
 * @return string
1236
 *     La chaîne avec les caractères utf8 des hauts "planes" échappée
1237
 *     en unicode : &#128169;
1238
 */
1239
function utf8_noplanes($x) {
1240
	$regexp_utf8_4bytes = '/(
1241
      \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
1242
   | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
1243
   |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
1244
)/xS';
1245
	if (preg_match_all($regexp_utf8_4bytes, $x, $z, PREG_PATTERN_ORDER)) {
1246
		foreach ($z[0] as $k) {
1247
			$ku = utf_8_to_unicode($k);
1248
			$x = str_replace($k, $ku, $x);
1249
		}
1250
	}
1251
1252
	return $x;
1253
}
1254