TextUtil::fixWrongUTF8Encoding() - Code Metrics - Inspection of "config minor ExternRefWorker : summary, !skip AdQ" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 732591...229bc5 )

by Dispositif

created 2023-11-07 17:46 UTC

TextUtil::fixWrongUTF8Encoding() A

↳ Parent: TextUtil

Complexity

Conditions	1
Paths	1

Size

Total Lines	44
Code Lines	36

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	2

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
eloc	36
c	1
b	0
f	1
dl	0
loc	44
rs	9.344
ccs	0
cts	0
cp	0
cc	1
nc	1
nop	1
crap	2

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);

namespace App\Domain\Utils;

/**
 * Class TextUtil.
 */
abstract class TextUtil
{
    public const SYMBOL_TEXT_CUT = '…';

    public const SKIP_PREDICT_PARAM = ['issue'];

    public const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;

    public const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";

    /** TODO ? add '-' and '.' ???  */
    public const ALL_PUNCTUATION
        = [
            '!',
            '"',
            '«',
            '»',
            '#',
            '$',
            '%',
            "'",
            '’',
            '´',
            '`',
            '^',
            '…',
            '‽',
            '(',
            ')',
            '*',
            '⁂',
            '+',
            '–',
            '—',
            '/',
            ':',
            ';',
            '?',
            '@',
            '[',
            '\\',
            ']',
            '_',
            '`',
            '{',
            '|',
            '¦',
            '}',
            '~',
            '<',
            '>',
            '№',
            '©',
            '®',
            '°',
            '†',
            '§',
            '∴',
            '∵',
            '¶',
            '•',
            '+',
        ];

    // &#8239;
    //    const ELLIPSIS = '…';
    //    const LAQUO = '«'; // &laquo;
    //    const RAQUO = '»'; // &raquo;
    //    const RSQUO = '’'; // &rsquo;
    //    const TIMES = '×'; // &times;
    //    const NDASH = '–'; // &ndash; or &#x2013;
    //    const MDASH = '—'; // &mdash; or &#x2014;
    //    const LDQUO = '“'; // &ldquo; or &#8220;
    //    const RDQUO = '”'; // &rdquo; or &#8221;
    //    const BDQUO = '„'; // &bdquo; or &#8222;
    //    const SHY = "\xC2\xAD"; // &shy;
    //    const TRADE = '™'; // &trade;
    //    const REG = '®'; // &reg;
    //    const COPY = '©'; // &copy;
    public const ALL_SPACES = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s

    /**
     * UTF8 first letter in upper case.
     * "économie" => "Économie".
     */
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
    {
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);

        return $first . $rest;
    }

    /**
     * UTF8 first letter in lower case.
     * "Économie" => "économie".
     * @return string
     */
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
    {
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);

        return $first . $rest;
    }

    public static function replaceNonBreakingSpaces(string $text): string
    {
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
    }

    /**
     * Trim also non-breaking space and carriage return.
     */
    public static function trim(string $string): string
    {
        return trim($string, self::NO_BREAK_SPACE . self::NO_BREAK_THIN_SPACE . "\n\t\r");
    }

    /**
     * Todo verify/correct.
     *
     * @param string $str
     *
     * @return bool
     */
    //    static public function containsNonLatinCharacters(string $str): bool
    //    {
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
    //    }
    /**
     * Simplest levenshtein distance prediction of the correct param name.
     * Weird results with ASCII extended chars :
     * levenshtein('notre','nôtre') => 2
     * TODO move.
     *
     * @param int $max Maximum number of permutation/add/subtraction)
     *
     * @return string|null
     */
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
    {
        $sanitized = self::sanitizeParamForPredict($str);
        $closest = null;
        foreach ($names as $name) {
            $sanitizedName = self::sanitizeParamForPredict($name);
            if ($str === $name || $sanitized === $sanitizedName) {
                return $name; // exact match
            }
            $lev = levenshtein($str, $name);
            $lev2 = levenshtein($sanitized, $sanitizedName);

            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
                $closest = $name;
                $shortest = $lev;
            }
        }
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
            return $closest;
        }

        return null;
    }

    /**
     * For predictCorrectParam().
     * @return string
     */
    private static function sanitizeParamForPredict(string $str): string
    {
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));

        return trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
    }

    /**
     * Strip punctuation
     * UTF-8 compatible ??
     * Note : can't use str_split() which cut on 1 byte length
     * See http://fr.wikipedia.org/wiki/Ponctuation.
     */
    public static function stripPunctuation(string $str): string
    {
        return str_replace(
            self::ALL_PUNCTUATION,
            '',
            $str
        );
    }

    /**
     * Strip accents
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
     * UTF-8 compatible.
     * @return string
     */
    public static function stripAccents(string $string): string
    {
        return strtr(
            utf8_decode($string),
            utf8_decode(
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
            ),
            utf8_decode(
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
            )
        );
    }

    /**
     * Like PHP8 str_ends_with(). Multibytes ok.
     * @return bool
     */
    public static function str_ends_with(string $haystack, string $needle): bool
    {
        $len = mb_strlen($needle);
        if ($len === 0) {
            return true;
        }

        return (mb_substr($haystack, -$len) === $needle);
    }

    /**
     * Like PHP8 str_starts_with().
     * @return bool
     */
    public static function str_starts_with(string $haystack, string $needle): bool
    {
        $len = mb_strlen($needle);

        return (mb_substr($haystack, 0, $len) === $needle);
    }

    /**
     * Cut string at position of last space before Xth character.
     */
    public static function cutTextOnSpace(string $text, int $maxLength = 70): string
    {
        if (mb_strlen($text) > $maxLength) {
            $spacePos = mb_strrpos(mb_substr($text, 0, $maxLength), ' ');
            $spacePos = ($spacePos > ($maxLength - 12)) ? $spacePos : $maxLength;
            $text = trim(mb_substr($text, 0, $spacePos)) . self::SYMBOL_TEXT_CUT;
        }

        return $text;
    }

    public static function countAllCapsWords(string $text): int
    {
        $words = explode(' ', $text);
        $count = 0;
        foreach ($words as $word) {
            if (mb_strlen($word) > 2 && mb_strtoupper($word) === $word) {
                ++$count;
            }
        }

        return $count;
    }

    /**
     * code source:  https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152
     * table source: http://www.i18nqa.com/debug/utf8-debug.html
     */
    public static function fixWrongUTF8Encoding($inputString)
    {
        $fix_list = [
            // 3 char errors first
            'â€š' => '‚', 'â€ž' => '„', 'â€¦' => '…', 'â€¡' => '‡',
            'â€°' => '‰', 'â€¹' => '‹', 'â€˜' => '‘', 'â€™' => '’',
            'â€œ' => '“', 'â€¢' => '•', 'â€“' => '–', 'â€”' => '—',
            'â„¢' => '™', 'â€º' => '›', 'â‚¬' => '€',
            // 2 char errors
            'Ã‚' => 'Â', 'Æ’' => 'ƒ', 'Ãƒ' => 'Ã', 'Ã„' => 'Ä',
            'Ã…' => 'Å', 'â€' => '†', 'Ã†' => 'Æ', 'Ã‡' => 'Ç',
            'Ë†' => 'ˆ', 'Ãˆ' => 'È', 'Ã‰' => 'É', 'ÃŠ' => 'Ê',
            'Ã‹' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Å½' => 'Ž',
            'ÃŽ' => 'Î', 'Ã‘' => 'Ñ', 'Ã’' => 'Ò', 'Ã“' => 'Ó',
            'â€' => '”', 'Ã”' => 'Ô', 'Ã•' => 'Õ', 'Ã–' => 'Ö',
            'Ã—' => '×', 'Ëœ' => '˜', 'Ã˜' => 'Ø', 'Ã™' => 'Ù',
            'Å¡' => 'š', 'Ãš' => 'Ú', 'Ã›' => 'Û', 'Å“' => 'œ',
            'Ãœ' => 'Ü', 'Å¾' => 'ž', 'Ãž' => 'Þ', 'Å¸' => 'Ÿ',
            'ÃŸ' => 'ß', 'Â¡' => '¡', 'Ã¡' => 'á', 'Â¢' => '¢',
            'Ã¢' => 'â', 'Â£' => '£', 'Ã£' => 'ã', 'Â¤' => '¤',
            'Ã¤' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', 'Â¦' => '¦',
            'Ã¦' => 'æ', 'Â§' => '§', 'Ã§' => 'ç', 'Â¨' => '¨',
            'Ã¨' => 'è', 'Â©' => '©', 'Ã©' => 'é', 'Âª' => 'ª',
            'Ãª' => 'ê', 'Â«' => '«', 'Ã«' => 'ë', 'Â¬' => '¬',
            'Ã¬' => 'ì', 'Â®' => '®', 'Ã®' => 'î', 'Â¯' => '¯',
            'Ã¯' => 'ï', 'Â°' => '°', 'Ã°' => 'ð', 'Â±' => '±',
            'Ã±' => 'ñ', 'Â²' => '²', 'Ã²' => 'ò', 'Â³' => '³',
            'Ã³' => 'ó', 'Â´' => '´', 'Ã´' => 'ô', 'Âµ' => 'µ',
            'Ãµ' => 'õ', 'Â¶' => '¶', 'Ã¶' => 'ö', 'Â·' => '·',
            'Ã·' => '÷', 'Â¸' => '¸', 'Ã¸' => 'ø', 'Â¹' => '¹',
            'Ã¹' => 'ù', 'Âº' => 'º', 'Ãº' => 'ú', 'Â»' => '»',
            'Ã»' => 'û', 'Â¼' => '¼', 'Ã¼' => 'ü', 'Â½' => '½',
            'Ã½' => 'ý', 'Â¾' => '¾', 'Ã¾' => 'þ', 'Â¿' => '¿',
            'Ã¿' => 'ÿ', 'Ã€' => 'À',
            '  ' => ' ', // double space
            // 1 char errors last
            'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
            'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'Ã­' => 'í',
        ];

        $error_chars = array_keys($fix_list);
        $real_chars = array_values($fix_list);

        return str_replace($error_chars, $real_chars, $inputString);
    }
}


1		<?php
2		/*
3		* This file is part of dispositif/wikibot application (@github)
4		* 2019-2023 © Philippe M./Irønie <[email protected]>
5		* For the full copyright and MIT license information, view the license file.
6		*/
7
8		declare(strict_types=1);
9
10		namespace App\Domain\Utils;
11
12		/**
13		* Class TextUtil.
14		*/
15		abstract class TextUtil
16		{
17		public const SYMBOL_TEXT_CUT = '…';
18
19		public const SKIP_PREDICT_PARAM = ['issue'];
20
21		public const NO_BREAK_SPACE = "\xC2\xA0"; //
22
23		public const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
24
25		/** TODO ? add '-' and '.' ??? */
26		public const ALL_PUNCTUATION
27		= [
28		'!',
29		'"',
30		'«',
31		'»',
32		'#',
33		'$',
34		'%',
35		"'",
36		'’',
37		'´',
38		'`',
39		'^',
40		'…',
41		'‽',
42		'(',
43		')',
44		'*',
45		'⁂',
46		'+',
47		'–',
48		'—',
49		'/',
50		':',
51		';',
52		'?',
53		'@',
54		'[',
55		'\\',
56		']',
57		'_',
58		'`',
59		'{',
60		'\|',
61		'¦',
62		'}',
63		'~',
64		'<',
65		'>',
66		'№',
67		'©',
68		'®',
69		'°',
70		'†',
71		'§',
72		'∴',
73		'∵',
74		'¶',
75		'•',
76		'+',
77		];
78
79		//
80		// const ELLIPSIS = '…';
81		// const LAQUO = '«'; // «
82		// const RAQUO = '»'; // »
83		// const RSQUO = '’'; // ’
84		// const TIMES = '×'; // ×
85		// const NDASH = '–'; // – or –
86		// const MDASH = '—'; // — or —
87		// const LDQUO = '“'; // “ or “
88		// const RDQUO = '”'; // ” or ”
89		// const BDQUO = '„'; // &bdquo; or „
90		// const SHY = "\xC2\xAD"; //
91		// const TRADE = '™'; // ™
92		// const REG = '®'; // ®
93		// const COPY = '©'; // ©
94		public const ALL_SPACES = "\xE2\x80\xAF\|\xC2\xAD\|\xC2\xA0\|\\s"; // Used in regexps. Better than \s
95
96		/**
97		* UTF8 first letter in upper case.
98		* "économie" => "Économie".
99		*/
100		public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
101		{
102	28	$first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
103		$rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
104	28
105	28	return $first . $rest;
106		}
107	28
108		/**
109		* UTF8 first letter in lower case.
110		* "Économie" => "économie".
111		* @return string
112		*/
113		public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
114		{
115		$first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
116		$rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
117
118		return $first . $rest;
119	1	}
120
121	1	public static function replaceNonBreakingSpaces(string $text): string
122	1	{
123		return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
124	1	}
125
126		/**
127		* Trim also non-breaking space and carriage return.
128		*/
129		public static function trim(string $string): string
130		{
131		return trim($string, self::NO_BREAK_SPACE . self::NO_BREAK_THIN_SPACE . "\n\t\r");
132	33	}
133
134	33	/**
135		* Todo verify/correct.
136		*
137		* @param string $str
138		*
139		* @return bool
140		*/
141		// static public function containsNonLatinCharacters(string $str): bool
142		// {
143		// return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
144	1	// }
145		/**
146	1	* Simplest levenshtein distance prediction of the correct param name.
147		* Weird results with ASCII extended chars :
148		* levenshtein('notre','nôtre') => 2
149		* TODO move.
150		*
151		* @param int $max Maximum number of permutation/add/subtraction)
152		*
153		* @return string\|null
154		*/
155		public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
156		{
157		$sanitized = self::sanitizeParamForPredict($str);
158		$closest = null;
159		foreach ($names as $name) {
160		$sanitizedName = self::sanitizeParamForPredict($name);
161		if ($str === $name \|\| $sanitized === $sanitizedName) {
162		return $name; // exact match
163		}
164		$lev = levenshtein($str, $name);
165		$lev2 = levenshtein($sanitized, $sanitizedName);
166
167		if (!isset($shortest) \|\| $lev < $shortest \|\| $lev2 < $shortest) {
168		$closest = $name;
169		$shortest = $lev;
170		}
171		}
172		if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
173	3	return $closest;
174		}
175	3
176	3	return null;
177	3	}
178	3
179	3	/**
180		* For predictCorrectParam().
181		* @return string
182	3	*/
183	3	private static function sanitizeParamForPredict(string $str): string
184		{
185	3	$sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
186	3
187	3	return trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
188		}
189
190	3	/**
191	3	* Strip punctuation
192		* UTF-8 compatible ??
193		* Note : can't use str_split() which cut on 1 byte length
194		* See http://fr.wikipedia.org/wiki/Ponctuation.
195		*/
196		public static function stripPunctuation(string $str): string
197		{
198		return str_replace(
199		self::ALL_PUNCTUATION,
200		'',
201		$str
202		);
203		}
204	3
205		/**
206	3	* Strip accents
207	3	* OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
208		* UTF-8 compatible.
209	3	* @return string
210		*/
211		public static function stripAccents(string $string): string
212		{
213		return strtr(
214		utf8_decode($string),
215		utf8_decode(
216		'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
217		),
218		utf8_decode(
219		'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
220		)
221		);
222	22	}
223
224	22	/**
225	22	* Like PHP8 str_ends_with(). Multibytes ok.
226	22	* @return bool
227	22	*/
228		public static function str_ends_with(string $haystack, string $needle): bool
229		{
230		$len = mb_strlen($needle);
231		if ($len === 0) {
232		return true;
233		}
234
235		return (mb_substr($haystack, -$len) === $needle);
236		}
237
238		/**
239		* Like PHP8 str_starts_with().
240	22	* @return bool
241		*/
242	22	public static function str_starts_with(string $haystack, string $needle): bool
243	22	{
244	22	$len = mb_strlen($needle);
245	22
246		return (mb_substr($haystack, 0, $len) === $needle);
247	22	}
248	22
249		/**
250		* Cut string at position of last space before Xth character.
251		*/
252		public static function cutTextOnSpace(string $text, int $maxLength = 70): string
253		{
254		if (mb_strlen($text) > $maxLength) {
255		$spacePos = mb_strrpos(mb_substr($text, 0, $maxLength), ' ');
256		$spacePos = ($spacePos > ($maxLength - 12)) ? $spacePos : $maxLength;
257		$text = trim(mb_substr($text, 0, $spacePos)) . self::SYMBOL_TEXT_CUT;
258		}
259
260		return $text;
261		}
262
263		public static function countAllCapsWords(string $text): int
264		{
265		$words = explode(' ', $text);
266		$count = 0;
267		foreach ($words as $word) {
268		if (mb_strlen($word) > 2 && mb_strtoupper($word) === $word) {
269		++$count;
270		}
271		}
272
273		return $count;
274		}
275
276		/**
277		* code source: https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152
278		* table source: http://www.i18nqa.com/debug/utf8-debug.html
279		*/
280		public static function fixWrongUTF8Encoding($inputString)
281		{
282		$fix_list = [
283		// 3 char errors first
284		'â€š' => '‚', 'â€ž' => '„', 'â€¦' => '…', 'â€¡' => '‡',
285		'â€°' => '‰', 'â€¹' => '‹', 'â€˜' => '‘', 'â€™' => '’',
286		'â€œ' => '“', 'â€¢' => '•', 'â€“' => '–', 'â€”' => '—',
287		'â„¢' => '™', 'â€º' => '›', 'â‚¬' => '€',
288		// 2 char errors
289		'Ã‚' => 'Â', 'Æ’' => 'ƒ', 'Ãƒ' => 'Ã', 'Ã„' => 'Ä',
290		'Ã…' => 'Å', 'â€' => '†', 'Ã†' => 'Æ', 'Ã‡' => 'Ç',
291		'Ë†' => 'ˆ', 'Ãˆ' => 'È', 'Ã‰' => 'É', 'ÃŠ' => 'Ê',
292		'Ã‹' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Å½' => 'Ž',
293		'ÃŽ' => 'Î', 'Ã‘' => 'Ñ', 'Ã’' => 'Ò', 'Ã“' => 'Ó',
294		'â€' => '”', 'Ã”' => 'Ô', 'Ã•' => 'Õ', 'Ã–' => 'Ö',
295		'Ã—' => '×', 'Ëœ' => '˜', 'Ã˜' => 'Ø', 'Ã™' => 'Ù',
296		'Å¡' => 'š', 'Ãš' => 'Ú', 'Ã›' => 'Û', 'Å“' => 'œ',
297		'Ãœ' => 'Ü', 'Å¾' => 'ž', 'Ãž' => 'Þ', 'Å¸' => 'Ÿ',
298		'ÃŸ' => 'ß', 'Â¡' => '¡', 'Ã¡' => 'á', 'Â¢' => '¢',
299		'Ã¢' => 'â', 'Â£' => '£', 'Ã£' => 'ã', 'Â¤' => '¤',
300		'Ã¤' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', 'Â¦' => '¦',
301		'Ã¦' => 'æ', 'Â§' => '§', 'Ã§' => 'ç', 'Â¨' => '¨',
302		'Ã¨' => 'è', 'Â©' => '©', 'Ã©' => 'é', 'Âª' => 'ª',
303		'Ãª' => 'ê', 'Â«' => '«', 'Ã«' => 'ë', 'Â¬' => '¬',
304		'Ã¬' => 'ì', 'Â®' => '®', 'Ã®' => 'î', 'Â¯' => '¯',
305		'Ã¯' => 'ï', 'Â°' => '°', 'Ã°' => 'ð', 'Â±' => '±',
306		'Ã±' => 'ñ', 'Â²' => '²', 'Ã²' => 'ò', 'Â³' => '³',
307		'Ã³' => 'ó', 'Â´' => '´', 'Ã´' => 'ô', 'Âµ' => 'µ',
308		'Ãµ' => 'õ', 'Â¶' => '¶', 'Ã¶' => 'ö', 'Â·' => '·',
309		'Ã·' => '÷', 'Â¸' => '¸', 'Ã¸' => 'ø', 'Â¹' => '¹',
310		'Ã¹' => 'ù', 'Âº' => 'º', 'Ãº' => 'ú', 'Â»' => '»',
311		'Ã»' => 'û', 'Â¼' => '¼', 'Ã¼' => 'ü', 'Â½' => '½',
312		'Ã½' => 'ý', 'Â¾' => '¾', 'Ã¾' => 'þ', 'Â¿' => '¿',
313		'Ã¿' => 'ÿ', 'Ã€' => 'À',
314		' ' => ' ', // double space
315		// 1 char errors last
316		'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
317		'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'Ã' => 'í',
318		];
319
320		$error_chars = array_keys($fix_list);
321		$real_chars = array_values($fix_list);
322
323		return str_replace($error_chars, $real_chars, $inputString);
324		}
325		}
326

Dispositif / Wikibot

Push — master ( 732591...229bc5 )

TextUtil::fixWrongUTF8Encoding() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like