Passed
Push — master ( 732591...229bc5 )
by Dispositif
03:49
created

TextUtil::fixWrongUTF8Encoding()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 44
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 1
Metric Value
eloc 36
c 1
b 0
f 1
dl 0
loc 44
rs 9.344
ccs 0
cts 0
cp 0
cc 1
nc 1
nop 1
crap 2
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
/**
13
 * Class TextUtil.
14
 */
15
abstract class TextUtil
16
{
17
    public const SYMBOL_TEXT_CUT = '…';
18
19
    public const SKIP_PREDICT_PARAM = ['issue'];
20
21
    public const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;
22
23
    public const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
24
25
    /** TODO ? add '-' and '.' ???  */
26
    public const ALL_PUNCTUATION
27
        = [
28
            '!',
29
            '"',
30
            '«',
31
            '»',
32
            '#',
33
            '$',
34
            '%',
35
            "'",
36
            '’',
37
            '´',
38
            '`',
39
            '^',
40
            '…',
41
            '‽',
42
            '(',
43
            ')',
44
            '*',
45
            '⁂',
46
            '+',
47
            '–',
48
            '—',
49
            '/',
50
            ':',
51
            ';',
52
            '?',
53
            '@',
54
            '[',
55
            '\\',
56
            ']',
57
            '_',
58
            '`',
59
            '{',
60
            '|',
61
            '¦',
62
            '}',
63
            '~',
64
            '<',
65
            '>',
66
            '№',
67
            '©',
68
            '®',
69
            '°',
70
            '†',
71
            '§',
72
            '∴',
73
            '∵',
74
            '¶',
75
            '•',
76
            '+',
77
        ];
78
79
    // &#8239;
80
    //    const ELLIPSIS = '…';
81
    //    const LAQUO = '«'; // &laquo;
82
    //    const RAQUO = '»'; // &raquo;
83
    //    const RSQUO = '’'; // &rsquo;
84
    //    const TIMES = '×'; // &times;
85
    //    const NDASH = '–'; // &ndash; or &#x2013;
86
    //    const MDASH = '—'; // &mdash; or &#x2014;
87
    //    const LDQUO = '“'; // &ldquo; or &#8220;
88
    //    const RDQUO = '”'; // &rdquo; or &#8221;
89
    //    const BDQUO = '„'; // &bdquo; or &#8222;
90
    //    const SHY = "\xC2\xAD"; // &shy;
91
    //    const TRADE = '™'; // &trade;
92
    //    const REG = '®'; // &reg;
93
    //    const COPY = '©'; // &copy;
94
    public const ALL_SPACES = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s
95
96
    /**
97
     * UTF8 first letter in upper case.
98
     * "économie" => "Économie".
99
     */
100
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
101
    {
102 28
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
103
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
104 28
105 28
        return $first . $rest;
106
    }
107 28
108
    /**
109
     * UTF8 first letter in lower case.
110
     * "Économie" => "économie".
111
     * @return string
112
     */
113
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
114
    {
115
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
116
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
117
118
        return $first . $rest;
119 1
    }
120
121 1
    public static function replaceNonBreakingSpaces(string $text): string
122 1
    {
123
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
124 1
    }
125
126
    /**
127
     * Trim also non-breaking space and carriage return.
128
     */
129
    public static function trim(string $string): string
130
    {
131
        return trim($string, self::NO_BREAK_SPACE . self::NO_BREAK_THIN_SPACE . "\n\t\r");
132 33
    }
133
134 33
    /**
135
     * Todo verify/correct.
136
     *
137
     * @param string $str
138
     *
139
     * @return bool
140
     */
141
    //    static public function containsNonLatinCharacters(string $str): bool
142
    //    {
143
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
144 1
    //    }
145
    /**
146 1
     * Simplest levenshtein distance prediction of the correct param name.
147
     * Weird results with ASCII extended chars :
148
     * levenshtein('notre','nôtre') => 2
149
     * TODO move.
150
     *
151
     * @param int $max Maximum number of permutation/add/subtraction)
152
     *
153
     * @return string|null
154
     */
155
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
156
    {
157
        $sanitized = self::sanitizeParamForPredict($str);
158
        $closest = null;
159
        foreach ($names as $name) {
160
            $sanitizedName = self::sanitizeParamForPredict($name);
161
            if ($str === $name || $sanitized === $sanitizedName) {
162
                return $name; // exact match
163
            }
164
            $lev = levenshtein($str, $name);
165
            $lev2 = levenshtein($sanitized, $sanitizedName);
166
167
            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
168
                $closest = $name;
169
                $shortest = $lev;
170
            }
171
        }
172
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
173 3
            return $closest;
174
        }
175 3
176 3
        return null;
177 3
    }
178 3
179 3
    /**
180
     * For predictCorrectParam().
181
     * @return string
182 3
     */
183 3
    private static function sanitizeParamForPredict(string $str): string
184
    {
185 3
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
186 3
187 3
        return trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
188
    }
189
190 3
    /**
191 3
     * Strip punctuation
192
     * UTF-8 compatible ??
193
     * Note : can't use str_split() which cut on 1 byte length
194
     * See http://fr.wikipedia.org/wiki/Ponctuation.
195
     */
196
    public static function stripPunctuation(string $str): string
197
    {
198
        return str_replace(
199
            self::ALL_PUNCTUATION,
200
            '',
201
            $str
202
        );
203
    }
204 3
205
    /**
206 3
     * Strip accents
207 3
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
208
     * UTF-8 compatible.
209 3
     * @return string
210
     */
211
    public static function stripAccents(string $string): string
212
    {
213
        return strtr(
214
            utf8_decode($string),
215
            utf8_decode(
216
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
217
            ),
218
            utf8_decode(
219
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
220
            )
221
        );
222 22
    }
223
224 22
    /**
225 22
     * Like PHP8 str_ends_with(). Multibytes ok.
226 22
     * @return bool
227 22
     */
228
    public static function str_ends_with(string $haystack, string $needle): bool
229
    {
230
        $len = mb_strlen($needle);
231
        if ($len === 0) {
232
            return true;
233
        }
234
235
        return (mb_substr($haystack, -$len) === $needle);
236
    }
237
238
    /**
239
     * Like PHP8 str_starts_with().
240 22
     * @return bool
241
     */
242 22
    public static function str_starts_with(string $haystack, string $needle): bool
243 22
    {
244 22
        $len = mb_strlen($needle);
245 22
246
        return (mb_substr($haystack, 0, $len) === $needle);
247 22
    }
248 22
249
    /**
250
     * Cut string at position of last space before Xth character.
251
     */
252
    public static function cutTextOnSpace(string $text, int $maxLength = 70): string
253
    {
254
        if (mb_strlen($text) > $maxLength) {
255
            $spacePos = mb_strrpos(mb_substr($text, 0, $maxLength), ' ');
256
            $spacePos = ($spacePos > ($maxLength - 12)) ? $spacePos : $maxLength;
257
            $text = trim(mb_substr($text, 0, $spacePos)) . self::SYMBOL_TEXT_CUT;
258
        }
259
260
        return $text;
261
    }
262
263
    public static function countAllCapsWords(string $text): int
264
    {
265
        $words = explode(' ', $text);
266
        $count = 0;
267
        foreach ($words as $word) {
268
            if (mb_strlen($word) > 2 && mb_strtoupper($word) === $word) {
269
                ++$count;
270
            }
271
        }
272
273
        return $count;
274
    }
275
276
    /**
277
     * code source:  https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152
278
     * table source: http://www.i18nqa.com/debug/utf8-debug.html
279
     */
280
    public static function fixWrongUTF8Encoding($inputString)
281
    {
282
        $fix_list = [
283
            // 3 char errors first
284
            '‚' => '‚', '„' => '„', '…' => '…', '‡' => '‡',
285
            '‰' => '‰', '‹' => '‹', '‘' => '‘', '’' => '’',
286
            '“' => '“', '•' => '•', '–' => '–', '—' => '—',
287
            'â„¢' => '™', '›' => '›', '€' => '€',
288
            // 2 char errors
289
            'Â' => 'Â', 'Æ’' => 'ƒ', 'Ã' => 'Ã', 'Ä' => 'Ä',
290
            'Ã…' => 'Å', 'â€' => '†', 'Æ' => 'Æ', 'Ç' => 'Ç',
291
            'ˆ' => 'ˆ', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
292
            'Ë' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Ž' => 'Ž',
293
            'ÃŽ' => 'Î', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó',
294
            'â€' => '”', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö',
295
            '×' => '×', 'Ëœ' => '˜', 'Ø' => 'Ø', 'Ù' => 'Ù',
296
            'Å¡' => 'š', 'Ú' => 'Ú', 'Û' => 'Û', 'Å“' => 'œ',
297
            'Ü' => 'Ü', 'ž' => 'ž', 'Þ' => 'Þ', 'Ÿ' => 'Ÿ',
298
            'ß' => 'ß', '¡' => '¡', 'á' => 'á', '¢' => '¢',
299
            'â' => 'â', '£' => '£', 'ã' => 'ã', '¤' => '¤',
300
            'ä' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', '¦' => '¦',
301
            'æ' => 'æ', '§' => '§', 'ç' => 'ç', '¨' => '¨',
302
            'è' => 'è', '©' => '©', 'é' => 'é', 'ª' => 'ª',
303
            'ê' => 'ê', '«' => '«', 'ë' => 'ë', '¬' => '¬',
304
            'ì' => 'ì', '®' => '®', 'î' => 'î', '¯' => '¯',
305
            'ï' => 'ï', '°' => '°', 'ð' => 'ð', '±' => '±',
306
            'ñ' => 'ñ', '²' => '²', 'ò' => 'ò', '³' => '³',
307
            'ó' => 'ó', '´' => '´', 'ô' => 'ô', 'µ' => 'µ',
308
            'õ' => 'õ', '¶' => '¶', 'ö' => 'ö', '·' => '·',
309
            '÷' => '÷', '¸' => '¸', 'ø' => 'ø', '¹' => '¹',
310
            'ù' => 'ù', 'º' => 'º', 'ú' => 'ú', '»' => '»',
311
            'û' => 'û', '¼' => '¼', 'ü' => 'ü', '½' => '½',
312
            'ý' => 'ý', '¾' => '¾', 'þ' => 'þ', '¿' => '¿',
313
            'ÿ' => 'ÿ', 'À' => 'À',
314
            '  ' => ' ', // double space
315
            // 1 char errors last
316
            'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
317
            'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'í' => 'í',
318
        ];
319
320
        $error_chars = array_keys($fix_list);
321
        $real_chars = array_values($fix_list);
322
323
        return str_replace($error_chars, $real_chars, $inputString);
324
    }
325
}
326