Passed
Push — master ( 5eccc7...1faa52 )
by Dispositif
02:45
created

TextUtil::mb_ucfirst()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 2
dl 0
loc 6
ccs 4
cts 4
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application
4
 * 2019 : Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the LICENSE file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
/**
13
 * Class TextUtil.
14
 */
15
abstract class TextUtil
16
{
17
    const SKIP_PREDICT_PARAM = ['issue'];
18
19
    const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;
20
21
    const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
22
23
    const ALL_PUNCTUATION
24
        = [
25
            '!',
26
            '"',
27
            '«',
28
            '»',
29
            '#',
30
            '$',
31
            '%',
32
            "'",
33
            '’',
34
            '´',
35
            '`',
36
            '^',
37
            '…',
38
            '‽',
39
            '(',
40
            ')',
41
            '*',
42
            '⁂',
43
            '+',
44
            '–',
45
            '—',
46
            '/',
47
            ':',
48
            ';',
49
            '?',
50
            '@',
51
            '[',
52
            '\\',
53
            ']',
54
            '_',
55
            '`',
56
            '{',
57
            '|',
58
            '¦',
59
            '}',
60
            '~',
61
            '<',
62
            '>',
63
            '№',
64
            '©',
65
            '®',
66
            '°',
67
            '†',
68
            '§',
69
            '∴',
70
            '∵',
71
            '¶',
72
            '•',
73
            '+',
74
        ];
75
76
    // &#8239;
77
    //    const ELLIPSIS = '…';
78
    //    const LAQUO = '«'; // &laquo;
79
    //    const RAQUO = '»'; // &raquo;
80
    //    const RSQUO = '’'; // &rsquo;
81
    //    const TIMES = '×'; // &times;
82
    //    const NDASH = '–'; // &ndash; or &#x2013;
83
    //    const MDASH = '—'; // &mdash; or &#x2014;
84
    //    const LDQUO = '“'; // &ldquo; or &#8220;
85
    //    const RDQUO = '”'; // &rdquo; or &#8221;
86
    //    const BDQUO = '„'; // &bdquo; or &#8222;
87
    //    const SHY = "\xC2\xAD"; // &shy;
88
    //    const TRADE = '™'; // &trade;
89
    //    const REG = '®'; // &reg;
90
    //    const COPY = '©'; // &copy;
91
    const ALL_SPACES         = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s
92
93
    /**
94
     * UTF8 first letter in upper case.
95
     * "économie" => "Économie".
96
     *
97
     * @param string      $str
98
     * @param string|null $e
99
     *
100
     * @return string
101
     */
102 28
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
103
    {
104 28
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
105 28
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
106
107 28
        return $first.$rest;
108
    }
109
110
    /**
111
     * UTF8 first letter in lower case.
112
     * "Économie" => "économie".
113
     *
114
     * @param string      $str
115
     * @param string|null $e
116
     *
117
     * @return string
118
     */
119 1
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
120
    {
121 1
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
122 1
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
123
124 1
        return $first.$rest;
125
    }
126
127
    /**
128
     * @param string $text
129
     *
130
     * @return mixed
131
     */
132 33
    public static function replaceNonBreakingSpaces(string $text)
133
    {
134 33
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
135
    }
136
137
    /**
138
     * Trim also non-breaking space and carriage return.
139
     *
140
     * @param string $string
141
     *
142
     * @return string
143
     */
144 1
    public static function trim(string $string)
145
    {
146 1
        return trim($string, self::NO_BREAK_SPACE.self::NO_BREAK_THIN_SPACE."\n\t\r");
147
    }
148
149
    /**
150
     * Todo verify/correct.
151
     *
152
     * @param string $str
153
     *
154
     * @return bool
155
     */
156
    //    static public function containsNonLatinCharacters(string $str): bool
157
    //    {
158
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
159
    //    }
160
161
    /**
162
     * Simplest levenshtein distance prediction of the correct param name.
163
     * Weird results with ASCII extended chars :
164
     * levenshtein('notre','nôtre') => 2
165
     * TODO move.
166
     *
167
     * @param string $str
168
     * @param array  $names
169
     * @param int    $max Maximum number of permutation/add/subtraction)
170
     *
171
     * @return string|null
172
     */
173 3
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
174
    {
175 3
        $sanitized = self::sanitizeParamForPredict($str);
176 3
        $closest = null;
177 3
        foreach ($names as $name) {
178 3
            $sanitizedName = self::sanitizeParamForPredict($name);
179 3
            if ($str === $name || $sanitized === $sanitizedName) {
180
                return $name; // exact match
181
            }
182 3
            $lev = levenshtein($str, $name);
183 3
            $lev2 = levenshtein($sanitized, $sanitizedName);
184
185 3
            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
186 3
                $closest = $name;
187 3
                $shortest = $lev;
188
            }
189
        }
190 3
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
191 3
            return $closest;
192
        }
193
194
        return null;
195
    }
196
197
    /**
198
     * For predictCorrectParam().
199
     *
200
     * @param string $str
201
     *
202
     * @return string
203
     */
204 3
    private static function sanitizeParamForPredict(string $str): string
205
    {
206 3
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
207 3
        $sanitized = trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
208
209 3
        return $sanitized;
210
    }
211
212
    /**
213
     * Strip punctuation
214
     * UTF-8 compatible ??
215
     * Note : can't use str_split() which cut on 1 byte length
216
     * See http://fr.wikipedia.org/wiki/Ponctuation.
217
     *
218
     * @param string $str
219
     *
220
     * @return string
221
     */
222 22
    public static function stripPunctuation(string $str)
223
    {
224 22
        return str_replace(
225 22
            self::ALL_PUNCTUATION,
226 22
            '',
227 22
            $str
228
        );
229
    }
230
231
    /**
232
     * Strip accents
233
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
234
     * UTF-8 compatible.
235
     *
236
     * @param string $string
237
     *
238
     * @return string
239
     */
240 22
    public static function stripAccents(string $string): string
241
    {
242 22
        return strtr(
243 22
            utf8_decode($string),
244 22
            utf8_decode(
245 22
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
246
            ),
247 22
            utf8_decode(
248 22
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
249
            )
250
        );
251
    }
252
}
253