Test Failed
Push — master ( 2eb953...f148c7 )
by Dispositif
08:32
created

TextUtil::predictCorrectParam()   B

Complexity

Conditions 10
Paths 7

Size

Total Lines 22
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 10.2368

Importance

Changes 0
Metric Value
cc 10
eloc 14
c 0
b 0
f 0
nc 7
nop 3
dl 0
loc 22
ccs 13
cts 15
cp 0.8667
crap 10.2368
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe/Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
/**
13
 * Class TextUtil.
14
 */
15
abstract class TextUtil
16
{
17
    const SKIP_PREDICT_PARAM = ['issue'];
18
19
    const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;
20
21
    const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
22
23
    const ALL_PUNCTUATION
24
        = [
25
            '!',
26
            '"',
27
            '«',
28
            '»',
29
            '#',
30
            '$',
31
            '%',
32
            "'",
33
            '’',
34
            '´',
35
            '`',
36
            '^',
37
            '…',
38
            '‽',
39
            '(',
40
            ')',
41
            '*',
42
            '⁂',
43
            '+',
44
            '–',
45
            '—',
46
            '/',
47
            ':',
48
            ';',
49
            '?',
50
            '@',
51
            '[',
52
            '\\',
53
            ']',
54
            '_',
55
            '`',
56
            '{',
57
            '|',
58
            '¦',
59
            '}',
60
            '~',
61
            '<',
62
            '>',
63
            '№',
64
            '©',
65
            '®',
66
            '°',
67
            '†',
68
            '§',
69
            '∴',
70
            '∵',
71
            '¶',
72
            '•',
73
            '+',
74
        ];
75
76
    // &#8239;
77
    //    const ELLIPSIS = '…';
78
    //    const LAQUO = '«'; // &laquo;
79
    //    const RAQUO = '»'; // &raquo;
80
    //    const RSQUO = '’'; // &rsquo;
81
    //    const TIMES = '×'; // &times;
82
    //    const NDASH = '–'; // &ndash; or &#x2013;
83
    //    const MDASH = '—'; // &mdash; or &#x2014;
84
    //    const LDQUO = '“'; // &ldquo; or &#8220;
85
    //    const RDQUO = '”'; // &rdquo; or &#8221;
86
    //    const BDQUO = '„'; // &bdquo; or &#8222;
87
    //    const SHY = "\xC2\xAD"; // &shy;
88
    //    const TRADE = '™'; // &trade;
89
    //    const REG = '®'; // &reg;
90
    //    const COPY = '©'; // &copy;
91
    const ALL_SPACES = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s
92
93
    /**
94
     * UTF8 first letter in upper case.
95
     * "économie" => "Économie".
96
     *
97
     * @param string      $str
98
     * @param string|null $e
99
     *
100
     * @return string
101
     */
102 28
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
103
    {
104 28
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
105 28
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
106
107 28
        return $first.$rest;
108
    }
109
110
    /**
111
     * UTF8 first letter in lower case.
112
     * "Économie" => "économie".
113
     *
114
     * @param string      $str
115
     * @param string|null $e
116
     *
117
     * @return string
118
     */
119 1
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
120
    {
121 1
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
122 1
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
123
124 1
        return $first.$rest;
125
    }
126
127
    /**
128
     * @param string $text
129
     *
130
     * @return mixed
131
     */
132 33
    public static function replaceNonBreakingSpaces(string $text)
133
    {
134 33
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
135
    }
136
137
    /**
138
     * Trim also non-breaking space and carriage return.
139
     *
140
     * @param string $string
141
     *
142
     * @return string
143
     */
144 1
    public static function trim(string $string)
145
    {
146 1
        return trim($string, self::NO_BREAK_SPACE.self::NO_BREAK_THIN_SPACE."\n\t\r");
147
    }
148
149
    /**
150
     * Todo verify/correct.
151
     *
152
     * @param string $str
153
     *
154
     * @return bool
155
     */
156
    //    static public function containsNonLatinCharacters(string $str): bool
157
    //    {
158
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
159
    //    }
160
161
    /**
162
     * Simplest levenshtein distance prediction of the correct param name.
163
     * Weird results with ASCII extended chars :
164
     * levenshtein('notre','nôtre') => 2
165
     * TODO move.
166
     *
167
     * @param string $str
168
     * @param array  $names
169
     * @param int    $max Maximum number of permutation/add/subtraction)
170
     *
171
     * @return string|null
172
     */
173 3
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
174
    {
175 3
        $sanitized = self::sanitizeParamForPredict($str);
176 3
        $closest = null;
177 3
        foreach ($names as $name) {
178 3
            $sanitizedName = self::sanitizeParamForPredict($name);
179 3
            if ($str === $name || $sanitized === $sanitizedName) {
180
                return $name; // exact match
181
            }
182 3
            $lev = levenshtein($str, $name);
183 3
            $lev2 = levenshtein($sanitized, $sanitizedName);
184
185 3
            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
186 3
                $closest = $name;
187 3
                $shortest = $lev;
188
            }
189
        }
190 3
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
191 3
            return $closest;
192
        }
193
194
        return null;
195
    }
196
197
    /**
198
     * For predictCorrectParam().
199
     *
200
     * @param string $str
201
     *
202
     * @return string
203
     */
204 3
    private static function sanitizeParamForPredict(string $str): string
205
    {
206 3
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
207 3
        $sanitized = trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
208
209 3
        return $sanitized;
210
    }
211
212
    /**
213
     * Strip punctuation
214
     * UTF-8 compatible ??
215
     * Note : can't use str_split() which cut on 1 byte length
216
     * See http://fr.wikipedia.org/wiki/Ponctuation.
217
     *
218
     * @param string $str
219
     *
220
     * @return string
221
     */
222 22
    public static function stripPunctuation(string $str)
223
    {
224 22
        return str_replace(
225 22
            self::ALL_PUNCTUATION,
226 22
            '',
227 22
            $str
228
        );
229
    }
230
231
    /**
232
     * Strip accents
233
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
234
     * UTF-8 compatible.
235
     *
236
     * @param string $string
237
     *
238
     * @return string
239
     */
240 22
    public static function stripAccents(string $string): string
241
    {
242 22
        return strtr(
243 22
            utf8_decode($string),
244 22
            utf8_decode(
245 22
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
246
            ),
247 22
            utf8_decode(
248 22
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
249
            )
250
        );
251
    }
252
253
    /**
254
     * Like PHP8 str_ends_with(). Multibytes ok.
255
     *
256
     * @param string $haystack
257
     * @param string $needle
258
     *
259
     * @return bool
260
     */
261
    public static function str_ends_with(string $haystack, string $needle): bool
262
    {
263
        $len = mb_strlen($needle);
264
        if ($len === 0) {
265
            return true;
266
        }
267
268
        return (mb_substr($haystack, -$len) === $needle);
269
    }
270
271
    /**
272
     * Like PHP8 str_starts_with().
273
     *
274
     * @param string $haystack
275
     * @param string $needle
276
     *
277
     * @return bool
278
     */
279
    public static function str_starts_with(string $haystack, string $needle): bool
280
    {
281
        $len = mb_strlen($needle);
282
283
        return (mb_substr($haystack, 0, $len) === $needle);
284
    }
285
}
286