Passed
Push — master ( dff8a4...2556d0 )
by Dispositif
08:19
created

TextUtil::cutTextOnSpace()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
cc 3
eloc 5
c 0
b 0
f 0
nc 3
nop 2
dl 0
loc 9
ccs 0
cts 0
cp 0
crap 12
rs 10
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
/**
13
 * Class TextUtil.
14
 */
15
abstract class TextUtil
16
{
17
    public const SYMBOL_TEXT_CUT = '…';
18
19
    public const SKIP_PREDICT_PARAM = ['issue'];
20
21
    public const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;
22
23
    public const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
24
25
    public const ALL_PUNCTUATION
26
        = [
27
            '!',
28
            '"',
29
            '«',
30
            '»',
31
            '#',
32
            '$',
33
            '%',
34
            "'",
35
            '’',
36
            '´',
37
            '`',
38
            '^',
39
            '…',
40
            '‽',
41
            '(',
42
            ')',
43
            '*',
44
            '⁂',
45
            '+',
46
            '–',
47
            '—',
48
            '/',
49
            ':',
50
            ';',
51
            '?',
52
            '@',
53
            '[',
54
            '\\',
55
            ']',
56
            '_',
57
            '`',
58
            '{',
59
            '|',
60
            '¦',
61
            '}',
62
            '~',
63
            '<',
64
            '>',
65
            '№',
66
            '©',
67
            '®',
68
            '°',
69
            '†',
70
            '§',
71
            '∴',
72
            '∵',
73
            '¶',
74
            '•',
75
            '+',
76
        ];
77
78
    // &#8239;
79
    //    const ELLIPSIS = '…';
80
    //    const LAQUO = '«'; // &laquo;
81
    //    const RAQUO = '»'; // &raquo;
82
    //    const RSQUO = '’'; // &rsquo;
83
    //    const TIMES = '×'; // &times;
84
    //    const NDASH = '–'; // &ndash; or &#x2013;
85
    //    const MDASH = '—'; // &mdash; or &#x2014;
86
    //    const LDQUO = '“'; // &ldquo; or &#8220;
87
    //    const RDQUO = '”'; // &rdquo; or &#8221;
88
    //    const BDQUO = '„'; // &bdquo; or &#8222;
89
    //    const SHY = "\xC2\xAD"; // &shy;
90
    //    const TRADE = '™'; // &trade;
91
    //    const REG = '®'; // &reg;
92
    //    const COPY = '©'; // &copy;
93
    public const ALL_SPACES = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s
94
95
    /**
96
     * UTF8 first letter in upper case.
97
     * "économie" => "Économie".
98
     */
99
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
100
    {
101
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
102 28
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
103
104 28
        return $first.$rest;
105 28
    }
106
107 28
    /**
108
     * UTF8 first letter in lower case.
109
     * "Économie" => "économie".
110
     *
111
     * @param string      $str
112
     * @param string|null $e
113
     *
114
     * @return string
115
     */
116
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
117
    {
118
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
119 1
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
120
121 1
        return $first.$rest;
122 1
    }
123
124 1
    public static function replaceNonBreakingSpaces(string $text): string
125
    {
126
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
127
    }
128
129
    /**
130
     * Trim also non-breaking space and carriage return.
131
     */
132 33
    public static function trim(string $string): string
133
    {
134 33
        return trim($string, self::NO_BREAK_SPACE.self::NO_BREAK_THIN_SPACE."\n\t\r");
135
    }
136
137
    /**
138
     * Todo verify/correct.
139
     *
140
     * @param string $str
141
     *
142
     * @return bool
143
     */
144 1
    //    static public function containsNonLatinCharacters(string $str): bool
145
    //    {
146 1
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
147
    //    }
148
149
    /**
150
     * Simplest levenshtein distance prediction of the correct param name.
151
     * Weird results with ASCII extended chars :
152
     * levenshtein('notre','nôtre') => 2
153
     * TODO move.
154
     *
155
     * @param string $str
156
     * @param array  $names
157
     * @param int    $max Maximum number of permutation/add/subtraction)
158
     *
159
     * @return string|null
160
     */
161
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
162
    {
163
        $sanitized = self::sanitizeParamForPredict($str);
164
        $closest = null;
165
        foreach ($names as $name) {
166
            $sanitizedName = self::sanitizeParamForPredict($name);
167
            if ($str === $name || $sanitized === $sanitizedName) {
168
                return $name; // exact match
169
            }
170
            $lev = levenshtein($str, $name);
171
            $lev2 = levenshtein($sanitized, $sanitizedName);
172
173 3
            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
174
                $closest = $name;
175 3
                $shortest = $lev;
176 3
            }
177 3
        }
178 3
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
179 3
            return $closest;
180
        }
181
182 3
        return null;
183 3
    }
184
185 3
    /**
186 3
     * For predictCorrectParam().
187 3
     *
188
     * @param string $str
189
     *
190 3
     * @return string
191 3
     */
192
    private static function sanitizeParamForPredict(string $str): string
193
    {
194
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
195
196
        return trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
197
    }
198
199
    /**
200
     * Strip punctuation
201
     * UTF-8 compatible ??
202
     * Note : can't use str_split() which cut on 1 byte length
203
     * See http://fr.wikipedia.org/wiki/Ponctuation.
204 3
     */
205
    public static function stripPunctuation(string $str): string
206 3
    {
207 3
        return str_replace(
208
            self::ALL_PUNCTUATION,
209 3
            '',
210
            $str
211
        );
212
    }
213
214
    /**
215
     * Strip accents
216
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
217
     * UTF-8 compatible.
218
     *
219
     * @param string $string
220
     *
221
     * @return string
222 22
     */
223
    public static function stripAccents(string $string): string
224 22
    {
225 22
        return strtr(
226 22
            utf8_decode($string),
227 22
            utf8_decode(
228
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
229
            ),
230
            utf8_decode(
231
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
232
            )
233
        );
234
    }
235
236
    /**
237
     * Like PHP8 str_ends_with(). Multibytes ok.
238
     *
239
     * @param string $haystack
240 22
     * @param string $needle
241
     *
242 22
     * @return bool
243 22
     */
244 22
    public static function str_ends_with(string $haystack, string $needle): bool
245 22
    {
246
        $len = mb_strlen($needle);
247 22
        if ($len === 0) {
248 22
            return true;
249
        }
250
251
        return (mb_substr($haystack, -$len) === $needle);
252
    }
253
254
    /**
255
     * Like PHP8 str_starts_with().
256
     *
257
     * @param string $haystack
258
     * @param string $needle
259
     *
260
     * @return bool
261
     */
262
    public static function str_starts_with(string $haystack, string $needle): bool
263
    {
264
        $len = mb_strlen($needle);
265
266
        return (mb_substr($haystack, 0, $len) === $needle);
267
    }
268
269
    /**
270
     * Cut string at position of last space before Xth character.
271
     */
272
    public static function cutTextOnSpace(string $text, int $maxLength = 70): string
273
    {
274
        if (mb_strlen($text) > $maxLength) {
275
            $spacePos = mb_strrpos(mb_substr($text, 0, $maxLength), ' ');
276
            $spacePos = ($spacePos > ($maxLength - 12)) ? $spacePos : $maxLength;
277
            $text = trim(mb_substr($text, 0, $spacePos)) . self::SYMBOL_TEXT_CUT;
278
        }
279
280
        return $text;
281
    }
282
283
    public static function countAllCapsWords(string $text): int
284
    {
285
        $words = explode(' ', $text);
286
        $count = 0;
287
        foreach ($words as $word) {
288
            if (mb_strlen($word) > 2 && mb_strtoupper($word) === $word) {
289
                ++$count;
290
            }
291
        }
292
293
        return $count;
294
    }
295
}
296