TextUtil::mb_ucfirst() - Code Metrics - Inspection of "Style & logic (scrutiniz)" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 5eccc7...1faa52 )

by Dispositif

created 2020-04-18 20:24 UTC

TextUtil::mb_ucfirst() A

↳ Parent: TextUtil

Complexity

Conditions	1
Paths	1

Size

Total Lines	6
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	4
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	3
nc	1
nop	2
dl	0
loc	6
ccs	4
cts	4
cp	1
crap	1
rs	10
c	0
b	0
f	0

<?php
/**
 * This file is part of dispositif/wikibot application
 * 2019 : Philippe M. <[email protected]>
 * For the full copyright and MIT license information, please view the LICENSE file.
 */

declare(strict_types=1);

namespace App\Domain\Utils;

/**
 * Class TextUtil.
 */
abstract class TextUtil
{
    const SKIP_PREDICT_PARAM = ['issue'];

    const NO_BREAK_SPACE = "\xC2\xA0"; // &#160;

    const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";

    const ALL_PUNCTUATION
        = [
            '!',
            '"',
            '«',
            '»',
            '#',
            '$',
            '%',
            "'",
            '’',
            '´',
            '`',
            '^',
            '…',
            '‽',
            '(',
            ')',
            '*',
            '⁂',
            '+',
            '–',
            '—',
            '/',
            ':',
            ';',
            '?',
            '@',
            '[',
            '\\',
            ']',
            '_',
            '`',
            '{',
            '|',
            '¦',
            '}',
            '~',
            '<',
            '>',
            '№',
            '©',
            '®',
            '°',
            '†',
            '§',
            '∴',
            '∵',
            '¶',
            '•',
            '+',
        ];

    // &#8239;
    //    const ELLIPSIS = '…';
    //    const LAQUO = '«'; // &laquo;
    //    const RAQUO = '»'; // &raquo;
    //    const RSQUO = '’'; // &rsquo;
    //    const TIMES = '×'; // &times;
    //    const NDASH = '–'; // &ndash; or &#x2013;
    //    const MDASH = '—'; // &mdash; or &#x2014;
    //    const LDQUO = '“'; // &ldquo; or &#8220;
    //    const RDQUO = '”'; // &rdquo; or &#8221;
    //    const BDQUO = '„'; // &bdquo; or &#8222;
    //    const SHY = "\xC2\xAD"; // &shy;
    //    const TRADE = '™'; // &trade;
    //    const REG = '®'; // &reg;
    //    const COPY = '©'; // &copy;
    const ALL_SPACES         = "\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\\s"; // Used in regexps. Better than \s

    /**
     * UTF8 first letter in upper case.
     * "économie" => "Économie".
     *
     * @param string      $str
     * @param string|null $e
     *
     * @return string
     */
    public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
    {
        $first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);

        return $first.$rest;
    }

    /**
     * UTF8 first letter in lower case.
     * "Économie" => "économie".
     *
     * @param string      $str
     * @param string|null $e
     *
     * @return string
     */
    public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
    {
        $first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
        $rest = mb_substr($str, 1, mb_strlen($str, $e), $e);

        return $first.$rest;
    }

    /**
     * @param string $text
     *
     * @return mixed
     */
    public static function replaceNonBreakingSpaces(string $text)
    {
        return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
    }

    /**
     * Trim also non-breaking space and carriage return.
     *
     * @param string $string
     *
     * @return string
     */
    public static function trim(string $string)
    {
        return trim($string, self::NO_BREAK_SPACE.self::NO_BREAK_THIN_SPACE."\n\t\r");
    }

    /**
     * Todo verify/correct.
     *
     * @param string $str
     *
     * @return bool
     */
    //    static public function containsNonLatinCharacters(string $str): bool
    //    {
    //        return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
    //    }

    /**
     * Simplest levenshtein distance prediction of the correct param name.
     * Weird results with ASCII extended chars :
     * levenshtein('notre','nôtre') => 2
     * TODO move.
     *
     * @param string $str
     * @param array  $names
     * @param int    $max Maximum number of permutation/add/subtraction)
     *
     * @return string|null
     */
    public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
    {
        $sanitized = self::sanitizeParamForPredict($str);
        $closest = null;
        foreach ($names as $name) {
            $sanitizedName = self::sanitizeParamForPredict($name);
            if ($str === $name || $sanitized === $sanitizedName) {
                return $name; // exact match
            }
            $lev = levenshtein($str, $name);
            $lev2 = levenshtein($sanitized, $sanitizedName);

            if (!isset($shortest) || $lev < $shortest || $lev2 < $shortest) {
                $closest = $name;
                $shortest = $lev;
            }
        }
        if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
            return $closest;
        }

        return null;
    }

    /**
     * For predictCorrectParam().
     *
     * @param string $str
     *
     * @return string
     */
    private static function sanitizeParamForPredict(string $str): string
    {
        $sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
        $sanitized = trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));

        return $sanitized;
    }

    /**
     * Strip punctuation
     * UTF-8 compatible ??
     * Note : can't use str_split() which cut on 1 byte length
     * See http://fr.wikipedia.org/wiki/Ponctuation.
     *
     * @param string $str
     *
     * @return string
     */
    public static function stripPunctuation(string $str)
    {
        return str_replace(
            self::ALL_PUNCTUATION,
            '',
            $str
        );
    }

    /**
     * Strip accents
     * OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
     * UTF-8 compatible.
     *
     * @param string $string
     *
     * @return string
     */
    public static function stripAccents(string $string): string
    {
        return strtr(
            utf8_decode($string),
            utf8_decode(
                'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
            ),
            utf8_decode(
                'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
            )
        );
    }
}


1		<?php
2		/**
3		* This file is part of dispositif/wikibot application
4		* 2019 : Philippe M. <[email protected]>
5		* For the full copyright and MIT license information, please view the LICENSE file.
6		*/
7
8		declare(strict_types=1);
9
10		namespace App\Domain\Utils;
11
12		/**
13		* Class TextUtil.
14		*/
15		abstract class TextUtil
16		{
17		const SKIP_PREDICT_PARAM = ['issue'];
18
19		const NO_BREAK_SPACE = "\xC2\xA0"; //
20
21		const NO_BREAK_THIN_SPACE = "\xE2\x80\xAF";
22
23		const ALL_PUNCTUATION
24		= [
25		'!',
26		'"',
27		'«',
28		'»',
29		'#',
30		'$',
31		'%',
32		"'",
33		'’',
34		'´',
35		'`',
36		'^',
37		'…',
38		'‽',
39		'(',
40		')',
41		'*',
42		'⁂',
43		'+',
44		'–',
45		'—',
46		'/',
47		':',
48		';',
49		'?',
50		'@',
51		'[',
52		'\\',
53		']',
54		'_',
55		'`',
56		'{',
57		'\|',
58		'¦',
59		'}',
60		'~',
61		'<',
62		'>',
63		'№',
64		'©',
65		'®',
66		'°',
67		'†',
68		'§',
69		'∴',
70		'∵',
71		'¶',
72		'•',
73		'+',
74		];
75
76		//
77		// const ELLIPSIS = '…';
78		// const LAQUO = '«'; // «
79		// const RAQUO = '»'; // »
80		// const RSQUO = '’'; // ’
81		// const TIMES = '×'; // ×
82		// const NDASH = '–'; // – or –
83		// const MDASH = '—'; // — or —
84		// const LDQUO = '“'; // “ or “
85		// const RDQUO = '”'; // ” or ”
86		// const BDQUO = '„'; // &bdquo; or „
87		// const SHY = "\xC2\xAD"; //
88		// const TRADE = '™'; // ™
89		// const REG = '®'; // ®
90		// const COPY = '©'; // ©
91		const ALL_SPACES = "\xE2\x80\xAF\|\xC2\xAD\|\xC2\xA0\|\\s"; // Used in regexps. Better than \s
92
93		/**
94		* UTF8 first letter in upper case.
95		* "économie" => "Économie".
96		*
97		* @param string $str
98		* @param string\|null $e
99		*
100		* @return string
101		*/
102	28	public static function mb_ucfirst(string $str, ?string $e = 'UTF-8'): string
103		{
104	28	$first = mb_strtoupper(mb_substr($str, 0, 1, $e), $e);
105	28	$rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
106
107	28	return $first.$rest;
108		}
109
110		/**
111		* UTF8 first letter in lower case.
112		* "Économie" => "économie".
113		*
114		* @param string $str
115		* @param string\|null $e
116		*
117		* @return string
118		*/
119	1	public static function mb_lowerfirst(string $str, ?string $e = 'UTF-8'): string
120		{
121	1	$first = mb_strtolower(mb_substr($str, 0, 1, $e), $e);
122	1	$rest = mb_substr($str, 1, mb_strlen($str, $e), $e);
123
124	1	return $first.$rest;
125		}
126
127		/**
128		* @param string $text
129		*
130		* @return mixed
131		*/
132	33	public static function replaceNonBreakingSpaces(string $text)
133		{
134	33	return str_replace([self::NO_BREAK_SPACE, self::NO_BREAK_THIN_SPACE], ' ', $text);
135		}
136
137		/**
138		* Trim also non-breaking space and carriage return.
139		*
140		* @param string $string
141		*
142		* @return string
143		*/
144	1	public static function trim(string $string)
145		{
146	1	return trim($string, self::NO_BREAK_SPACE.self::NO_BREAK_THIN_SPACE."\n\t\r");
147		}
148
149		/**
150		* Todo verify/correct.
151		*
152		* @param string $str
153		*
154		* @return bool
155		*/
156		// static public function containsNonLatinCharacters(string $str): bool
157		// {
158		// return preg_match('/[^\\p{Common}\\p{Latin}]/u', $str);
159		// }
160
161		/**
162		* Simplest levenshtein distance prediction of the correct param name.
163		* Weird results with ASCII extended chars :
164		* levenshtein('notre','nôtre') => 2
165		* TODO move.
166		*
167		* @param string $str
168		* @param array $names
169		* @param int $max Maximum number of permutation/add/subtraction)
170		*
171		* @return string\|null
172		*/
173	3	public static function predictCorrectParam(string $str, array $names, int $max = 2): ?string
174		{
175	3	$sanitized = self::sanitizeParamForPredict($str);
176	3	$closest = null;
177	3	foreach ($names as $name) {
178	3	$sanitizedName = self::sanitizeParamForPredict($name);
179	3	if ($str === $name \|\| $sanitized === $sanitizedName) {
180		return $name; // exact match
181		}
182	3	$lev = levenshtein($str, $name);
183	3	$lev2 = levenshtein($sanitized, $sanitizedName);
184
185	3	if (!isset($shortest) \|\| $lev < $shortest \|\| $lev2 < $shortest) {
186	3	$closest = $name;
187	3	$shortest = $lev;
188		}
189		}
190	3	if (isset($shortest) && $shortest <= $max && !in_array($sanitized, self::SKIP_PREDICT_PARAM)) {
191	3	return $closest;
192		}
193
194		return null;
195		}
196
197		/**
198		* For predictCorrectParam().
199		*
200		* @param string $str
201		*
202		* @return string
203		*/
204	3	private static function sanitizeParamForPredict(string $str): string
205		{
206	3	$sanitized = mb_strtolower(self::stripPunctuation(self::stripAccents($str)));
207	3	$sanitized = trim(preg_replace('#[^a-z0-9 ]#', '', $sanitized));
208
209	3	return $sanitized;
210		}
211
212		/**
213		* Strip punctuation
214		* UTF-8 compatible ??
215		* Note : can't use str_split() which cut on 1 byte length
216		* See http://fr.wikipedia.org/wiki/Ponctuation.
217		*
218		* @param string $str
219		*
220		* @return string
221		*/
222	22	public static function stripPunctuation(string $str)
223		{
224	22	return str_replace(
225	22	self::ALL_PUNCTUATION,
226	22	'',
227	22	$str
228		);
229		}
230
231		/**
232		* Strip accents
233		* OK : grec, cyrillique, macron, hatchek, brève, rond en chef, tilde
234		* UTF-8 compatible.
235		*
236		* @param string $string
237		*
238		* @return string
239		*/
240	22	public static function stripAccents(string $string): string
241		{
242	22	return strtr(
243	22	utf8_decode($string),
244	22	utf8_decode(
245	22	'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝøāǟǡēḕḗḡḹīōȫȭȱṑṓǭṝūǖṻȳǣӣᾱῑῡčšžйўŭăӗğÅåůẘẙ'
246		),
247	22	utf8_decode(
248	22	'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUYoaaaeeeglioooooooruuuyæиαιυcszиyuaegAauwy'
249		)
250		);
251		}
252		}
253

Dispositif / Wikibot

Push — master ( 5eccc7...1faa52 )

TextUtil::mb_ucfirst() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like