Strings::normalizeEol() - Code Metrics - fab2s/Strings - Measure and Improve Code Quality continuously with Scrutinizer

Strings::normalizeEol() A
last analyzed 2020-10-18 16:21 UTC

↳ Parent: Strings

Complexity

Conditions	3
Paths	3

Size

Total Lines	20
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	3
eloc	10
c	1
b	0
f	0
nc	3
nop	3
dl	0
loc	20
rs	9.9332

<?php

/*
 * This file is part of Strings
 *     (c) Fabrice de Stefanis / https://github.com/fab2s/Strings
 * This source file is licensed under the MIT license which you will
 * find in the LICENSE file or at https://opensource.org/licenses/MIT
 */

namespace fab2s\Strings;

use fab2s\Bom\Bom;
use fab2s\Utf8\Utf8;

/**
 * class Strings
 */
class Strings
{
    /**
     * The canonical EOL for normalization
     */
    const EOL = "\n";

    /**
     * The canonical encoding
     */
    const ENCODING = 'UTF-8';

    /**
     * U+200B zero width space
     * U+FEFF zero width no-break space
     */
    const ZERO_WIDTH_WS_CLASS = '\x{200B}\x{FEFF}';

    /**
     * U+00A0  no-break space
     * U+2000  en quad
     * U+2001  em quad
     * U+2002  en space
     * U+2003  em space
     * U+2004  three-per-em space
     * U+2005  four-per-em space
     * U+2006  six-per-em space
     * U+2007  figure space
     * U+2008  punctuation space
     * U+2009  thin space
     * U+200A  hair space
     * U+202F  narrow no-break space
     * U+3000  ideographic space
     */
    const NON_STANDARD_WS_CLASS = '\x{00A0}\x{2000}-\x{200A}\x{202F}\x{3000}';

    /**
     * normalize EOL to LF and strip null bit
     *
     * @param string $string
     *
     * @return string
     */
    public static function filter(string $string): string
    {
        /*
         * U+00 null bit
         * Zero width ws
         * normalized eol
         * normalized utf8
         */
        return Utf8::normalize(static::normalizeEol(preg_replace('`[\x{00}' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string)));
    }

    /**
     * @param string $string
     * @param bool   $normalize
     * @param bool   $includeTabs
     *
     * @return string
     */
    public static function singleWsIze(string $string, bool $normalize = false, bool $includeTabs = true): string
    {
        if ($normalize) {
            // multiple horizontal ws to a single low ws (eg ' ')
            return static::normalizeWs($string, $includeTabs);
        }

        return preg_replace('`(\h)(?:\1+)`u', '$1', $string);
    }

    /**
     * @param string $string
     *
     * @return string
     */
    public static function singleLineIze(string $string): string
    {
        return preg_replace("`\s*\R+`u", ' ', $string);
    }

    /**
     * @param $string string
     *
     * @return string
     */
    public static function dropZwWs(string $string): string
    {
        return preg_replace('`[' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string);
    }

    /**
     * @param string   $string
     * @param bool     $includeTabs    true to also replace tabs (\t) with ws ( )
     * @param int|null $maxConsecutive
     *
     * @return string
     */
    public static function normalizeWs(string $string, bool $includeTabs = true, ?int $maxConsecutive = null): string
    {
        // don't include regular ws unless we want to handle consecutive
        $extraWs = $includeTabs ? "\t" : '';
        $length  = '';
        $replace = ' ';
        if (isset($maxConsecutive)) {
            // as regular ws should be the majority, put it first
            $extraWs = " $extraWs";
            $length  = '{' . $maxConsecutive . ',}';
            $replace = str_repeat($replace, $maxConsecutive);
        }

        return preg_replace("`[$extraWs" . static::NON_STANDARD_WS_CLASS . "]$length`u", $replace, $string);
    }

    /**
     * @param string      $string
     * @param int|null    $maxConsecutive
     * @param string|null $eol
     *
     * @return string
     */
    public static function normalizeEol(string $string, ?int $maxConsecutive = null, string $eol = self::EOL): string
    {
        if ($maxConsecutive === null) {
            return preg_replace('`\s*?\R`u', $eol, $string);
        }

        if ($maxConsecutive === 1) {
            return preg_replace('`\s*\R`u', $eol, $string);
        }

        return preg_replace([
            // start with normalizing with LF (faster than CRLF)
            '`\s*?\R`u',
            // then remove high dupes
            "`\n{" . $maxConsecutive . ',}`u',
        ], [
            "\n",
            // restore EOL and set max consecutive
            str_repeat($eol, $maxConsecutive),
        ], $string);
    }

    /**
     * Normalizes a text document
     *
     * @param string $text
     *
     * @return string
     */
    public static function normalizeText(string $text): string
    {
        return trim(static::filter($text));
    }

    /**
     * Normalizes a title
     *
     * @param string $title
     *
     * @return string
     */
    public static function normalizeTitle(string $title): string
    {
        return Utf8::ucfirst(static::normalizeWs(static::singleLineIze(static::normalizeText($title)), true, 1));
    }

    /**
     * @param string $name
     *
     * @return string
     */
    public static function normalizeName(string $name): string
    {
        return Utf8::ucwords(static::normalizeTitle($name));
    }

    /**
     * wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set as default
     *
     * @param string $string
     * @param int    $flag
     * @param bool   $hardEscape
     *
     * @return string
     */
    public static function escape(string $string, int $flag = ENT_COMPAT, bool $hardEscape = true)
    {
        return htmlspecialchars($string, $flag, static::ENCODING, (bool) $hardEscape);
    }

    /**
     * wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set
     * which prevents double encoding
     *
     * @param string $string
     * @param int    $flag
     *
     * @return string
     */
    public static function softEscape(string $string, int $flag = ENT_COMPAT): string
    {
        return static::escape($string, $flag, false);
    }

    /**
     * wrapper for htmlspecialchars_decode with ENT_COMPAT set
     *
     * @param string $string
     * @param int    $quoteStyle
     *
     * @return string
     */
    public static function unEscape(string $string, int $quoteStyle = ENT_COMPAT): string
    {
        return htmlspecialchars_decode($string, $quoteStyle);
    }

    /**
     * @param string      $string
     * @param string|null $from
     * @param string      $to
     *
     * @return string
     */
    public static function convert(string $string, ?string $from = null, string $to = self::ENCODING): string
    {
        return mb_convert_encoding($string, $to, $from ? $from : static::detectEncoding($string));
    }

    /**
     * @param string $string
     *
     * @return string|null
     */
    public static function detectEncoding(string $string): ? string
    {
        if (Utf8::isUtf8($string)) {
            return static::ENCODING;
        }

        if ($bom = Bom::extract($string)) {
            return Bom::getBomEncoding($bom);
        }

        return mb_detect_encoding($string, 'ISO-8859-1,Windows-1252', true) ?: null;
    }

    /**
     * Truly constant time string comparison for Timing Attack protection
     *
     * Many implementations will stop after length comparison which can
     * leak length (not much I agree, but what topic is this?), or try to
     * be smart at failing to compare portion of the $reference which again
     * could leak $reference length
     *
     * This method just goes through exactly the same number of operations
     * in every cases
     *
     * @param string $userInput
     * @param string $reference
     *
     * @return bool
     */
    public static function secureCompare(string $userInput, string $reference): bool
    {
        if (strlen($userInput) !== strlen($reference)) {
            // preserve full comparison loop
            $comparison = $reference ^ $reference;
            // and return false
            $result = 1;
        } else {
            $comparison = $userInput ^ $reference;
            $result     = 0;
        }

        $len = strlen($comparison);
        for ($i = $len - 1; $i >= 0; --$i) {
            $result |= ord($comparison[$i]);
        }

        return !$result;
    }

    /**
     * Generate a pretty reliable hash to identify strings
     * Adding the length reduces collisions by quite a lot
     *
     * @param string $content
     *
     * @return string
     */
    public static function contentHash(string $content): string
    {
        return strlen($content) . '_' . hash('sha256', $content);
    }
}


1			<?php
2
3			/*
4			* This file is part of Strings
5			* (c) Fabrice de Stefanis / https://github.com/fab2s/Strings
6			* This source file is licensed under the MIT license which you will
7			* find in the LICENSE file or at https://opensource.org/licenses/MIT
8			*/
9
10			namespace fab2s\Strings;
11
12			use fab2s\Bom\Bom;
13			use fab2s\Utf8\Utf8;
14
15			/**
16			* class Strings
17			*/
18			class Strings
19			{
20			/**
21			* The canonical EOL for normalization
22			*/
23			const EOL = "\n";
24
25			/**
26			* The canonical encoding
27			*/
28			const ENCODING = 'UTF-8';
29
30			/**
31			* U+200B zero width space
32			* U+FEFF zero width no-break space
33			*/
34			const ZERO_WIDTH_WS_CLASS = '\x{200B}\x{FEFF}';
35
36			/**
37			* U+00A0 no-break space
38			* U+2000 en quad
39			* U+2001 em quad
40			* U+2002 en space
41			* U+2003 em space
42			* U+2004 three-per-em space
43			* U+2005 four-per-em space
44			* U+2006 six-per-em space
45			* U+2007 figure space
46			* U+2008 punctuation space
47			* U+2009 thin space
48			* U+200A hair space
49			* U+202F narrow no-break space
50			* U+3000 ideographic space
51			*/
52			const NON_STANDARD_WS_CLASS = '\x{00A0}\x{2000}-\x{200A}\x{202F}\x{3000}';
53
54			/**
55			* normalize EOL to LF and strip null bit
56			*
57			* @param string $string
58			*
59			* @return string
60			*/
61			public static function filter(string $string): string
62			{
63			/*
64			* U+00 null bit
65			* Zero width ws
66			* normalized eol
67			* normalized utf8
68			*/
69			return Utf8::normalize(static::normalizeEol(preg_replace('`[\x{00}' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string)));
70			}
71
72			/**
73			* @param string $string
74			* @param bool $normalize
75			* @param bool $includeTabs
76			*
77			* @return string
78			*/
79			public static function singleWsIze(string $string, bool $normalize = false, bool $includeTabs = true): string
80			{
81			if ($normalize) {
82			// multiple horizontal ws to a single low ws (eg ' ')
83			return static::normalizeWs($string, $includeTabs);
84			}
85
86			return preg_replace('`(\h)(?:\1+)`u', '$1', $string);
87			}
88
89			/**
90			* @param string $string
91			*
92			* @return string
93			*/
94			public static function singleLineIze(string $string): string
95			{
96			return preg_replace("`\s*\R+`u", ' ', $string);
97			}
98
99			/**
100			* @param $string string
101			*
102			* @return string
103			*/
104			public static function dropZwWs(string $string): string
105			{
106			return preg_replace('`[' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string);
107			}
108
109			/**
110			* @param string $string
111			* @param bool $includeTabs true to also replace tabs (\t) with ws ( )
112			* @param int\|null $maxConsecutive
113			*
114			* @return string
115			*/
116			public static function normalizeWs(string $string, bool $includeTabs = true, ?int $maxConsecutive = null): string
117			{
118			// don't include regular ws unless we want to handle consecutive
119			$extraWs = $includeTabs ? "\t" : '';
120			$length = '';
121			$replace = ' ';
122			if (isset($maxConsecutive)) {
123			// as regular ws should be the majority, put it first
124			$extraWs = " $extraWs";
125			$length = '{' . $maxConsecutive . ',}';
126			$replace = str_repeat($replace, $maxConsecutive);
127			}
128
129			return preg_replace("`[$extraWs" . static::NON_STANDARD_WS_CLASS . "]$length`u", $replace, $string);
130			}
131
132			/**
133			* @param string $string
134			* @param int\|null $maxConsecutive
135			* @param string\|null $eol
136			*
137			* @return string
138			*/
139			public static function normalizeEol(string $string, ?int $maxConsecutive = null, string $eol = self::EOL): string
140			{
141			if ($maxConsecutive === null) {
142			return preg_replace('`\s*?\R`u', $eol, $string);
143			}
144
145			if ($maxConsecutive === 1) {
146			return preg_replace('`\s*\R`u', $eol, $string);
147			}
148
149			return preg_replace([
150			// start with normalizing with LF (faster than CRLF)
151			'`\s*?\R`u',
152			// then remove high dupes
153			"`\n{" . $maxConsecutive . ',}`u',
154			], [
155			"\n",
156			// restore EOL and set max consecutive
157			str_repeat($eol, $maxConsecutive),
158			], $string);
159			}
160
161			/**
162			* Normalizes a text document
163			*
164			* @param string $text
165			*
166			* @return string
167			*/
168			public static function normalizeText(string $text): string
169			{
170			return trim(static::filter($text));
171			}
172
173			/**
174			* Normalizes a title
175			*
176			* @param string $title
177			*
178			* @return string
179			*/
180			public static function normalizeTitle(string $title): string
181			{
182			return Utf8::ucfirst(static::normalizeWs(static::singleLineIze(static::normalizeText($title)), true, 1));
183			}
184
185			/**
186			* @param string $name
187			*
188			* @return string
189			*/
190			public static function normalizeName(string $name): string
191			{
192			return Utf8::ucwords(static::normalizeTitle($name));
193			}
194
195			/**
196			* wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set as default
197			*
198			* @param string $string
199			* @param int $flag
200			* @param bool $hardEscape
201			*
202			* @return string
203			*/
204			public static function escape(string $string, int $flag = ENT_COMPAT, bool $hardEscape = true)
205			{
206			return htmlspecialchars($string, $flag, static::ENCODING, (bool) $hardEscape);
207			}
208
209			/**
210			* wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set
211			* which prevents double encoding
212			*
213			* @param string $string
214			* @param int $flag
215			*
216			* @return string
217			*/
218			public static function softEscape(string $string, int $flag = ENT_COMPAT): string
219			{
220			return static::escape($string, $flag, false);
221			}
222
223			/**
224			* wrapper for htmlspecialchars_decode with ENT_COMPAT set
225			*
226			* @param string $string
227			* @param int $quoteStyle
228			*
229			* @return string
230			*/
231			public static function unEscape(string $string, int $quoteStyle = ENT_COMPAT): string
232			{
233			return htmlspecialchars_decode($string, $quoteStyle);
234			}
235
236			/**
237			* @param string $string
238			* @param string\|null $from
239			* @param string $to
240			*
241			* @return string
242			*/
243			public static function convert(string $string, ?string $from = null, string $to = self::ENCODING): string
244			{
245			return mb_convert_encoding($string, $to, $from ? $from : static::detectEncoding($string));
246			}
247
248			/**
249			* @param string $string
250			*
251			* @return string\|null
252			*/
253			public static function detectEncoding(string $string): ? string
254			{
255			if (Utf8::isUtf8($string)) {
256			return static::ENCODING;
257			}
258
259			if ($bom = Bom::extract($string)) {
260			return Bom::getBomEncoding($bom);
261			}
262
263			return mb_detect_encoding($string, 'ISO-8859-1,Windows-1252', true) ?: null;
264			}
265
266			/**
267			* Truly constant time string comparison for Timing Attack protection
268			*
269			* Many implementations will stop after length comparison which can
270			* leak length (not much I agree, but what topic is this?), or try to
271			* be smart at failing to compare portion of the $reference which again
272			* could leak $reference length
273			*
274			* This method just goes through exactly the same number of operations
275			* in every cases
276			*
277			* @param string $userInput
278			* @param string $reference
279			*
280			* @return bool
281			*/
282			public static function secureCompare(string $userInput, string $reference): bool
283			{
284			if (strlen($userInput) !== strlen($reference)) {
285			// preserve full comparison loop
286			$comparison = $reference ^ $reference;
287			// and return false
288			$result = 1;
289			} else {
290			$comparison = $userInput ^ $reference;
291			$result = 0;
292			}
293
294			$len = strlen($comparison);
295			for ($i = $len - 1; $i >= 0; --$i) {
296			$result \|= ord($comparison[$i]);
297			}
298
299			return !$result;
300			}
301
302			/**
303			* Generate a pretty reliable hash to identify strings
304			* Adding the length reduces collisions by quite a lot
305			*
306			* @param string $content
307			*
308			* @return string
309			*/
310			public static function contentHash(string $content): string
311			{
312			return strlen($content) . '_' . hash('sha256', $content);
313			}
314			}
315

fab2s / Strings

Strings::normalizeEol() A last analyzed 2020-10-18 16:21 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Strings::normalizeEol() A
last analyzed 2020-10-18 16:21 UTC