Clean::replaceBadBytes() - Code Metrics - Inspection of "Merge branch 'utf8refactor' into psr2" - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Push — psr2 ( 2b9c4a...b47790 )

by Andreas

created 2019-07-14 19:11 UTC

Clean::replaceBadBytes() A

↳ Parent: Clean

Complexity

Conditions	3
Paths	3

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
nc	3
nop	2
dl	0
loc	23
rs	9.552
c	0
b	0
f	0

<?php

namespace dokuwiki\Utf8;

/**
 * Methods to assess and clean UTF-8 strings
 */
class Clean
{
    /**
     * Checks if a string contains 7bit ASCII only
     *
     * @author Andreas Haerter <[email protected]>
     *
     * @param string $str
     * @return bool
     */
    public static function isASCII($str)
    {
        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
    }

    /**
     * Tries to detect if a string is in Unicode encoding
     *
     * @author <[email protected]>
     * @link   http://php.net/manual/en/function.utf8-encode.php
     *
     * @param string $str
     * @return bool
     */
    public static function isUtf8($str)
    {
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $b = ord($str[$i]);
            if ($b < 0x80) continue; # 0bbbbbbb
            elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
            elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
            elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
            elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
            elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
            else return false; # Does not match any model

            for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
                if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
                    return false;
            }
        }
        return true;
    }

    /**
     * Strips all high byte chars
     *
     * Returns a pure ASCII7 string
     *
     * @author Andreas Gohr <[email protected]>
     *
     * @param string $str
     * @return string
     */
    public static function strip($str)
    {
        $ascii = '';
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            if (ord($str{$i}) < 128) {
                $ascii .= $str{$i};
            }
        }
        return $ascii;
    }

    /**
     * Removes special characters (nonalphanumeric) from a UTF-8 string
     *
     * This function adds the controlchars 0x00 to 0x19 to the array of
     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
     *
     * @author Andreas Gohr <[email protected]>
     *
     * @param  string $string The UTF8 string to strip of special chars
     * @param  string $repl Replace special with this string
     * @param  string $additional Additional chars to strip (used in regexp char class)
     * @return string
     */
    public static function stripspecials($string, $repl = '', $additional = '')
    {
        static $specials = null;
        if ($specials === null) {
            $specials = preg_quote(Table::specialChars(), '/');
        }

        return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
    }

    /**
     * Replace bad bytes with an alternative character
     *
     * ASCII character is recommended for replacement char
     *
     * PCRE Pattern to locate bad bytes in a UTF-8 string
     * Comes from W3 FAQ: Multilingual Forms
     * Note: modified to include full ASCII range including control chars
     *
     * @author Harry Fuecks <[email protected]>
     * @see http://www.w3.org/International/questions/qa-forms-utf-8
     *
     * @param string $str to search
     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
     * @return string
     */
    public static function replaceBadBytes($str, $replace = '')
    {
        $UTF8_BAD =
            '([\x00-\x7F]' .                          # ASCII (including control chars)
            '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
            '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
            '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
            '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
            '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
            '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
            '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
            '|(.{1}))';                               # invalid byte
        ob_start();
        while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
            if (!isset($matches[2])) {
                echo $matches[0];
            } else {
                echo $replace;
            }
            $str = substr($str, strlen($matches[0]));
        }
        return ob_get_clean();
    }


    /**
     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
     *
     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
     * letters. Default is to deaccent both cases ($case = 0)
     *
     * @author Andreas Gohr <[email protected]>
     *
     * @param string $string
     * @param int $case
     * @return string
     */
    public static function deaccent($string, $case = 0)
    {
        if ($case <= 0) {
            $string = strtr($string, Table::lowerAccents());
        }
        if ($case >= 0) {
            $string = strtr($string, Table::upperAccents());
        }
        return $string;
    }

    /**
     * Romanize a non-latin string
     *
     * @author Andreas Gohr <[email protected]>
     *
     * @param string $string
     * @return string
     */
    public static function romanize($string)
    {
        if (self::isASCII($string)) return $string; //nothing to do

        return strtr($string, Table::romanization());
    }

    /**
     * adjust a byte index into a utf8 string to a utf8 character boundary
     *
     * @author       chris smith <[email protected]>
     *
     * @param string $str utf8 character string
     * @param int $i byte index into $str
     * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
     * @return int byte index into $str now pointing to a utf8 character boundary
     */
    public static function correctIdx($str, $i, $next = false)
    {

        if ($i <= 0) return 0;

        $limit = strlen($str);
        if ($i >= $limit) return $limit;

        if ($next) {
            while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
        } else {
            while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
        }

        return $i;
    }

}


1			<?php
2
3			namespace dokuwiki\Utf8;
4
5			/**
6			* Methods to assess and clean UTF-8 strings
7			*/
8			class Clean
9			{
10			/**
11			* Checks if a string contains 7bit ASCII only
12			*
13			* @author Andreas Haerter <[email protected]>
14			*
15			* @param string $str
16			* @return bool
17			*/
18			public static function isASCII($str)
19			{
20			return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
21			}
22
23			/**
24			* Tries to detect if a string is in Unicode encoding
25			*
26			* @author <[email protected]>
27			* @link http://php.net/manual/en/function.utf8-encode.php
28			*
29			* @param string $str
30			* @return bool
31			*/
32			public static function isUtf8($str)
33			{
34			$len = strlen($str);
35			for ($i = 0; $i < $len; $i++) {
36			$b = ord($str[$i]);
37			if ($b < 0x80) continue; # 0bbbbbbb
38			elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39			elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40			elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41			elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42			elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43			else return false; # Does not match any model
44
45			for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
46			if ((++$i === $len) \|\| ((ord($str[$i]) & 0xC0) !== 0x80))
47			return false;
48			}
49			}
50			return true;
51			}
52
53			/**
54			* Strips all high byte chars
55			*
56			* Returns a pure ASCII7 string
57			*
58			* @author Andreas Gohr <[email protected]>
59			*
60			* @param string $str
61			* @return string
62			*/
63			public static function strip($str)
64			{
65			$ascii = '';
66			$len = strlen($str);
67			for ($i = 0; $i < $len; $i++) {
68			if (ord($str{$i}) < 128) {
69			$ascii .= $str{$i};
70			}
71			}
72			return $ascii;
73			}
74
75			/**
76			* Removes special characters (nonalphanumeric) from a UTF-8 string
77			*
78			* This function adds the controlchars 0x00 to 0x19 to the array of
79			* stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
80			*
81			* @author Andreas Gohr <[email protected]>
82			*
83			* @param string $string The UTF8 string to strip of special chars
84			* @param string $repl Replace special with this string
85			* @param string $additional Additional chars to strip (used in regexp char class)
86			* @return string
87			*/
88			public static function stripspecials($string, $repl = '', $additional = '')
89			{
90			static $specials = null;
91			if ($specials === null) {
92			$specials = preg_quote(Table::specialChars(), '/');
93			}
94
95			return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
96			}
97
98			/**
99			* Replace bad bytes with an alternative character
100			*
101			* ASCII character is recommended for replacement char
102			*
103			* PCRE Pattern to locate bad bytes in a UTF-8 string
104			* Comes from W3 FAQ: Multilingual Forms
105			* Note: modified to include full ASCII range including control chars
106			*
107			* @author Harry Fuecks <[email protected]>
108			* @see http://www.w3.org/International/questions/qa-forms-utf-8
109			*
110			* @param string $str to search
111			* @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
112			* @return string
113			*/
114			public static function replaceBadBytes($str, $replace = '')
115			{
116			$UTF8_BAD =
117			'([\x00-\x7F]' . # ASCII (including control chars)
118			'\|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
119			'\|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
120			'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
121			'\|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
122			'\|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
123			'\|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
124			'\|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
125			'\|(.{1}))'; # invalid byte
126			ob_start();
127			while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128			if (!isset($matches[2])) {
129			echo $matches[0];
130			} else {
131			echo $replace;
132			}
133			$str = substr($str, strlen($matches[0]));
134			}
135			return ob_get_clean();
136			}
137
138
139			/**
140			* Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
141			*
142			* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143			* letters. Default is to deaccent both cases ($case = 0)
144			*
145			* @author Andreas Gohr <[email protected]>
146			*
147			* @param string $string
148			* @param int $case
149			* @return string
150			*/
151			public static function deaccent($string, $case = 0)
152			{
153			if ($case <= 0) {
154			$string = strtr($string, Table::lowerAccents());
155			}
156			if ($case >= 0) {
157			$string = strtr($string, Table::upperAccents());
158			}
159			return $string;
160			}
161
162			/**
163			* Romanize a non-latin string
164			*
165			* @author Andreas Gohr <[email protected]>
166			*
167			* @param string $string
168			* @return string
169			*/
170			public static function romanize($string)
171			{
172			if (self::isASCII($string)) return $string; //nothing to do
173
174			return strtr($string, Table::romanization());
175			}
176
177			/**
178			* adjust a byte index into a utf8 string to a utf8 character boundary
179			*
180			* @author chris smith <[email protected]>
181			*
182			* @param string $str utf8 character string
183			* @param int $i byte index into $str
184			* @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185			* @return int byte index into $str now pointing to a utf8 character boundary
186			*/
187			public static function correctIdx($str, $i, $next = false)
188			{
189
190			if ($i <= 0) return 0;
191
192			$limit = strlen($str);
193			if ($i >= $limit) return $limit;
194
195			if ($next) {
196			while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
197			} else {
198			while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
199			}
200
201			return $i;
202			}
203
204			}
205

splitbrain / dokuwiki

Push — psr2 ( 2b9c4a...b47790 )

Clean::replaceBadBytes() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like