Completed
Push — master ( 30aab1...cadcfa )
by Asmir
07:00
created

UTF8Utils   A

Complexity

Total Complexity 12

Size/Duplication

Total Lines 141
Duplicated Lines 0 %

Coupling/Cohesion

Dependencies 1

Test Coverage

Coverage 60%

Importance

Changes 0
Metric Value
wmc 12
cbo 1
dl 0
loc 141
ccs 21
cts 35
cp 0.6
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A countChars() 0 17 4
B convertToUTF8() 0 45 5
A checkForIllegalCodepoints() 0 36 3
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
/*
4
 *
5
* Portions based on code from html5lib files with the following copyright:
6
7
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9
Permission is hereby granted, free of charge, to any person obtaining a
10
copy of this software and associated documentation files (the
11
    "Software"), to deal in the Software without restriction, including
12
without limitation the rights to use, copy, modify, merge, publish,
13
distribute, sublicense, and/or sell copies of the Software, and to
14
permit persons to whom the Software is furnished to do so, subject to
15
the following conditions:
16
17
The above copyright notice and this permission notice shall be included
18
in all copies or substantial portions of the Software.
19
20
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28
*/
29
30
use Masterminds\HTML5\Exception;
31
32
/**
33
 * UTF-8 Utilities
34
 */
35
class UTF8Utils
36
{
37
38
    /**
39
     * The Unicode replacement character..
40
     */
41
    const FFFD = "\xEF\xBF\xBD";
42
43
    /**
44
     * Count the number of characters in a string.
45
     *
46
     * UTF-8 aware. This will try (in order) iconv,
47
     * MB, libxml, and finally a custom counter.
48
     *
49
     * @todo Move this to a general utility class.
50
     *
51
     * @param string $string
52
     *
53
     * @return int
54
     */
55 16
    public static function countChars($string)
56
    {
57
        // Get the length for the string we need.
58 16
        if (function_exists('mb_strlen')) {
59 16
            return mb_strlen($string, 'utf-8');
60
        } elseif (function_exists('iconv_strlen')) {
61
            return iconv_strlen($string, 'utf-8');
62
        } elseif (function_exists('utf8_decode')) {
63
            // MPB: Will this work? Won't certain decodes lead to two chars
64
            // extrapolated out of 2-byte chars?
65
            return strlen(utf8_decode($string));
66
        }
67
        $count = count_chars($string);
68
        // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
69
        // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
70
        return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33));
71
    }
72
73
    /**
74
     * Convert data from the given encoding to UTF-8.
75
     *
76
     * This has not yet been tested with charactersets other than UTF-8.
77
     * It should work with ISO-8859-1/-13 and standard Latin Win charsets.
78
     *
79
     * @param string $data
80
     *            The data to convert.
81
     * @param string $encoding
82
     *            A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
83
     *
84
     * @return string
85
     */
86 143
    public static function convertToUTF8($data, $encoding = 'UTF-8')
87
    {
88
        /*
89
         * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points.
90
         */
91
92
        // mb_convert_encoding is chosen over iconv because of a bug. The best
93
        // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
94
        // which contains links to the actual but reports as well as work around
95
        // details.
96 143
        if (function_exists('mb_convert_encoding')) {
97
            // mb library has the following behaviors:
98
            // - UTF-16 surrogates result in false.
99
            // - Overlongs and outside Plane 16 result in empty strings.
100
101
            // Before we run mb_convert_encoding we need to tell it what to do with
102
            // characters it does not know. This could be different than the parent
103
            // application executing this library so we store the value, change it
104
            // to our needs, and then change it back when we are done. This feels
105
            // a little excessive and it would be great if there was a better way.
106 143
            $save = mb_substitute_character();
107 143
            mb_substitute_character('none');
108 143
            $data = mb_convert_encoding($data, 'UTF-8', $encoding);
109 143
            mb_substitute_character($save);
110 143
        }        // @todo Get iconv running in at least some environments if that is possible.
111
        elseif (function_exists('iconv') && $encoding != 'auto') {
112
            // fprintf(STDOUT, "iconv found\n");
113
            // iconv has the following behaviors:
114
            // - Overlong representations are ignored.
115
            // - Beyond Plane 16 is replaced with a lower char.
116
            // - Incomplete sequences generate a warning.
117
            $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
118
        } else {
119
            throw new Exception('Not implemented, please install mbstring or iconv');
120
        }
121
122
        /*
123
         * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present.
124
         */
125 143
        if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
126
            $data = substr($data, 3);
127
        }
128
129 143
        return $data;
130
    }
131
132
    /**
133
     * Checks for Unicode code points that are not valid in a document.
134
     *
135
     * @param string $data A string to analyze.
136
     *
137
     * @return array An array of (string) error messages produced by the scanning.
138
     */
139 143
    public static function checkForIllegalCodepoints($data)
140
    {
141
        // Vestigal error handling.
142 143
        $errors = array();
143
144
        /*
145
         * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error.
146
         */
147 143
        for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) {
148 2
            $errors[] = 'null-character';
149 2
        }
150
151
        /*
152
         * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.)
153
         */
154
        // Check PCRE is loaded.
155 143
        $count = preg_match_all(
156
            '/(?:
157
        [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B,  U+000E to U+001F and U+007F
158
      |
159
        \xC2[\x80-\x9F] # U+0080 to U+009F
160
      |
161
        \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
162
      |
163
        \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
164
      |
165
        \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
166
      |
167
        [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
168 143
      )/x', $data, $matches);
169 143
        for ($i = 0; $i < $count; $i ++) {
170
            $errors[] = 'invalid-codepoint';
171
        }
172
173 143
        return $errors;
174
    }
175
}
176