Completed
Pull Request — master (#107)
by Asmir
03:02
created

StringInputStream::current()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
/**
3
 * Loads a string to be parsed.
4
 */
5
namespace Masterminds\Html5\Parser;
6
7
/*
8
 *
9
* Based on code from html5lib:
10
11
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
12
13
Permission is hereby granted, free of charge, to any person obtaining a
14
copy of this software and associated documentation files (the
15
    "Software"), to deal in the Software without restriction, including
16
without limitation the rights to use, copy, modify, merge, publish,
17
distribute, sublicense, and/or sell copies of the Software, and to
18
permit persons to whom the Software is furnished to do so, subject to
19
the following conditions:
20
21
The above copyright notice and this permission notice shall be included
22
in all copies or substantial portions of the Software.
23
24
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
25
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31
32
*/
33
34
// Some conventions:
35
// - /* */ indicates verbatim text from the HTML 5 specification
36
//   MPB: Not sure which version of the spec. Moving from HTML5lib to
37
//   HTML5-PHP, I have been using this version:
38
//   http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents
39
//
40
// - // indicates regular comments
41
42
class StringInputStream implements InputStream
43
{
44
45
    /**
46
     * The string data we're parsing.
47
     */
48
    private $data;
49
50
    /**
51
     * The current integer byte position we are in $data
52
     */
53
    private $char;
54
55
    /**
56
     * Length of $data; when $char === $data, we are at the end-of-file.
57
     */
58
    private $EOF;
59
60
    /**
61
     * Parse errors.
62
     */
63
    public $errors = array();
64
65
    /**
66
     * Create a new InputStream wrapper.
67
     *
68
     * @param $data Data
69
     *            to parse
70
     */
71 150
    public function __construct($data, $encoding = 'UTF-8')
72
    {
73 150
        $data = UTF8Utils::convertToUTF8($data, $encoding);
74
        // There is good reason to question whether it makes sense to
75
        // do this here, since most of these checks are done during
76
        // parsing, and since this check doesn't actually *do* anything.
77 150
        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
78
79 150
        $data = $this->replaceLinefeeds($data);
80
81 150
        $this->data = $data;
82 150
        $this->char = 0;
83 150
        $this->EOF = strlen($data);
84 150
    }
85
86
    /**
87
     * Replace linefeed characters according to the spec.
88
     */
89 150
    protected function replaceLinefeeds($data)
90
    {
91
        /*
92
         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. Any CR characters that are followed by LF characters must be removed, and any CR characters not followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage.
93
         */
94
        $crlfTable = array(
95 150
            "\0" => "\xEF\xBF\xBD",
96 150
            "\r\n" => "\n",
97
            "\r" => "\n"
98 150
        );
99
100 150
        return strtr($data, $crlfTable);
101
    }
102
103
    /**
104
     * Returns the current line that the tokenizer is at.
105
     */
106 16
    public function currentLine()
107
    {
108 16
        if (empty($this->EOF) || $this->char == 0) {
109 3
            return 1;
110
        }
111
        // Add one to $this->char because we want the number for the next
112
        // byte to be processed.
113 16
        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
114
    }
115
116
    /**
117
     *
118
     * @deprecated
119
     *
120
     */
121
    public function getCurrentLine()
122
    {
123
        return currentLine();
124
    }
125
126
    /**
127
     * Returns the current column of the current line that the tokenizer is at.
128
     *
129
     * Newlines are column 0. The first char after a newline is column 1.
130
     *
131
     * @return int The column number.
132
     */
133 16
    public function columnOffset()
134
    {
135
        // Short circuit for the first char.
136 16
        if ($this->char == 0) {
137 2
            return 0;
138
        }
139
        // strrpos is weird, and the offset needs to be negative for what we
140
        // want (i.e., the last \n before $this->char). This needs to not have
141
        // one (to make it point to the next character, the one we want the
142
        // position of) added to it because strrpos's behaviour includes the
143
        // final offset byte.
144 16
        $backwardFrom = $this->char - 1 - strlen($this->data);
145 16
        $lastLine = strrpos($this->data, "\n", $backwardFrom);
146
147
        // However, for here we want the length up until the next byte to be
148
        // processed, so add one to the current byte ($this->char).
149 16
        if ($lastLine !== false) {
150 3
            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
151 3
        } else {
152
            // After a newline.
153 16
            $findLengthOf = substr($this->data, 0, $this->char);
154
        }
155
156 16
        return UTF8Utils::countChars($findLengthOf);
157
    }
158
159
    /**
160
     *
161
     * @deprecated
162
     *
163
     */
164
    public function getColumnOffset()
165
    {
166
        return $this->columnOffset();
167
    }
168
169
    /**
170
     * Get the current character.
171
     *
172
     * @return string The current character.
173
     */
174 121
    public function current()
175
    {
176 121
        return $this->data[$this->char];
177
    }
178
179
    /**
180
     * Advance the pointer.
181
     * This is part of the Iterator interface.
182
     */
183 137
    public function next()
184
    {
185 137
        $this->char ++;
186 137
    }
187
188
    /**
189
     * Rewind to the start of the string.
190
     */
191
    public function rewind()
192
    {
193
        $this->char = 0;
194
    }
195
196
    /**
197
     * Is the current pointer location valid.
198
     *
199
     * @return bool Is the current pointer location valid.
200
     */
201 115
    public function valid()
202
    {
203 115
        if ($this->char < $this->EOF) {
204 113
            return true;
205
        }
206
207 113
        return false;
208
    }
209
210
    /**
211
     * Get all characters until EOF.
212
     *
213
     * This reads to the end of the file, and sets the read marker at the
214
     * end of the file.
215
     *
216
     * @note This performs bounds checking
217
     *
218
     * @return string Returns the remaining text. If called when the InputStream is
219
     *         already exhausted, it returns an empty string.
220
     */
221 7
    public function remainingChars()
222
    {
223 7
        if ($this->char < $this->EOF) {
224 7
            $data = substr($this->data, $this->char);
225 7
            $this->char = $this->EOF;
226
227 7
            return $data;
228
        }
229
230
        return ''; // false;
231
    }
232
233
    /**
234
     * Read to a particular match (or until $max bytes are consumed).
235
     *
236
     * This operates on byte sequences, not characters.
237
     *
238
     * Matches as far as possible until we reach a certain set of bytes
239
     * and returns the matched substring.
240
     *
241
     * @param string $bytes
242
     *            Bytes to match.
243
     * @param int $max
244
     *            Maximum number of bytes to scan.
245
     * @return mixed Index or false if no match is found. You should use strong
246
     *         equality when checking the result, since index could be 0.
247
     */
248 104
    public function charsUntil($bytes, $max = null)
249
    {
250 104
        if ($this->char >= $this->EOF) {
251 2
            return false;
252
        }
253
254 104
        if ($max === 0 || $max) {
255 2
            $len = strcspn($this->data, $bytes, $this->char, $max);
256 2
        } else {
257 104
            $len = strcspn($this->data, $bytes, $this->char);
258
        }
259
260 104
        $string = (string) substr($this->data, $this->char, $len);
261 104
        $this->char += $len;
262
263 104
        return $string;
264
    }
265
266
    /**
267
     * Returns the string so long as $bytes matches.
268
     *
269
     * Matches as far as possible with a certain set of bytes
270
     * and returns the matched substring.
271
     *
272
     * @param string $bytes
273
     *            A mask of bytes to match. If ANY byte in this mask matches the
274
     *            current char, the pointer advances and the char is part of the
275
     *            substring.
276
     * @param int $max
277
     *            The max number of chars to read.
278
     */
279 118
    public function charsWhile($bytes, $max = null)
280
    {
281 118
        if ($this->char >= $this->EOF) {
282 3
            return false;
283
        }
284
285 118
        if ($max === 0 || $max) {
286 2
            $len = strspn($this->data, $bytes, $this->char, $max);
287 2
        } else {
288 118
            $len = strspn($this->data, $bytes, $this->char);
289
        }
290 118
        $string = (string) substr($this->data, $this->char, $len);
291 118
        $this->char += $len;
292
293 118
        return $string;
294
    }
295
296
    /**
297
     * Unconsume characters.
298
     *
299
     * @param int $howMany
300
     *            The number of characters to unconsume.
301
     */
302 52
    public function unconsume($howMany = 1)
303
    {
304 52
        if (($this->char - $howMany) >= 0) {
305 52
            $this->char = $this->char - $howMany;
306 52
        }
307 52
    }
308
309
    /**
310
     * Look ahead without moving cursor.
311
     */
312 14
    public function peek()
313
    {
314 14
        if (($this->char + 1) <= $this->EOF) {
315 14
            return $this->data[$this->char + 1];
316
        }
317
318
        return false;
319
    }
320
321 117
    public function key()
322
    {
323 117
        return $this->char;
324
    }
325
}
326