Complex classes like Tokenizer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Tokenizer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
27 | class Tokenizer |
||
28 | { |
||
29 | protected $scanner; |
||
30 | |||
31 | protected $events; |
||
32 | |||
33 | protected $tok; |
||
34 | |||
35 | /** |
||
36 | * Buffer for text. |
||
37 | */ |
||
38 | protected $text = ''; |
||
39 | |||
40 | // When this goes to false, the parser stops. |
||
41 | protected $carryOn = true; |
||
42 | |||
43 | protected $textMode = 0; // TEXTMODE_NORMAL; |
||
44 | protected $untilTag = null; |
||
45 | |||
46 | const CONFORMANT_XML = 'xml'; |
||
47 | const CONFORMANT_HTML = 'html'; |
||
48 | protected $mode = self::CONFORMANT_HTML; |
||
49 | |||
50 | /** |
||
51 | * Create a new tokenizer. |
||
52 | * |
||
53 | * Typically, parsing a document involves creating a new tokenizer, giving |
||
54 | * it a scanner (input) and an event handler (output), and then calling |
||
55 | * the Tokenizer::parse() method.` |
||
56 | * |
||
57 | * @param Scanner $scanner A scanner initialized with an input stream. |
||
58 | * @param EventHandler $eventHandler An event handler, initialized and ready to receive events. |
||
59 | * @param string $mode |
||
60 | */ |
||
61 | 130 | public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) |
|
62 | { |
||
63 | 130 | $this->scanner = $scanner; |
|
64 | 130 | $this->events = $eventHandler; |
|
65 | 130 | $this->mode = $mode; |
|
66 | 130 | } |
|
67 | |||
68 | /** |
||
69 | * Begin parsing. |
||
70 | * |
||
71 | * This will begin scanning the document, tokenizing as it goes. |
||
72 | * Tokens are emitted into the event handler. |
||
73 | * |
||
74 | * Tokenizing will continue until the document is completely |
||
75 | * read. Errors are emitted into the event handler, but |
||
76 | * the parser will attempt to continue parsing until the |
||
77 | * entire input stream is read. |
||
78 | */ |
||
79 | 130 | public function parse() |
|
80 | { |
||
81 | do { |
||
82 | 130 | $this->consumeData(); |
|
83 | // FIXME: Add infinite loop protection. |
||
84 | 130 | } while ($this->carryOn); |
|
85 | 130 | } |
|
86 | |||
87 | /** |
||
88 | * Set the text mode for the character data reader. |
||
89 | * |
||
90 | * HTML5 defines three different modes for reading text: |
||
91 | * - Normal: Read until a tag is encountered. |
||
92 | * - RCDATA: Read until a tag is encountered, but skip a few otherwise- |
||
93 | * special characters. |
||
94 | * - Raw: Read until a special closing tag is encountered (viz. pre, script) |
||
95 | * |
||
96 | * This allows those modes to be set. |
||
97 | * |
||
98 | * Normally, setting is done by the event handler via a special return code on |
||
99 | * startTag(), but it can also be set manually using this function. |
||
100 | * |
||
101 | * @param int $textmode One of Elements::TEXT_*. |
||
102 | * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not |
||
103 | * use this indicator. |
||
104 | */ |
||
105 | 110 | public function setTextMode($textmode, $untilTag = null) |
|
106 | { |
||
107 | 110 | $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); |
|
108 | 110 | $this->untilTag = $untilTag; |
|
109 | 110 | } |
|
110 | |||
111 | /** |
||
112 | * Consume a character and make a move. |
||
113 | * HTML5 8.2.4.1. |
||
114 | */ |
||
115 | 130 | protected function consumeData() |
|
116 | { |
||
117 | 130 | $tok = $this->scanner->current(); |
|
118 | |||
119 | 130 | if ('&' === $tok) { |
|
120 | // Character reference |
||
121 | 9 | $ref = $this->decodeCharacterReference(); |
|
122 | 9 | $this->buffer($ref); |
|
123 | |||
124 | 9 | $tok = $this->scanner->current(); |
|
125 | 9 | } |
|
126 | |||
127 | // Parse tag |
||
128 | 130 | if ('<' === $tok) { |
|
129 | // Any buffered text data can go out now. |
||
130 | 125 | $this->flushBuffer(); |
|
131 | |||
132 | 125 | $tok = $this->scanner->next(); |
|
133 | |||
134 | 125 | if ('!' === $tok) { |
|
135 | 103 | $this->markupDeclaration(); |
|
136 | 125 | } elseif ('/' === $tok) { |
|
137 | 113 | $this->endTag(); |
|
138 | 122 | } elseif ('?' === $tok) { |
|
139 | 7 | $this->processingInstruction(); |
|
140 | 121 | } elseif (ctype_alpha($tok)) { |
|
141 | 116 | $this->tagName(); |
|
142 | 116 | } else { |
|
143 | 1 | $this->parseError('Illegal tag opening'); |
|
144 | // TODO is this necessary ? |
||
145 | 1 | $this->characterData(); |
|
146 | } |
||
147 | |||
148 | 125 | $tok = $this->scanner->current(); |
|
149 | 125 | } |
|
150 | |||
151 | 130 | if (false === $tok) { |
|
152 | // Handle end of document |
||
153 | 130 | $this->eof(); |
|
154 | 130 | } else { |
|
155 | // Parse character |
||
156 | 115 | switch ($this->textMode) { |
|
157 | 115 | case Elements::TEXT_RAW: |
|
158 | 8 | $this->rawText($tok); |
|
159 | 8 | break; |
|
160 | |||
161 | 115 | case Elements::TEXT_RCDATA: |
|
162 | 37 | $this->rcdata($tok); |
|
163 | 37 | break; |
|
164 | |||
165 | 114 | default: |
|
166 | 114 | if ('<' === $tok || '&' === $tok) { |
|
167 | 70 | break; |
|
168 | } |
||
169 | |||
170 | // NULL character |
||
171 | 90 | if ("\00" === $tok) { |
|
172 | $this->parseError('Received null character.'); |
||
173 | |||
174 | $this->text .= $tok; |
||
175 | $this->scanner->consume(); |
||
176 | |||
177 | break; |
||
178 | } |
||
179 | |||
180 | 90 | $this->text .= $this->scanner->charsUntil("<&\0"); |
|
181 | 115 | } |
|
182 | } |
||
183 | |||
184 | 130 | return $this->carryOn; |
|
185 | } |
||
186 | |||
187 | /** |
||
188 | * Parse anything that looks like character data. |
||
189 | * |
||
190 | * Different rules apply based on the current text mode. |
||
191 | * |
||
192 | * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. |
||
193 | */ |
||
194 | 1 | protected function characterData() |
|
195 | { |
||
196 | 1 | $tok = $this->scanner->current(); |
|
197 | 1 | if (false === $tok) { |
|
198 | return false; |
||
199 | } |
||
200 | 1 | switch ($this->textMode) { |
|
201 | 1 | case Elements::TEXT_RAW: |
|
202 | return $this->rawText($tok); |
||
203 | 1 | case Elements::TEXT_RCDATA: |
|
204 | return $this->rcdata($tok); |
||
205 | 1 | default: |
|
206 | 1 | if ('<' === $tok || '&' === $tok) { |
|
207 | return false; |
||
208 | } |
||
209 | |||
210 | 1 | return $this->text($tok); |
|
211 | 1 | } |
|
212 | } |
||
213 | |||
214 | /** |
||
215 | * This buffers the current token as character data. |
||
216 | * |
||
217 | * @param string $tok The current token. |
||
218 | * |
||
219 | * @return bool |
||
220 | */ |
||
221 | 1 | protected function text($tok) |
|
222 | { |
||
223 | // This should never happen... |
||
224 | 1 | if (false === $tok) { |
|
225 | return false; |
||
226 | } |
||
227 | |||
228 | // NULL character |
||
229 | 1 | if ("\00" === $tok) { |
|
230 | $this->parseError('Received null character.'); |
||
231 | } |
||
232 | |||
233 | 1 | $this->buffer($tok); |
|
234 | 1 | $this->scanner->consume(); |
|
235 | |||
236 | 1 | return true; |
|
237 | } |
||
238 | |||
239 | /** |
||
240 | * Read text in RAW mode. |
||
241 | * |
||
242 | * @param string $tok The current token. |
||
243 | * |
||
244 | * @return bool |
||
245 | */ |
||
246 | 8 | protected function rawText($tok) |
|
247 | { |
||
248 | 8 | if (is_null($this->untilTag)) { |
|
249 | return $this->text($tok); |
||
250 | } |
||
251 | |||
252 | 8 | $sequence = '</' . $this->untilTag . '>'; |
|
253 | 8 | $txt = $this->readUntilSequence($sequence); |
|
254 | 8 | $this->events->text($txt); |
|
255 | 8 | $this->setTextMode(0); |
|
256 | |||
257 | 8 | return $this->endTag(); |
|
258 | } |
||
259 | |||
260 | /** |
||
261 | * Read text in RCDATA mode. |
||
262 | * |
||
263 | * @param string $tok The current token. |
||
264 | * |
||
265 | * @return bool |
||
266 | */ |
||
267 | 37 | protected function rcdata($tok) |
|
268 | { |
||
269 | 37 | if (is_null($this->untilTag)) { |
|
270 | return $this->text($tok); |
||
271 | } |
||
272 | |||
273 | 37 | $sequence = '</' . $this->untilTag; |
|
274 | 37 | $txt = ''; |
|
275 | |||
276 | 37 | $caseSensitive = !Elements::isHtml5Element($this->untilTag); |
|
277 | 37 | while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { |
|
278 | 35 | if ('&' == $tok) { |
|
279 | 1 | $txt .= $this->decodeCharacterReference(); |
|
280 | 1 | $tok = $this->scanner->current(); |
|
281 | 1 | } else { |
|
282 | 35 | $txt .= $tok; |
|
283 | 35 | $tok = $this->scanner->next(); |
|
284 | } |
||
285 | 35 | } |
|
286 | 37 | $len = strlen($sequence); |
|
287 | 37 | $this->scanner->consume($len); |
|
288 | 37 | $len += $this->scanner->whitespace(); |
|
289 | 37 | if ('>' !== $this->scanner->current()) { |
|
290 | $this->parseError('Unclosed RCDATA end tag'); |
||
291 | } |
||
292 | |||
293 | 37 | $this->scanner->unconsume($len); |
|
294 | 37 | $this->events->text($txt); |
|
295 | 37 | $this->setTextMode(0); |
|
296 | |||
297 | 37 | return $this->endTag(); |
|
298 | } |
||
299 | |||
300 | /** |
||
301 | * If the document is read, emit an EOF event. |
||
302 | */ |
||
303 | 130 | protected function eof() |
|
304 | { |
||
305 | // fprintf(STDOUT, "EOF"); |
||
306 | 130 | $this->flushBuffer(); |
|
307 | 130 | $this->events->eof(); |
|
308 | 130 | $this->carryOn = false; |
|
309 | 130 | } |
|
310 | |||
311 | /** |
||
312 | * Look for markup. |
||
313 | */ |
||
314 | 103 | protected function markupDeclaration() |
|
315 | { |
||
316 | 103 | $tok = $this->scanner->next(); |
|
317 | |||
318 | // Comment: |
||
319 | 103 | if ('-' == $tok && '-' == $this->scanner->peek()) { |
|
320 | 6 | $this->scanner->consume(2); |
|
321 | |||
322 | 6 | return $this->comment(); |
|
323 | 100 | } elseif ('D' == $tok || 'd' == $tok) { // Doctype |
|
324 | 98 | return $this->doctype(); |
|
325 | 7 | } elseif ('[' == $tok) { // CDATA section |
|
326 | 7 | return $this->cdataSection(); |
|
327 | } |
||
328 | |||
329 | // FINISH |
||
330 | 1 | $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok); |
|
331 | 1 | $this->bogusComment('<!'); |
|
332 | |||
333 | 1 | return true; |
|
334 | } |
||
335 | |||
336 | /** |
||
337 | * Consume an end tag. See section 8.2.4.9. |
||
338 | */ |
||
339 | 113 | protected function endTag() |
|
376 | |||
377 | /** |
||
378 | * Consume a tag name and body. See section 8.2.4.10. |
||
379 | */ |
||
380 | 116 | protected function tagName() |
|
381 | { |
||
382 | // We know this is at least one char. |
||
383 | 116 | $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'); |
|
384 | 116 | $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); |
|
385 | 116 | $attributes = array(); |
|
386 | 116 | $selfClose = false; |
|
409 | |||
410 | /** |
||
411 | * Check if the scanner has reached the end of a tag. |
||
412 | */ |
||
413 | 116 | protected function isTagEnd(&$selfClose) |
|
449 | |||
450 | /** |
||
451 | * Parse attributes from inside of a tag. |
||
452 | * |
||
453 | * @param string[] $attributes |
||
454 | * |
||
455 | * @return bool |
||
456 | * |
||
457 | * @throws ParseError |
||
458 | */ |
||
459 | 116 | protected function attribute(&$attributes) |
|
513 | |||
514 | /** |
||
515 | * Consume an attribute value. See section 8.2.4.37 and after. |
||
516 | * |
||
517 | * @return string|null |
||
518 | */ |
||
519 | 84 | protected function attributeValue() |
|
555 | |||
556 | /** |
||
557 | * Get an attribute value string. |
||
558 | * |
||
559 | * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered |
||
560 | * termination of an attribute's value. E.g. "\"'" will stop at either |
||
561 | * ' or ". |
||
562 | * |
||
563 | * @return string The attribute value. |
||
564 | */ |
||
565 | 80 | protected function quotedAttributeValue($quote) |
|
589 | |||
590 | 1 | protected function unquotedAttributeValue() |
|
628 | |||
629 | /** |
||
630 | * Consume malformed markup as if it were a comment. |
||
631 | * 8.2.4.44. |
||
632 | * |
||
633 | * The spec requires that the ENTIRE tag-like thing be enclosed inside of |
||
634 | * the comment. So this will generate comments like: |
||
635 | * |
||
636 | * <!--</+foo>--> |
||
637 | * |
||
638 | * @param string $leading Prepend any leading characters. This essentially |
||
639 | * negates the need to backtrack, but it's sort of a hack. |
||
640 | * |
||
641 | * @return bool |
||
642 | */ |
||
643 | 3 | protected function bogusComment($leading = '') |
|
661 | |||
662 | /** |
||
663 | * Read a comment. |
||
664 | * Expects the first tok to be inside of the comment. |
||
665 | * |
||
666 | * @return bool |
||
667 | */ |
||
668 | 6 | protected function comment() |
|
697 | |||
698 | /** |
||
699 | * Check if the scanner has reached the end of a comment. |
||
700 | * |
||
701 | * @return bool |
||
702 | */ |
||
703 | 6 | protected function isCommentEnd() |
|
730 | |||
731 | /** |
||
732 | * Parse a DOCTYPE. |
||
733 | * |
||
734 | * Parse a DOCTYPE declaration. This method has strong bearing on whether or |
||
735 | * not Quirksmode is enabled on the event handler. |
||
736 | * |
||
737 | * @todo This method is a little long. Should probably refactor. |
||
738 | * |
||
739 | * @return bool |
||
740 | */ |
||
741 | 98 | protected function doctype() |
|
852 | |||
853 | /** |
||
854 | * Utility for reading a quoted string. |
||
855 | * |
||
856 | * @param string $stopchars Characters (in addition to a close-quote) that should stop the string. |
||
857 | * E.g. sometimes '>' is higher precedence than '"' or "'". |
||
858 | * |
||
859 | * @return mixed String if one is found (quotations omitted). |
||
860 | */ |
||
861 | 1 | protected function quotedString($stopchars) |
|
879 | |||
880 | /** |
||
881 | * Handle a CDATA section. |
||
882 | * |
||
883 | * @return bool |
||
884 | */ |
||
885 | 7 | protected function cdataSection() |
|
916 | |||
917 | // ================================================================ |
||
918 | // Non-HTML5 |
||
919 | // ================================================================ |
||
920 | |||
921 | /** |
||
922 | * Handle a processing instruction. |
||
923 | * |
||
924 | * XML processing instructions are supposed to be ignored in HTML5, |
||
925 | * treated as "bogus comments". However, since we're not a user |
||
926 | * agent, we allow them. We consume until ?> and then issue a |
||
927 | * EventListener::processingInstruction() event. |
||
928 | * |
||
929 | * @return bool |
||
930 | */ |
||
931 | 7 | protected function processingInstruction() |
|
968 | |||
969 | // ================================================================ |
||
970 | // UTILITY FUNCTIONS |
||
971 | // ================================================================ |
||
972 | |||
973 | /** |
||
974 | * Read from the input stream until we get to the desired sequene |
||
975 | * or hit the end of the input stream. |
||
976 | * |
||
977 | * @param string $sequence |
||
978 | * |
||
979 | * @return string |
||
980 | */ |
||
981 | 8 | protected function readUntilSequence($sequence) |
|
1003 | |||
1004 | /** |
||
1005 | * Check if upcomming chars match the given sequence. |
||
1006 | * |
||
1007 | * This will read the stream for the $sequence. If it's |
||
1008 | * found, this will return true. If not, return false. |
||
1009 | * Since this unconsumes any chars it reads, the caller |
||
1010 | * will still need to read the next sequence, even if |
||
1011 | * this returns true. |
||
1012 | * |
||
1013 | * Example: $this->scanner->sequenceMatches('</script>') will |
||
1014 | * see if the input stream is at the start of a |
||
1015 | * '</script>' string. |
||
1016 | * |
||
1017 | * @param string $sequence |
||
1018 | * @param bool $caseSensitive |
||
1019 | * |
||
1020 | * @return bool |
||
1021 | */ |
||
1022 | protected function sequenceMatches($sequence, $caseSensitive = true) |
||
1028 | |||
1029 | /** |
||
1030 | * Send a TEXT event with the contents of the text buffer. |
||
1031 | * |
||
1032 | * This emits an EventHandler::text() event with the current contents of the |
||
1033 | * temporary text buffer. (The buffer is used to group as much PCDATA |
||
1034 | * as we can instead of emitting lots and lots of TEXT events.) |
||
1035 | */ |
||
1036 | 130 | protected function flushBuffer() |
|
1044 | |||
1045 | /** |
||
1046 | * Add text to the temporary buffer. |
||
1047 | * |
||
1048 | * @see flushBuffer() |
||
1049 | * |
||
1050 | * @param string $str |
||
1051 | */ |
||
1052 | 10 | protected function buffer($str) |
|
1056 | |||
1057 | /** |
||
1058 | * Emit a parse error. |
||
1059 | * |
||
1060 | * A parse error always returns false because it never consumes any |
||
1061 | * characters. |
||
1062 | * |
||
1063 | * @param string $msg |
||
1064 | * |
||
1065 | * @return string |
||
1066 | */ |
||
1067 | 16 | protected function parseError($msg) |
|
1082 | |||
1083 | /** |
||
1084 | * Decode a character reference and return the string. |
||
1085 | * |
||
1086 | * If $inAttribute is set to true, a bare & will be returned as-is. |
||
1087 | * |
||
1088 | * @param bool $inAttribute Set to true if the text is inside of an attribute value. |
||
1089 | * false otherwise. |
||
1090 | * |
||
1091 | * @return string |
||
1092 | */ |
||
1093 | 13 | protected function decodeCharacterReference($inAttribute = false) |
|
1196 | } |
||
1197 |
This check looks for type mismatches where the missing type is
false
. This is usually indicative of an error condtion.Consider the follow example
This function either returns a new
DateTime
object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returnedfalse
before passing on the value to another function or method that may not be able to handle afalse
.