Conditions | 6 |
Paths | 9 |
Total Lines | 105 |
Code Lines | 34 |
Lines | 12 |
Ratio | 11.43 % |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | <?php |
||
56 | public function __construct($data) { |
||
57 | |||
58 | /* Given an encoding, the bytes in the input stream must be |
||
59 | converted to Unicode characters for the tokeniser, as |
||
60 | described by the rules for that encoding, except that the |
||
61 | leading U+FEFF BYTE ORDER MARK character, if any, must not |
||
62 | be stripped by the encoding layer (it is stripped by the rule below). |
||
63 | |||
64 | Bytes or sequences of bytes in the original byte stream that |
||
65 | could not be converted to Unicode characters must be converted |
||
66 | to U+FFFD REPLACEMENT CHARACTER code points. */ |
||
67 | |||
68 | // XXX currently assuming input data is UTF-8; once we |
||
69 | // build encoding detection this will no longer be the case |
||
70 | // |
||
71 | // We previously had an mbstring implementation here, but that |
||
72 | // implementation is heavily non-conforming, so it's been |
||
73 | // omitted. |
||
74 | if (extension_loaded('iconv')) { |
||
75 | // non-conforming |
||
76 | $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); |
||
77 | } else { |
||
78 | // we can make a conforming native implementation |
||
79 | throw new Exception('Not implemented, please install mbstring or iconv'); |
||
80 | } |
||
81 | |||
82 | /* One leading U+FEFF BYTE ORDER MARK character must be |
||
83 | ignored if any are present. */ |
||
84 | if (substr($data, 0, 3) === "\xEF\xBB\xBF") { |
||
85 | $data = substr($data, 3); |
||
86 | } |
||
87 | |||
88 | /* All U+0000 NULL characters in the input must be replaced |
||
89 | by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such |
||
90 | characters is a parse error. */ |
||
91 | View Code Duplication | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { |
|
92 | $this->errors[] = array( |
||
93 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
94 | 'data' => 'null-character' |
||
95 | ); |
||
96 | } |
||
97 | /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED |
||
98 | (LF) characters are treated specially. Any CR characters |
||
99 | that are followed by LF characters must be removed, and any |
||
100 | CR characters not followed by LF characters must be converted |
||
101 | to LF characters. Thus, newlines in HTML DOMs are represented |
||
102 | by LF characters, and there are never any CR characters in the |
||
103 | input to the tokenization stage. */ |
||
104 | $data = str_replace( |
||
105 | array( |
||
106 | "\0", |
||
107 | "\r\n", |
||
108 | "\r" |
||
109 | ), |
||
110 | array( |
||
111 | "\xEF\xBF\xBD", |
||
112 | "\n", |
||
113 | "\n" |
||
114 | ), |
||
115 | $data |
||
116 | ); |
||
117 | |||
118 | /* Any occurrences of any characters in the ranges U+0001 to |
||
119 | U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, |
||
120 | U+D800 to U+DFFF , U+FDD0 to U+FDEF, and |
||
121 | characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, |
||
122 | U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, |
||
123 | U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, |
||
124 | U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, |
||
125 | U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and |
||
126 | U+10FFFF are parse errors. (These are all control characters |
||
127 | or permanently undefined Unicode characters.) */ |
||
128 | // Check PCRE is loaded. |
||
129 | if (extension_loaded('pcre')) { |
||
130 | $count = preg_match_all( |
||
131 | '/(?: |
||
132 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F |
||
133 | | |
||
134 | \xC2[\x80-\x9F] # U+0080 to U+009F |
||
135 | | |
||
136 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF |
||
137 | | |
||
138 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF |
||
139 | | |
||
140 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF |
||
141 | | |
||
142 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) |
||
143 | )/x', |
||
144 | $data, |
||
145 | $matches |
||
146 | ); |
||
147 | View Code Duplication | for ($i = 0; $i < $count; $i++) { |
|
148 | $this->errors[] = array( |
||
149 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
150 | 'data' => 'invalid-codepoint' |
||
151 | ); |
||
152 | } |
||
153 | } else { |
||
154 | // XXX: Need non-PCRE impl, probably using substr_count |
||
155 | } |
||
156 | |||
157 | $this->data = $data; |
||
158 | $this->char = 0; |
||
159 | $this->EOF = strlen($data); |
||
160 | } |
||
161 | |||
285 |
You can fix this by adding a namespace to your class:
When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.