1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CodeIgniter_Sniffs_Files_Utf8EncodingSniff. |
4
|
|
|
* |
5
|
|
|
* PHP version 5 |
6
|
|
|
* |
7
|
|
|
* @category PHP |
8
|
|
|
* @package PHP_CodeSniffer |
9
|
|
|
* @author Thomas Ernest <[email protected]> |
10
|
|
|
* @copyright 2006 Thomas Ernest |
11
|
|
|
* @license http://thomas.ernest.fr/developement/php_cs/licence GNU General Public License |
12
|
|
|
* @link http://pear.php.net/package/PHP_CodeSniffer |
13
|
|
|
*/ |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* CodeIgniter_Sniffs_Files_Utf8EncodingSniff. |
17
|
|
|
* |
18
|
|
|
* Ensures that PHP files are encoded with Unicode (UTF-8) encoding. |
19
|
|
|
* |
20
|
|
|
* @category PHP |
21
|
|
|
* @package PHP_CodeSniffer |
22
|
|
|
* @author Thomas Ernest <[email protected]> |
23
|
|
|
* @copyright 2006 Thomas Ernest |
24
|
|
|
* @license http://thomas.ernest.fr/developement/php_cs/licence GNU General Public License |
25
|
|
|
* @link http://pear.php.net/package/PHP_CodeSniffer |
26
|
|
|
*/ |
27
|
|
|
|
28
|
|
|
namespace CodeIgniter\Sniffs\Files; |
29
|
|
|
|
30
|
|
|
use PHP_CodeSniffer\Sniffs\Sniff; |
31
|
|
|
use PHP_CodeSniffer\Files\File; |
32
|
|
|
|
33
|
|
|
class Utf8EncodingSniff implements Sniff |
34
|
|
|
{ |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* Returns an array of tokens this test wants to listen for. |
38
|
|
|
* |
39
|
|
|
* @return array |
40
|
|
|
*/ |
41
|
|
|
public function register() |
42
|
|
|
{ |
43
|
|
|
return array( |
44
|
|
|
T_OPEN_TAG |
45
|
|
|
); |
46
|
|
|
|
47
|
|
|
}//end register() |
48
|
|
|
|
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* Processes this test, when one of its tokens is encountered. |
52
|
|
|
* |
53
|
|
|
* @param File $phpcsFile The current file being scanned. |
54
|
|
|
* @param int $stackPtr The position of the current token |
55
|
|
|
* in the stack passed in $tokens. |
56
|
|
|
* |
57
|
|
|
* @return void |
58
|
|
|
*/ |
59
|
|
|
public function process(File $phpcsFile, $stackPtr) |
60
|
|
|
{ |
61
|
|
|
// We are only interested if this is the first open tag. |
62
|
|
View Code Duplication |
if ($stackPtr !== 0) { |
|
|
|
|
63
|
|
|
if ($phpcsFile->findPrevious(T_OPEN_TAG, ($stackPtr - 1)) !== false) { |
64
|
|
|
return; |
65
|
|
|
} |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
$file_path = $phpcsFile->getFilename(); |
69
|
|
|
$file_name = basename($file_path); |
70
|
|
|
$file_content = file_get_contents($file_path); |
71
|
|
|
if (false === mb_check_encoding($file_content, 'UTF-8')) { |
72
|
|
|
$error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding.'; |
73
|
|
|
$phpcsFile->addError($error, 0); |
|
|
|
|
74
|
|
|
} |
75
|
|
View Code Duplication |
if ( ! self::_checkUtf8W3c($file_content)) { |
|
|
|
|
76
|
|
|
$error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding, but it did not successfully pass the W3C test.'; |
|
|
|
|
77
|
|
|
$phpcsFile->addError($error, 0); |
|
|
|
|
78
|
|
|
} |
79
|
|
View Code Duplication |
if ( ! self::_checkUtf8Rfc3629($file_content)) { |
|
|
|
|
80
|
|
|
$error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding, but it did not meet RFC3629 requirements.'; |
|
|
|
|
81
|
|
|
$phpcsFile->addError($error, 0); |
|
|
|
|
82
|
|
|
} |
83
|
|
|
}//end process() |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* Checks that the string $content contains only valid UTF-8 chars |
88
|
|
|
* using W3C's method. |
89
|
|
|
* Returns true if $content contains only UTF-8 chars, false otherwise. |
90
|
|
|
* |
91
|
|
|
* @param string $content String to check. |
92
|
|
|
* |
93
|
|
|
* @return bool true if $content contains only UTF-8 chars, false otherwise. |
94
|
|
|
* |
95
|
|
|
* @see http://w3.org/International/questions/qa-forms-utf-8.html |
96
|
|
|
*/ |
97
|
|
|
private static function _checkUtf8W3c($content) |
98
|
|
|
{ |
99
|
|
|
$content_chunks=self::mb_chunk_split($content, 4096, ''); |
100
|
|
|
foreach($content_chunks as $content_chunk) |
|
|
|
|
101
|
|
|
{ |
102
|
|
|
$preg_result= preg_match( |
103
|
|
|
'%^(?: |
104
|
|
|
[\x09\x0A\x0D\x20-\x7E] # ASCII |
105
|
|
|
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
106
|
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs |
107
|
|
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
108
|
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
109
|
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
110
|
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 |
111
|
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
112
|
|
|
)*$%xs', |
113
|
|
|
$content_chunk |
114
|
|
|
); |
115
|
|
|
if($preg_result!==1) |
116
|
|
|
{ |
117
|
|
|
return false; |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
} |
121
|
|
|
return true; |
122
|
|
|
}//end _checkUtf8W3c() |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* Checks that the string $content contains only valid UTF-8 chars |
126
|
|
|
* using the method described in RFC 3629. |
127
|
|
|
* Returns true if $content contains only UTF-8 chars, false otherwise. |
128
|
|
|
* |
129
|
|
|
* @param string $content String to check. |
130
|
|
|
* |
131
|
|
|
* @return bool true if $content contains only UTF-8 chars, false otherwise. |
132
|
|
|
* |
133
|
|
|
* @see http://www.php.net/manual/en/function.mb-detect-encoding.php#85294 |
134
|
|
|
*/ |
135
|
|
|
private static function _checkUtf8Rfc3629($content) |
136
|
|
|
{ |
137
|
|
|
$len = strlen($content); |
138
|
|
|
for ($i = 0; $i < $len; $i++) { |
139
|
|
|
$c = ord($content[$i]); |
140
|
|
|
if ($c > 128) { |
141
|
|
|
if (($c >= 254)) { |
142
|
|
|
return false; |
143
|
|
|
} elseif ($c >= 252) { |
144
|
|
|
$bits=6; |
|
|
|
|
145
|
|
|
} elseif ($c >= 248) { |
146
|
|
|
$bits=5; |
|
|
|
|
147
|
|
|
} elseif ($c >= 240) { |
148
|
|
|
$bytes = 4; |
149
|
|
|
} elseif ($c >= 224) { |
150
|
|
|
$bytes = 3; |
151
|
|
|
} elseif ($c >= 192) { |
152
|
|
|
$bytes = 2; |
153
|
|
|
} else { |
154
|
|
|
return false; |
155
|
|
|
} if (($i + $bytes) > $len) { |
|
|
|
|
156
|
|
|
return false; |
157
|
|
|
} while ($bytes > 1) { |
158
|
|
|
$i++; |
159
|
|
|
$b = ord($content[$i]); |
160
|
|
|
if ($b < 128 || $b > 191) { |
161
|
|
|
return false; |
162
|
|
|
} |
163
|
|
|
$bytes--; |
164
|
|
|
} |
165
|
|
|
} |
166
|
|
|
} |
167
|
|
|
return true; |
168
|
|
|
}//_checkUtf8Rfc3629() |
169
|
|
|
|
170
|
|
|
/** |
171
|
|
|
* Splits a string to chunks of given size |
172
|
|
|
* This helps to avoid segmentation fault errors when large text is given |
173
|
|
|
* Returns array of strings after splitting |
174
|
|
|
* |
175
|
|
|
* @param string $str String to split. |
176
|
|
|
* @param int $len number of characters per chunk |
177
|
|
|
* |
178
|
|
|
* @return array string array after splitting |
|
|
|
|
179
|
|
|
* |
180
|
|
|
* @see http://php.net/manual/en/function.chunk-split.php |
181
|
|
|
*/ |
182
|
|
|
private static function mb_chunk_split($str, $len, $glue) |
183
|
|
|
{ |
184
|
|
|
if (empty($str)) return false; |
185
|
|
|
$array = self::mbStringToArray ($str); |
186
|
|
|
$n = -1; |
187
|
|
|
$new = Array(); |
188
|
|
|
foreach ($array as $char) { |
|
|
|
|
189
|
|
|
$n++; |
190
|
|
|
if ($n < $len) $new []= $char; |
191
|
|
|
elseif ($n == $len) { |
192
|
|
|
$new []= $glue . $char; |
193
|
|
|
$n = 0; |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
return $new; |
197
|
|
|
}//mb_chunk_split |
198
|
|
|
/** |
199
|
|
|
* Supporting function for mb_chunk_split |
200
|
|
|
* |
201
|
|
|
* @param string $str |
202
|
|
|
* |
203
|
|
|
* @return array |
|
|
|
|
204
|
|
|
* |
205
|
|
|
* @see http://php.net/manual/en/function.chunk-split.php |
206
|
|
|
*/ |
207
|
|
|
private static function mbStringToArray ($str) |
208
|
|
|
{ |
209
|
|
|
if (empty($str)) return false; |
210
|
|
|
$len = mb_strlen($str); |
211
|
|
|
$array = array(); |
212
|
|
|
for ($i = 0; $i < $len; $i++) { |
213
|
|
|
$array[] = mb_substr($str, $i, 1); |
214
|
|
|
} |
215
|
|
|
return $array; |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
|
219
|
|
|
|
220
|
|
|
}//end class |
221
|
|
|
|
222
|
|
|
?> |
|
|
|
|
223
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.