|
1
|
|
|
<?php |
|
2
|
|
|
namespace Rmccue\Requests; |
|
3
|
|
|
|
|
4
|
|
|
use Rmccue\Requests\Exception as Exception; |
|
5
|
|
|
/** |
|
6
|
|
|
* IDNA URL encoder |
|
7
|
|
|
* |
|
8
|
|
|
* Note: Not fully compliant, as nameprep does nothing yet. |
|
9
|
|
|
* |
|
10
|
|
|
* @package Rmccue\Requests |
|
11
|
|
|
* @subpackage Utilities |
|
12
|
|
|
* @see https://tools.ietf.org/html/rfc3490 IDNA specification |
|
13
|
|
|
* @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification |
|
14
|
|
|
*/ |
|
15
|
|
|
class IDNAEncoder { |
|
16
|
|
|
/** |
|
17
|
|
|
* ACE prefix used for IDNA |
|
18
|
|
|
* |
|
19
|
|
|
* @see https://tools.ietf.org/html/rfc3490#section-5 |
|
20
|
|
|
* @var string |
|
21
|
|
|
*/ |
|
22
|
|
|
const ACE_PREFIX = 'xn--'; |
|
23
|
|
|
|
|
24
|
|
|
/**#@+ |
|
25
|
|
|
* Bootstrap constant for Punycode |
|
26
|
|
|
* |
|
27
|
|
|
* @see https://tools.ietf.org/html/rfc3492#section-5 |
|
28
|
|
|
* @var int |
|
29
|
|
|
*/ |
|
30
|
|
|
const BOOTSTRAP_BASE = 36; |
|
31
|
|
|
const BOOTSTRAP_TMIN = 1; |
|
32
|
|
|
const BOOTSTRAP_TMAX = 26; |
|
33
|
|
|
const BOOTSTRAP_SKEW = 38; |
|
34
|
|
|
const BOOTSTRAP_DAMP = 700; |
|
35
|
|
|
const BOOTSTRAP_INITIAL_BIAS = 72; |
|
36
|
|
|
const BOOTSTRAP_INITIAL_N = 128; |
|
37
|
|
|
/**#@-*/ |
|
38
|
|
|
|
|
39
|
|
|
/** |
|
40
|
|
|
* Encode a hostname using Punycode |
|
41
|
|
|
* |
|
42
|
|
|
* @param string $string Hostname |
|
43
|
|
|
* @return string Punycode-encoded hostname |
|
44
|
|
|
*/ |
|
45
|
|
|
public static function encode($string) { |
|
46
|
|
|
$parts = explode('.', $string); |
|
47
|
|
|
foreach ($parts as &$part) { |
|
48
|
|
|
$part = self::to_ascii($part); |
|
49
|
|
|
} |
|
50
|
|
|
return implode('.', $parts); |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
|
|
/** |
|
54
|
|
|
* Convert a UTF-8 string to an ASCII string using Punycode |
|
55
|
|
|
* |
|
56
|
|
|
* @throws Rmccue\Requests\Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`) |
|
57
|
|
|
* @throws Rmccue\Requests\Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`) |
|
58
|
|
|
* @throws Rmccue\Requests\Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`) |
|
59
|
|
|
* @throws Rmccue\Requests\Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`) |
|
60
|
|
|
* |
|
61
|
|
|
* @param string $string ASCII or UTF-8 string (max length 64 characters) |
|
62
|
|
|
* @return string ASCII string |
|
63
|
|
|
*/ |
|
64
|
|
|
public static function to_ascii($string) { |
|
65
|
|
|
// Step 1: Check if the string is already ASCII |
|
66
|
|
View Code Duplication |
if (self::is_ascii($string)) { |
|
|
|
|
|
|
67
|
|
|
// Skip to step 7 |
|
68
|
|
|
if (strlen($string) < 64) { |
|
69
|
|
|
return $string; |
|
70
|
|
|
} |
|
71
|
|
|
|
|
72
|
|
|
throw new Exception('Provided string is too long', 'idna.provided_too_long', $string); |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
// Step 2: nameprep |
|
76
|
|
|
$string = self::nameprep($string); |
|
77
|
|
|
|
|
78
|
|
|
// Step 3: UseSTD3ASCIIRules is false, continue |
|
|
|
|
|
|
79
|
|
|
// Step 4: Check if it's ASCII now |
|
80
|
|
View Code Duplication |
if (self::is_ascii($string)) { |
|
|
|
|
|
|
81
|
|
|
// Skip to step 7 |
|
82
|
|
|
if (strlen($string) < 64) { |
|
83
|
|
|
return $string; |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
throw new Exception('Prepared string is too long', 'idna.prepared_too_long', $string); |
|
87
|
|
|
} |
|
88
|
|
|
|
|
89
|
|
|
// Step 5: Check ACE prefix |
|
90
|
|
|
if (strpos($string, self::ACE_PREFIX) === 0) { |
|
91
|
|
|
throw new Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string); |
|
92
|
|
|
} |
|
93
|
|
|
|
|
94
|
|
|
// Step 6: Encode with Punycode |
|
95
|
|
|
$string = self::punycode_encode($string); |
|
96
|
|
|
|
|
97
|
|
|
// Step 7: Prepend ACE prefix |
|
98
|
|
|
$string = self::ACE_PREFIX . $string; |
|
99
|
|
|
|
|
100
|
|
|
// Step 8: Check size |
|
101
|
|
|
if (strlen($string) < 64) { |
|
102
|
|
|
return $string; |
|
103
|
|
|
} |
|
104
|
|
|
|
|
105
|
|
|
throw new Exception('Encoded string is too long', 'idna.encoded_too_long', $string); |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
/** |
|
109
|
|
|
* Check whether a given string contains only ASCII characters |
|
110
|
|
|
* |
|
111
|
|
|
* @internal (Testing found regex was the fastest implementation) |
|
112
|
|
|
* |
|
113
|
|
|
* @param string $string |
|
114
|
|
|
* @return bool Is the string ASCII-only? |
|
115
|
|
|
*/ |
|
116
|
|
|
protected static function is_ascii($string) { |
|
117
|
|
|
return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1); |
|
118
|
|
|
} |
|
119
|
|
|
|
|
120
|
|
|
/** |
|
121
|
|
|
* Prepare a string for use as an IDNA name |
|
122
|
|
|
* |
|
123
|
|
|
* @todo Implement this based on RFC 3491 and the newer 5891 |
|
124
|
|
|
* @param string $string |
|
125
|
|
|
* @return string Prepared string |
|
126
|
|
|
*/ |
|
127
|
|
|
protected static function nameprep($string) { |
|
128
|
|
|
return $string; |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
/** |
|
132
|
|
|
* Convert a UTF-8 string to a UCS-4 codepoint array |
|
133
|
|
|
* |
|
134
|
|
|
* Based on Rmccue\Requests\IRI::replace_invalid_with_pct_encoding() |
|
135
|
|
|
* |
|
136
|
|
|
* @throws Rmccue\Requests\Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`) |
|
137
|
|
|
* @param string $input |
|
138
|
|
|
* @return array Unicode code points |
|
139
|
|
|
*/ |
|
140
|
|
|
protected static function utf8_to_codepoints($input) { |
|
141
|
|
|
$codepoints = array(); |
|
142
|
|
|
|
|
143
|
|
|
// Get number of bytes |
|
144
|
|
|
$strlen = strlen($input); |
|
145
|
|
|
|
|
146
|
|
|
for ($position = 0; $position < $strlen; $position++) { |
|
147
|
|
|
$value = ord($input[$position]); |
|
148
|
|
|
|
|
149
|
|
|
// One byte sequence: |
|
150
|
|
|
if ((~$value & 0x80) === 0x80) { |
|
151
|
|
|
$character = $value; |
|
152
|
|
|
$length = 1; |
|
153
|
|
|
$remaining = 0; |
|
154
|
|
|
} |
|
155
|
|
|
// Two byte sequence: |
|
156
|
|
View Code Duplication |
elseif (($value & 0xE0) === 0xC0) { |
|
157
|
|
|
$character = ($value & 0x1F) << 6; |
|
158
|
|
|
$length = 2; |
|
159
|
|
|
$remaining = 1; |
|
160
|
|
|
} |
|
161
|
|
|
// Three byte sequence: |
|
162
|
|
View Code Duplication |
elseif (($value & 0xF0) === 0xE0) { |
|
163
|
|
|
$character = ($value & 0x0F) << 12; |
|
164
|
|
|
$length = 3; |
|
165
|
|
|
$remaining = 2; |
|
166
|
|
|
} |
|
167
|
|
|
// Four byte sequence: |
|
168
|
|
View Code Duplication |
elseif (($value & 0xF8) === 0xF0) { |
|
169
|
|
|
$character = ($value & 0x07) << 18; |
|
170
|
|
|
$length = 4; |
|
171
|
|
|
$remaining = 3; |
|
172
|
|
|
} |
|
173
|
|
|
// Invalid byte: |
|
174
|
|
|
else { |
|
175
|
|
|
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value); |
|
176
|
|
|
} |
|
177
|
|
|
|
|
178
|
|
|
if ($remaining > 0) { |
|
179
|
|
|
if ($position + $length > $strlen) { |
|
180
|
|
|
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
181
|
|
|
} |
|
182
|
|
|
for ($position++; $remaining > 0; $position++) { |
|
183
|
|
|
$value = ord($input[$position]); |
|
184
|
|
|
|
|
185
|
|
|
// If it is invalid, count the sequence as invalid and reprocess the current byte: |
|
186
|
|
|
if (($value & 0xC0) !== 0x80) { |
|
187
|
|
|
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
188
|
|
|
} |
|
189
|
|
|
|
|
190
|
|
|
$character |= ($value & 0x3F) << (--$remaining * 6); |
|
191
|
|
|
} |
|
192
|
|
|
$position--; |
|
193
|
|
|
} |
|
194
|
|
|
|
|
195
|
|
|
if ( |
|
196
|
|
|
// Non-shortest form sequences are invalid |
|
197
|
|
|
$length > 1 && $character <= 0x7F |
|
198
|
|
|
|| $length > 2 && $character <= 0x7FF |
|
199
|
|
|
|| $length > 3 && $character <= 0xFFFF |
|
200
|
|
|
// Outside of range of ucschar codepoints |
|
201
|
|
|
// Noncharacters |
|
202
|
|
|
|| ($character & 0xFFFE) === 0xFFFE |
|
203
|
|
|
|| $character >= 0xFDD0 && $character <= 0xFDEF |
|
204
|
|
|
|| ( |
|
205
|
|
|
// Everything else not in ucschar |
|
206
|
|
|
$character > 0xD7FF && $character < 0xF900 |
|
207
|
|
|
|| $character < 0x20 |
|
208
|
|
|
|| $character > 0x7E && $character < 0xA0 |
|
209
|
|
|
|| $character > 0xEFFFD |
|
210
|
|
|
) |
|
211
|
|
|
) { |
|
212
|
|
|
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
213
|
|
|
} |
|
214
|
|
|
|
|
215
|
|
|
$codepoints[] = $character; |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
|
|
return $codepoints; |
|
219
|
|
|
} |
|
220
|
|
|
|
|
221
|
|
|
/** |
|
222
|
|
|
* RFC3492-compliant encoder |
|
223
|
|
|
* |
|
224
|
|
|
* @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code |
|
225
|
|
|
* @throws Rmccue\Requests\Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`) |
|
|
|
|
|
|
226
|
|
|
* |
|
227
|
|
|
* @param string $input UTF-8 encoded string to encode |
|
228
|
|
|
* @return string Punycode-encoded string |
|
229
|
|
|
*/ |
|
230
|
|
|
public static function punycode_encode($input) { |
|
231
|
|
|
$output = ''; |
|
232
|
|
|
# let n = initial_n |
|
233
|
|
|
$n = self::BOOTSTRAP_INITIAL_N; |
|
|
|
|
|
|
234
|
|
|
# let delta = 0 |
|
235
|
|
|
$delta = 0; |
|
236
|
|
|
# let bias = initial_bias |
|
237
|
|
|
$bias = self::BOOTSTRAP_INITIAL_BIAS; |
|
238
|
|
|
# let h = b = the number of basic code points in the input |
|
239
|
|
|
$h = $b = 0; // see loop |
|
|
|
|
|
|
240
|
|
|
# copy them to the output in order |
|
241
|
|
|
$codepoints = self::utf8_to_codepoints($input); |
|
242
|
|
|
$extended = array(); |
|
243
|
|
|
|
|
244
|
|
|
foreach ($codepoints as $char) { |
|
245
|
|
|
if ($char < 128) { |
|
246
|
|
|
// Character is valid ASCII |
|
247
|
|
|
// TODO: this should also check if it's valid for a URL |
|
248
|
|
|
$output .= chr($char); |
|
249
|
|
|
$h++; |
|
250
|
|
|
} |
|
251
|
|
|
// Check if the character is non-ASCII, but below initial n |
|
252
|
|
|
// This never occurs for Punycode, so ignore in coverage |
|
253
|
|
|
// @codeCoverageIgnoreStart |
|
254
|
|
|
elseif ($char < $n) { |
|
255
|
|
|
throw new Exception('Invalid character', 'idna.character_outside_domain', $char); |
|
256
|
|
|
} |
|
257
|
|
|
// @codeCoverageIgnoreEnd |
|
258
|
|
|
else { |
|
259
|
|
|
$extended[$char] = true; |
|
260
|
|
|
} |
|
261
|
|
|
} |
|
262
|
|
|
$extended = array_keys($extended); |
|
263
|
|
|
sort($extended); |
|
264
|
|
|
$b = $h; |
|
265
|
|
|
# [copy them] followed by a delimiter if b > 0 |
|
266
|
|
|
if (strlen($output) > 0) { |
|
267
|
|
|
$output .= '-'; |
|
268
|
|
|
} |
|
269
|
|
|
# {if the input contains a non-basic code point < n then fail} |
|
270
|
|
|
# while h < length(input) do begin |
|
271
|
|
|
while ($h < count($codepoints)) { |
|
272
|
|
|
# let m = the minimum code point >= n in the input |
|
273
|
|
|
$m = array_shift($extended); |
|
|
|
|
|
|
274
|
|
|
//printf('next code point to insert is %s' . PHP_EOL, dechex($m)); |
|
275
|
|
|
# let delta = delta + (m - n) * (h + 1), fail on overflow |
|
276
|
|
|
$delta += ($m - $n) * ($h + 1); |
|
277
|
|
|
# let n = m |
|
278
|
|
|
$n = $m; |
|
279
|
|
|
# for each code point c in the input (in order) do begin |
|
280
|
|
|
for ($num = 0; $num < count($codepoints); $num++) { |
|
281
|
|
|
$c = $codepoints[$num]; |
|
|
|
|
|
|
282
|
|
|
# if c < n then increment delta, fail on overflow |
|
283
|
|
|
if ($c < $n) { |
|
284
|
|
|
$delta++; |
|
285
|
|
|
} |
|
286
|
|
|
# if c == n then begin |
|
287
|
|
|
elseif ($c === $n) { |
|
288
|
|
|
# let q = delta |
|
289
|
|
|
$q = $delta; |
|
|
|
|
|
|
290
|
|
|
# for k = base to infinity in steps of base do begin |
|
291
|
|
|
for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) { |
|
292
|
|
|
# let t = tmin if k <= bias {+ tmin}, or |
|
293
|
|
|
# tmax if k >= bias + tmax, or k - bias otherwise |
|
294
|
|
|
if ($k <= ($bias + self::BOOTSTRAP_TMIN)) { |
|
295
|
|
|
$t = self::BOOTSTRAP_TMIN; |
|
|
|
|
|
|
296
|
|
|
} |
|
297
|
|
|
elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) { |
|
298
|
|
|
$t = self::BOOTSTRAP_TMAX; |
|
299
|
|
|
} |
|
300
|
|
|
else { |
|
301
|
|
|
$t = $k - $bias; |
|
302
|
|
|
} |
|
303
|
|
|
# if q < t then break |
|
304
|
|
|
if ($q < $t) { |
|
305
|
|
|
break; |
|
306
|
|
|
} |
|
307
|
|
|
# output the code point for digit t + ((q - t) mod (base - t)) |
|
308
|
|
|
$digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t)); |
|
309
|
|
|
$output .= self::digit_to_char($digit); |
|
310
|
|
|
# let q = (q - t) div (base - t) |
|
311
|
|
|
$q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t)); |
|
312
|
|
|
# end |
|
313
|
|
|
} |
|
314
|
|
|
# output the code point for digit q |
|
315
|
|
|
$output .= self::digit_to_char($q); |
|
316
|
|
|
# let bias = adapt(delta, h + 1, test h equals b?) |
|
317
|
|
|
$bias = self::adapt($delta, $h + 1, $h === $b); |
|
318
|
|
|
# let delta = 0 |
|
319
|
|
|
$delta = 0; |
|
320
|
|
|
# increment h |
|
321
|
|
|
$h++; |
|
322
|
|
|
# end |
|
323
|
|
|
} |
|
324
|
|
|
# end |
|
325
|
|
|
} |
|
326
|
|
|
# increment delta and n |
|
327
|
|
|
$delta++; |
|
328
|
|
|
$n++; |
|
329
|
|
|
# end |
|
330
|
|
|
} |
|
331
|
|
|
|
|
332
|
|
|
return $output; |
|
333
|
|
|
} |
|
334
|
|
|
|
|
335
|
|
|
/** |
|
336
|
|
|
* Convert a digit to its respective character |
|
337
|
|
|
* |
|
338
|
|
|
* @see https://tools.ietf.org/html/rfc3492#section-5 |
|
339
|
|
|
* @throws Rmccue\Requests\Exception On invalid digit (`idna.invalid_digit`) |
|
340
|
|
|
* |
|
341
|
|
|
* @param int $digit Digit in the range 0-35 |
|
342
|
|
|
* @return string Single character corresponding to digit |
|
343
|
|
|
*/ |
|
344
|
|
|
protected static function digit_to_char($digit) { |
|
345
|
|
|
// @codeCoverageIgnoreStart |
|
346
|
|
|
// As far as I know, this never happens, but still good to be sure. |
|
347
|
|
|
if ($digit < 0 || $digit > 35) { |
|
348
|
|
|
throw new Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit); |
|
349
|
|
|
} |
|
350
|
|
|
// @codeCoverageIgnoreEnd |
|
351
|
|
|
$digits = 'abcdefghijklmnopqrstuvwxyz0123456789'; |
|
352
|
|
|
return substr($digits, $digit, 1); |
|
353
|
|
|
} |
|
354
|
|
|
|
|
355
|
|
|
/** |
|
356
|
|
|
* Adapt the bias |
|
357
|
|
|
* |
|
358
|
|
|
* @see https://tools.ietf.org/html/rfc3492#section-6.1 |
|
359
|
|
|
* @param int $delta |
|
360
|
|
|
* @param int $numpoints |
|
361
|
|
|
* @param bool $firsttime |
|
362
|
|
|
* @return int New bias |
|
363
|
|
|
*/ |
|
364
|
|
|
protected static function adapt($delta, $numpoints, $firsttime) { |
|
365
|
|
|
# function adapt(delta,numpoints,firsttime): |
|
366
|
|
|
# if firsttime then let delta = delta div damp |
|
367
|
|
|
if ($firsttime) { |
|
368
|
|
|
$delta = floor($delta / self::BOOTSTRAP_DAMP); |
|
369
|
|
|
} |
|
370
|
|
|
# else let delta = delta div 2 |
|
371
|
|
|
else { |
|
372
|
|
|
$delta = floor($delta / 2); |
|
373
|
|
|
} |
|
374
|
|
|
# let delta = delta + (delta div numpoints) |
|
375
|
|
|
$delta += floor($delta / $numpoints); |
|
376
|
|
|
# let k = 0 |
|
377
|
|
|
$k = 0; |
|
|
|
|
|
|
378
|
|
|
# while delta > ((base - tmin) * tmax) div 2 do begin |
|
379
|
|
|
$max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2); |
|
380
|
|
|
while ($delta > $max) { |
|
381
|
|
|
# let delta = delta div (base - tmin) |
|
382
|
|
|
$delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN)); |
|
383
|
|
|
# let k = k + base |
|
384
|
|
|
$k += self::BOOTSTRAP_BASE; |
|
385
|
|
|
# end |
|
386
|
|
|
} |
|
387
|
|
|
# return k + (((base - tmin + 1) * delta) div (delta + skew)) |
|
388
|
|
|
return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW)); |
|
389
|
|
|
} |
|
390
|
|
|
} |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.