|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* Class to safely store UTF-8 in a Filename |
|
5
|
|
|
* |
|
6
|
|
|
* Encodes a utf8 string using only the following characters 0-9a-z_.-% |
|
7
|
|
|
* characters 0-9a-z in the original string are preserved, "plain". |
|
8
|
|
|
* all other characters are represented in a substring that starts |
|
9
|
|
|
* with '%' are "converted". |
|
10
|
|
|
* The transition from converted substrings to plain characters is |
|
11
|
|
|
* marked with a '.' |
|
12
|
|
|
* |
|
13
|
|
|
* @author Christopher Smith <[email protected]> |
|
14
|
|
|
* @date 2010-04-02 |
|
15
|
|
|
*/ |
|
16
|
|
|
class SafeFN { |
|
17
|
|
|
|
|
18
|
|
|
// 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator |
|
19
|
|
|
private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted |
|
20
|
|
|
private static $pre_indicator = '%'; |
|
21
|
|
|
private static $post_indicator = ']'; |
|
22
|
|
|
|
|
23
|
|
|
/** |
|
24
|
|
|
* Convert an UTF-8 string to a safe ASCII String |
|
25
|
|
|
* |
|
26
|
|
|
* conversion process |
|
27
|
|
|
* - if codepoint is a plain or post_indicator character, |
|
28
|
|
|
* - if previous character was "converted", append post_indicator to output, clear "converted" flag |
|
29
|
|
|
* - append ascii byte for character to output |
|
30
|
|
|
* (continue to next character) |
|
31
|
|
|
* |
|
32
|
|
|
* - if codepoint is a pre_indicator character, |
|
33
|
|
|
* - append ascii byte for character to output, set "converted" flag |
|
34
|
|
|
* (continue to next character) |
|
35
|
|
|
* |
|
36
|
|
|
* (all remaining characters) |
|
37
|
|
|
* - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero. |
|
38
|
|
|
* - convert reduced value to base36 (0-9a-z) |
|
39
|
|
|
* - append $pre_indicator characater followed by base36 string to output, set converted flag |
|
40
|
|
|
* (continue to next character) |
|
41
|
|
|
* |
|
42
|
|
|
* @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f |
|
43
|
|
|
* @return string an encoded representation of $filename using only 'safe' ASCII characters |
|
44
|
|
|
* |
|
45
|
|
|
* @author Christopher Smith <[email protected]> |
|
46
|
|
|
*/ |
|
47
|
|
|
public static function encode($filename) { |
|
48
|
|
|
return self::unicodeToSafe(utf8_to_unicode($filename)); |
|
49
|
|
|
} |
|
50
|
|
|
|
|
51
|
|
|
/** |
|
52
|
|
|
* decoding process |
|
53
|
|
|
* - split the string into substrings at any occurrence of pre or post indicator characters |
|
54
|
|
|
* - check the first character of the substring |
|
55
|
|
|
* - if its not a pre_indicator character |
|
56
|
|
|
* - if previous character was converted, skip over post_indicator character |
|
57
|
|
|
* - copy codepoint values of remaining characters to the output array |
|
58
|
|
|
* - clear any converted flag |
|
59
|
|
|
* (continue to next substring) |
|
60
|
|
|
* |
|
61
|
|
|
* _ else (its a pre_indicator character) |
|
62
|
|
|
* - if string length is 1, copy the post_indicator character to the output array |
|
63
|
|
|
* (continue to next substring) |
|
64
|
|
|
* |
|
65
|
|
|
* - else (string length > 1) |
|
66
|
|
|
* - skip the pre-indicator character and convert remaining string from base36 to base10 |
|
67
|
|
|
* - increase codepoint value for non-printable ASCII characters (add 0x20) |
|
68
|
|
|
* - append codepoint to output array |
|
69
|
|
|
* (continue to next substring) |
|
70
|
|
|
* |
|
71
|
|
|
* @param string $filename a 'safe' encoded ASCII string, |
|
72
|
|
|
* @return string decoded utf8 representation of $filename |
|
73
|
|
|
* |
|
74
|
|
|
* @author Christopher Smith <[email protected]> |
|
75
|
|
|
*/ |
|
76
|
|
|
public static function decode($filename) { |
|
77
|
|
|
return unicode_to_utf8(self::safeToUnicode(strtolower($filename))); |
|
78
|
|
|
} |
|
79
|
|
|
|
|
80
|
|
|
public static function validatePrintableUtf8($printable_utf8) { |
|
81
|
|
|
return !preg_match('#[\x01-\x1f]#',$printable_utf8); |
|
82
|
|
|
} |
|
83
|
|
|
|
|
84
|
|
|
public static function validateSafe($safe) { |
|
85
|
|
|
return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe); |
|
86
|
|
|
} |
|
87
|
|
|
|
|
88
|
|
|
/** |
|
89
|
|
|
* convert an array of unicode codepoints into 'safe_filename' format |
|
90
|
|
|
* |
|
91
|
|
|
* @param array int $unicode an array of unicode codepoints |
|
92
|
|
|
* @return string the unicode represented in 'safe_filename' format |
|
93
|
|
|
* |
|
94
|
|
|
* @author Christopher Smith <[email protected]> |
|
95
|
|
|
*/ |
|
96
|
|
|
private static function unicodeToSafe($unicode) { |
|
97
|
|
|
|
|
98
|
|
|
$safe = ''; |
|
99
|
|
|
$converted = false; |
|
100
|
|
|
|
|
101
|
|
|
foreach ($unicode as $codepoint) { |
|
102
|
|
|
if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) { |
|
103
|
|
|
if ($converted) { |
|
104
|
|
|
$safe .= self::$post_indicator; |
|
105
|
|
|
$converted = false; |
|
106
|
|
|
} |
|
107
|
|
|
$safe .= chr($codepoint); |
|
108
|
|
|
|
|
109
|
|
|
} else if ($codepoint == ord(self::$pre_indicator)) { |
|
110
|
|
|
$safe .= self::$pre_indicator; |
|
111
|
|
|
$converted = true; |
|
112
|
|
|
} else { |
|
113
|
|
|
$safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36); |
|
114
|
|
|
$converted = true; |
|
115
|
|
|
} |
|
116
|
|
|
} |
|
117
|
|
|
if($converted) $safe .= self::$post_indicator; |
|
118
|
|
|
return $safe; |
|
119
|
|
|
} |
|
120
|
|
|
|
|
121
|
|
|
/** |
|
122
|
|
|
* convert a 'safe_filename' string into an array of unicode codepoints |
|
123
|
|
|
* |
|
124
|
|
|
* @param string $safe a filename in 'safe_filename' format |
|
125
|
|
|
* @return array int an array of unicode codepoints |
|
126
|
|
|
* |
|
127
|
|
|
* @author Christopher Smith <[email protected]> |
|
128
|
|
|
*/ |
|
129
|
|
|
private static function safeToUnicode($safe) { |
|
130
|
|
|
|
|
131
|
|
|
$unicode = array(); |
|
132
|
|
|
$split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY); |
|
133
|
|
|
|
|
134
|
|
|
$converted = false; |
|
135
|
|
|
foreach ($split as $sub) { |
|
136
|
|
|
$len = strlen($sub); |
|
137
|
|
|
if ($sub[0] != self::$pre_indicator) { |
|
138
|
|
|
// plain (unconverted) characters, optionally starting with a post_indicator |
|
139
|
|
|
// set initial value to skip any post_indicator |
|
140
|
|
|
for ($i=($converted?1:0); $i < $len; $i++) { |
|
141
|
|
|
$unicode[] = ord($sub[$i]); |
|
142
|
|
|
} |
|
143
|
|
|
$converted = false; |
|
144
|
|
|
} else if ($len==1) { |
|
145
|
|
|
// a pre_indicator character in the real data |
|
146
|
|
|
$unicode[] = ord($sub); |
|
147
|
|
|
$converted = true; |
|
148
|
|
|
} else { |
|
149
|
|
|
// a single codepoint in base36, adjusted for initial 32 non-printable chars |
|
150
|
|
|
$unicode[] = 32 + (int)base_convert(substr($sub,1),36,10); |
|
151
|
|
|
$converted = true; |
|
152
|
|
|
} |
|
153
|
|
|
} |
|
154
|
|
|
|
|
155
|
|
|
return $unicode; |
|
156
|
|
|
} |
|
157
|
|
|
|
|
158
|
|
|
} |
|
159
|
|
|
|