1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Class to safely store UTF-8 in a Filename |
5
|
|
|
* |
6
|
|
|
* Encodes a utf8 string using only the following characters 0-9a-z_.-% |
7
|
|
|
* characters 0-9a-z in the original string are preserved, "plain". |
8
|
|
|
* all other characters are represented in a substring that starts |
9
|
|
|
* with '%' are "converted". |
10
|
|
|
* The transition from converted substrings to plain characters is |
11
|
|
|
* marked with a '.' |
12
|
|
|
* |
13
|
|
|
* @author Christopher Smith <[email protected]> |
14
|
|
|
* @date 2010-04-02 |
15
|
|
|
*/ |
16
|
|
|
class SafeFN { |
17
|
|
|
|
18
|
|
|
// 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator |
19
|
|
|
private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted |
20
|
|
|
private static $pre_indicator = '%'; |
21
|
|
|
private static $post_indicator = ']'; |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* Convert an UTF-8 string to a safe ASCII String |
25
|
|
|
* |
26
|
|
|
* conversion process |
27
|
|
|
* - if codepoint is a plain or post_indicator character, |
28
|
|
|
* - if previous character was "converted", append post_indicator to output, clear "converted" flag |
29
|
|
|
* - append ascii byte for character to output |
30
|
|
|
* (continue to next character) |
31
|
|
|
* |
32
|
|
|
* - if codepoint is a pre_indicator character, |
33
|
|
|
* - append ascii byte for character to output, set "converted" flag |
34
|
|
|
* (continue to next character) |
35
|
|
|
* |
36
|
|
|
* (all remaining characters) |
37
|
|
|
* - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero. |
38
|
|
|
* - convert reduced value to base36 (0-9a-z) |
39
|
|
|
* - append $pre_indicator characater followed by base36 string to output, set converted flag |
40
|
|
|
* (continue to next character) |
41
|
|
|
* |
42
|
|
|
* @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f |
43
|
|
|
* @return string an encoded representation of $filename using only 'safe' ASCII characters |
44
|
|
|
* |
45
|
|
|
* @author Christopher Smith <[email protected]> |
46
|
|
|
*/ |
47
|
|
|
public static function encode($filename) { |
48
|
|
|
return self::unicodeToSafe(utf8_to_unicode($filename)); |
49
|
|
|
} |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* decoding process |
53
|
|
|
* - split the string into substrings at any occurrence of pre or post indicator characters |
54
|
|
|
* - check the first character of the substring |
55
|
|
|
* - if its not a pre_indicator character |
56
|
|
|
* - if previous character was converted, skip over post_indicator character |
57
|
|
|
* - copy codepoint values of remaining characters to the output array |
58
|
|
|
* - clear any converted flag |
59
|
|
|
* (continue to next substring) |
60
|
|
|
* |
61
|
|
|
* _ else (its a pre_indicator character) |
62
|
|
|
* - if string length is 1, copy the post_indicator character to the output array |
63
|
|
|
* (continue to next substring) |
64
|
|
|
* |
65
|
|
|
* - else (string length > 1) |
66
|
|
|
* - skip the pre-indicator character and convert remaining string from base36 to base10 |
67
|
|
|
* - increase codepoint value for non-printable ASCII characters (add 0x20) |
68
|
|
|
* - append codepoint to output array |
69
|
|
|
* (continue to next substring) |
70
|
|
|
* |
71
|
|
|
* @param string $filename a 'safe' encoded ASCII string, |
72
|
|
|
* @return string decoded utf8 representation of $filename |
73
|
|
|
* |
74
|
|
|
* @author Christopher Smith <[email protected]> |
75
|
|
|
*/ |
76
|
|
|
public static function decode($filename) { |
77
|
|
|
return unicode_to_utf8(self::safeToUnicode(strtolower($filename))); |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
public static function validatePrintableUtf8($printable_utf8) { |
81
|
|
|
return !preg_match('#[\x01-\x1f]#',$printable_utf8); |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
public static function validateSafe($safe) { |
85
|
|
|
return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe); |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* convert an array of unicode codepoints into 'safe_filename' format |
90
|
|
|
* |
91
|
|
|
* @param array int $unicode an array of unicode codepoints |
92
|
|
|
* @return string the unicode represented in 'safe_filename' format |
93
|
|
|
* |
94
|
|
|
* @author Christopher Smith <[email protected]> |
95
|
|
|
*/ |
96
|
|
|
private static function unicodeToSafe($unicode) { |
97
|
|
|
|
98
|
|
|
$safe = ''; |
99
|
|
|
$converted = false; |
100
|
|
|
|
101
|
|
|
foreach ($unicode as $codepoint) { |
102
|
|
|
if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) { |
103
|
|
|
if ($converted) { |
104
|
|
|
$safe .= self::$post_indicator; |
105
|
|
|
$converted = false; |
106
|
|
|
} |
107
|
|
|
$safe .= chr($codepoint); |
108
|
|
|
|
109
|
|
|
} else if ($codepoint == ord(self::$pre_indicator)) { |
110
|
|
|
$safe .= self::$pre_indicator; |
111
|
|
|
$converted = true; |
112
|
|
|
} else { |
113
|
|
|
$safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36); |
114
|
|
|
$converted = true; |
115
|
|
|
} |
116
|
|
|
} |
117
|
|
|
if($converted) $safe .= self::$post_indicator; |
118
|
|
|
return $safe; |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* convert a 'safe_filename' string into an array of unicode codepoints |
123
|
|
|
* |
124
|
|
|
* @param string $safe a filename in 'safe_filename' format |
125
|
|
|
* @return array int an array of unicode codepoints |
126
|
|
|
* |
127
|
|
|
* @author Christopher Smith <[email protected]> |
128
|
|
|
*/ |
129
|
|
|
private static function safeToUnicode($safe) { |
130
|
|
|
|
131
|
|
|
$unicode = array(); |
132
|
|
|
$split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY); |
133
|
|
|
|
134
|
|
|
$converted = false; |
135
|
|
|
foreach ($split as $sub) { |
136
|
|
|
$len = strlen($sub); |
137
|
|
|
if ($sub[0] != self::$pre_indicator) { |
138
|
|
|
// plain (unconverted) characters, optionally starting with a post_indicator |
139
|
|
|
// set initial value to skip any post_indicator |
140
|
|
|
for ($i=($converted?1:0); $i < $len; $i++) { |
141
|
|
|
$unicode[] = ord($sub[$i]); |
142
|
|
|
} |
143
|
|
|
$converted = false; |
144
|
|
|
} else if ($len==1) { |
145
|
|
|
// a pre_indicator character in the real data |
146
|
|
|
$unicode[] = ord($sub); |
147
|
|
|
$converted = true; |
148
|
|
|
} else { |
149
|
|
|
// a single codepoint in base36, adjusted for initial 32 non-printable chars |
150
|
|
|
$unicode[] = 32 + (int)base_convert(substr($sub,1),36,10); |
151
|
|
|
$converted = true; |
152
|
|
|
} |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
return $unicode; |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
} |
159
|
|
|
|