|
1
|
|
|
<?php |
|
2
|
|
|
/* |
|
3
|
|
|
* This file is part of Yolk - Gamer Network's PHP Framework. |
|
4
|
|
|
* |
|
5
|
|
|
* Copyright (c) 2013 Gamer Network Ltd. |
|
6
|
|
|
* |
|
7
|
|
|
* Distributed under the MIT License, a copy of which is available in the |
|
8
|
|
|
* LICENSE file that was bundled with this package, or online at: |
|
9
|
|
|
* https://github.com/gamernetwork/yolk-core |
|
10
|
|
|
*/ |
|
11
|
|
|
|
|
12
|
|
|
namespace yolk\helpers; |
|
13
|
|
|
|
|
14
|
|
|
class StringHelper { |
|
15
|
|
|
|
|
16
|
|
|
/** |
|
17
|
|
|
* Helpers cannot be instantiated. |
|
18
|
|
|
*/ |
|
19
|
|
|
private function __construct() {} |
|
20
|
|
|
|
|
21
|
|
|
/** |
|
22
|
|
|
* Parse a URL string into an array of components. |
|
23
|
|
|
* Similar to the native parse_url except that the returned array will contain all components |
|
24
|
|
|
* and the query component is replaced with an options component containing a decoded array. |
|
25
|
|
|
* |
|
26
|
|
|
* @param string|array $url either a string array or a partial list of url components |
|
27
|
|
|
* @param array $defaults an array of default values for components |
|
28
|
|
|
* @return array|boolean Returns false if the URL could not be parsed |
|
29
|
|
|
*/ |
|
30
|
|
|
public static function parseURL( $url, $defaults = array() ) { |
|
31
|
|
|
|
|
32
|
|
|
$parts = is_string($url) ? \parse_url(urldecode($url)) : $url; |
|
33
|
|
|
|
|
34
|
|
|
$select = function( $k ) use ( $parts, $defaults ) { |
|
35
|
|
|
if( isset($parts[$k]) ) |
|
36
|
|
|
return $parts[$k]; |
|
37
|
|
|
elseif( isset($defaults[$k]) ) |
|
38
|
|
|
return $defaults[$k]; |
|
39
|
|
|
else |
|
40
|
|
|
return ''; |
|
41
|
|
|
}; |
|
42
|
|
|
|
|
43
|
|
|
$url = array( |
|
44
|
|
|
'scheme' => $select('scheme'), |
|
45
|
|
|
'host' => $select('host'), |
|
46
|
|
|
'port' => $select('port'), |
|
47
|
|
|
'user' => $select('user'), |
|
48
|
|
|
'pass' => $select('pass'), |
|
49
|
|
|
'path' => $select('path'), |
|
50
|
|
|
'options' => array(), |
|
51
|
|
|
); |
|
52
|
|
|
|
|
53
|
|
|
if( isset($parts['query']) ) |
|
54
|
|
|
parse_str($parts['query'], $url['options']); |
|
55
|
|
|
|
|
56
|
|
|
return $url; |
|
57
|
|
|
|
|
58
|
|
|
} |
|
59
|
|
|
|
|
60
|
|
|
/** |
|
61
|
|
|
* Returns a string of cryptographically strong random hex digits. |
|
62
|
|
|
* |
|
63
|
|
|
* @param integer $length length of the desired hex string |
|
64
|
|
|
* @return string |
|
65
|
|
|
*/ |
|
66
|
|
|
public static function randomHex( $length = 40 ) { |
|
67
|
|
|
return bin2hex(openssl_random_pseudo_bytes($length / 2)); |
|
68
|
|
|
} |
|
69
|
|
|
|
|
70
|
|
|
/** |
|
71
|
|
|
* Returns a string of the specified length containing only the characters in the $allowed parameter. |
|
72
|
|
|
* This function is not cryptographically strong. |
|
73
|
|
|
* |
|
74
|
|
|
* @param string $length length of the desired string |
|
75
|
|
|
* @param string $allowed the characters allowed to appear in the output |
|
76
|
|
|
* @return string |
|
77
|
|
|
*/ |
|
78
|
|
|
public static function randomString( $length, $allowed = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ) { |
|
79
|
|
|
$out = ''; |
|
80
|
|
|
$max = strlen($allowed) - 1; |
|
81
|
|
|
for ($i = 0; $i < $length; $i++) { |
|
82
|
|
|
$out .= $allowed[mt_rand(0, $max)]; |
|
83
|
|
|
} |
|
84
|
|
|
return $out; |
|
85
|
|
|
} |
|
86
|
|
|
|
|
87
|
|
|
/** |
|
88
|
|
|
* Convert a camel-cased string to lower case with underscores |
|
89
|
|
|
*/ |
|
90
|
|
|
public static function uncamelise( $str ) { |
|
91
|
|
|
return mb_strtolower( |
|
92
|
|
|
preg_replace( |
|
93
|
|
|
'/^A-Z^a-z^0-9]+/', '_', |
|
94
|
|
|
preg_replace('/([a-z\d])([A-Z])/u', '$1_$2', |
|
95
|
|
|
preg_replace('/([A-Z+])([A-Z][a-z])/u', '$1_$2', $str) |
|
96
|
|
|
) |
|
97
|
|
|
) |
|
98
|
|
|
); |
|
99
|
|
|
} |
|
100
|
|
|
|
|
101
|
|
|
/** |
|
102
|
|
|
* Convert a string into a format safe for use in urls. |
|
103
|
|
|
* Converts any accent characters to their equivalent normal characters |
|
104
|
|
|
* and then any sequence of two or more non-alphanumeric characters to a dash. |
|
105
|
|
|
* |
|
106
|
|
|
* @param string $str A string to convert to a slug |
|
107
|
|
|
* @return string |
|
108
|
|
|
*/ |
|
109
|
|
|
public static function slugify( $str ) { |
|
110
|
|
|
$chars = array('&' => '-and-', '€' => '-EUR-', '£' => '-GBP-', '$' => '-USD-'); |
|
111
|
|
|
return trim(preg_replace('/([^a-z0-9]+)/u', '-', mb_strtolower(strtr(static::removeAccents($str), $chars))), '-'); |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
/** |
|
115
|
|
|
* Converts all accent characters to their ASCII counterparts. |
|
116
|
|
|
* |
|
117
|
|
|
* @param string $str A string that might contain accent characters |
|
118
|
|
|
* @return string |
|
119
|
|
|
*/ |
|
120
|
|
|
public static function removeAccents( $str ) { |
|
121
|
|
|
$chars = array( |
|
122
|
|
|
'ª' => 'a', 'º' => 'o', 'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', |
|
123
|
|
|
'Ä' => 'A', 'Å' => 'A', 'Ā' => 'A', 'Ă' => 'A', 'Ą' => 'A', 'à' => 'a', |
|
124
|
|
|
'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'ā' => 'a', |
|
125
|
|
|
'ă' => 'a', 'ą' => 'a', 'Ç' => 'C', 'Ć' => 'C', 'Ĉ' => 'C', 'Ċ' => 'C', |
|
126
|
|
|
'Č' => 'C', 'ç' => 'c', 'ć' => 'c', 'ĉ' => 'c', 'ċ' => 'c', 'č' => 'c', |
|
127
|
|
|
'Đ' => 'D', 'Ď' => 'D', 'đ' => 'd', 'ď' => 'd', 'È' => 'E', 'É' => 'E', |
|
128
|
|
|
'Ê' => 'E', 'Ë' => 'E', 'Ē' => 'E', 'Ĕ' => 'E', 'Ė' => 'E', 'Ę' => 'E', |
|
129
|
|
|
'Ě' => 'E', 'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ē' => 'e', |
|
130
|
|
|
'ĕ' => 'e', 'ė' => 'e', 'ę' => 'e', 'ě' => 'e', 'ƒ' => 'f', 'Ĝ' => 'G', |
|
131
|
|
|
'Ğ' => 'G', 'Ġ' => 'G', 'Ģ' => 'G', 'ĝ' => 'g', 'ğ' => 'g', 'ġ' => 'g', |
|
132
|
|
|
'ģ' => 'g', 'Ĥ' => 'H', 'Ħ' => 'H', 'ĥ' => 'h', 'ħ' => 'h', 'Ì' => 'I', |
|
133
|
|
|
'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', 'Ĩ' => 'I', 'Ī' => 'I', 'Ĭ' => 'I', |
|
134
|
|
|
'Į' => 'I', 'İ' => 'I', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', |
|
135
|
|
|
'ĩ' => 'i', 'ī' => 'i', 'ĭ' => 'i', 'į' => 'i', 'ı' => 'i', 'Ĵ' => 'J', |
|
136
|
|
|
'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k', 'ĸ' => 'k', 'Ĺ' => 'L', 'Ļ' => 'L', |
|
137
|
|
|
'Ľ' => 'L', 'Ŀ' => 'L', 'Ł' => 'L', 'ĺ' => 'l', 'ļ' => 'l', 'ľ' => 'l', |
|
138
|
|
|
'ŀ' => 'l', 'ł' => 'l', 'Ñ' => 'N', 'Ń' => 'N', 'Ņ' => 'N', 'Ň' => 'N', |
|
139
|
|
|
'Ŋ' => 'N', 'ñ' => 'n', 'ń' => 'n', 'ņ' => 'n', 'ň' => 'n', 'ʼn' => 'n', |
|
140
|
|
|
'ŋ' => 'n', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', |
|
141
|
|
|
'Ø' => 'O', 'Ō' => 'O', 'Ŏ' => 'O', 'Ő' => 'O', 'ò' => 'o', 'ó' => 'o', |
|
142
|
|
|
'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o', 'ō' => 'o', 'ŏ' => 'o', |
|
143
|
|
|
'ő' => 'o', 'ð' => 'o', 'Ŕ' => 'R', 'Ŗ' => 'R', 'Ř' => 'R', 'ŕ' => 'r', |
|
144
|
|
|
'ŗ' => 'r', 'ř' => 'r', 'Ś' => 'S', 'Ŝ' => 'S', 'Ş' => 'S', 'Š' => 'S', |
|
145
|
|
|
'Ș' => 'S', 'ś' => 's', 'ŝ' => 's', 'ş' => 's', 'š' => 's', 'ș' => 's', |
|
146
|
|
|
'ſ' => 's', 'Ţ' => 'T', 'Ť' => 'T', 'Ŧ' => 'T', 'Ț' => 'T', 'ţ' => 't', |
|
147
|
|
|
'ť' => 't', 'ŧ' => 't', 'ț' => 't', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', |
|
148
|
|
|
'Ü' => 'U', 'Ũ' => 'U', 'Ū' => 'U', 'Ŭ' => 'U', 'Ů' => 'U', 'Ű' => 'U', |
|
149
|
|
|
'Ų' => 'U', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ũ' => 'u', |
|
150
|
|
|
'ū' => 'u', 'ŭ' => 'u', 'ů' => 'u', 'ű' => 'u', 'ų' => 'u', 'Ŵ' => 'W', |
|
151
|
|
|
'ŵ' => 'w', 'Ý' => 'Y', 'Ÿ' => 'Y', 'Ŷ' => 'Y', 'ý' => 'y', 'ÿ' => 'y', |
|
152
|
|
|
'ŷ' => 'y', 'Ź' => 'Z', 'Ż' => 'Z', 'Ž' => 'Z', 'ź' => 'z', 'ż' => 'z', |
|
153
|
|
|
'ž' => 'z', 'Æ' => 'AE', 'æ' => 'ae', 'IJ' => 'IJ', 'ij' => 'ij', |
|
154
|
|
|
'Œ' => 'OE', 'œ' => 'oe', 'ß' => 'ss', 'þ' => 'th', 'Þ' => 'th', |
|
155
|
|
|
); |
|
156
|
|
|
return strtr($str, $chars); |
|
157
|
|
|
} |
|
158
|
|
|
|
|
159
|
|
|
/** |
|
160
|
|
|
* Converts a UTF-8 string to Latin-1 with unsupported characters encoded as numeric entities. |
|
161
|
|
|
* Example: I want to turn text like |
|
162
|
|
|
* hello é β 水 |
|
163
|
|
|
* into |
|
164
|
|
|
* hello é β 水 |
|
165
|
|
|
* |
|
166
|
|
|
* @param string $str |
|
167
|
|
|
* @return string the converted string. |
|
168
|
|
|
*/ |
|
169
|
|
|
public static function latin1( $str ) { |
|
170
|
|
|
return utf8_decode( |
|
171
|
|
|
mb_encode_numericentity( |
|
172
|
|
|
(string) $str, |
|
173
|
|
|
array(0x0100, 0xFFFF, 0, 0xFFFF), |
|
174
|
|
|
'UTF-8' |
|
175
|
|
|
) |
|
176
|
|
|
); |
|
177
|
|
|
} |
|
178
|
|
|
|
|
179
|
|
|
/** |
|
180
|
|
|
* Converts a Latin-1 string to UTF-8 and decodes entities. |
|
181
|
|
|
* |
|
182
|
|
|
* @param string $str |
|
183
|
|
|
* @return string the converted string. |
|
184
|
|
|
*/ |
|
185
|
|
|
public static function utf8( $str ) { |
|
186
|
|
|
return html_entity_decode( |
|
187
|
|
|
mb_convert_encoding( |
|
188
|
|
|
(string) $str, |
|
189
|
|
|
'UTF-8', |
|
190
|
|
|
'ISO-8859-1' |
|
191
|
|
|
), |
|
192
|
|
|
ENT_NOQUOTES, |
|
193
|
|
|
'UTF-8' |
|
194
|
|
|
); |
|
195
|
|
|
} |
|
196
|
|
|
|
|
197
|
|
|
/** |
|
198
|
|
|
* Return the ordinal suffix (st, nd, rd, th) of a number. |
|
199
|
|
|
* Taken from: http://stackoverflow.com/questions/3109978/php-display-number-with-ordinal-suffix |
|
200
|
|
|
* |
|
201
|
|
|
* @param integer $n |
|
202
|
|
|
* @return string the number cast as a string with the ordinal suffixed. |
|
203
|
|
|
*/ |
|
204
|
|
|
public static function ordinal( $n ) { |
|
205
|
|
|
$ends = array('th','st','nd','rd','th','th','th','th','th','th'); |
|
206
|
|
|
// if tens digit is 1, 2 or 3 then use th instead of usual ordinal |
|
207
|
|
|
if( ($n % 100) >= 11 && ($n % 100) <= 13 ) |
|
208
|
|
|
return "{$n}th"; |
|
209
|
|
|
else |
|
210
|
|
|
return "{$n}{$ends[$n % 10]}"; |
|
211
|
|
|
} |
|
212
|
|
|
|
|
213
|
|
|
/** |
|
214
|
|
|
* Convert a number of bytes to a human-friendly string using the largest suitable unit. |
|
215
|
|
|
* Taken from: http://www.php.net/manual/de/function.filesize.php#91477 |
|
216
|
|
|
* |
|
217
|
|
|
* @param integer $bytes the number of bytes to |
|
218
|
|
|
* @param integer $precision the number of decimal places to format the result to. |
|
219
|
|
|
* @return string |
|
220
|
|
|
*/ |
|
221
|
|
|
public static function sizeFormat( $bytes, $precision ) { |
|
222
|
|
|
$units = array('B', 'KB', 'MB', 'GB', 'TB', 'PB'); |
|
223
|
|
|
$bytes = max($bytes, 0); |
|
224
|
|
|
$pow = floor(($bytes ? log($bytes) : 0) / log(1024)); |
|
225
|
|
|
$pow = min($pow, count($units) - 1); |
|
226
|
|
|
$bytes /= (1 << (10 * $pow)); |
|
227
|
|
|
return round($bytes, $precision). ' '. $units[$pow]; |
|
228
|
|
|
} |
|
229
|
|
|
|
|
230
|
|
|
/** |
|
231
|
|
|
* Remove XSS vulnerabilities from a string. |
|
232
|
|
|
* Shamelessly ripped from Kohana v2 and then tweaked to remove control characters |
|
233
|
|
|
* and replace the associated regex components with \s instead. |
|
234
|
|
|
* Also added a couple of other tags to the really bad list. |
|
235
|
|
|
* Handles most of the XSS vectors listed at http://ha.ckers.org/xss.html |
|
236
|
|
|
* @param string|array str |
|
237
|
|
|
* @return string|array |
|
238
|
|
|
*/ |
|
239
|
|
|
public static function xssClean( $str, $charset = 'UTF-8' ) { |
|
240
|
|
|
|
|
241
|
|
|
if( !$str ) |
|
242
|
|
|
return $str; |
|
243
|
|
|
|
|
244
|
|
|
if( is_array($str) ) { |
|
245
|
|
|
foreach( $str as &$item ) { |
|
246
|
|
|
$item = static::xssClean($item); |
|
247
|
|
|
} |
|
248
|
|
|
return $str; |
|
249
|
|
|
} |
|
250
|
|
|
|
|
251
|
|
|
// strip any raw control characters that might interfere with our cleaning |
|
252
|
|
|
$str = static::stripControlChars($str); |
|
253
|
|
|
|
|
254
|
|
|
// fix and decode entities (handles missing ; terminator) |
|
255
|
|
|
$str = str_replace(array('&','<','>'), array('&amp;','&lt;','&gt;'), $str); |
|
256
|
|
|
$str = preg_replace('/(&#*\w+)\s+;/u', '$1;', $str); |
|
257
|
|
|
$str = preg_replace('/(&#x*[0-9A-F]+);*/iu', '$1;', $str); |
|
258
|
|
|
$str = html_entity_decode($str, ENT_COMPAT, $charset); |
|
259
|
|
|
|
|
260
|
|
|
// strip any control characters that were sneakily encoded as entities |
|
261
|
|
|
$str = static::stripControlChars($str); |
|
262
|
|
|
|
|
263
|
|
|
// normalise line endings |
|
264
|
|
|
$str = static::normaliseLineEndings($str); |
|
265
|
|
|
|
|
266
|
|
|
// remove any attribute starting with "on" or xmlns |
|
267
|
|
|
$str = preg_replace('#(?:on[a-z]+|xmlns)\s*=\s*[\'"\s]?[^\'>"]*[\'"\s]?\s?#iu', '', $str); |
|
268
|
|
|
|
|
269
|
|
|
// remove javascript: and vbscript: protocols and -moz-binding CSS property |
|
270
|
|
|
$str = preg_replace('#([a-z]*)\s*=\s*([`\'"]*)\s*j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:#iu', '$1=$2nojavascript...', $str); |
|
271
|
|
|
$str = preg_replace('#([a-z]*)\s*=([\'"]*)\s*v\s*b\s*s\s*c\s*r\s*i\s*p\s*t\s*:#iu', '$1=$2novbscript...', $str); |
|
272
|
|
|
$str = preg_replace('#([a-z]*)\s*=([\'"]*)\s*-moz-binding\s*:#u', '$1=$2nomozbinding...', $str); |
|
273
|
|
|
|
|
274
|
|
|
// only works in IE: <span style="width: expression(alert('XSS!'));"></span> |
|
275
|
|
|
$str = preg_replace('#(<[^>]+?)style\s*=\s*[`\'"]*.*?expression\s*\([^>]*+>#isu', '$1>', $str); |
|
276
|
|
|
$str = preg_replace('#(<[^>]+?)style\s*=\s*[`\'"]*.*?behaviour\s*\([^>]*+>#isu', '$1>', $str); |
|
277
|
|
|
$str = preg_replace('#(<[^>]+?)style\s*=\s*[`\'"]*.*?s\s*c\s*r\s*i\s*p\s*t\s*:*[^>]*+>#isu', '$1>', $str); |
|
278
|
|
|
|
|
279
|
|
|
// remove namespaced elements (we do not need them) |
|
280
|
|
|
$str = preg_replace('#</*\w+:\w[^>]*+>#iu', '', $str); |
|
281
|
|
|
|
|
282
|
|
|
// remove data URIs |
|
283
|
|
|
$str = preg_replace("#data:[\w/]+;\w+,[\w\r\n+=/]*#iu", "data: not allowed", $str); |
|
284
|
|
|
|
|
285
|
|
|
// remove really unwanted tags |
|
286
|
|
|
do { |
|
287
|
|
|
$old = $str; |
|
288
|
|
|
$str = preg_replace('#</*(?:applet|b(?:ase|gsound|link)|body|embed|frame(?:set)?|head|html|i(?:frame|layer)|l(?:ayer|ink)|meta|object|s(?:cript|tyle)|title|xml)[^>]*+>#iu', '', $str); |
|
289
|
|
|
} |
|
290
|
|
|
while ($old !== $str); |
|
291
|
|
|
|
|
292
|
|
|
return $str; |
|
293
|
|
|
} |
|
294
|
|
|
|
|
295
|
|
|
/** |
|
296
|
|
|
* Remove every control character except newline (10/x0A) carriage return (13/x0D), and horizontal tab (09/x09) |
|
297
|
|
|
* @param string|array str |
|
298
|
|
|
* @return string|array |
|
299
|
|
|
*/ |
|
300
|
|
|
public static function stripControlChars( $str ) { |
|
301
|
|
|
|
|
302
|
|
|
if( is_array($str) ) { |
|
303
|
|
|
foreach( $str as &$item ) { |
|
304
|
|
|
$item = static::stripControlChars($item); |
|
305
|
|
|
} |
|
306
|
|
|
return $str; |
|
307
|
|
|
} |
|
308
|
|
|
|
|
309
|
|
|
do { |
|
310
|
|
|
// 00-08, 11, 12, 14-31, 127 |
|
|
|
|
|
|
311
|
|
|
$str = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/Su', '', $str, -1, $count); |
|
312
|
|
|
} |
|
313
|
|
|
while ($count); |
|
314
|
|
|
|
|
315
|
|
|
return $str; |
|
316
|
|
|
|
|
317
|
|
|
} |
|
318
|
|
|
|
|
319
|
|
|
/** |
|
320
|
|
|
* Ensures that a string has consistent line-endings. |
|
321
|
|
|
* All line-ending are converted to LF with maximum of two consecutive. |
|
322
|
|
|
* @return string |
|
323
|
|
|
*/ |
|
324
|
|
|
public static function normaliseLineEndings( $str ) { |
|
325
|
|
|
$str = str_replace("\r\n", "\n", $str); |
|
326
|
|
|
$str = str_replace("\r", "\n", $str); |
|
327
|
|
|
return preg_replace("/\n{2,}/", "\n\n", $str); |
|
328
|
|
|
} |
|
329
|
|
|
|
|
330
|
|
|
} |
|
331
|
|
|
|
|
332
|
|
|
// EOF |
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.