1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @file TextHelper.php |
5
|
|
|
* @brief This file contains the TextHelper class. |
6
|
|
|
* @details |
7
|
|
|
* @author Filippo F. Fadda |
8
|
|
|
*/ |
9
|
|
|
|
10
|
|
|
|
11
|
|
|
//! Classes to help the development |
12
|
|
|
namespace ToolBag\Helper; |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* @brief This helper class contains routines to process text. |
17
|
|
|
* @nosubgrouping |
18
|
|
|
*/ |
19
|
|
|
class TextHelper { |
20
|
|
|
|
21
|
|
|
const SEPARATOR = '::'; //!< Used to separate the ID from the version number. |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* @brief Converts a string from a charset to another one. |
26
|
|
|
* @details The default conversion is from `Windows-1252` to `UTF-8`. `Windows-1252` or `CP-1252` is a character |
27
|
|
|
* encoding of the Latin alphabet, used by default in the legacy components of Microsoft Windows in English and some |
28
|
|
|
* other Western languages. This character encoding is a superset of `ISO-8859-1`, but it differs from it by using |
29
|
|
|
* displayable characters rather than control characters in the 80 to 9F (hex) range. |
30
|
|
|
* @param string $text The input string. |
31
|
|
|
* @param bool $stripslashes (optional) If `true` strip all the slashes before converting the text. |
32
|
|
|
* @param string $fromCharset (optional) The origin charset. |
33
|
|
|
* @param string $toCharset (optional) The target charset. |
34
|
|
|
* @return string |
35
|
|
|
* @attention Doesn't matter if the varchar fields of your MySQL tables are encoded in `LATIN1`, in fact, if someone |
36
|
|
|
* ever posted a document from Windows Word containing smart characters, like curly quotes or smart apostrophes, the |
37
|
|
|
* real charset used is `Windows-1252`. |
38
|
|
|
* @warning This function doesn't use `LATIN1` or `ISO-8859-1` as default, because `Windows-1251` and `Windows-1252` |
39
|
|
|
* will only succeed if the entire string consists of high-byte characters in a certain range. That means you'll never |
40
|
|
|
* get the right conversion because the text will appear as `ISO-8859-1` even if it is `Windows-1252`. See the bug |
41
|
|
|
* section. |
42
|
|
|
* @bug https://bugs.php.net/bug.php?id=64667 |
43
|
|
|
*/ |
44
|
|
|
public static function convertCharset($text, $stripslashes = FALSE, $fromCharset = 'Windows-1252', $toCharset = 'UTF-8') { |
45
|
|
|
if ($stripslashes) |
46
|
|
|
return iconv($fromCharset, $toCharset, stripslashes($text)); |
47
|
|
|
else |
48
|
|
|
return iconv($fromCharset, $toCharset, $text); |
49
|
|
|
} |
50
|
|
|
|
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* @brief Cuts a string to a given number of characters without breaking words. |
54
|
|
|
* @param string $text The input string. |
55
|
|
|
* @param integer $length The number of characters at which the string will be wrapped, ex. 200 characters. |
56
|
|
|
* @param string $etc The characters you want append to the end of text. |
57
|
|
|
* @param string $charset (optional) The charset used. |
58
|
|
|
* @param bool $breakWords (optional) If `true` breaks the words to return the exact number of chars. |
59
|
|
|
* @param bool $middle (optional) Truncates the text but remove middle instead the end of the string. |
60
|
|
|
* @return string |
61
|
|
|
* @warning This function works with UTF-8 strings. |
62
|
|
|
*/ |
63
|
|
|
public static function truncate($text, $length = 200, $etc = ' ...', $charset='UTF-8', $breakWords = FALSE, $middle = FALSE) { |
64
|
|
|
if ($length == 0) |
65
|
|
|
return ''; |
66
|
|
|
|
67
|
|
|
if (mb_strlen($text) > $length) { |
68
|
|
|
$length -= min($length, mb_strlen($etc, $charset)); |
69
|
|
|
|
70
|
|
|
if (!$breakWords && !$middle) |
71
|
|
|
$text = preg_replace('/\s+?(\S+)?$/u', '', mb_substr($text, 0, $length+1, $charset)); |
72
|
|
|
|
73
|
|
|
if(!$middle) |
74
|
|
|
return mb_substr($text, 0, $length, $charset) . $etc; |
75
|
|
|
else |
76
|
|
|
return mb_substr($text, 0, $length/2, $charset) . $etc . mb_substr($text, -$length/2, (mb_strlen($text, $charset) - $length/2), $charset); |
77
|
|
|
} |
78
|
|
|
else |
79
|
|
|
return $text; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* @brief Capitalizes the given string. |
85
|
|
|
* @param string $text The input string. |
86
|
|
|
* @param string $charset (optional) The charset used. |
87
|
|
|
* @return string |
88
|
|
|
* @warning This function works with UTF-8 strings. |
89
|
|
|
*/ |
90
|
|
|
public static function capitalize($text, $charset = 'UTF-8') { |
91
|
|
|
return mb_strtoupper(mb_substr($text, 0, 1, $charset), $charset) . mb_strtolower(mb_substr($text, 1, mb_strlen($text, $charset), $charset), $charset); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* @brief Removes the content of pre tags, than strip all tags. |
97
|
|
|
* @param string $text The input string. |
98
|
|
|
* @return string |
99
|
|
|
* @warning This function works with UTF-8 strings. |
100
|
|
|
*/ |
101
|
|
|
public static function purge($text) { |
102
|
|
|
// Removes the content of <pre></pre>. |
103
|
|
|
$temp = preg_replace('/<(pre)(?:(?!<\/\1).)*?<\/\1>/su', '', $text); |
104
|
|
|
|
105
|
|
|
if (is_null($temp)) |
|
|
|
|
106
|
|
|
throw new \RuntimeException(array_flip(get_defined_constants(TRUE)['pcre'])[preg_last_error()]); |
107
|
|
|
|
108
|
|
|
// Removes all the HTML tags. |
109
|
|
|
$temp = strip_tags($temp); |
110
|
|
|
|
111
|
|
|
return $temp; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
|
115
|
|
|
/** |
116
|
|
|
* @brief Generates a single word, stripping every `-` from a compound word. |
117
|
|
|
* @param string $word A compound word. |
118
|
|
|
* @return string |
119
|
|
|
* @warning This function works with UTF-8 strings. |
120
|
|
|
*/ |
121
|
|
|
public static function stick($word) { |
122
|
|
|
return preg_replace('/-/su', '', $word); |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* @brief Given a string, returns all the unique contained substrings. |
128
|
|
|
* @param string $str The input string. |
129
|
|
|
* @param string $charset (optional) The charset used. |
130
|
|
|
* @return array |
131
|
|
|
* @warning This function works with UTF-8 strings. |
132
|
|
|
*/ |
133
|
|
|
public static function substrings($str, $charset = 'UTF-8') { |
134
|
|
|
$length = mb_strlen($str, $charset); |
135
|
|
|
|
136
|
|
|
$subs = []; |
137
|
|
|
for ($i = 0; $i < $length; $i++) |
138
|
|
|
for ($j = 1; $j <= $length; $j++) |
139
|
|
|
$subs[] = mb_substr($str, $i, $j, $charset); |
140
|
|
|
|
141
|
|
|
return array_unique($subs); |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* @brief Generates a slug from the provided string. |
147
|
|
|
* @param string $str The input string. |
148
|
|
|
* @return string |
149
|
|
|
* @warning This function receives as input an UTF-8 string and returns an ASCII string. |
150
|
|
|
* @see https://en.wikipedia.org/wiki/Slug_(publishing) |
151
|
|
|
*/ |
152
|
|
|
public static function slug($str) { |
153
|
|
|
// Replaces any character that is not a letter or a number with minus. |
154
|
|
|
$slug = preg_replace('/[^\pL\d]+/u', '-', $str); |
155
|
|
|
|
156
|
|
|
// Removes the minus character from the begin and the end. |
157
|
|
|
$slug = trim($slug, '-'); |
158
|
|
|
|
159
|
|
|
// Converts the charset from uft-8 to ASCII. |
160
|
|
|
$slug = self::convertCharset($slug, FALSE, 'utf-8', 'ASCII//TRANSLIT'); |
161
|
|
|
|
162
|
|
|
// Converts the string to Lowercase. |
163
|
|
|
$slug = strtolower($slug); |
164
|
|
|
|
165
|
|
|
// Finally removes any character that is not a letter, a number or a minus. |
166
|
|
|
return preg_replace('/[^\-\w]+/', '', $slug); |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
|
170
|
|
|
/** |
171
|
|
|
* @brief Builds the post url, given its publishing or creation date and its slug. |
172
|
|
|
* @param int $date Publishing or creation date. |
173
|
|
|
* @param string $slug The slug of the title. |
174
|
|
|
* @return string The complete url of the post. |
175
|
|
|
*/ |
176
|
|
|
public static function buildUrl($date, $slug) { |
177
|
|
|
return date('/Y/m/d/', $date).$slug; |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
|
181
|
|
|
/** |
182
|
|
|
* @brief Replaces all the occurrences but first. |
183
|
|
|
* @param string $pattern The pattern to search for. |
184
|
|
|
* @param string $replacement The string used as replacement for the match found. |
185
|
|
|
* @param string $subject The string to search and replace. |
186
|
|
|
* @return string |
187
|
|
|
*/ |
188
|
|
|
public static function replaceAllButFirst($pattern, $replacement, $subject) { |
189
|
|
|
return preg_replace_callback( |
190
|
|
|
$pattern, |
191
|
|
|
function($matches) use ($replacement, $subject) { |
|
|
|
|
192
|
|
|
static $s; |
193
|
|
|
$s++; |
194
|
|
|
return ($s <= 1) ? $matches[0] : $replacement; |
195
|
|
|
}, |
196
|
|
|
$subject |
197
|
|
|
); |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* @brief Prunes the ID of its version number, if any. |
203
|
|
|
* @param string $id An UUID followed by a timestamp, like `3e96144b-3ebd-41e4-8a45-78cd9af1671d::1410886811`. |
204
|
|
|
* @return string Returns just `3e96144b-3ebd-41e4-8a45-78cd9af1671d`. |
205
|
|
|
*/ |
206
|
|
|
public static function unversion($id) { |
207
|
|
|
return strtok($id, self::SEPARATOR); |
208
|
|
|
} |
209
|
|
|
|
210
|
|
|
|
211
|
|
|
/** |
212
|
|
|
* @brief Formats the number replacing the thousand separator with the decimal point. |
213
|
|
|
* @param double $number The input number. |
214
|
|
|
* @return string |
215
|
|
|
*/ |
216
|
|
|
public static function formatNumber($number) { |
217
|
|
|
return number_format($number, 0, ",", "."); |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
|
221
|
|
|
/** |
222
|
|
|
* @brief Separates the given full name into first name and last name. |
223
|
|
|
* @param string $fullName A person full name. |
224
|
|
|
* @return array An associative array. |
225
|
|
|
*/ |
226
|
|
|
public static function splitFullName($fullName) { |
227
|
|
|
$result = []; |
228
|
|
|
|
229
|
|
|
$r = explode(' ', $fullName); |
230
|
|
|
$size = count($r); |
231
|
|
|
|
232
|
|
|
// Checks first for period, assume salutation if so |
233
|
|
|
if (mb_strpos($r[0], '.') === FALSE) { |
234
|
|
|
$result['salutation'] = ''; |
235
|
|
|
$result['first'] = $r[0]; |
236
|
|
|
} |
237
|
|
|
else { |
238
|
|
|
$result['salutation'] = $r[0]; |
239
|
|
|
$result['first'] = $r[1]; |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
// Checks last for period, assume suffix if so |
243
|
|
|
if (mb_strpos($r[$size - 1], '.') === FALSE) |
244
|
|
|
$result['suffix'] = ''; |
245
|
|
|
else |
246
|
|
|
$result['suffix'] = $r[$size - 1]; |
247
|
|
|
|
248
|
|
|
// Combines remains into last. |
249
|
|
|
$start = ($result['salutation']) ? 2 : 1; |
250
|
|
|
$end = ($result['suffix']) ? $size - 2 : $size - 1; |
251
|
|
|
|
252
|
|
|
$last = ''; |
253
|
|
|
for ($i = $start; $i <= $end; $i++) |
254
|
|
|
$last .= ' '.$r[$i]; |
255
|
|
|
|
256
|
|
|
$result['last'] = trim($last); |
257
|
|
|
|
258
|
|
|
return $result; |
259
|
|
|
} |
260
|
|
|
|
261
|
|
|
|
262
|
|
|
/** |
263
|
|
|
* @brief Removes unwanted MS Word smart characters from a string. |
264
|
|
|
* @param string $text The text to be sanitized. |
265
|
|
|
* @return string The sanitized text. |
266
|
|
|
* @warning This function doesn't work with UTF-8 strings. |
267
|
|
|
*/ |
268
|
|
|
public static function sanitize($text) { |
269
|
|
|
$from = [ |
270
|
|
|
"\xe2\x80\x98", // Left single quote. |
271
|
|
|
"\xe2\x80\x99", // Right single quote. |
272
|
|
|
"\xe2\x80\x9c", // Left double quote. |
273
|
|
|
"\xe2\x80\x9d", // Right double quote. |
274
|
|
|
"\xe2\x80\x94", // Em dash. |
275
|
|
|
"\xe2\x80\xa6" // Elipses. |
276
|
|
|
]; |
277
|
|
|
|
278
|
|
|
$to = [ |
279
|
|
|
"'", |
280
|
|
|
"'", |
281
|
|
|
'"', |
282
|
|
|
'"', |
283
|
|
|
'—', |
284
|
|
|
'...' |
285
|
|
|
]; |
286
|
|
|
|
287
|
|
|
return htmlspecialchars(str_replace($from, $to, $text)); |
288
|
|
|
} |
289
|
|
|
|
290
|
|
|
} |