|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* @file TextHelper.php |
|
5
|
|
|
* @brief This file contains the TextHelper class. |
|
6
|
|
|
* @details |
|
7
|
|
|
* @author Filippo F. Fadda |
|
8
|
|
|
*/ |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
//! Classes to help the development |
|
12
|
|
|
namespace ToolBag\Helper; |
|
13
|
|
|
|
|
14
|
|
|
|
|
15
|
|
|
/** |
|
16
|
|
|
* @brief This helper class contains routines to process text. |
|
17
|
|
|
* @nosubgrouping |
|
18
|
|
|
*/ |
|
19
|
|
|
class TextHelper { |
|
20
|
|
|
|
|
21
|
|
|
const SEPARATOR = '::'; //!< Used to separate the ID from the version number. |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
/** |
|
25
|
|
|
* @brief Converts a string from a charset to another one. |
|
26
|
|
|
* @details The default conversion is from `Windows-1252` to `UTF-8`. `Windows-1252` or `CP-1252` is a character |
|
27
|
|
|
* encoding of the Latin alphabet, used by default in the legacy components of Microsoft Windows in English and some |
|
28
|
|
|
* other Western languages. This character encoding is a superset of `ISO-8859-1`, but it differs from it by using |
|
29
|
|
|
* displayable characters rather than control characters in the 80 to 9F (hex) range. |
|
30
|
|
|
* @param string $text The input string. |
|
31
|
|
|
* @param bool $stripslashes (optional) If `true` strip all the slashes before converting the text. |
|
32
|
|
|
* @param string $fromCharset (optional) The origin charset. |
|
33
|
|
|
* @param string $toCharset (optional) The target charset. |
|
34
|
|
|
* @return string |
|
35
|
|
|
* @attention Doesn't matter if the varchar fields of your MySQL tables are encoded in `LATIN1`, in fact, if someone |
|
36
|
|
|
* ever posted a document from Windows Word containing smart characters, like curly quotes or smart apostrophes, the |
|
37
|
|
|
* real charset used is `Windows-1252`. |
|
38
|
|
|
* @warning This function doesn't use `LATIN1` or `ISO-8859-1` as default, because `Windows-1251` and `Windows-1252` |
|
39
|
|
|
* will only succeed if the entire string consists of high-byte characters in a certain range. That means you'll never |
|
40
|
|
|
* get the right conversion because the text will appear as `ISO-8859-1` even if it is `Windows-1252`. See the bug |
|
41
|
|
|
* section. |
|
42
|
|
|
* @bug https://bugs.php.net/bug.php?id=64667 |
|
43
|
|
|
*/ |
|
44
|
|
|
public static function convertCharset($text, $stripslashes = FALSE, $fromCharset = 'Windows-1252', $toCharset = 'UTF-8') { |
|
45
|
|
|
if ($stripslashes) |
|
46
|
|
|
return iconv($fromCharset, $toCharset, stripslashes($text)); |
|
47
|
|
|
else |
|
48
|
|
|
return iconv($fromCharset, $toCharset, $text); |
|
49
|
|
|
} |
|
50
|
|
|
|
|
51
|
|
|
|
|
52
|
|
|
/** |
|
53
|
|
|
* @brief Cuts a string to a given number of characters without breaking words. |
|
54
|
|
|
* @param string $text The input string. |
|
55
|
|
|
* @param integer $length The number of characters at which the string will be wrapped, ex. 200 characters. |
|
56
|
|
|
* @param string $etc The characters you want append to the end of text. |
|
57
|
|
|
* @param string $charset (optional) The charset used. |
|
58
|
|
|
* @param bool $breakWords (optional) If `true` breaks the words to return the exact number of chars. |
|
59
|
|
|
* @param bool $middle (optional) Truncates the text but remove middle instead the end of the string. |
|
60
|
|
|
* @return string |
|
61
|
|
|
* @warning This function works with UTF-8 strings. |
|
62
|
|
|
*/ |
|
63
|
|
|
public static function truncate($text, $length = 200, $etc = ' ...', $charset='UTF-8', $breakWords = FALSE, $middle = FALSE) { |
|
64
|
|
|
if ($length == 0) |
|
65
|
|
|
return ''; |
|
66
|
|
|
|
|
67
|
|
|
if (mb_strlen($text) > $length) { |
|
68
|
|
|
$length -= min($length, mb_strlen($etc, $charset)); |
|
69
|
|
|
|
|
70
|
|
|
if (!$breakWords && !$middle) |
|
71
|
|
|
$text = preg_replace('/\s+?(\S+)?$/u', '', mb_substr($text, 0, $length+1, $charset)); |
|
72
|
|
|
|
|
73
|
|
|
if(!$middle) |
|
74
|
|
|
return mb_substr($text, 0, $length, $charset) . $etc; |
|
75
|
|
|
else |
|
76
|
|
|
return mb_substr($text, 0, $length/2, $charset) . $etc . mb_substr($text, -$length/2, (mb_strlen($text, $charset) - $length/2), $charset); |
|
77
|
|
|
} |
|
78
|
|
|
else |
|
79
|
|
|
return $text; |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
|
|
|
|
83
|
|
|
/** |
|
84
|
|
|
* @brief Capitalizes the given string. |
|
85
|
|
|
* @param string $text The input string. |
|
86
|
|
|
* @param string $charset (optional) The charset used. |
|
87
|
|
|
* @return string |
|
88
|
|
|
* @warning This function works with UTF-8 strings. |
|
89
|
|
|
*/ |
|
90
|
|
|
public static function capitalize($text, $charset = 'UTF-8') { |
|
91
|
|
|
return mb_strtoupper(mb_substr($text, 0, 1, $charset), $charset) . mb_strtolower(mb_substr($text, 1, mb_strlen($text, $charset), $charset), $charset); |
|
92
|
|
|
} |
|
93
|
|
|
|
|
94
|
|
|
|
|
95
|
|
|
/** |
|
96
|
|
|
* @brief Removes the content of pre tags, than strip all tags. |
|
97
|
|
|
* @param string $text The input string. |
|
98
|
|
|
* @return string |
|
99
|
|
|
* @warning This function works with UTF-8 strings. |
|
100
|
|
|
*/ |
|
101
|
|
|
public static function purge($text) { |
|
102
|
|
|
// Removes the content of <pre></pre>. |
|
103
|
|
|
$temp = preg_replace('/<(pre)(?:(?!<\/\1).)*?<\/\1>/su', '', $text); |
|
104
|
|
|
|
|
105
|
|
|
if (is_null($temp)) |
|
|
|
|
|
|
106
|
|
|
throw new \RuntimeException(array_flip(get_defined_constants(TRUE)['pcre'])[preg_last_error()]); |
|
107
|
|
|
|
|
108
|
|
|
// Removes all the HTML tags. |
|
109
|
|
|
$temp = strip_tags($temp); |
|
110
|
|
|
|
|
111
|
|
|
return $temp; |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
|
|
115
|
|
|
/** |
|
116
|
|
|
* @brief Generates a single word, stripping every `-` from a compound word. |
|
117
|
|
|
* @param string $word A compound word. |
|
118
|
|
|
* @return string |
|
119
|
|
|
* @warning This function works with UTF-8 strings. |
|
120
|
|
|
*/ |
|
121
|
|
|
public static function stick($word) { |
|
122
|
|
|
return preg_replace('/-/su', '', $word); |
|
123
|
|
|
} |
|
124
|
|
|
|
|
125
|
|
|
|
|
126
|
|
|
/** |
|
127
|
|
|
* @brief Given a string, returns all the unique contained substrings. |
|
128
|
|
|
* @param string $str The input string. |
|
129
|
|
|
* @param string $charset (optional) The charset used. |
|
130
|
|
|
* @return array |
|
131
|
|
|
* @warning This function works with UTF-8 strings. |
|
132
|
|
|
*/ |
|
133
|
|
|
public static function substrings($str, $charset = 'UTF-8') { |
|
134
|
|
|
$length = mb_strlen($str, $charset); |
|
135
|
|
|
|
|
136
|
|
|
$subs = []; |
|
137
|
|
|
for ($i = 0; $i < $length; $i++) |
|
138
|
|
|
for ($j = 1; $j <= $length; $j++) |
|
139
|
|
|
$subs[] = mb_substr($str, $i, $j, $charset); |
|
140
|
|
|
|
|
141
|
|
|
return array_unique($subs); |
|
142
|
|
|
} |
|
143
|
|
|
|
|
144
|
|
|
|
|
145
|
|
|
/** |
|
146
|
|
|
* @brief Generates a slug from the provided string. |
|
147
|
|
|
* @param string $str The input string. |
|
148
|
|
|
* @return string |
|
149
|
|
|
* @warning This function receives as input an UTF-8 string and returns an ASCII string. |
|
150
|
|
|
* @see https://en.wikipedia.org/wiki/Slug_(publishing) |
|
151
|
|
|
*/ |
|
152
|
|
|
public static function slug($str) { |
|
153
|
|
|
// Replaces any character that is not a letter or a number with minus. |
|
154
|
|
|
$slug = preg_replace('/[^\pL\d]+/u', '-', $str); |
|
155
|
|
|
|
|
156
|
|
|
// Removes the minus character from the begin and the end. |
|
157
|
|
|
$slug = trim($slug, '-'); |
|
158
|
|
|
|
|
159
|
|
|
// Converts the charset from uft-8 to ASCII. |
|
160
|
|
|
$slug = self::convertCharset($slug, FALSE, 'utf-8', 'ASCII//TRANSLIT'); |
|
161
|
|
|
|
|
162
|
|
|
// Converts the string to Lowercase. |
|
163
|
|
|
$slug = strtolower($slug); |
|
164
|
|
|
|
|
165
|
|
|
// Finally removes any character that is not a letter, a number or a minus. |
|
166
|
|
|
return preg_replace('/[^\-\w]+/', '', $slug); |
|
167
|
|
|
} |
|
168
|
|
|
|
|
169
|
|
|
|
|
170
|
|
|
/** |
|
171
|
|
|
* @brief Builds the post url, given its publishing or creation date and its slug. |
|
172
|
|
|
* @param int $date Publishing or creation date. |
|
173
|
|
|
* @param string $slug The slug of the title. |
|
174
|
|
|
* @return string The complete url of the post. |
|
175
|
|
|
*/ |
|
176
|
|
|
public static function buildUrl($date, $slug) { |
|
177
|
|
|
return date('/Y/m/d/', $date).$slug; |
|
178
|
|
|
} |
|
179
|
|
|
|
|
180
|
|
|
|
|
181
|
|
|
/** |
|
182
|
|
|
* @brief Replaces all the occurrences but first. |
|
183
|
|
|
* @param string $pattern The pattern to search for. |
|
184
|
|
|
* @param string $replacement The string used as replacement for the match found. |
|
185
|
|
|
* @param string $subject The string to search and replace. |
|
186
|
|
|
* @return string |
|
187
|
|
|
*/ |
|
188
|
|
|
public static function replaceAllButFirst($pattern, $replacement, $subject) { |
|
189
|
|
|
return preg_replace_callback( |
|
190
|
|
|
$pattern, |
|
191
|
|
|
function($matches) use ($replacement, $subject) { |
|
|
|
|
|
|
192
|
|
|
static $s; |
|
193
|
|
|
$s++; |
|
194
|
|
|
return ($s <= 1) ? $matches[0] : $replacement; |
|
195
|
|
|
}, |
|
196
|
|
|
$subject |
|
197
|
|
|
); |
|
198
|
|
|
} |
|
199
|
|
|
|
|
200
|
|
|
|
|
201
|
|
|
/** |
|
202
|
|
|
* @brief Prunes the ID of its version number, if any. |
|
203
|
|
|
* @param string $id An UUID followed by a timestamp, like `3e96144b-3ebd-41e4-8a45-78cd9af1671d::1410886811`. |
|
204
|
|
|
* @return string Returns just `3e96144b-3ebd-41e4-8a45-78cd9af1671d`. |
|
205
|
|
|
*/ |
|
206
|
|
|
public static function unversion($id) { |
|
207
|
|
|
return strtok($id, self::SEPARATOR); |
|
208
|
|
|
} |
|
209
|
|
|
|
|
210
|
|
|
|
|
211
|
|
|
/** |
|
212
|
|
|
* @brief Formats the number replacing the thousand separator with the decimal point. |
|
213
|
|
|
* @param double $number The input number. |
|
214
|
|
|
* @return string |
|
215
|
|
|
*/ |
|
216
|
|
|
public static function formatNumber($number) { |
|
217
|
|
|
return number_format($number, 0, ",", "."); |
|
218
|
|
|
} |
|
219
|
|
|
|
|
220
|
|
|
|
|
221
|
|
|
/** |
|
222
|
|
|
* @brief Separates the given full name into first name and last name. |
|
223
|
|
|
* @param string $fullName A person full name. |
|
224
|
|
|
* @return array An associative array. |
|
225
|
|
|
*/ |
|
226
|
|
|
public static function splitFullName($fullName) { |
|
227
|
|
|
$result = []; |
|
228
|
|
|
|
|
229
|
|
|
$r = explode(' ', $fullName); |
|
230
|
|
|
$size = count($r); |
|
231
|
|
|
|
|
232
|
|
|
// Checks first for period, assume salutation if so |
|
233
|
|
|
if (mb_strpos($r[0], '.') === FALSE) { |
|
234
|
|
|
$result['salutation'] = ''; |
|
235
|
|
|
$result['first'] = $r[0]; |
|
236
|
|
|
} |
|
237
|
|
|
else { |
|
238
|
|
|
$result['salutation'] = $r[0]; |
|
239
|
|
|
$result['first'] = $r[1]; |
|
240
|
|
|
} |
|
241
|
|
|
|
|
242
|
|
|
// Checks last for period, assume suffix if so |
|
243
|
|
|
if (mb_strpos($r[$size - 1], '.') === FALSE) |
|
244
|
|
|
$result['suffix'] = ''; |
|
245
|
|
|
else |
|
246
|
|
|
$result['suffix'] = $r[$size - 1]; |
|
247
|
|
|
|
|
248
|
|
|
// Combines remains into last. |
|
249
|
|
|
$start = ($result['salutation']) ? 2 : 1; |
|
250
|
|
|
$end = ($result['suffix']) ? $size - 2 : $size - 1; |
|
251
|
|
|
|
|
252
|
|
|
$last = ''; |
|
253
|
|
|
for ($i = $start; $i <= $end; $i++) |
|
254
|
|
|
$last .= ' '.$r[$i]; |
|
255
|
|
|
|
|
256
|
|
|
$result['last'] = trim($last); |
|
257
|
|
|
|
|
258
|
|
|
return $result; |
|
259
|
|
|
} |
|
260
|
|
|
|
|
261
|
|
|
|
|
262
|
|
|
/** |
|
263
|
|
|
* @brief Removes unwanted MS Word smart characters from a string. |
|
264
|
|
|
* @param string $text The text to be sanitized. |
|
265
|
|
|
* @return string The sanitized text. |
|
266
|
|
|
* @warning This function doesn't work with UTF-8 strings. |
|
267
|
|
|
*/ |
|
268
|
|
|
public static function sanitize($text) { |
|
269
|
|
|
$from = [ |
|
270
|
|
|
"\xe2\x80\x98", // Left single quote. |
|
271
|
|
|
"\xe2\x80\x99", // Right single quote. |
|
272
|
|
|
"\xe2\x80\x9c", // Left double quote. |
|
273
|
|
|
"\xe2\x80\x9d", // Right double quote. |
|
274
|
|
|
"\xe2\x80\x94", // Em dash. |
|
275
|
|
|
"\xe2\x80\xa6" // Elipses. |
|
276
|
|
|
]; |
|
277
|
|
|
|
|
278
|
|
|
$to = [ |
|
279
|
|
|
"'", |
|
280
|
|
|
"'", |
|
281
|
|
|
'"', |
|
282
|
|
|
'"', |
|
283
|
|
|
'—', |
|
284
|
|
|
'...' |
|
285
|
|
|
]; |
|
286
|
|
|
|
|
287
|
|
|
return htmlspecialchars(str_replace($from, $to, $text)); |
|
288
|
|
|
} |
|
289
|
|
|
|
|
290
|
|
|
} |