1 | <?php |
||
2 | |||
3 | /** |
||
4 | * @file TextHelper.php |
||
5 | * @brief This file contains the TextHelper class. |
||
6 | * @details |
||
7 | * @author Filippo F. Fadda |
||
8 | */ |
||
9 | |||
10 | |||
11 | //! Classes to help the development |
||
12 | namespace ToolBag\Helper; |
||
13 | |||
14 | |||
15 | /** |
||
16 | * @brief This helper class contains routines to process text. |
||
17 | * @nosubgrouping |
||
18 | */ |
||
19 | class TextHelper { |
||
20 | |||
21 | const SEPARATOR = '::'; //!< Used to separate the ID from the version number. |
||
22 | |||
23 | |||
24 | /** |
||
25 | * @brief Converts a string from a charset to another one. |
||
26 | * @details The default conversion is from `Windows-1252` to `UTF-8`. `Windows-1252` or `CP-1252` is a character |
||
27 | * encoding of the Latin alphabet, used by default in the legacy components of Microsoft Windows in English and some |
||
28 | * other Western languages. This character encoding is a superset of `ISO-8859-1`, but it differs from it by using |
||
29 | * displayable characters rather than control characters in the 80 to 9F (hex) range. |
||
30 | * @param string $text The input string. |
||
31 | * @param bool $stripslashes (optional) If `true` strip all the slashes before converting the text. |
||
32 | * @param string $fromCharset (optional) The origin charset. |
||
33 | * @param string $toCharset (optional) The target charset. |
||
34 | * @return string |
||
35 | * @attention Doesn't matter if the varchar fields of your MySQL tables are encoded in `LATIN1`, in fact, if someone |
||
36 | * ever posted a document from Windows Word containing smart characters, like curly quotes or smart apostrophes, the |
||
37 | * real charset used is `Windows-1252`. |
||
38 | * @warning This function doesn't use `LATIN1` or `ISO-8859-1` as default, because `Windows-1251` and `Windows-1252` |
||
39 | * will only succeed if the entire string consists of high-byte characters in a certain range. That means you'll never |
||
40 | * get the right conversion because the text will appear as `ISO-8859-1` even if it is `Windows-1252`. See the bug |
||
41 | * section. |
||
42 | * @bug https://bugs.php.net/bug.php?id=64667 |
||
43 | */ |
||
44 | public static function convertCharset($text, $stripslashes = FALSE, $fromCharset = 'Windows-1252', $toCharset = 'UTF-8') { |
||
45 | if ($stripslashes) |
||
46 | return iconv($fromCharset, $toCharset, stripslashes($text)); |
||
47 | else |
||
48 | return iconv($fromCharset, $toCharset, $text); |
||
49 | } |
||
50 | |||
51 | |||
52 | /** |
||
53 | * @brief Cuts a string to a given number of characters without breaking words. |
||
54 | * @param string $text The input string. |
||
55 | * @param integer $length The number of characters at which the string will be wrapped, ex. 200 characters. |
||
56 | * @param string $etc The characters you want append to the end of text. |
||
57 | * @param string $charset (optional) The charset used. |
||
58 | * @param bool $breakWords (optional) If `true` breaks the words to return the exact number of chars. |
||
59 | * @param bool $middle (optional) Truncates the text but remove middle instead the end of the string. |
||
60 | * @return string |
||
61 | * @warning This function works with UTF-8 strings. |
||
62 | */ |
||
63 | public static function truncate($text, $length = 200, $etc = ' ...', $charset='UTF-8', $breakWords = FALSE, $middle = FALSE) { |
||
64 | if ($length == 0) |
||
65 | return ''; |
||
66 | |||
67 | if (mb_strlen($text) > $length) { |
||
68 | $length -= min($length, mb_strlen($etc, $charset)); |
||
69 | |||
70 | if (!$breakWords && !$middle) |
||
71 | $text = preg_replace('/\s+?(\S+)?$/u', '', mb_substr($text, 0, $length+1, $charset)); |
||
72 | |||
73 | if(!$middle) |
||
74 | return mb_substr($text, 0, $length, $charset) . $etc; |
||
75 | else |
||
76 | return mb_substr($text, 0, $length/2, $charset) . $etc . mb_substr($text, -$length/2, (mb_strlen($text, $charset) - $length/2), $charset); |
||
77 | } |
||
78 | else |
||
79 | return $text; |
||
80 | } |
||
81 | |||
82 | |||
83 | /** |
||
84 | * @brief Capitalizes the given string. |
||
85 | * @param string $text The input string. |
||
86 | * @param string $charset (optional) The charset used. |
||
87 | * @return string |
||
88 | * @warning This function works with UTF-8 strings. |
||
89 | */ |
||
90 | public static function capitalize($text, $charset = 'UTF-8') { |
||
91 | return mb_strtoupper(mb_substr($text, 0, 1, $charset), $charset) . mb_strtolower(mb_substr($text, 1, mb_strlen($text, $charset), $charset), $charset); |
||
92 | } |
||
93 | |||
94 | |||
95 | /** |
||
96 | * @brief Removes the content of pre tags, than strip all tags. |
||
97 | * @param string $text The input string. |
||
98 | * @return string |
||
99 | * @warning This function works with UTF-8 strings. |
||
100 | */ |
||
101 | public static function purge($text) { |
||
102 | // Removes the content of <pre></pre>. |
||
103 | $temp = preg_replace('/<(pre)(?:(?!<\/\1).)*?<\/\1>/su', '', $text); |
||
104 | |||
105 | if (is_null($temp)) |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
106 | throw new \RuntimeException(array_flip(get_defined_constants(TRUE)['pcre'])[preg_last_error()]); |
||
107 | |||
108 | // Removes all the HTML tags. |
||
109 | $temp = strip_tags($temp); |
||
110 | |||
111 | return $temp; |
||
112 | } |
||
113 | |||
114 | |||
115 | /** |
||
116 | * @brief Generates a single word, stripping every `-` from a compound word. |
||
117 | * @param string $word A compound word. |
||
118 | * @return string |
||
119 | * @warning This function works with UTF-8 strings. |
||
120 | */ |
||
121 | public static function stick($word) { |
||
122 | return preg_replace('/-/su', '', $word); |
||
123 | } |
||
124 | |||
125 | |||
126 | /** |
||
127 | * @brief Given a string, returns all the unique contained substrings. |
||
128 | * @param string $str The input string. |
||
129 | * @param string $charset (optional) The charset used. |
||
130 | * @return array |
||
131 | * @warning This function works with UTF-8 strings. |
||
132 | */ |
||
133 | public static function substrings($str, $charset = 'UTF-8') { |
||
134 | $length = mb_strlen($str, $charset); |
||
135 | |||
136 | $subs = []; |
||
137 | for ($i = 0; $i < $length; $i++) |
||
138 | for ($j = 1; $j <= $length; $j++) |
||
139 | $subs[] = mb_substr($str, $i, $j, $charset); |
||
140 | |||
141 | return array_unique($subs); |
||
142 | } |
||
143 | |||
144 | |||
145 | /** |
||
146 | * @brief Generates a slug from the provided string. |
||
147 | * @param string $str The input string. |
||
148 | * @return string |
||
149 | * @warning This function receives as input an UTF-8 string and returns an ASCII string. |
||
150 | * @see https://en.wikipedia.org/wiki/Slug_(publishing) |
||
151 | */ |
||
152 | public static function slug($str) { |
||
153 | // Replaces any character that is not a letter or a number with minus. |
||
154 | $slug = preg_replace('/[^\pL\d]+/u', '-', $str); |
||
155 | |||
156 | // Removes the minus character from the begin and the end. |
||
157 | $slug = trim($slug, '-'); |
||
158 | |||
159 | // Converts the charset from uft-8 to ASCII. |
||
160 | $slug = self::convertCharset($slug, FALSE, 'utf-8', 'ASCII//TRANSLIT'); |
||
161 | |||
162 | // Converts the string to Lowercase. |
||
163 | $slug = strtolower($slug); |
||
164 | |||
165 | // Finally removes any character that is not a letter, a number or a minus. |
||
166 | return preg_replace('/[^\-\w]+/', '', $slug); |
||
167 | } |
||
168 | |||
169 | |||
170 | /** |
||
171 | * @brief Builds the post url, given its publishing or creation date and its slug. |
||
172 | * @param int $date Publishing or creation date. |
||
173 | * @param string $slug The slug of the title. |
||
174 | * @return string The complete url of the post. |
||
175 | */ |
||
176 | public static function buildUrl($date, $slug) { |
||
177 | return date('/Y/m/d/', $date).$slug; |
||
178 | } |
||
179 | |||
180 | |||
181 | /** |
||
182 | * @brief Replaces all the occurrences but first. |
||
183 | * @param string $pattern The pattern to search for. |
||
184 | * @param string $replacement The string used as replacement for the match found. |
||
185 | * @param string $subject The string to search and replace. |
||
186 | * @return string |
||
187 | */ |
||
188 | public static function replaceAllButFirst($pattern, $replacement, $subject) { |
||
189 | return preg_replace_callback( |
||
190 | $pattern, |
||
191 | function($matches) use ($replacement, $subject) { |
||
0 ignored issues
–
show
|
|||
192 | static $s; |
||
193 | $s++; |
||
194 | return ($s <= 1) ? $matches[0] : $replacement; |
||
195 | }, |
||
196 | $subject |
||
197 | ); |
||
198 | } |
||
199 | |||
200 | |||
201 | /** |
||
202 | * @brief Prunes the ID of its version number, if any. |
||
203 | * @param string $id An UUID followed by a timestamp, like `3e96144b-3ebd-41e4-8a45-78cd9af1671d::1410886811`. |
||
204 | * @return string Returns just `3e96144b-3ebd-41e4-8a45-78cd9af1671d`. |
||
205 | */ |
||
206 | public static function unversion($id) { |
||
207 | return strtok($id, self::SEPARATOR); |
||
208 | } |
||
209 | |||
210 | |||
211 | /** |
||
212 | * @brief Formats the number replacing the thousand separator with the decimal point. |
||
213 | * @param double $number The input number. |
||
214 | * @return string |
||
215 | */ |
||
216 | public static function formatNumber($number) { |
||
217 | return number_format($number, 0, ",", "."); |
||
218 | } |
||
219 | |||
220 | |||
221 | /** |
||
222 | * @brief Separates the given full name into first name and last name. |
||
223 | * @param string $fullName A person full name. |
||
224 | * @return array An associative array. |
||
225 | */ |
||
226 | public static function splitFullName($fullName) { |
||
227 | $result = []; |
||
228 | |||
229 | $r = explode(' ', $fullName); |
||
230 | $size = count($r); |
||
231 | |||
232 | // Checks first for period, assume salutation if so |
||
233 | if (mb_strpos($r[0], '.') === FALSE) { |
||
234 | $result['salutation'] = ''; |
||
235 | $result['first'] = $r[0]; |
||
236 | } |
||
237 | else { |
||
238 | $result['salutation'] = $r[0]; |
||
239 | $result['first'] = $r[1]; |
||
240 | } |
||
241 | |||
242 | // Checks last for period, assume suffix if so |
||
243 | if (mb_strpos($r[$size - 1], '.') === FALSE) |
||
244 | $result['suffix'] = ''; |
||
245 | else |
||
246 | $result['suffix'] = $r[$size - 1]; |
||
247 | |||
248 | // Combines remains into last. |
||
249 | $start = ($result['salutation']) ? 2 : 1; |
||
250 | $end = ($result['suffix']) ? $size - 2 : $size - 1; |
||
251 | |||
252 | $last = ''; |
||
253 | for ($i = $start; $i <= $end; $i++) |
||
254 | $last .= ' '.$r[$i]; |
||
255 | |||
256 | $result['last'] = trim($last); |
||
257 | |||
258 | return $result; |
||
259 | } |
||
260 | |||
261 | |||
262 | /** |
||
263 | * @brief Removes unwanted MS Word smart characters from a string. |
||
264 | * @param string $text The text to be sanitized. |
||
265 | * @return string The sanitized text. |
||
266 | * @warning This function doesn't work with UTF-8 strings. |
||
267 | */ |
||
268 | public static function sanitize($text) { |
||
269 | $from = [ |
||
270 | "\xe2\x80\x98", // Left single quote. |
||
271 | "\xe2\x80\x99", // Right single quote. |
||
272 | "\xe2\x80\x9c", // Left double quote. |
||
273 | "\xe2\x80\x9d", // Right double quote. |
||
274 | "\xe2\x80\x94", // Em dash. |
||
275 | "\xe2\x80\xa6" // Elipses. |
||
276 | ]; |
||
277 | |||
278 | $to = [ |
||
279 | "'", |
||
280 | "'", |
||
281 | '"', |
||
282 | '"', |
||
283 | '—', |
||
284 | '...' |
||
285 | ]; |
||
286 | |||
287 | return htmlspecialchars(str_replace($from, $to, $text)); |
||
288 | } |
||
289 | |||
290 | } |