dedalozzo /
tool-bag
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * @file TextHelper.php |
||
| 5 | * @brief This file contains the TextHelper class. |
||
| 6 | * @details |
||
| 7 | * @author Filippo F. Fadda |
||
| 8 | */ |
||
| 9 | |||
| 10 | |||
| 11 | //! Classes to help the development |
||
| 12 | namespace ToolBag\Helper; |
||
| 13 | |||
| 14 | |||
| 15 | /** |
||
| 16 | * @brief This helper class contains routines to process text. |
||
| 17 | * @nosubgrouping |
||
| 18 | */ |
||
| 19 | class TextHelper { |
||
| 20 | |||
| 21 | const SEPARATOR = '::'; //!< Used to separate the ID from the version number. |
||
| 22 | |||
| 23 | |||
| 24 | /** |
||
| 25 | * @brief Converts a string from a charset to another one. |
||
| 26 | * @details The default conversion is from `Windows-1252` to `UTF-8`. `Windows-1252` or `CP-1252` is a character |
||
| 27 | * encoding of the Latin alphabet, used by default in the legacy components of Microsoft Windows in English and some |
||
| 28 | * other Western languages. This character encoding is a superset of `ISO-8859-1`, but it differs from it by using |
||
| 29 | * displayable characters rather than control characters in the 80 to 9F (hex) range. |
||
| 30 | * @param string $text The input string. |
||
| 31 | * @param bool $stripslashes (optional) If `true` strip all the slashes before converting the text. |
||
| 32 | * @param string $fromCharset (optional) The origin charset. |
||
| 33 | * @param string $toCharset (optional) The target charset. |
||
| 34 | * @return string |
||
| 35 | * @attention Doesn't matter if the varchar fields of your MySQL tables are encoded in `LATIN1`, in fact, if someone |
||
| 36 | * ever posted a document from Windows Word containing smart characters, like curly quotes or smart apostrophes, the |
||
| 37 | * real charset used is `Windows-1252`. |
||
| 38 | * @warning This function doesn't use `LATIN1` or `ISO-8859-1` as default, because `Windows-1251` and `Windows-1252` |
||
| 39 | * will only succeed if the entire string consists of high-byte characters in a certain range. That means you'll never |
||
| 40 | * get the right conversion because the text will appear as `ISO-8859-1` even if it is `Windows-1252`. See the bug |
||
| 41 | * section. |
||
| 42 | * @bug https://bugs.php.net/bug.php?id=64667 |
||
| 43 | */ |
||
| 44 | public static function convertCharset($text, $stripslashes = FALSE, $fromCharset = 'Windows-1252', $toCharset = 'UTF-8') { |
||
| 45 | if ($stripslashes) |
||
| 46 | return iconv($fromCharset, $toCharset, stripslashes($text)); |
||
| 47 | else |
||
| 48 | return iconv($fromCharset, $toCharset, $text); |
||
| 49 | } |
||
| 50 | |||
| 51 | |||
| 52 | /** |
||
| 53 | * @brief Cuts a string to a given number of characters without breaking words. |
||
| 54 | * @param string $text The input string. |
||
| 55 | * @param integer $length The number of characters at which the string will be wrapped, ex. 200 characters. |
||
| 56 | * @param string $etc The characters you want append to the end of text. |
||
| 57 | * @param string $charset (optional) The charset used. |
||
| 58 | * @param bool $breakWords (optional) If `true` breaks the words to return the exact number of chars. |
||
| 59 | * @param bool $middle (optional) Truncates the text but remove middle instead the end of the string. |
||
| 60 | * @return string |
||
| 61 | * @warning This function works with UTF-8 strings. |
||
| 62 | */ |
||
| 63 | public static function truncate($text, $length = 200, $etc = ' ...', $charset='UTF-8', $breakWords = FALSE, $middle = FALSE) { |
||
| 64 | if ($length == 0) |
||
| 65 | return ''; |
||
| 66 | |||
| 67 | if (mb_strlen($text) > $length) { |
||
| 68 | $length -= min($length, mb_strlen($etc, $charset)); |
||
| 69 | |||
| 70 | if (!$breakWords && !$middle) |
||
| 71 | $text = preg_replace('/\s+?(\S+)?$/u', '', mb_substr($text, 0, $length+1, $charset)); |
||
| 72 | |||
| 73 | if(!$middle) |
||
| 74 | return mb_substr($text, 0, $length, $charset) . $etc; |
||
| 75 | else |
||
| 76 | return mb_substr($text, 0, $length/2, $charset) . $etc . mb_substr($text, -$length/2, (mb_strlen($text, $charset) - $length/2), $charset); |
||
| 77 | } |
||
| 78 | else |
||
| 79 | return $text; |
||
| 80 | } |
||
| 81 | |||
| 82 | |||
| 83 | /** |
||
| 84 | * @brief Capitalizes the given string. |
||
| 85 | * @param string $text The input string. |
||
| 86 | * @param string $charset (optional) The charset used. |
||
| 87 | * @return string |
||
| 88 | * @warning This function works with UTF-8 strings. |
||
| 89 | */ |
||
| 90 | public static function capitalize($text, $charset = 'UTF-8') { |
||
| 91 | return mb_strtoupper(mb_substr($text, 0, 1, $charset), $charset) . mb_strtolower(mb_substr($text, 1, mb_strlen($text, $charset), $charset), $charset); |
||
| 92 | } |
||
| 93 | |||
| 94 | |||
| 95 | /** |
||
| 96 | * @brief Removes the content of pre tags, than strip all tags. |
||
| 97 | * @param string $text The input string. |
||
| 98 | * @return string |
||
| 99 | * @warning This function works with UTF-8 strings. |
||
| 100 | */ |
||
| 101 | public static function purge($text) { |
||
| 102 | // Removes the content of <pre></pre>. |
||
| 103 | $temp = preg_replace('/<(pre)(?:(?!<\/\1).)*?<\/\1>/su', '', $text); |
||
| 104 | |||
| 105 | if (is_null($temp)) |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 106 | throw new \RuntimeException(array_flip(get_defined_constants(TRUE)['pcre'])[preg_last_error()]); |
||
| 107 | |||
| 108 | // Removes all the HTML tags. |
||
| 109 | $temp = strip_tags($temp); |
||
| 110 | |||
| 111 | return $temp; |
||
| 112 | } |
||
| 113 | |||
| 114 | |||
| 115 | /** |
||
| 116 | * @brief Generates a single word, stripping every `-` from a compound word. |
||
| 117 | * @param string $word A compound word. |
||
| 118 | * @return string |
||
| 119 | * @warning This function works with UTF-8 strings. |
||
| 120 | */ |
||
| 121 | public static function stick($word) { |
||
| 122 | return preg_replace('/-/su', '', $word); |
||
| 123 | } |
||
| 124 | |||
| 125 | |||
| 126 | /** |
||
| 127 | * @brief Given a string, returns all the unique contained substrings. |
||
| 128 | * @param string $str The input string. |
||
| 129 | * @param string $charset (optional) The charset used. |
||
| 130 | * @return array |
||
| 131 | * @warning This function works with UTF-8 strings. |
||
| 132 | */ |
||
| 133 | public static function substrings($str, $charset = 'UTF-8') { |
||
| 134 | $length = mb_strlen($str, $charset); |
||
| 135 | |||
| 136 | $subs = []; |
||
| 137 | for ($i = 0; $i < $length; $i++) |
||
| 138 | for ($j = 1; $j <= $length; $j++) |
||
| 139 | $subs[] = mb_substr($str, $i, $j, $charset); |
||
| 140 | |||
| 141 | return array_unique($subs); |
||
| 142 | } |
||
| 143 | |||
| 144 | |||
| 145 | /** |
||
| 146 | * @brief Generates a slug from the provided string. |
||
| 147 | * @param string $str The input string. |
||
| 148 | * @return string |
||
| 149 | * @warning This function receives as input an UTF-8 string and returns an ASCII string. |
||
| 150 | * @see https://en.wikipedia.org/wiki/Slug_(publishing) |
||
| 151 | */ |
||
| 152 | public static function slug($str) { |
||
| 153 | // Replaces any character that is not a letter or a number with minus. |
||
| 154 | $slug = preg_replace('/[^\pL\d]+/u', '-', $str); |
||
| 155 | |||
| 156 | // Removes the minus character from the begin and the end. |
||
| 157 | $slug = trim($slug, '-'); |
||
| 158 | |||
| 159 | // Converts the charset from uft-8 to ASCII. |
||
| 160 | $slug = self::convertCharset($slug, FALSE, 'utf-8', 'ASCII//TRANSLIT'); |
||
| 161 | |||
| 162 | // Converts the string to Lowercase. |
||
| 163 | $slug = strtolower($slug); |
||
| 164 | |||
| 165 | // Finally removes any character that is not a letter, a number or a minus. |
||
| 166 | return preg_replace('/[^\-\w]+/', '', $slug); |
||
| 167 | } |
||
| 168 | |||
| 169 | |||
| 170 | /** |
||
| 171 | * @brief Builds the post url, given its publishing or creation date and its slug. |
||
| 172 | * @param int $date Publishing or creation date. |
||
| 173 | * @param string $slug The slug of the title. |
||
| 174 | * @return string The complete url of the post. |
||
| 175 | */ |
||
| 176 | public static function buildUrl($date, $slug) { |
||
| 177 | return date('/Y/m/d/', $date).$slug; |
||
| 178 | } |
||
| 179 | |||
| 180 | |||
| 181 | /** |
||
| 182 | * @brief Replaces all the occurrences but first. |
||
| 183 | * @param string $pattern The pattern to search for. |
||
| 184 | * @param string $replacement The string used as replacement for the match found. |
||
| 185 | * @param string $subject The string to search and replace. |
||
| 186 | * @return string |
||
| 187 | */ |
||
| 188 | public static function replaceAllButFirst($pattern, $replacement, $subject) { |
||
| 189 | return preg_replace_callback( |
||
| 190 | $pattern, |
||
| 191 | function($matches) use ($replacement, $subject) { |
||
|
0 ignored issues
–
show
|
|||
| 192 | static $s; |
||
| 193 | $s++; |
||
| 194 | return ($s <= 1) ? $matches[0] : $replacement; |
||
| 195 | }, |
||
| 196 | $subject |
||
| 197 | ); |
||
| 198 | } |
||
| 199 | |||
| 200 | |||
| 201 | /** |
||
| 202 | * @brief Prunes the ID of its version number, if any. |
||
| 203 | * @param string $id An UUID followed by a timestamp, like `3e96144b-3ebd-41e4-8a45-78cd9af1671d::1410886811`. |
||
| 204 | * @return string Returns just `3e96144b-3ebd-41e4-8a45-78cd9af1671d`. |
||
| 205 | */ |
||
| 206 | public static function unversion($id) { |
||
| 207 | return strtok($id, self::SEPARATOR); |
||
| 208 | } |
||
| 209 | |||
| 210 | |||
| 211 | /** |
||
| 212 | * @brief Formats the number replacing the thousand separator with the decimal point. |
||
| 213 | * @param double $number The input number. |
||
| 214 | * @return string |
||
| 215 | */ |
||
| 216 | public static function formatNumber($number) { |
||
| 217 | return number_format($number, 0, ",", "."); |
||
| 218 | } |
||
| 219 | |||
| 220 | |||
| 221 | /** |
||
| 222 | * @brief Separates the given full name into first name and last name. |
||
| 223 | * @param string $fullName A person full name. |
||
| 224 | * @return array An associative array. |
||
| 225 | */ |
||
| 226 | public static function splitFullName($fullName) { |
||
| 227 | $result = []; |
||
| 228 | |||
| 229 | $r = explode(' ', $fullName); |
||
| 230 | $size = count($r); |
||
| 231 | |||
| 232 | // Checks first for period, assume salutation if so |
||
| 233 | if (mb_strpos($r[0], '.') === FALSE) { |
||
| 234 | $result['salutation'] = ''; |
||
| 235 | $result['first'] = $r[0]; |
||
| 236 | } |
||
| 237 | else { |
||
| 238 | $result['salutation'] = $r[0]; |
||
| 239 | $result['first'] = $r[1]; |
||
| 240 | } |
||
| 241 | |||
| 242 | // Checks last for period, assume suffix if so |
||
| 243 | if (mb_strpos($r[$size - 1], '.') === FALSE) |
||
| 244 | $result['suffix'] = ''; |
||
| 245 | else |
||
| 246 | $result['suffix'] = $r[$size - 1]; |
||
| 247 | |||
| 248 | // Combines remains into last. |
||
| 249 | $start = ($result['salutation']) ? 2 : 1; |
||
| 250 | $end = ($result['suffix']) ? $size - 2 : $size - 1; |
||
| 251 | |||
| 252 | $last = ''; |
||
| 253 | for ($i = $start; $i <= $end; $i++) |
||
| 254 | $last .= ' '.$r[$i]; |
||
| 255 | |||
| 256 | $result['last'] = trim($last); |
||
| 257 | |||
| 258 | return $result; |
||
| 259 | } |
||
| 260 | |||
| 261 | |||
| 262 | /** |
||
| 263 | * @brief Removes unwanted MS Word smart characters from a string. |
||
| 264 | * @param string $text The text to be sanitized. |
||
| 265 | * @return string The sanitized text. |
||
| 266 | * @warning This function doesn't work with UTF-8 strings. |
||
| 267 | */ |
||
| 268 | public static function sanitize($text) { |
||
| 269 | $from = [ |
||
| 270 | "\xe2\x80\x98", // Left single quote. |
||
| 271 | "\xe2\x80\x99", // Right single quote. |
||
| 272 | "\xe2\x80\x9c", // Left double quote. |
||
| 273 | "\xe2\x80\x9d", // Right double quote. |
||
| 274 | "\xe2\x80\x94", // Em dash. |
||
| 275 | "\xe2\x80\xa6" // Elipses. |
||
| 276 | ]; |
||
| 277 | |||
| 278 | $to = [ |
||
| 279 | "'", |
||
| 280 | "'", |
||
| 281 | '"', |
||
| 282 | '"', |
||
| 283 | '—', |
||
| 284 | '...' |
||
| 285 | ]; |
||
| 286 | |||
| 287 | return htmlspecialchars(str_replace($from, $to, $text)); |
||
| 288 | } |
||
| 289 | |||
| 290 | } |