Issues (9)

src/ToolBag/Helper/TextHelper.php (2 issues)

1
<?php
2
3
/**
4
 * @file TextHelper.php
5
 * @brief This file contains the TextHelper class.
6
 * @details
7
 * @author Filippo F. Fadda
8
 */
9
10
11
//! Classes to help the development
12
namespace ToolBag\Helper;
13
14
15
/**
16
 * @brief This helper class contains routines to process text.
17
 * @nosubgrouping
18
 */
19
class TextHelper {
20
21
  const SEPARATOR = '::'; //!< Used to separate the ID from the version number.
22
23
24
  /**
25
   * @brief Converts a string from a charset to another one.
26
   * @details The default conversion is from `Windows-1252` to `UTF-8`. `Windows-1252` or `CP-1252` is a character
27
   * encoding of the Latin alphabet, used by default in the legacy components of Microsoft Windows in English and some
28
   * other Western languages. This character encoding is a superset of `ISO-8859-1`, but it differs from it by using
29
   * displayable characters rather than control characters in the 80 to 9F (hex) range.
30
   * @param string $text The input string.
31
   * @param bool $stripslashes (optional) If `true` strip all the slashes before converting the text.
32
   * @param string $fromCharset (optional) The origin charset.
33
   * @param string $toCharset (optional) The target charset.
34
   * @return string
35
   * @attention Doesn't matter if the varchar fields of your MySQL tables are encoded in `LATIN1`, in fact, if someone
36
   * ever posted a document from Windows Word containing smart characters, like curly quotes or smart apostrophes, the
37
   * real charset used is `Windows-1252`.
38
   * @warning This function doesn't use `LATIN1` or `ISO-8859-1` as default, because `Windows-1251` and `Windows-1252`
39
   * will only succeed if the entire string consists of high-byte characters in a certain range. That means you'll never
40
   * get the right conversion because the text will appear as `ISO-8859-1` even if it is `Windows-1252`. See the bug
41
   * section.
42
   * @bug https://bugs.php.net/bug.php?id=64667
43
   */
44
  public static function convertCharset($text, $stripslashes = FALSE, $fromCharset = 'Windows-1252', $toCharset = 'UTF-8') {
45
    if ($stripslashes)
46
      return iconv($fromCharset, $toCharset, stripslashes($text));
47
    else
48
      return iconv($fromCharset, $toCharset, $text);
49
  }
50
51
52
  /**
53
   * @brief Cuts a string to a given number of characters without breaking words.
54
   * @param string $text The input string.
55
   * @param integer $length The number of characters at which the string will be wrapped, ex. 200 characters.
56
   * @param string $etc The characters you want append to the end of text.
57
   * @param string $charset (optional) The charset used.
58
   * @param bool $breakWords (optional) If `true` breaks the words to return the exact number of chars.
59
   * @param bool $middle (optional) Truncates the text but remove middle instead the end of the string.
60
   * @return string
61
   * @warning This function works with UTF-8 strings.
62
   */
63
  public static function truncate($text, $length = 200, $etc = ' ...', $charset='UTF-8', $breakWords = FALSE, $middle = FALSE) {
64
    if ($length == 0)
65
      return '';
66
67
    if (mb_strlen($text) > $length) {
68
      $length -= min($length, mb_strlen($etc, $charset));
69
70
      if (!$breakWords && !$middle)
71
        $text = preg_replace('/\s+?(\S+)?$/u', '', mb_substr($text, 0, $length+1, $charset));
72
73
      if(!$middle)
74
        return mb_substr($text, 0, $length, $charset) . $etc;
75
      else
76
        return mb_substr($text, 0, $length/2, $charset) . $etc . mb_substr($text, -$length/2, (mb_strlen($text, $charset) - $length/2), $charset);
77
    }
78
    else
79
      return $text;
80
  }
81
82
83
  /**
84
   * @brief Capitalizes the given string.
85
   * @param string $text The input string.
86
   * @param string $charset (optional) The charset used.
87
   * @return string
88
   * @warning This function works with UTF-8 strings.
89
   */
90
  public static function capitalize($text, $charset = 'UTF-8') {
91
    return mb_strtoupper(mb_substr($text, 0, 1, $charset), $charset) . mb_strtolower(mb_substr($text, 1, mb_strlen($text, $charset), $charset), $charset);
92
  }
93
94
95
  /**
96
   * @brief Removes the content of pre tags, than strip all tags.
97
   * @param string $text The input string.
98
   * @return string
99
   * @warning This function works with UTF-8 strings.
100
   */
101
  public static function purge($text) {
102
    // Removes the content of <pre></pre>.
103
    $temp = preg_replace('/<(pre)(?:(?!<\/\1).)*?<\/\1>/su', '', $text);
104
105
    if (is_null($temp))
0 ignored issues
show
The condition is_null($temp) can never be true.
Loading history...
106
      throw new \RuntimeException(array_flip(get_defined_constants(TRUE)['pcre'])[preg_last_error()]);
107
108
    // Removes all the HTML tags.
109
    $temp = strip_tags($temp);
110
111
    return $temp;
112
  }
113
114
115
  /**
116
   * @brief Generates a single word, stripping every `-` from a compound word.
117
   * @param string $word A compound word.
118
   * @return string
119
   * @warning This function works with UTF-8 strings.
120
   */
121
  public static function stick($word) {
122
    return preg_replace('/-/su', '', $word);
123
  }
124
125
126
  /**
127
   * @brief Given a string, returns all the unique contained substrings.
128
   * @param string $str The input string.
129
   * @param string $charset (optional) The charset used.
130
   * @return array
131
   * @warning This function works with UTF-8 strings.
132
   */
133
  public static function substrings($str, $charset = 'UTF-8') {
134
    $length = mb_strlen($str, $charset);
135
136
    $subs = [];
137
    for ($i = 0; $i < $length; $i++)
138
      for ($j = 1; $j <= $length; $j++)
139
        $subs[] = mb_substr($str, $i, $j, $charset);
140
141
    return array_unique($subs);
142
  }
143
144
145
  /**
146
   * @brief Generates a slug from the provided string.
147
   * @param string $str The input string.
148
   * @return string
149
   * @warning This function receives as input an UTF-8 string and returns an ASCII string.
150
   * @see https://en.wikipedia.org/wiki/Slug_(publishing)
151
   */
152
  public static function slug($str) {
153
    // Replaces any character that is not a letter or a number with minus.
154
    $slug = preg_replace('/[^\pL\d]+/u', '-', $str);
155
156
    // Removes the minus character from the begin and the end.
157
    $slug = trim($slug, '-');
158
159
    // Converts the charset from uft-8 to ASCII.
160
    $slug = self::convertCharset($slug, FALSE, 'utf-8', 'ASCII//TRANSLIT');
161
162
    // Converts the string to Lowercase.
163
    $slug = strtolower($slug);
164
165
    // Finally removes any character that is not a letter, a number or a minus.
166
    return preg_replace('/[^\-\w]+/', '', $slug);
167
  }
168
169
170
  /**
171
   * @brief Builds the post url, given its publishing or creation date and its slug.
172
   * @param int $date Publishing or creation date.
173
   * @param string $slug The slug of the title.
174
   * @return string The complete url of the post.
175
   */
176
  public static function buildUrl($date, $slug) {
177
    return date('/Y/m/d/', $date).$slug;
178
  }
179
180
181
  /**
182
   * @brief Replaces all the occurrences but first.
183
   * @param string $pattern The pattern to search for.
184
   * @param string $replacement The string used as replacement for the match found.
185
   * @param string $subject The string to search and replace.
186
   * @return string
187
   */
188
  public static function replaceAllButFirst($pattern, $replacement, $subject) {
189
    return preg_replace_callback(
190
      $pattern,
191
      function($matches) use ($replacement, $subject) {
0 ignored issues
show
The import $subject is not used and could be removed.

This check looks for imports that have been defined, but are not used in the scope.

Loading history...
192
        static $s;
193
        $s++;
194
        return ($s <= 1) ? $matches[0] : $replacement;
195
      },
196
      $subject
197
    );
198
  }
199
200
201
  /**
202
   * @brief Prunes the ID of its version number, if any.
203
   * @param string $id An UUID followed by a timestamp, like `3e96144b-3ebd-41e4-8a45-78cd9af1671d::1410886811`.
204
   * @return string Returns just `3e96144b-3ebd-41e4-8a45-78cd9af1671d`.
205
   */
206
  public static function unversion($id) {
207
    return strtok($id, self::SEPARATOR);
208
  }
209
210
211
  /**
212
   * @brief Formats the number replacing the thousand separator with the decimal point.
213
   * @param double $number The input number.
214
   * @return string
215
   */
216
  public static function formatNumber($number) {
217
    return number_format($number, 0, ",", ".");
218
  }
219
220
221
  /**
222
   * @brief Separates the given full name into first name and last name.
223
   * @param string $fullName A person full name.
224
   * @return array An associative array.
225
   */
226
  public static function splitFullName($fullName) {
227
    $result = [];
228
229
    $r = explode(' ', $fullName);
230
    $size = count($r);
231
232
    // Checks first for period, assume salutation if so
233
    if (mb_strpos($r[0], '.') === FALSE) {
234
      $result['salutation'] = '';
235
      $result['first'] = $r[0];
236
    }
237
    else {
238
      $result['salutation'] = $r[0];
239
      $result['first'] = $r[1];
240
    }
241
242
    // Checks last for period, assume suffix if so
243
    if (mb_strpos($r[$size - 1], '.') === FALSE)
244
      $result['suffix'] = '';
245
    else
246
      $result['suffix'] = $r[$size - 1];
247
248
    // Combines remains into last.
249
    $start = ($result['salutation']) ? 2 : 1;
250
    $end = ($result['suffix']) ? $size - 2 : $size - 1;
251
252
    $last = '';
253
    for ($i = $start; $i <= $end; $i++)
254
      $last .= ' '.$r[$i];
255
256
    $result['last'] = trim($last);
257
258
    return $result;
259
  }
260
261
262
  /**
263
   * @brief Removes unwanted MS Word smart characters from a string.
264
   * @param string $text The text to be sanitized.
265
   * @return string The sanitized text.
266
   * @warning This function doesn't work with UTF-8 strings.
267
   */
268
  public static function sanitize($text) {
269
    $from = [
270
      "\xe2\x80\x98", // Left single quote.
271
      "\xe2\x80\x99", // Right single quote.
272
      "\xe2\x80\x9c", // Left double quote.
273
      "\xe2\x80\x9d", // Right double quote.
274
      "\xe2\x80\x94", // Em dash.
275
      "\xe2\x80\xa6" // Elipses.
276
    ];
277
278
    $to = [
279
      "'",
280
      "'",
281
      '"',
282
      '"',
283
      '&mdash;',
284
      '...'
285
    ];
286
287
    return htmlspecialchars(str_replace($from, $to, $text));
288
  }
289
290
}