Passed
Push — master ( 20ad1a...1e5204 )
by Martijn
03:13 queued 01:48
created
src/Multibyte.php 2 patches
Indentation   +76 added lines, -76 removed lines patch added patch discarded remove patch
@@ -7,91 +7,91 @@
 block discarded – undo
7 7
  */
8 8
 class Multibyte
9 9
 {
10
-    /**
11
-     * Multibyte.php safe version of standard trim() function.
12
-     *
13
-     * @param string $string
14
-     * @return string
15
-     */
16
-    public static function trim($string)
17
-    {
18
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
19
-    }
10
+	/**
11
+	 * Multibyte.php safe version of standard trim() function.
12
+	 *
13
+	 * @param string $string
14
+	 * @return string
15
+	 */
16
+	public static function trim($string)
17
+	{
18
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
19
+	}
20 20
 
21
-    /**
22
-     * A cross between mb_split and preg_split, adding the preg_split flags
23
-     * to mb_split.
24
-     *
25
-     * @param string $pattern
26
-     * @param string $string
27
-     * @param int $limit
28
-     * @param int $flags
29
-     * @return array
30
-     */
31
-    public static function split($pattern, $string, $limit = -1, $flags = 0)
32
-    {
33
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
34
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
21
+	/**
22
+	 * A cross between mb_split and preg_split, adding the preg_split flags
23
+	 * to mb_split.
24
+	 *
25
+	 * @param string $pattern
26
+	 * @param string $string
27
+	 * @param int $limit
28
+	 * @param int $flags
29
+	 * @return array
30
+	 */
31
+	public static function split($pattern, $string, $limit = -1, $flags = 0)
32
+	{
33
+		$split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
34
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
+		$delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
36 36
 
37
-        $strlen = strlen($string); // bytes!
38
-        mb_ereg_search_init($string);
37
+		$strlen = strlen($string); // bytes!
38
+		mb_ereg_search_init($string);
39 39
 
40
-        $lengths = array();
41
-        $position = 0;
42
-        while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
43
-            // capture split
44
-            $lengths[] = array($array[0] - $position, false, null);
40
+		$lengths = array();
41
+		$position = 0;
42
+		while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
43
+			// capture split
44
+			$lengths[] = array($array[0] - $position, false, null);
45 45
 
46
-            // move position
47
-            $position = $array[0] + $array[1];
46
+			// move position
47
+			$position = $array[0] + $array[1];
48 48
 
49
-            // capture delimiter
50
-            $regs = mb_ereg_search_getregs();
51
-            $lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
49
+			// capture delimiter
50
+			$regs = mb_ereg_search_getregs();
51
+			$lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
52 52
 
53
-            // Continue on?
54
-            if ($position >= $strlen) {
55
-                break;
56
-            }
57
-        }
53
+			// Continue on?
54
+			if ($position >= $strlen) {
55
+				break;
56
+			}
57
+		}
58 58
 
59
-        // Add last bit, if not ending with split
60
-        $lengths[] = array($strlen - $position, false, null);
59
+		// Add last bit, if not ending with split
60
+		$lengths[] = array($strlen - $position, false, null);
61 61
 
62
-        // Substrings
63
-        $parts = array();
64
-        $position = 0;
65
-        $count = 1;
66
-        foreach ($lengths as $length) {
67
-            $split_empty = $length[0] || !$split_no_empty;
68
-            $is_delimiter = $length[1];
69
-            $is_captured = $length[2];
62
+		// Substrings
63
+		$parts = array();
64
+		$position = 0;
65
+		$count = 1;
66
+		foreach ($lengths as $length) {
67
+			$split_empty = $length[0] || !$split_no_empty;
68
+			$is_delimiter = $length[1];
69
+			$is_captured = $length[2];
70 70
 
71
-            if ($limit > 0
72
-                && !$is_delimiter
73
-                && $split_empty
74
-                && ++$count > $limit) {
75
-                if ($length[0] > 0
76
-                    || $split_empty) {
77
-                    $parts[] = $offset_capture
78
-                        ? array(mb_strcut($string, $position), $position)
79
-                        : mb_strcut($string, $position);
80
-                }
81
-                break;
82
-            } elseif ((!$is_delimiter
83
-                    || ($delim_capture
84
-                        && $is_captured))
85
-                && ($length[0]
86
-                    || $split_empty)) {
87
-                $parts[] = $offset_capture
88
-                    ? array(mb_strcut($string, $position, $length[0]), $position)
89
-                    : mb_strcut($string, $position, $length[0]);
90
-            }
71
+			if ($limit > 0
72
+				&& !$is_delimiter
73
+				&& $split_empty
74
+				&& ++$count > $limit) {
75
+				if ($length[0] > 0
76
+					|| $split_empty) {
77
+					$parts[] = $offset_capture
78
+						? array(mb_strcut($string, $position), $position)
79
+						: mb_strcut($string, $position);
80
+				}
81
+				break;
82
+			} elseif ((!$is_delimiter
83
+					|| ($delim_capture
84
+						&& $is_captured))
85
+				&& ($length[0]
86
+					|| $split_empty)) {
87
+				$parts[] = $offset_capture
88
+					? array(mb_strcut($string, $position, $length[0]), $position)
89
+					: mb_strcut($string, $position, $length[0]);
90
+			}
91 91
 
92
-            $position += $length[0];
93
-        }
92
+			$position += $length[0];
93
+		}
94 94
 
95
-        return $parts;
96
-    }
95
+		return $parts;
96
+	}
97 97
 }
98 98
\ No newline at end of file
Please login to merge, or discard this patch.
Spacing   +3 added lines, -3 removed lines patch added patch discarded remove patch
@@ -30,9 +30,9 @@
 block discarded – undo
30 30
      */
31 31
     public static function split($pattern, $string, $limit = -1, $flags = 0)
32 32
     {
33
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
34
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
33
+        $split_no_empty = (bool) ($flags & PREG_SPLIT_NO_EMPTY);
34
+        $offset_capture = (bool) ($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
+        $delim_capture = (bool) ($flags & PREG_SPLIT_DELIM_CAPTURE);
36 36
 
37 37
         $strlen = strlen($string); // bytes!
38 38
         mb_ereg_search_init($string);
Please login to merge, or discard this patch.
src/Sentence.php 1 patch
Indentation   +375 added lines, -375 removed lines patch added patch discarded remove patch
@@ -17,380 +17,380 @@
 block discarded – undo
17 17
 class Sentence
18 18
 {
19 19
 
20
-    /**
21
-     * Specify this flag with the split method to trim whitespace.
22
-     */
23
-    const SPLIT_TRIM = 0x1;
24
-
25
-    /**
26
-     * List of characters used to terminate sentences.
27
-     *
28
-     * @var string[]
29
-     */
30
-    private $terminals = array('.', '!', '?');
31
-
32
-    /**
33
-     * List of characters used for abbreviations.
34
-     *
35
-     * @var string[]
36
-     */
37
-    private $abbreviators = array('.');
38
-
39
-    /**
40
-     * Breaks a piece of text into lines by linebreak.
41
-     * Eats up any linebreak characters as if one.
42
-     *
43
-     * Multibyte.php safe
44
-     *
45
-     * @param string $text
46
-     * @return string[]
47
-     */
48
-    private static function linebreakSplit($text)
49
-    {
50
-        $lines = array();
51
-        $line = '';
52
-
53
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
-            $line .= $part;
55
-            if (Multibyte::trim($part) === '') {
56
-                $lines[] = $line;
57
-                $line = '';
58
-            }
59
-        }
60
-        $lines[] = $line;
61
-
62
-        return $lines;
63
-    }
64
-
65
-    /**
66
-     * Replace
67
-     *
68
-     * @staticvar array $chr_map
69
-     * @param string $string
70
-     * @return string
71
-     */
72
-    private static function cleanUnicode($string)
73
-    {
74
-        //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
75
-        static $character_map = array(
76
-            // Windows codepage 1252
77
-            "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
78
-            "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
79
-            "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
80
-            "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
81
-            "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
82
-            "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
83
-            "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
84
-            "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
85
-            // Regular Unicode     // U+0022 quotation mark (")
86
-            // U+0027 apostrophe     (')
87
-            "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
88
-            "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
89
-            "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
90
-            "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
91
-            "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
92
-            "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
93
-            "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
94
-            "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
95
-            "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
96
-            "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
97
-            "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
98
-            "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
99
-        );
100
-
101
-        $character = array_keys($character_map); // but: for efficiency you should
102
-        $replace = array_values($character_map); // pre-calculate these two arrays
103
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
104
-    }
105
-
106
-    /**
107
-     * Splits an array of lines by (consecutive sequences of)
108
-     * terminals, keeping terminals.
109
-     *
110
-     * Multibyte.php safe (atleast for UTF-8)
111
-     *
112
-     * For example:
113
-     *    "There ... is. More!"
114
-     *        ... becomes ...
115
-     *    [ "There ", "...", " is", ".", " More", "!" ]
116
-     *
117
-     * @param string $line
118
-     * @return string[]
119
-     */
120
-    private function punctuationSplit($line)
121
-    {
122
-        $parts = array();
123
-
124
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
125
-        $is_terminal = in_array($chars[0], $this->terminals);
126
-
127
-        $part = '';
128
-        foreach ($chars as $index => $char) {
129
-            if (in_array($char, $this->terminals) !== $is_terminal) {
130
-                $parts[] = $part;
131
-                $part = '';
132
-                $is_terminal = !$is_terminal;
133
-            }
134
-            $part .= $char;
135
-        }
136
-
137
-        if (!empty($part)) {
138
-            $parts[] = $part;
139
-        }
140
-
141
-        return $parts;
142
-    }
143
-
144
-    /**
145
-     * Appends each terminal item after it's preceding
146
-     * non-terminals.
147
-     *
148
-     * Multibyte.php safe (atleast for UTF-8)
149
-     *
150
-     * For example:
151
-     *    [ "There ", "...", " is", ".", " More", "!" ]
152
-     *        ... becomes ...
153
-     *    [ "There ... is.", "More!" ]
154
-     *
155
-     * @param string[] $punctuations
156
-     * @return string[]
157
-     */
158
-    private function punctuationMerge($punctuations)
159
-    {
160
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
161
-
162
-        $merges = array();
163
-        $merge = '';
164
-
165
-        foreach ($punctuations as $punctuation) {
166
-            if ($punctuation !== '') {
167
-                $merge .= $punctuation;
168
-                if (mb_strlen($punctuation) === 1
169
-                    && in_array($punctuation, $this->terminals)) {
170
-                    $merges[] = $merge;
171
-                    $merge = '';
172
-                } else {
173
-                    foreach ($definite_terminals as $terminal) {
174
-                        if (mb_strpos($punctuation, $terminal) !== false) {
175
-                            $merges[] = $merge;
176
-                            $merge = '';
177
-                            break;
178
-                        }
179
-                    }
180
-                }
181
-            }
182
-        }
183
-        if (!empty($merge)) {
184
-            $merges[] = $merge;
185
-        }
186
-
187
-        return $merges;
188
-    }
189
-
190
-    /**
191
-     * Looks for capitalized abbreviations & includes them with the following fragment.
192
-     *
193
-     * For example:
194
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
195
-     *        ... becomes ...
196
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
197
-     *  [ "Mr. Comey was not available for comment." ]
198
-     *
199
-     * @param string[] $fragments
200
-     * @return string[]
201
-     */
202
-    private function abbreviationMerge($fragments)
203
-    {
204
-        $return_fragment = array();
205
-
206
-        $previous_string = '';
207
-        $previous_is_abbreviation = false;
208
-        $i = 0;
209
-
210
-        foreach ($fragments as $fragment) {
211
-            $current_string = $fragment;
212
-            $words = mb_split('\s+', Multibyte::trim($fragment));
213
-
214
-            $word_count = count($words);
215
-
216
-            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
217
-            $last_word = trim($words[$word_count - 1]);
218
-            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
219
-            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
220
-            $is_abbreviation = $last_is_capital > 0
221
-                && $last_is_abbreviation > 0
222
-                && mb_strlen($last_word) <= 3;
223
-
224
-            // merge previous fragment with this
225
-            if ($previous_is_abbreviation === true) {
226
-                $current_string = $previous_string . $current_string;
227
-            }
228
-            $return_fragment[$i] = $current_string;
229
-
230
-            $previous_is_abbreviation = $is_abbreviation;
231
-            $previous_string = $current_string;
232
-            // only increment if this isn't an abbreviation
233
-            if ($is_abbreviation === false) {
234
-                $i++;
235
-            }
236
-        }
237
-        return $return_fragment;
238
-    }
239
-
240
-    /**
241
-     * Merges any part starting with a closing parenthesis ')' to the previous
242
-     * part.
243
-     *
244
-     * @param string[] $parts
245
-     * @return string[]
246
-     */
247
-    private function parenthesesMerge($parts)
248
-    {
249
-        $subsentences = array();
250
-
251
-        foreach ($parts as $part) {
252
-            if ($part[0] === ')') {
253
-                $subsentences[count($subsentences) - 1] .= $part;
254
-            } else {
255
-                $subsentences[] = $part;
256
-            }
257
-        }
258
-
259
-        return $subsentences;
260
-    }
261
-
262
-    /**
263
-     * Looks for closing quotes to include them with the previous statement.
264
-     * "That was very interesting," he said.
265
-     * "That was very interesting."
266
-     *
267
-     * @param string[] $statements
268
-     * @return string[]
269
-     */
270
-    private function closeQuotesMerge($statements)
271
-    {
272
-        $i = 0;
273
-        $previous_statement = "";
274
-        $return = array();
275
-        foreach ($statements as $statement) {
276
-            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
277
-            if (trim($statement) === '"'
278
-                || trim($statement) === "'"
279
-                || (
280
-                    (substr($statement, 0, 1) === '"'
281
-                        || substr($statement, 0, 1) === "'")
282
-                    && substr($statement, 1, 1) === ' '
283
-                    && ctype_lower(substr($statement, 2, 1)) === true
284
-                )
285
-            ) {
286
-                $statement = $previous_statement . $statement;
287
-            } else {
288
-                $i++;
289
-            }
290
-
291
-            $return[$i] = $statement;
292
-            $previous_statement = $statement;
293
-        }
294
-
295
-        return $return;
296
-    }
297
-
298
-    /**
299
-     * Merges items into larger sentences.
300
-     * Multibyte.php safe
301
-     *
302
-     * @param string[] $shorts
303
-     * @return string[]
304
-     */
305
-    private function sentenceMerge($shorts)
306
-    {
307
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
308
-
309
-        $sentences = array();
310
-
311
-        $sentence = '';
312
-        $has_words = false;
313
-        $previous_word_ending = null;
314
-        foreach ($shorts as $short) {
315
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
316
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
317
-
318
-            if ($after_non_abbreviating_terminal
319
-                || ($has_words && $word_count > 1)) {
320
-                $sentences[] = $sentence;
321
-                $sentence = '';
322
-                $has_words = $word_count > 1;
323
-            } else {
324
-                $has_words = ($has_words
325
-                    || $word_count > 1);
326
-            }
327
-
328
-            $sentence .= $short;
329
-            $previous_word_ending = mb_substr($short, -1);
330
-        }
331
-        if (!empty($sentence)) {
332
-            $sentences[] = $sentence;
333
-        }
334
-
335
-        return $sentences;
336
-    }
337
-
338
-    /**
339
-     * Return the sentences sentences detected in the provided text.
340
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
341
-     * @param string $text
342
-     * @param integer $flags
343
-     * @return string[]
344
-     */
345
-    public function split($text, $flags = 0)
346
-    {
347
-        $sentences = array();
348
-
349
-        // clean funny quotes
350
-        $text = self::cleanUnicode($text);
351
-
352
-        // Split
353
-        foreach (self::linebreakSplit($text) as $line) {
354
-            if (Multibyte::trim($line) !== '') {
355
-                $punctuations = $this->punctuationSplit($line);
356
-                $parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
357
-                $merges = $this->punctuationMerge($parentheses);
358
-                $shorts = $this->abbreviationMerge($merges);
359
-                $quotes = $this->closeQuotesMerge($shorts);
360
-                $sentences = array_merge($sentences, $this->sentenceMerge($quotes));
361
-            }
362
-        }
363
-
364
-        // Post process
365
-        if ($flags & self::SPLIT_TRIM) {
366
-            return self::trimSentences($sentences);
367
-        }
368
-
369
-        return $sentences;
370
-    }
371
-
372
-    /**
373
-     * Multibyte.php trim each string in an array.
374
-     * @param string[] $sentences
375
-     * @return string[]
376
-     */
377
-    private static function trimSentences($sentences)
378
-    {
379
-        $trimmed = array();
380
-        foreach ($sentences as $sentence) {
381
-            $trimmed[] = Multibyte::trim($sentence);
382
-        }
383
-        return $trimmed;
384
-    }
385
-
386
-    /**
387
-     * Return the number of sentences detected in the provided text.
388
-     * @param string $text
389
-     * @return integer
390
-     */
391
-    public function count($text)
392
-    {
393
-        return count($this->split($text));
394
-    }
20
+	/**
21
+	 * Specify this flag with the split method to trim whitespace.
22
+	 */
23
+	const SPLIT_TRIM = 0x1;
24
+
25
+	/**
26
+	 * List of characters used to terminate sentences.
27
+	 *
28
+	 * @var string[]
29
+	 */
30
+	private $terminals = array('.', '!', '?');
31
+
32
+	/**
33
+	 * List of characters used for abbreviations.
34
+	 *
35
+	 * @var string[]
36
+	 */
37
+	private $abbreviators = array('.');
38
+
39
+	/**
40
+	 * Breaks a piece of text into lines by linebreak.
41
+	 * Eats up any linebreak characters as if one.
42
+	 *
43
+	 * Multibyte.php safe
44
+	 *
45
+	 * @param string $text
46
+	 * @return string[]
47
+	 */
48
+	private static function linebreakSplit($text)
49
+	{
50
+		$lines = array();
51
+		$line = '';
52
+
53
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
+			$line .= $part;
55
+			if (Multibyte::trim($part) === '') {
56
+				$lines[] = $line;
57
+				$line = '';
58
+			}
59
+		}
60
+		$lines[] = $line;
61
+
62
+		return $lines;
63
+	}
64
+
65
+	/**
66
+	 * Replace
67
+	 *
68
+	 * @staticvar array $chr_map
69
+	 * @param string $string
70
+	 * @return string
71
+	 */
72
+	private static function cleanUnicode($string)
73
+	{
74
+		//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
75
+		static $character_map = array(
76
+			// Windows codepage 1252
77
+			"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
78
+			"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
79
+			"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
80
+			"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
81
+			"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
82
+			"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
83
+			"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
84
+			"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
85
+			// Regular Unicode     // U+0022 quotation mark (")
86
+			// U+0027 apostrophe     (')
87
+			"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
88
+			"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
89
+			"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
90
+			"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
91
+			"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
92
+			"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
93
+			"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
94
+			"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
95
+			"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
96
+			"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
97
+			"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
98
+			"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
99
+		);
100
+
101
+		$character = array_keys($character_map); // but: for efficiency you should
102
+		$replace = array_values($character_map); // pre-calculate these two arrays
103
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
104
+	}
105
+
106
+	/**
107
+	 * Splits an array of lines by (consecutive sequences of)
108
+	 * terminals, keeping terminals.
109
+	 *
110
+	 * Multibyte.php safe (atleast for UTF-8)
111
+	 *
112
+	 * For example:
113
+	 *    "There ... is. More!"
114
+	 *        ... becomes ...
115
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
116
+	 *
117
+	 * @param string $line
118
+	 * @return string[]
119
+	 */
120
+	private function punctuationSplit($line)
121
+	{
122
+		$parts = array();
123
+
124
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
125
+		$is_terminal = in_array($chars[0], $this->terminals);
126
+
127
+		$part = '';
128
+		foreach ($chars as $index => $char) {
129
+			if (in_array($char, $this->terminals) !== $is_terminal) {
130
+				$parts[] = $part;
131
+				$part = '';
132
+				$is_terminal = !$is_terminal;
133
+			}
134
+			$part .= $char;
135
+		}
136
+
137
+		if (!empty($part)) {
138
+			$parts[] = $part;
139
+		}
140
+
141
+		return $parts;
142
+	}
143
+
144
+	/**
145
+	 * Appends each terminal item after it's preceding
146
+	 * non-terminals.
147
+	 *
148
+	 * Multibyte.php safe (atleast for UTF-8)
149
+	 *
150
+	 * For example:
151
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
152
+	 *        ... becomes ...
153
+	 *    [ "There ... is.", "More!" ]
154
+	 *
155
+	 * @param string[] $punctuations
156
+	 * @return string[]
157
+	 */
158
+	private function punctuationMerge($punctuations)
159
+	{
160
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
161
+
162
+		$merges = array();
163
+		$merge = '';
164
+
165
+		foreach ($punctuations as $punctuation) {
166
+			if ($punctuation !== '') {
167
+				$merge .= $punctuation;
168
+				if (mb_strlen($punctuation) === 1
169
+					&& in_array($punctuation, $this->terminals)) {
170
+					$merges[] = $merge;
171
+					$merge = '';
172
+				} else {
173
+					foreach ($definite_terminals as $terminal) {
174
+						if (mb_strpos($punctuation, $terminal) !== false) {
175
+							$merges[] = $merge;
176
+							$merge = '';
177
+							break;
178
+						}
179
+					}
180
+				}
181
+			}
182
+		}
183
+		if (!empty($merge)) {
184
+			$merges[] = $merge;
185
+		}
186
+
187
+		return $merges;
188
+	}
189
+
190
+	/**
191
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
192
+	 *
193
+	 * For example:
194
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
195
+	 *        ... becomes ...
196
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
197
+	 *  [ "Mr. Comey was not available for comment." ]
198
+	 *
199
+	 * @param string[] $fragments
200
+	 * @return string[]
201
+	 */
202
+	private function abbreviationMerge($fragments)
203
+	{
204
+		$return_fragment = array();
205
+
206
+		$previous_string = '';
207
+		$previous_is_abbreviation = false;
208
+		$i = 0;
209
+
210
+		foreach ($fragments as $fragment) {
211
+			$current_string = $fragment;
212
+			$words = mb_split('\s+', Multibyte::trim($fragment));
213
+
214
+			$word_count = count($words);
215
+
216
+			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
217
+			$last_word = trim($words[$word_count - 1]);
218
+			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
219
+			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
220
+			$is_abbreviation = $last_is_capital > 0
221
+				&& $last_is_abbreviation > 0
222
+				&& mb_strlen($last_word) <= 3;
223
+
224
+			// merge previous fragment with this
225
+			if ($previous_is_abbreviation === true) {
226
+				$current_string = $previous_string . $current_string;
227
+			}
228
+			$return_fragment[$i] = $current_string;
229
+
230
+			$previous_is_abbreviation = $is_abbreviation;
231
+			$previous_string = $current_string;
232
+			// only increment if this isn't an abbreviation
233
+			if ($is_abbreviation === false) {
234
+				$i++;
235
+			}
236
+		}
237
+		return $return_fragment;
238
+	}
239
+
240
+	/**
241
+	 * Merges any part starting with a closing parenthesis ')' to the previous
242
+	 * part.
243
+	 *
244
+	 * @param string[] $parts
245
+	 * @return string[]
246
+	 */
247
+	private function parenthesesMerge($parts)
248
+	{
249
+		$subsentences = array();
250
+
251
+		foreach ($parts as $part) {
252
+			if ($part[0] === ')') {
253
+				$subsentences[count($subsentences) - 1] .= $part;
254
+			} else {
255
+				$subsentences[] = $part;
256
+			}
257
+		}
258
+
259
+		return $subsentences;
260
+	}
261
+
262
+	/**
263
+	 * Looks for closing quotes to include them with the previous statement.
264
+	 * "That was very interesting," he said.
265
+	 * "That was very interesting."
266
+	 *
267
+	 * @param string[] $statements
268
+	 * @return string[]
269
+	 */
270
+	private function closeQuotesMerge($statements)
271
+	{
272
+		$i = 0;
273
+		$previous_statement = "";
274
+		$return = array();
275
+		foreach ($statements as $statement) {
276
+			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
277
+			if (trim($statement) === '"'
278
+				|| trim($statement) === "'"
279
+				|| (
280
+					(substr($statement, 0, 1) === '"'
281
+						|| substr($statement, 0, 1) === "'")
282
+					&& substr($statement, 1, 1) === ' '
283
+					&& ctype_lower(substr($statement, 2, 1)) === true
284
+				)
285
+			) {
286
+				$statement = $previous_statement . $statement;
287
+			} else {
288
+				$i++;
289
+			}
290
+
291
+			$return[$i] = $statement;
292
+			$previous_statement = $statement;
293
+		}
294
+
295
+		return $return;
296
+	}
297
+
298
+	/**
299
+	 * Merges items into larger sentences.
300
+	 * Multibyte.php safe
301
+	 *
302
+	 * @param string[] $shorts
303
+	 * @return string[]
304
+	 */
305
+	private function sentenceMerge($shorts)
306
+	{
307
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
308
+
309
+		$sentences = array();
310
+
311
+		$sentence = '';
312
+		$has_words = false;
313
+		$previous_word_ending = null;
314
+		foreach ($shorts as $short) {
315
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
316
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
317
+
318
+			if ($after_non_abbreviating_terminal
319
+				|| ($has_words && $word_count > 1)) {
320
+				$sentences[] = $sentence;
321
+				$sentence = '';
322
+				$has_words = $word_count > 1;
323
+			} else {
324
+				$has_words = ($has_words
325
+					|| $word_count > 1);
326
+			}
327
+
328
+			$sentence .= $short;
329
+			$previous_word_ending = mb_substr($short, -1);
330
+		}
331
+		if (!empty($sentence)) {
332
+			$sentences[] = $sentence;
333
+		}
334
+
335
+		return $sentences;
336
+	}
337
+
338
+	/**
339
+	 * Return the sentences sentences detected in the provided text.
340
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
341
+	 * @param string $text
342
+	 * @param integer $flags
343
+	 * @return string[]
344
+	 */
345
+	public function split($text, $flags = 0)
346
+	{
347
+		$sentences = array();
348
+
349
+		// clean funny quotes
350
+		$text = self::cleanUnicode($text);
351
+
352
+		// Split
353
+		foreach (self::linebreakSplit($text) as $line) {
354
+			if (Multibyte::trim($line) !== '') {
355
+				$punctuations = $this->punctuationSplit($line);
356
+				$parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
357
+				$merges = $this->punctuationMerge($parentheses);
358
+				$shorts = $this->abbreviationMerge($merges);
359
+				$quotes = $this->closeQuotesMerge($shorts);
360
+				$sentences = array_merge($sentences, $this->sentenceMerge($quotes));
361
+			}
362
+		}
363
+
364
+		// Post process
365
+		if ($flags & self::SPLIT_TRIM) {
366
+			return self::trimSentences($sentences);
367
+		}
368
+
369
+		return $sentences;
370
+	}
371
+
372
+	/**
373
+	 * Multibyte.php trim each string in an array.
374
+	 * @param string[] $sentences
375
+	 * @return string[]
376
+	 */
377
+	private static function trimSentences($sentences)
378
+	{
379
+		$trimmed = array();
380
+		foreach ($sentences as $sentence) {
381
+			$trimmed[] = Multibyte::trim($sentence);
382
+		}
383
+		return $trimmed;
384
+	}
385
+
386
+	/**
387
+	 * Return the number of sentences detected in the provided text.
388
+	 * @param string $text
389
+	 * @return integer
390
+	 */
391
+	public function count($text)
392
+	{
393
+		return count($this->split($text));
394
+	}
395 395
 
396 396
 }
Please login to merge, or discard this patch.