Passed
Push — master ( 9ee15e...e4f543 )
by Martijn
01:34
created
classes/Sentence.php 1 patch
Indentation   +463 added lines, -463 removed lines patch added patch discarded remove patch
@@ -15,468 +15,468 @@
 block discarded – undo
15 15
 class Sentence
16 16
 {
17 17
 
18
-    /**
19
-     * Specify this flag with the split method to trim whitespace.
20
-     */
21
-    const SPLIT_TRIM = 0x1;
22
-
23
-    /**
24
-     * List of characters used to terminate sentences.
25
-     *
26
-     * @var string[]
27
-     */
28
-    private $terminals = array('.', '!', '?');
29
-
30
-    /**
31
-     * List of characters used for abbreviations.
32
-     *
33
-     * @var string[]
34
-     */
35
-    private $abbreviators = array('.');
36
-
37
-    /**
38
-     * Multibyte safe version of standard trim() function.
39
-     *
40
-     * @param string $string
41
-     * @return string
42
-     */
43
-    private static function mbTrim($string)
44
-    {
45
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
46
-    }
47
-
48
-    /**
49
-     * A cross between mb_split and preg_split, adding the preg_split flags
50
-     * to mb_split.
51
-     *
52
-     * @param string $pattern
53
-     * @param string $string
54
-     * @param int $limit
55
-     * @param int $flags
56
-     * @return array
57
-     */
58
-    private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
59
-    {
60
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
61
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
63
-
64
-        $strlen = strlen($string); // bytes!
65
-        mb_ereg_search_init($string);
66
-
67
-        $lengths = array();
68
-        $position = 0;
69
-        while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
70
-            // capture split
71
-            $lengths[] = array($array[0] - $position, false, null);
72
-
73
-            // move position
74
-            $position = $array[0] + $array[1];
75
-
76
-            // capture delimiter
77
-            $regs = mb_ereg_search_getregs();
78
-            $lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
79
-
80
-            // Continue on?
81
-            if ($position >= $strlen) {
82
-                break;
83
-            }
84
-        }
85
-
86
-        // Add last bit, if not ending with split
87
-        $lengths[] = array($strlen - $position, false, null);
88
-
89
-        // Substrings
90
-        $parts = array();
91
-        $position = 0;
92
-        $count = 1;
93
-        foreach ($lengths as $length) {
94
-            $split_empty = $length[0] || !$split_no_empty;
95
-            $is_delimiter = $length[1];
96
-            $is_captured = $length[2];
97
-
98
-            if ($limit > 0
99
-                && !$is_delimiter
100
-                && $split_empty
101
-                && ++$count > $limit) {
102
-                if ($length[0] > 0
103
-                    || $split_empty) {
104
-                    $parts[] = $offset_capture
105
-                        ? array(mb_strcut($string, $position), $position)
106
-                        : mb_strcut($string, $position);
107
-                }
108
-                break;
109
-            } elseif ((!$is_delimiter
110
-                    || ($delim_capture
111
-                        && $is_captured))
112
-                && ($length[0]
113
-                    || $split_empty)) {
114
-                $parts[] = $offset_capture
115
-                    ? array(mb_strcut($string, $position, $length[0]), $position)
116
-                    : mb_strcut($string, $position, $length[0]);
117
-            }
118
-
119
-            $position += $length[0];
120
-        }
121
-
122
-        return $parts;
123
-    }
124
-
125
-    /**
126
-     * Breaks a piece of text into lines by linebreak.
127
-     * Eats up any linebreak characters as if one.
128
-     *
129
-     * Multibyte safe
130
-     *
131
-     * @param string $text
132
-     * @return string[]
133
-     */
134
-    private static function linebreakSplit($text)
135
-    {
136
-        $lines = array();
137
-        $line = '';
138
-
139
-        foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
140
-            $line .= $part;
141
-            if (self::mbTrim($part) === '') {
142
-                $lines[] = $line;
143
-                $line = '';
144
-            }
145
-        }
146
-        $lines[] = $line;
147
-
148
-        return $lines;
149
-    }
150
-
151
-    /**
152
-     * Replace
153
-     *
154
-     * @staticvar array $chr_map
155
-     * @param string $string
156
-     * @return string
157
-     */
158
-    private static function cleanUnicode($string)
159
-    {
160
-        //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
161
-        static $character_map = array(
162
-            // Windows codepage 1252
163
-            "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
164
-            "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
165
-            "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
166
-            "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
167
-            "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
168
-            "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
169
-            "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
170
-            "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
171
-            // Regular Unicode     // U+0022 quotation mark (")
172
-            // U+0027 apostrophe     (')
173
-            "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
174
-            "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
175
-            "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
176
-            "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
177
-            "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
178
-            "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
179
-            "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
180
-            "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
181
-            "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
182
-            "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
183
-            "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
184
-            "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
185
-        );
186
-
187
-        $character = array_keys($character_map); // but: for efficiency you should
188
-        $replace = array_values($character_map); // pre-calculate these two arrays
189
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
190
-    }
191
-
192
-    /**
193
-     * Splits an array of lines by (consecutive sequences of)
194
-     * terminals, keeping terminals.
195
-     *
196
-     * Multibyte safe (atleast for UTF-8)
197
-     *
198
-     * For example:
199
-     *    "There ... is. More!"
200
-     *        ... becomes ...
201
-     *    [ "There ", "...", " is", ".", " More", "!" ]
202
-     *
203
-     * @param string $line
204
-     * @return string[]
205
-     */
206
-    private function punctuationSplit($line)
207
-    {
208
-        $parts = array();
209
-
210
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
211
-        $is_terminal = in_array($chars[0], $this->terminals);
212
-
213
-        $part = '';
214
-        foreach ($chars as $index => $char) {
215
-            if (in_array($char, $this->terminals) !== $is_terminal) {
216
-                $parts[] = $part;
217
-                $part = '';
218
-                $is_terminal = !$is_terminal;
219
-            }
220
-            $part .= $char;
221
-        }
222
-
223
-        if (!empty($part)) {
224
-            $parts[] = $part;
225
-        }
226
-
227
-        return $parts;
228
-    }
229
-
230
-    /**
231
-     * Appends each terminal item after it's preceding
232
-     * non-terminals.
233
-     *
234
-     * Multibyte safe (atleast for UTF-8)
235
-     *
236
-     * For example:
237
-     *    [ "There ", "...", " is", ".", " More", "!" ]
238
-     *        ... becomes ...
239
-     *    [ "There ... is.", "More!" ]
240
-     *
241
-     * @param string[] $punctuations
242
-     * @return string[]
243
-     */
244
-    private function punctuationMerge($punctuations)
245
-    {
246
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
247
-
248
-        $merges = array();
249
-        $merge = '';
250
-
251
-        foreach ($punctuations as $punctuation) {
252
-            if ($punctuation !== '') {
253
-                $merge .= $punctuation;
254
-                if (mb_strlen($punctuation) === 1
255
-                    && in_array($punctuation, $this->terminals)) {
256
-                    $merges[] = $merge;
257
-                    $merge = '';
258
-                } else {
259
-                    foreach ($definite_terminals as $terminal) {
260
-                        if (mb_strpos($punctuation, $terminal) !== false) {
261
-                            $merges[] = $merge;
262
-                            $merge = '';
263
-                            break;
264
-                        }
265
-                    }
266
-                }
267
-            }
268
-        }
269
-        if (!empty($merge)) {
270
-            $merges[] = $merge;
271
-        }
272
-
273
-        return $merges;
274
-    }
275
-
276
-    /**
277
-     * Looks for capitalized abbreviations & includes them with the following fragment.
278
-     *
279
-     * For example:
280
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
281
-     *        ... becomes ...
282
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
283
-     *  [ "Mr. Comey was not available for comment." ]
284
-     *
285
-     * @param string[] $fragments
286
-     * @return string[]
287
-     */
288
-    private function abbreviationMerge($fragments)
289
-    {
290
-        $return_fragment = array();
291
-
292
-        $previous_string = '';
293
-        $previous_is_abbreviation = false;
294
-        $i = 0;
295
-
296
-        foreach ($fragments as $fragment) {
297
-            $current_string = $fragment;
298
-            $words = mb_split('\s+', self::mbTrim($fragment));
299
-
300
-            $word_count = count($words);
301
-
302
-            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
303
-            $last_word = trim($words[$word_count - 1]);
304
-            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
305
-            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
306
-            $is_abbreviation = $last_is_capital > 0
307
-                && $last_is_abbreviation > 0
308
-                && mb_strlen($last_word) <= 3;
309
-
310
-            // merge previous fragment with this
311
-            if ($previous_is_abbreviation === true) {
312
-                $current_string = $previous_string . $current_string;
313
-            }
314
-            $return_fragment[$i] = $current_string;
315
-
316
-            $previous_is_abbreviation = $is_abbreviation;
317
-            $previous_string = $current_string;
318
-            // only increment if this isn't an abbreviation
319
-            if ($is_abbreviation === false) {
320
-                $i++;
321
-            }
322
-        }
323
-        return $return_fragment;
324
-    }
325
-
326
-    /**
327
-     * Merges any part starting with a closing parenthesis ')' to the previous
328
-     * part.
329
-     *
330
-     * @param string[] $parts
331
-     * @return string[]
332
-     */
333
-    private function parenthesesMerge($parts)
334
-    {
335
-        $subsentences = array();
336
-
337
-        foreach ($parts as $part) {
338
-            if ($part[0] === ')') {
339
-                $subsentences[count($subsentences) - 1] .= $part;
340
-            } else {
341
-                $subsentences[] = $part;
342
-            }
343
-        }
344
-
345
-        return $subsentences;
346
-    }
347
-
348
-    /**
349
-     * Looks for closing quotes to include them with the previous statement.
350
-     * "That was very interesting," he said.
351
-     * "That was very interesting."
352
-     *
353
-     * @param string[] $statements
354
-     * @return string[]
355
-     */
356
-    private function closeQuotesMerge($statements)
357
-    {
358
-        $i = 0;
359
-        $previous_statement = "";
360
-        $return = array();
361
-        foreach ($statements as $statement) {
362
-            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
363
-            if (trim($statement) === '"'
364
-                || trim($statement) === "'"
365
-                || (
366
-                    (substr($statement, 0, 1) === '"'
367
-                        || substr($statement, 0, 1) === "'")
368
-                    && substr($statement, 1, 1) === ' '
369
-                    && ctype_lower(substr($statement, 2, 1)) === true
370
-                )
371
-            ) {
372
-                $statement = $previous_statement . $statement;
373
-            } else {
374
-                $i++;
375
-            }
376
-
377
-            $return[$i] = $statement;
378
-            $previous_statement = $statement;
379
-        }
380
-
381
-        return $return;
382
-    }
383
-
384
-    /**
385
-     * Merges items into larger sentences.
386
-     * Multibyte safe
387
-     *
388
-     * @param string[] $shorts
389
-     * @return string[]
390
-     */
391
-    private function sentenceMerge($shorts)
392
-    {
393
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
394
-
395
-        $sentences = array();
396
-
397
-        $sentence = '';
398
-        $has_words = false;
399
-        $previous_word_ending = null;
400
-        foreach ($shorts as $short) {
401
-            $word_count = count(mb_split('\s+', self::mbTrim($short)));
402
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
403
-
404
-            if ($after_non_abbreviating_terminal
405
-                || ($has_words && $word_count > 1)) {
406
-                $sentences[] = $sentence;
407
-                $sentence = '';
408
-                $has_words = $word_count > 1;
409
-            } else {
410
-                $has_words = ($has_words
411
-                    || $word_count > 1);
412
-            }
413
-
414
-            $sentence .= $short;
415
-            $previous_word_ending = mb_substr($short, -1);
416
-        }
417
-        if (!empty($sentence)) {
418
-            $sentences[] = $sentence;
419
-        }
420
-
421
-        return $sentences;
422
-    }
423
-
424
-    /**
425
-     * Return the sentences sentences detected in the provided text.
426
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
427
-     * @param string $text
428
-     * @param integer $flags
429
-     * @return string[]
430
-     */
431
-    public function split($text, $flags = 0)
432
-    {
433
-        $sentences = array();
434
-
435
-        // clean funny quotes
436
-        $text = self::cleanUnicode($text);
437
-
438
-        // Split
439
-        foreach (self::linebreakSplit($text) as $line) {
440
-            if (self::mbTrim($line) !== '') {
441
-                $punctuations = $this->punctuationSplit($line);
442
-                $parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
443
-                $merges = $this->punctuationMerge($parentheses);
444
-                $shorts = $this->abbreviationMerge($merges);
445
-                $quotes = $this->closeQuotesMerge($shorts);
446
-                $sentences = array_merge($sentences, $this->sentenceMerge($quotes));
447
-            }
448
-        }
449
-
450
-        // Post process
451
-        if ($flags & self::SPLIT_TRIM) {
452
-            return self::trimSentences($sentences);
453
-        }
454
-
455
-        return $sentences;
456
-    }
457
-
458
-    /**
459
-     * Multibyte trim each string in an array.
460
-     * @param string[] $sentences
461
-     * @return string[]
462
-     */
463
-    private static function trimSentences($sentences)
464
-    {
465
-        $trimmed = array();
466
-        foreach ($sentences as $sentence) {
467
-            $trimmed[] = self::mbTrim($sentence);
468
-        }
469
-        return $trimmed;
470
-    }
471
-
472
-    /**
473
-     * Return the number of sentences detected in the provided text.
474
-     * @param string $text
475
-     * @return integer
476
-     */
477
-    public function count($text)
478
-    {
479
-        return count($this->split($text));
480
-    }
18
+	/**
19
+	 * Specify this flag with the split method to trim whitespace.
20
+	 */
21
+	const SPLIT_TRIM = 0x1;
22
+
23
+	/**
24
+	 * List of characters used to terminate sentences.
25
+	 *
26
+	 * @var string[]
27
+	 */
28
+	private $terminals = array('.', '!', '?');
29
+
30
+	/**
31
+	 * List of characters used for abbreviations.
32
+	 *
33
+	 * @var string[]
34
+	 */
35
+	private $abbreviators = array('.');
36
+
37
+	/**
38
+	 * Multibyte safe version of standard trim() function.
39
+	 *
40
+	 * @param string $string
41
+	 * @return string
42
+	 */
43
+	private static function mbTrim($string)
44
+	{
45
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
46
+	}
47
+
48
+	/**
49
+	 * A cross between mb_split and preg_split, adding the preg_split flags
50
+	 * to mb_split.
51
+	 *
52
+	 * @param string $pattern
53
+	 * @param string $string
54
+	 * @param int $limit
55
+	 * @param int $flags
56
+	 * @return array
57
+	 */
58
+	private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
59
+	{
60
+		$split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
61
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
+		$delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
63
+
64
+		$strlen = strlen($string); // bytes!
65
+		mb_ereg_search_init($string);
66
+
67
+		$lengths = array();
68
+		$position = 0;
69
+		while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
70
+			// capture split
71
+			$lengths[] = array($array[0] - $position, false, null);
72
+
73
+			// move position
74
+			$position = $array[0] + $array[1];
75
+
76
+			// capture delimiter
77
+			$regs = mb_ereg_search_getregs();
78
+			$lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
79
+
80
+			// Continue on?
81
+			if ($position >= $strlen) {
82
+				break;
83
+			}
84
+		}
85
+
86
+		// Add last bit, if not ending with split
87
+		$lengths[] = array($strlen - $position, false, null);
88
+
89
+		// Substrings
90
+		$parts = array();
91
+		$position = 0;
92
+		$count = 1;
93
+		foreach ($lengths as $length) {
94
+			$split_empty = $length[0] || !$split_no_empty;
95
+			$is_delimiter = $length[1];
96
+			$is_captured = $length[2];
97
+
98
+			if ($limit > 0
99
+				&& !$is_delimiter
100
+				&& $split_empty
101
+				&& ++$count > $limit) {
102
+				if ($length[0] > 0
103
+					|| $split_empty) {
104
+					$parts[] = $offset_capture
105
+						? array(mb_strcut($string, $position), $position)
106
+						: mb_strcut($string, $position);
107
+				}
108
+				break;
109
+			} elseif ((!$is_delimiter
110
+					|| ($delim_capture
111
+						&& $is_captured))
112
+				&& ($length[0]
113
+					|| $split_empty)) {
114
+				$parts[] = $offset_capture
115
+					? array(mb_strcut($string, $position, $length[0]), $position)
116
+					: mb_strcut($string, $position, $length[0]);
117
+			}
118
+
119
+			$position += $length[0];
120
+		}
121
+
122
+		return $parts;
123
+	}
124
+
125
+	/**
126
+	 * Breaks a piece of text into lines by linebreak.
127
+	 * Eats up any linebreak characters as if one.
128
+	 *
129
+	 * Multibyte safe
130
+	 *
131
+	 * @param string $text
132
+	 * @return string[]
133
+	 */
134
+	private static function linebreakSplit($text)
135
+	{
136
+		$lines = array();
137
+		$line = '';
138
+
139
+		foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
140
+			$line .= $part;
141
+			if (self::mbTrim($part) === '') {
142
+				$lines[] = $line;
143
+				$line = '';
144
+			}
145
+		}
146
+		$lines[] = $line;
147
+
148
+		return $lines;
149
+	}
150
+
151
+	/**
152
+	 * Replace
153
+	 *
154
+	 * @staticvar array $chr_map
155
+	 * @param string $string
156
+	 * @return string
157
+	 */
158
+	private static function cleanUnicode($string)
159
+	{
160
+		//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
161
+		static $character_map = array(
162
+			// Windows codepage 1252
163
+			"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
164
+			"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
165
+			"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
166
+			"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
167
+			"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
168
+			"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
169
+			"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
170
+			"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
171
+			// Regular Unicode     // U+0022 quotation mark (")
172
+			// U+0027 apostrophe     (')
173
+			"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
174
+			"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
175
+			"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
176
+			"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
177
+			"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
178
+			"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
179
+			"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
180
+			"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
181
+			"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
182
+			"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
183
+			"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
184
+			"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
185
+		);
186
+
187
+		$character = array_keys($character_map); // but: for efficiency you should
188
+		$replace = array_values($character_map); // pre-calculate these two arrays
189
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
190
+	}
191
+
192
+	/**
193
+	 * Splits an array of lines by (consecutive sequences of)
194
+	 * terminals, keeping terminals.
195
+	 *
196
+	 * Multibyte safe (atleast for UTF-8)
197
+	 *
198
+	 * For example:
199
+	 *    "There ... is. More!"
200
+	 *        ... becomes ...
201
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
202
+	 *
203
+	 * @param string $line
204
+	 * @return string[]
205
+	 */
206
+	private function punctuationSplit($line)
207
+	{
208
+		$parts = array();
209
+
210
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
211
+		$is_terminal = in_array($chars[0], $this->terminals);
212
+
213
+		$part = '';
214
+		foreach ($chars as $index => $char) {
215
+			if (in_array($char, $this->terminals) !== $is_terminal) {
216
+				$parts[] = $part;
217
+				$part = '';
218
+				$is_terminal = !$is_terminal;
219
+			}
220
+			$part .= $char;
221
+		}
222
+
223
+		if (!empty($part)) {
224
+			$parts[] = $part;
225
+		}
226
+
227
+		return $parts;
228
+	}
229
+
230
+	/**
231
+	 * Appends each terminal item after it's preceding
232
+	 * non-terminals.
233
+	 *
234
+	 * Multibyte safe (atleast for UTF-8)
235
+	 *
236
+	 * For example:
237
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
238
+	 *        ... becomes ...
239
+	 *    [ "There ... is.", "More!" ]
240
+	 *
241
+	 * @param string[] $punctuations
242
+	 * @return string[]
243
+	 */
244
+	private function punctuationMerge($punctuations)
245
+	{
246
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
247
+
248
+		$merges = array();
249
+		$merge = '';
250
+
251
+		foreach ($punctuations as $punctuation) {
252
+			if ($punctuation !== '') {
253
+				$merge .= $punctuation;
254
+				if (mb_strlen($punctuation) === 1
255
+					&& in_array($punctuation, $this->terminals)) {
256
+					$merges[] = $merge;
257
+					$merge = '';
258
+				} else {
259
+					foreach ($definite_terminals as $terminal) {
260
+						if (mb_strpos($punctuation, $terminal) !== false) {
261
+							$merges[] = $merge;
262
+							$merge = '';
263
+							break;
264
+						}
265
+					}
266
+				}
267
+			}
268
+		}
269
+		if (!empty($merge)) {
270
+			$merges[] = $merge;
271
+		}
272
+
273
+		return $merges;
274
+	}
275
+
276
+	/**
277
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
278
+	 *
279
+	 * For example:
280
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
281
+	 *        ... becomes ...
282
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
283
+	 *  [ "Mr. Comey was not available for comment." ]
284
+	 *
285
+	 * @param string[] $fragments
286
+	 * @return string[]
287
+	 */
288
+	private function abbreviationMerge($fragments)
289
+	{
290
+		$return_fragment = array();
291
+
292
+		$previous_string = '';
293
+		$previous_is_abbreviation = false;
294
+		$i = 0;
295
+
296
+		foreach ($fragments as $fragment) {
297
+			$current_string = $fragment;
298
+			$words = mb_split('\s+', self::mbTrim($fragment));
299
+
300
+			$word_count = count($words);
301
+
302
+			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
303
+			$last_word = trim($words[$word_count - 1]);
304
+			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
305
+			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
306
+			$is_abbreviation = $last_is_capital > 0
307
+				&& $last_is_abbreviation > 0
308
+				&& mb_strlen($last_word) <= 3;
309
+
310
+			// merge previous fragment with this
311
+			if ($previous_is_abbreviation === true) {
312
+				$current_string = $previous_string . $current_string;
313
+			}
314
+			$return_fragment[$i] = $current_string;
315
+
316
+			$previous_is_abbreviation = $is_abbreviation;
317
+			$previous_string = $current_string;
318
+			// only increment if this isn't an abbreviation
319
+			if ($is_abbreviation === false) {
320
+				$i++;
321
+			}
322
+		}
323
+		return $return_fragment;
324
+	}
325
+
326
+	/**
327
+	 * Merges any part starting with a closing parenthesis ')' to the previous
328
+	 * part.
329
+	 *
330
+	 * @param string[] $parts
331
+	 * @return string[]
332
+	 */
333
+	private function parenthesesMerge($parts)
334
+	{
335
+		$subsentences = array();
336
+
337
+		foreach ($parts as $part) {
338
+			if ($part[0] === ')') {
339
+				$subsentences[count($subsentences) - 1] .= $part;
340
+			} else {
341
+				$subsentences[] = $part;
342
+			}
343
+		}
344
+
345
+		return $subsentences;
346
+	}
347
+
348
+	/**
349
+	 * Looks for closing quotes to include them with the previous statement.
350
+	 * "That was very interesting," he said.
351
+	 * "That was very interesting."
352
+	 *
353
+	 * @param string[] $statements
354
+	 * @return string[]
355
+	 */
356
+	private function closeQuotesMerge($statements)
357
+	{
358
+		$i = 0;
359
+		$previous_statement = "";
360
+		$return = array();
361
+		foreach ($statements as $statement) {
362
+			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
363
+			if (trim($statement) === '"'
364
+				|| trim($statement) === "'"
365
+				|| (
366
+					(substr($statement, 0, 1) === '"'
367
+						|| substr($statement, 0, 1) === "'")
368
+					&& substr($statement, 1, 1) === ' '
369
+					&& ctype_lower(substr($statement, 2, 1)) === true
370
+				)
371
+			) {
372
+				$statement = $previous_statement . $statement;
373
+			} else {
374
+				$i++;
375
+			}
376
+
377
+			$return[$i] = $statement;
378
+			$previous_statement = $statement;
379
+		}
380
+
381
+		return $return;
382
+	}
383
+
384
+	/**
385
+	 * Merges items into larger sentences.
386
+	 * Multibyte safe
387
+	 *
388
+	 * @param string[] $shorts
389
+	 * @return string[]
390
+	 */
391
+	private function sentenceMerge($shorts)
392
+	{
393
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
394
+
395
+		$sentences = array();
396
+
397
+		$sentence = '';
398
+		$has_words = false;
399
+		$previous_word_ending = null;
400
+		foreach ($shorts as $short) {
401
+			$word_count = count(mb_split('\s+', self::mbTrim($short)));
402
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
403
+
404
+			if ($after_non_abbreviating_terminal
405
+				|| ($has_words && $word_count > 1)) {
406
+				$sentences[] = $sentence;
407
+				$sentence = '';
408
+				$has_words = $word_count > 1;
409
+			} else {
410
+				$has_words = ($has_words
411
+					|| $word_count > 1);
412
+			}
413
+
414
+			$sentence .= $short;
415
+			$previous_word_ending = mb_substr($short, -1);
416
+		}
417
+		if (!empty($sentence)) {
418
+			$sentences[] = $sentence;
419
+		}
420
+
421
+		return $sentences;
422
+	}
423
+
424
+	/**
425
+	 * Return the sentences sentences detected in the provided text.
426
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
427
+	 * @param string $text
428
+	 * @param integer $flags
429
+	 * @return string[]
430
+	 */
431
+	public function split($text, $flags = 0)
432
+	{
433
+		$sentences = array();
434
+
435
+		// clean funny quotes
436
+		$text = self::cleanUnicode($text);
437
+
438
+		// Split
439
+		foreach (self::linebreakSplit($text) as $line) {
440
+			if (self::mbTrim($line) !== '') {
441
+				$punctuations = $this->punctuationSplit($line);
442
+				$parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
443
+				$merges = $this->punctuationMerge($parentheses);
444
+				$shorts = $this->abbreviationMerge($merges);
445
+				$quotes = $this->closeQuotesMerge($shorts);
446
+				$sentences = array_merge($sentences, $this->sentenceMerge($quotes));
447
+			}
448
+		}
449
+
450
+		// Post process
451
+		if ($flags & self::SPLIT_TRIM) {
452
+			return self::trimSentences($sentences);
453
+		}
454
+
455
+		return $sentences;
456
+	}
457
+
458
+	/**
459
+	 * Multibyte trim each string in an array.
460
+	 * @param string[] $sentences
461
+	 * @return string[]
462
+	 */
463
+	private static function trimSentences($sentences)
464
+	{
465
+		$trimmed = array();
466
+		foreach ($sentences as $sentence) {
467
+			$trimmed[] = self::mbTrim($sentence);
468
+		}
469
+		return $trimmed;
470
+	}
471
+
472
+	/**
473
+	 * Return the number of sentences detected in the provided text.
474
+	 * @param string $text
475
+	 * @return integer
476
+	 */
477
+	public function count($text)
478
+	{
479
+		return count($this->split($text));
480
+	}
481 481
 
482 482
 }
Please login to merge, or discard this patch.