Passed
Push — master ( bd113f...e772b1 )
by Martijn
02:09
created
src/Sentence.php 1 patch
Indentation   +430 added lines, -430 removed lines patch added patch discarded remove patch
@@ -16,435 +16,435 @@
 block discarded – undo
16 16
  */
17 17
 class Sentence
18 18
 {
19
-    /**
20
-     * Specify this flag with the split method to trim whitespace.
21
-     */
22
-    const SPLIT_TRIM = 0x1;
23
-
24
-    /**
25
-     * List of characters used to terminate sentences.
26
-     *
27
-     * @var string[]
28
-     */
29
-    private $terminals = ['.', '!', '?'];
30
-
31
-    /**
32
-     * List of characters used for abbreviations.
33
-     *
34
-     * @var string[]
35
-     */
36
-    private $abbreviators = ['.'];
37
-
38
-    /**
39
-     * List of float numbers in the text
40
-     *
41
-     * @var string[]
42
-     */
43
-    private $replacements = [];
44
-
45
-    /**
46
-     * Generate an in-text replacement code for the specified index
47
-     *
48
-     * @param string $index
49
-     *
50
-     * @return string
51
-     */
52
-    private function getReplaceCode(string $index)
53
-    {
54
-        return 0x02 . $index . 0x03;
55
-    }
56
-
57
-    /**
58
-     * Clean floating point numbers by replace them with an in-text index
59
-     *
60
-     * @param string $text
61
-     *
62
-     * @return string
63
-     */
64
-    private function replaceFloatNumbers(string $text)
65
-    {
66
-        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
67
-
68
-        $this->replacements = [];
69
-        $index = 0;
70
-        foreach (array_reverse($matches[0]) as $match) {
71
-            $number = $match[0];
72
-            $offset = $match[1];
73
-            $code = $this->getReplaceCode($index);
74
-
75
-            $this->replacements[$index] = $number;
76
-
77
-            $text = substr_replace($text, $code, $offset, mb_strlen($number));
78
-
79
-            ++$index;
80
-        }
81
-
82
-        return $text;
83
-    }
84
-
85
-    /**
86
-     * Restore any stored replacements
87
-     *
88
-     * @param string[] $text
89
-     *
90
-     * @return string[]
91
-     */
92
-    private function restoreReplacements($text)
93
-    {
94
-        return array_map(function($value) {
95
-            foreach ($this->replacements as $index => $number) {
96
-                $code = $this->getReplaceCode($index);
97
-                $value = str_replace($code, $number, $value);
98
-            }
99
-            return $value;
100
-        }, $text);
101
-    }
102
-
103
-    /**
104
-     * Breaks a piece of text into lines by linebreak.
105
-     * Eats up any linebreak characters as if one.
106
-     *
107
-     * Multibyte.php safe
108
-     *
109
-     * @param string $text
110
-     * @return string[]
111
-     */
112
-    private static function linebreakSplit($text)
113
-    {
114
-        $lines = [];
115
-        $line = '';
116
-
117
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
118
-            $line .= $part;
119
-            if (Multibyte::trim($part) === '') {
120
-                $lines[] = $line;
121
-                $line = '';
122
-            }
123
-        }
124
-        $lines[] = $line;
125
-
126
-        return $lines;
127
-    }
128
-
129
-    /**
130
-     * Splits an array of lines by (consecutive sequences of)
131
-     * terminals, keeping terminals.
132
-     *
133
-     * Multibyte.php safe (atleast for UTF-8)
134
-     *
135
-     * For example:
136
-     *    "There ... is. More!"
137
-     *        ... becomes ...
138
-     *    [ "There ", "...", " is", ".", " More", "!" ]
139
-     *
140
-     * @param string $line
141
-     * @return string[]
142
-     */
143
-    private function punctuationSplit($line)
144
-    {
145
-        $parts = [];
146
-
147
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
148
-        $is_terminal = in_array($chars[0], $this->terminals);
149
-
150
-        $part = '';
151
-        foreach ($chars as $index => $char) {
152
-            if (in_array($char, $this->terminals) !== $is_terminal) {
153
-                $parts[] = $part;
154
-                $part = '';
155
-                $is_terminal = !$is_terminal;
156
-            }
157
-            $part .= $char;
158
-        }
159
-
160
-        if (!empty($part)) {
161
-            $parts[] = $part;
162
-        }
163
-
164
-        return $parts;
165
-    }
166
-
167
-    /**
168
-     * Appends each terminal item after it's preceding
169
-     * non-terminals.
170
-     *
171
-     * Multibyte.php safe (atleast for UTF-8)
172
-     *
173
-     * For example:
174
-     *    [ "There ", "...", " is", ".", " More", "!" ]
175
-     *        ... becomes ...
176
-     *    [ "There ... is.", "More!" ]
177
-     *
178
-     * @param string[] $punctuations
179
-     * @return string[]
180
-     */
181
-    private function punctuationMerge($punctuations)
182
-    {
183
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
184
-
185
-        $merges = [];
186
-        $merge = '';
187
-
188
-        $filtered = array_filter($punctuations, function ($p) {
189
-            return $p !== '';
190
-        });
191
-
192
-        foreach ($filtered as $punctuation) {
193
-            $merge .= $punctuation;
194
-            if (mb_strlen($punctuation) === 1
195
-                && in_array($punctuation, $this->terminals)) {
196
-                $merges[] = $merge;
197
-                $merge = '';
198
-            } else {
199
-                foreach ($definite_terminals as $terminal) {
200
-                    if (mb_strpos($punctuation, $terminal) !== false) {
201
-                        $merges[] = $merge;
202
-                        $merge = '';
203
-                        break;
204
-                    }
205
-                }
206
-            }
207
-        }
208
-        if (!empty($merge)) {
209
-            $merges[] = $merge;
210
-        }
211
-
212
-        return $merges;
213
-    }
214
-
215
-    /**
216
-     * Looks for capitalized abbreviations & includes them with the following fragment.
217
-     *
218
-     * For example:
219
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
220
-     *        ... becomes ...
221
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
222
-     *  [ "Mr. Comey was not available for comment." ]
223
-     *
224
-     * @param string[] $fragments
225
-     * @return string[]
226
-     */
227
-    private function abbreviationMerge($fragments)
228
-    {
229
-        $return_fragment = [];
230
-
231
-        $previous_fragment = '';
232
-        $previous_is_abbreviation = false;
233
-        $i = 0;
234
-        foreach ($fragments as $fragment) {
235
-            $is_abbreviation = self::isAbreviation($fragment);
236
-
237
-            // merge previous fragment with this
238
-            if ($previous_is_abbreviation) {
239
-                $fragment = $previous_fragment . $fragment;
240
-            }
241
-            $return_fragment[$i] = $fragment;
242
-
243
-            $previous_is_abbreviation = $is_abbreviation;
244
-            $previous_fragment = $fragment;
245
-
246
-            // only increment if this isn't an abbreviation
247
-            if (!$is_abbreviation) {
248
-                $i++;
249
-            }
250
-        }
251
-        return $return_fragment;
252
-    }
253
-
254
-    /**
255
-     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
256
-     *
257
-     * @param $fragment
258
-     * @return bool
259
-     */
260
-    private static function isAbreviation($fragment)
261
-    {
262
-        $words = mb_split('\s+', Multibyte::trim($fragment));
263
-
264
-        $word_count = count($words);
265
-
266
-        $last_word = Multibyte::trim($words[$word_count - 1]);
267
-        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
268
-        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
269
-
270
-        return $last_is_capital > 0
271
-            && $last_is_abbreviation > 0
272
-            && mb_strlen($last_word) <= 3;
273
-    }
274
-
275
-    /**
276
-     * Merges any part starting with a closing parenthesis ')' to the previous
277
-     * part.
278
-     *
279
-     * @param string[] $parts
280
-     * @return string[]
281
-     */
282
-    private function parenthesesMerge($parts)
283
-    {
284
-        $subsentences = [];
285
-
286
-        foreach ($parts as $part) {
287
-            if ($part[0] === ')') {
288
-                $subsentences[count($subsentences) - 1] .= $part;
289
-            } else {
290
-                $subsentences[] = $part;
291
-            }
292
-        }
293
-
294
-        return $subsentences;
295
-    }
296
-
297
-    /**
298
-     * Looks for closing quotes to include them with the previous statement.
299
-     * "That was very interesting," he said.
300
-     * "That was very interesting."
301
-     *
302
-     * @param string[] $statements
303
-     * @return string[]
304
-     */
305
-    private function closeQuotesMerge($statements)
306
-    {
307
-        $i = 0;
308
-        $previous_statement = '';
309
-        $return = [];
310
-        foreach ($statements as $statement) {
311
-            if (self::isEndQuote($statement)) {
312
-                $statement = $previous_statement . $statement;
313
-            } else {
314
-                $i++;
315
-            }
316
-
317
-            $return[$i] = $statement;
318
-            $previous_statement = $statement;
319
-        }
320
-
321
-        return $return;
322
-    }
323
-
324
-    /**
325
-     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
326
-     *
327
-     * @param $statement
328
-     * @return bool
329
-     */
330
-    private static function isEndQuote($statement)
331
-    {
332
-        $trimmed = Multibyte::trim($statement);
333
-        $first = mb_substr($statement, 0, 1);
334
-
335
-        return in_array($trimmed, ['"', '\''])
336
-            || (
337
-                in_array($first, ['"', '\''])
338
-                && mb_substr($statement, 1, 1) === ' '
339
-                && ctype_lower(mb_substr($statement, 2, 1)) === true
340
-            );
341
-    }
342
-
343
-    /**
344
-     * Merges items into larger sentences.
345
-     * Multibyte.php safe
346
-     *
347
-     * @param string[] $shorts
348
-     * @return string[]
349
-     */
350
-    private function sentenceMerge($shorts)
351
-    {
352
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
353
-
354
-        $sentences = [];
355
-
356
-        $sentence = '';
357
-        $has_words = false;
358
-        $previous_word_ending = null;
359
-        foreach ($shorts as $short) {
360
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
361
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
362
-
363
-            if ($after_non_abbreviating_terminal
364
-                || ($has_words && $word_count > 1)) {
365
-
366
-                $sentences[] = $sentence;
367
-
368
-                $sentence = '';
369
-                $has_words = false;
370
-            }
371
-
372
-            $has_words = $has_words
373
-                || $word_count > 1;
374
-
375
-            $sentence .= $short;
376
-            $previous_word_ending = mb_substr($short, -1);
377
-        }
378
-
379
-        if (!empty($sentence)) {
380
-            $sentences[] = $sentence;
381
-        }
382
-
383
-        return $sentences;
384
-    }
385
-
386
-    /**
387
-     * Return the sentences sentences detected in the provided text.
388
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
389
-     * @param string $text
390
-     * @param integer $flags
391
-     * @return string[]
392
-     */
393
-    public function split($text, $flags = 0)
394
-    {
395
-        static $pipeline = [
396
-            'replaceFloatNumbers',
397
-            'punctuationSplit',
398
-            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
399
-            'punctuationMerge',
400
-            'abbreviationMerge',
401
-            'closeQuotesMerge',
402
-            'sentenceMerge',
403
-            'restoreReplacements'
404
-        ];
405
-
406
-        // clean funny quotes
407
-        $text = Multibyte::cleanUnicode($text);
408
-
409
-        // Split
410
-        $sentences = [];
411
-        foreach (self::linebreakSplit($text) as $input) {
412
-            if (Multibyte::trim($input) !== '') {
413
-                foreach ($pipeline as $method) {
414
-                    $input = $this->$method($input);
415
-                }
416
-                $sentences = array_merge($sentences, $input);
417
-            }
418
-        }
419
-
420
-        // Post process
421
-        if ($flags & self::SPLIT_TRIM) {
422
-            return self::trimSentences($sentences);
423
-        }
424
-
425
-        return $sentences;
426
-    }
427
-
428
-    /**
429
-     * Multibyte.php trim each string in an array.
430
-     * @param string[] $sentences
431
-     * @return string[]
432
-     */
433
-    private static function trimSentences($sentences)
434
-    {
435
-        return array_map(function ($sentence) {
436
-            return Multibyte::trim($sentence);
437
-        }, $sentences);
438
-    }
439
-
440
-    /**
441
-     * Return the number of sentences detected in the provided text.
442
-     * @param string $text
443
-     * @return integer
444
-     */
445
-    public function count($text)
446
-    {
447
-        return count($this->split($text));
448
-    }
19
+	/**
20
+	 * Specify this flag with the split method to trim whitespace.
21
+	 */
22
+	const SPLIT_TRIM = 0x1;
23
+
24
+	/**
25
+	 * List of characters used to terminate sentences.
26
+	 *
27
+	 * @var string[]
28
+	 */
29
+	private $terminals = ['.', '!', '?'];
30
+
31
+	/**
32
+	 * List of characters used for abbreviations.
33
+	 *
34
+	 * @var string[]
35
+	 */
36
+	private $abbreviators = ['.'];
37
+
38
+	/**
39
+	 * List of float numbers in the text
40
+	 *
41
+	 * @var string[]
42
+	 */
43
+	private $replacements = [];
44
+
45
+	/**
46
+	 * Generate an in-text replacement code for the specified index
47
+	 *
48
+	 * @param string $index
49
+	 *
50
+	 * @return string
51
+	 */
52
+	private function getReplaceCode(string $index)
53
+	{
54
+		return 0x02 . $index . 0x03;
55
+	}
56
+
57
+	/**
58
+	 * Clean floating point numbers by replace them with an in-text index
59
+	 *
60
+	 * @param string $text
61
+	 *
62
+	 * @return string
63
+	 */
64
+	private function replaceFloatNumbers(string $text)
65
+	{
66
+		preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
67
+
68
+		$this->replacements = [];
69
+		$index = 0;
70
+		foreach (array_reverse($matches[0]) as $match) {
71
+			$number = $match[0];
72
+			$offset = $match[1];
73
+			$code = $this->getReplaceCode($index);
74
+
75
+			$this->replacements[$index] = $number;
76
+
77
+			$text = substr_replace($text, $code, $offset, mb_strlen($number));
78
+
79
+			++$index;
80
+		}
81
+
82
+		return $text;
83
+	}
84
+
85
+	/**
86
+	 * Restore any stored replacements
87
+	 *
88
+	 * @param string[] $text
89
+	 *
90
+	 * @return string[]
91
+	 */
92
+	private function restoreReplacements($text)
93
+	{
94
+		return array_map(function($value) {
95
+			foreach ($this->replacements as $index => $number) {
96
+				$code = $this->getReplaceCode($index);
97
+				$value = str_replace($code, $number, $value);
98
+			}
99
+			return $value;
100
+		}, $text);
101
+	}
102
+
103
+	/**
104
+	 * Breaks a piece of text into lines by linebreak.
105
+	 * Eats up any linebreak characters as if one.
106
+	 *
107
+	 * Multibyte.php safe
108
+	 *
109
+	 * @param string $text
110
+	 * @return string[]
111
+	 */
112
+	private static function linebreakSplit($text)
113
+	{
114
+		$lines = [];
115
+		$line = '';
116
+
117
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
118
+			$line .= $part;
119
+			if (Multibyte::trim($part) === '') {
120
+				$lines[] = $line;
121
+				$line = '';
122
+			}
123
+		}
124
+		$lines[] = $line;
125
+
126
+		return $lines;
127
+	}
128
+
129
+	/**
130
+	 * Splits an array of lines by (consecutive sequences of)
131
+	 * terminals, keeping terminals.
132
+	 *
133
+	 * Multibyte.php safe (atleast for UTF-8)
134
+	 *
135
+	 * For example:
136
+	 *    "There ... is. More!"
137
+	 *        ... becomes ...
138
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
139
+	 *
140
+	 * @param string $line
141
+	 * @return string[]
142
+	 */
143
+	private function punctuationSplit($line)
144
+	{
145
+		$parts = [];
146
+
147
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
148
+		$is_terminal = in_array($chars[0], $this->terminals);
149
+
150
+		$part = '';
151
+		foreach ($chars as $index => $char) {
152
+			if (in_array($char, $this->terminals) !== $is_terminal) {
153
+				$parts[] = $part;
154
+				$part = '';
155
+				$is_terminal = !$is_terminal;
156
+			}
157
+			$part .= $char;
158
+		}
159
+
160
+		if (!empty($part)) {
161
+			$parts[] = $part;
162
+		}
163
+
164
+		return $parts;
165
+	}
166
+
167
+	/**
168
+	 * Appends each terminal item after it's preceding
169
+	 * non-terminals.
170
+	 *
171
+	 * Multibyte.php safe (atleast for UTF-8)
172
+	 *
173
+	 * For example:
174
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
175
+	 *        ... becomes ...
176
+	 *    [ "There ... is.", "More!" ]
177
+	 *
178
+	 * @param string[] $punctuations
179
+	 * @return string[]
180
+	 */
181
+	private function punctuationMerge($punctuations)
182
+	{
183
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
184
+
185
+		$merges = [];
186
+		$merge = '';
187
+
188
+		$filtered = array_filter($punctuations, function ($p) {
189
+			return $p !== '';
190
+		});
191
+
192
+		foreach ($filtered as $punctuation) {
193
+			$merge .= $punctuation;
194
+			if (mb_strlen($punctuation) === 1
195
+				&& in_array($punctuation, $this->terminals)) {
196
+				$merges[] = $merge;
197
+				$merge = '';
198
+			} else {
199
+				foreach ($definite_terminals as $terminal) {
200
+					if (mb_strpos($punctuation, $terminal) !== false) {
201
+						$merges[] = $merge;
202
+						$merge = '';
203
+						break;
204
+					}
205
+				}
206
+			}
207
+		}
208
+		if (!empty($merge)) {
209
+			$merges[] = $merge;
210
+		}
211
+
212
+		return $merges;
213
+	}
214
+
215
+	/**
216
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
217
+	 *
218
+	 * For example:
219
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
220
+	 *        ... becomes ...
221
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
222
+	 *  [ "Mr. Comey was not available for comment." ]
223
+	 *
224
+	 * @param string[] $fragments
225
+	 * @return string[]
226
+	 */
227
+	private function abbreviationMerge($fragments)
228
+	{
229
+		$return_fragment = [];
230
+
231
+		$previous_fragment = '';
232
+		$previous_is_abbreviation = false;
233
+		$i = 0;
234
+		foreach ($fragments as $fragment) {
235
+			$is_abbreviation = self::isAbreviation($fragment);
236
+
237
+			// merge previous fragment with this
238
+			if ($previous_is_abbreviation) {
239
+				$fragment = $previous_fragment . $fragment;
240
+			}
241
+			$return_fragment[$i] = $fragment;
242
+
243
+			$previous_is_abbreviation = $is_abbreviation;
244
+			$previous_fragment = $fragment;
245
+
246
+			// only increment if this isn't an abbreviation
247
+			if (!$is_abbreviation) {
248
+				$i++;
249
+			}
250
+		}
251
+		return $return_fragment;
252
+	}
253
+
254
+	/**
255
+	 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
256
+	 *
257
+	 * @param $fragment
258
+	 * @return bool
259
+	 */
260
+	private static function isAbreviation($fragment)
261
+	{
262
+		$words = mb_split('\s+', Multibyte::trim($fragment));
263
+
264
+		$word_count = count($words);
265
+
266
+		$last_word = Multibyte::trim($words[$word_count - 1]);
267
+		$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
268
+		$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
269
+
270
+		return $last_is_capital > 0
271
+			&& $last_is_abbreviation > 0
272
+			&& mb_strlen($last_word) <= 3;
273
+	}
274
+
275
+	/**
276
+	 * Merges any part starting with a closing parenthesis ')' to the previous
277
+	 * part.
278
+	 *
279
+	 * @param string[] $parts
280
+	 * @return string[]
281
+	 */
282
+	private function parenthesesMerge($parts)
283
+	{
284
+		$subsentences = [];
285
+
286
+		foreach ($parts as $part) {
287
+			if ($part[0] === ')') {
288
+				$subsentences[count($subsentences) - 1] .= $part;
289
+			} else {
290
+				$subsentences[] = $part;
291
+			}
292
+		}
293
+
294
+		return $subsentences;
295
+	}
296
+
297
+	/**
298
+	 * Looks for closing quotes to include them with the previous statement.
299
+	 * "That was very interesting," he said.
300
+	 * "That was very interesting."
301
+	 *
302
+	 * @param string[] $statements
303
+	 * @return string[]
304
+	 */
305
+	private function closeQuotesMerge($statements)
306
+	{
307
+		$i = 0;
308
+		$previous_statement = '';
309
+		$return = [];
310
+		foreach ($statements as $statement) {
311
+			if (self::isEndQuote($statement)) {
312
+				$statement = $previous_statement . $statement;
313
+			} else {
314
+				$i++;
315
+			}
316
+
317
+			$return[$i] = $statement;
318
+			$previous_statement = $statement;
319
+		}
320
+
321
+		return $return;
322
+	}
323
+
324
+	/**
325
+	 * Check if the entire string is a quotation mark or quote, then space, then lowercase.
326
+	 *
327
+	 * @param $statement
328
+	 * @return bool
329
+	 */
330
+	private static function isEndQuote($statement)
331
+	{
332
+		$trimmed = Multibyte::trim($statement);
333
+		$first = mb_substr($statement, 0, 1);
334
+
335
+		return in_array($trimmed, ['"', '\''])
336
+			|| (
337
+				in_array($first, ['"', '\''])
338
+				&& mb_substr($statement, 1, 1) === ' '
339
+				&& ctype_lower(mb_substr($statement, 2, 1)) === true
340
+			);
341
+	}
342
+
343
+	/**
344
+	 * Merges items into larger sentences.
345
+	 * Multibyte.php safe
346
+	 *
347
+	 * @param string[] $shorts
348
+	 * @return string[]
349
+	 */
350
+	private function sentenceMerge($shorts)
351
+	{
352
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
353
+
354
+		$sentences = [];
355
+
356
+		$sentence = '';
357
+		$has_words = false;
358
+		$previous_word_ending = null;
359
+		foreach ($shorts as $short) {
360
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
361
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
362
+
363
+			if ($after_non_abbreviating_terminal
364
+				|| ($has_words && $word_count > 1)) {
365
+
366
+				$sentences[] = $sentence;
367
+
368
+				$sentence = '';
369
+				$has_words = false;
370
+			}
371
+
372
+			$has_words = $has_words
373
+				|| $word_count > 1;
374
+
375
+			$sentence .= $short;
376
+			$previous_word_ending = mb_substr($short, -1);
377
+		}
378
+
379
+		if (!empty($sentence)) {
380
+			$sentences[] = $sentence;
381
+		}
382
+
383
+		return $sentences;
384
+	}
385
+
386
+	/**
387
+	 * Return the sentences sentences detected in the provided text.
388
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
389
+	 * @param string $text
390
+	 * @param integer $flags
391
+	 * @return string[]
392
+	 */
393
+	public function split($text, $flags = 0)
394
+	{
395
+		static $pipeline = [
396
+			'replaceFloatNumbers',
397
+			'punctuationSplit',
398
+			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
399
+			'punctuationMerge',
400
+			'abbreviationMerge',
401
+			'closeQuotesMerge',
402
+			'sentenceMerge',
403
+			'restoreReplacements'
404
+		];
405
+
406
+		// clean funny quotes
407
+		$text = Multibyte::cleanUnicode($text);
408
+
409
+		// Split
410
+		$sentences = [];
411
+		foreach (self::linebreakSplit($text) as $input) {
412
+			if (Multibyte::trim($input) !== '') {
413
+				foreach ($pipeline as $method) {
414
+					$input = $this->$method($input);
415
+				}
416
+				$sentences = array_merge($sentences, $input);
417
+			}
418
+		}
419
+
420
+		// Post process
421
+		if ($flags & self::SPLIT_TRIM) {
422
+			return self::trimSentences($sentences);
423
+		}
424
+
425
+		return $sentences;
426
+	}
427
+
428
+	/**
429
+	 * Multibyte.php trim each string in an array.
430
+	 * @param string[] $sentences
431
+	 * @return string[]
432
+	 */
433
+	private static function trimSentences($sentences)
434
+	{
435
+		return array_map(function ($sentence) {
436
+			return Multibyte::trim($sentence);
437
+		}, $sentences);
438
+	}
439
+
440
+	/**
441
+	 * Return the number of sentences detected in the provided text.
442
+	 * @param string $text
443
+	 * @return integer
444
+	 */
445
+	public function count($text)
446
+	{
447
+		return count($this->split($text));
448
+	}
449 449
 
450 450
 }
Please login to merge, or discard this patch.