Passed
Pull Request — master (#21)
by
unknown
01:40
created
src/Sentence.php 1 patch
Indentation   +455 added lines, -455 removed lines patch added patch discarded remove patch
@@ -17,460 +17,460 @@
 block discarded – undo
17 17
 class Sentence
18 18
 {
19 19
 
20
-    /**
21
-     * Specify this flag with the split method to trim whitespace.
22
-     */
23
-    const SPLIT_TRIM = 0x1;
24
-
25
-    /**
26
-     * List of characters used to terminate sentences.
27
-     *
28
-     * @var string[]
29
-     */
30
-    private $terminals = ['.', '!', '?'];
31
-
32
-    /**
33
-     * List of characters used for abbreviations.
34
-     *
35
-     * @var string[]
36
-     */
37
-    private $abbreviators = ['.'];
38
-
39
-    /**
40
-     * List of replacements in the text.
41
-     *
42
-     * @var string[]
43
-     */
44
-    private $replacements = [];
45
-
46
-    /**
47
-     * Generate an in-text replacement code for the specified index
48
-     *
49
-     * @param int $index
50
-     *
51
-     * @return string
52
-     */
53
-    private function getReplaceCode(int $index)
54
-    {
55
-        return 0x02 . $index . 0x03;
56
-    }
57
-
58
-    /**
59
-     * Clean floating point numbers by replace them with an in-text index
60
-     *
61
-     * @param string $text
62
-     *
63
-     * @return string
64
-     */
65
-    private function replaceFloatNumbers(string $text)
66
-    {
67
-        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
68
-
69
-        $this->replacements = [];
70
-        $index = 0;
71
-        foreach (array_reverse($matches[0]) as $match) {
72
-            $number = $match[0];
73
-            $offset = $match[1];
74
-            $code = $this->getReplaceCode($index);
75
-
76
-            $this->replacements[$index] = $number;
77
-
78
-            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
79
-
80
-            ++$index;
81
-        }
82
-
83
-        return $text;
84
-    }
85
-
86
-    /**
87
-     * Restore any stored replacements
88
-     *
89
-     * @param string[] $text
90
-     *
91
-     * @return string[]
92
-     */
93
-    private function restoreReplacements($text)
94
-    {
95
-        return array_map(function ($value) {
96
-            foreach ($this->replacements as $index => $number) {
97
-                $code = $this->getReplaceCode($index);
98
-                $value = str_replace($code, $number, $value);
99
-            }
100
-
101
-            return $value;
102
-        }, $text);
103
-    }
104
-
105
-    /**
106
-     * Breaks a piece of text into lines by linebreak.
107
-     * Eats up any linebreak characters as if one.
108
-     *
109
-     * Multibyte.php safe
110
-     *
111
-     * @param string $text
112
-     *
113
-     * @return string[]
114
-     */
115
-    private static function linebreakSplit($text)
116
-    {
117
-        $lines = [];
118
-        $line = '';
119
-
120
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
121
-            $line .= $part;
122
-            if (Multibyte::trim($part) === '') {
123
-                $lines[] = $line;
124
-                $line = '';
125
-            }
126
-        }
127
-        $lines[] = $line;
128
-
129
-        return $lines;
130
-    }
131
-
132
-    /**
133
-     * Splits an array of lines by (consecutive sequences of)
134
-     * terminals, keeping terminals.
135
-     *
136
-     * Multibyte.php safe (atleast for UTF-8)
137
-     *
138
-     * For example:
139
-     *    "There ... is. More!"
140
-     *        ... becomes ...
141
-     *    [ "There ", "...", " is", ".", " More", "!" ]
142
-     *
143
-     * @param string $line
144
-     *
145
-     * @return string[]
146
-     */
147
-    private function punctuationSplit($line)
148
-    {
149
-        $parts = [];
150
-
151
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
152
-        //add space after each terminal because every sentence ends with terminal and whitespace
153
-        $terminals = array_map(function($terminal)
154
-            {
155
-                return sprintf('%s ', $terminal);
156
-            },
157
-            $this->terminals
158
-        );
159
-        $is_terminal = in_array($chars[0], $terminals);
160
-
161
-        $part = '';
162
-        foreach ($chars as $char) {
163
-            if (in_array($char, $terminals) !== $is_terminal) {
164
-                $parts[] = $part;
165
-                $part = '';
166
-                $is_terminal = !$is_terminal;
167
-            }
168
-            $part .= $char;
169
-        }
170
-
171
-        if (!empty($part)) {
172
-            $parts[] = $part;
173
-        }
174
-
175
-        return $parts;
176
-    }
177
-
178
-
179
-    /**
180
-     * Appends each terminal item after it's preceding
181
-     * non-terminals.
182
-     *
183
-     * Multibyte.php safe (atleast for UTF-8)
184
-     *
185
-     * For example:
186
-     *    [ "There ", "...", " is", ".", " More", "!" ]
187
-     *        ... becomes ...
188
-     *    [ "There ... is.", "More!" ]
189
-     *
190
-     * @param string[] $punctuations
191
-     *
192
-     * @return string[]
193
-     */
194
-    private function punctuationMerge($punctuations)
195
-    {
196
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
197
-
198
-        $merges = [];
199
-        $merge = '';
200
-
201
-        $filtered = array_filter($punctuations, function ($p) {
202
-            return $p !== '';
203
-        });
204
-
205
-        foreach ($filtered as $punctuation) {
206
-            $merge .= $punctuation;
207
-            if (mb_strlen($punctuation) === 1
208
-                && in_array($punctuation, $this->terminals)) {
209
-                $merges[] = $merge;
210
-                $merge = '';
211
-            } else {
212
-                foreach ($definite_terminals as $terminal) {
213
-                    if (mb_strpos($punctuation, $terminal) !== false) {
214
-                        $merges[] = $merge;
215
-                        $merge = '';
216
-                        break;
217
-                    }
218
-                }
219
-            }
220
-        }
221
-        if (!empty($merge)) {
222
-            $merges[] = $merge;
223
-        }
224
-
225
-        return $merges;
226
-    }
227
-
228
-    /**
229
-     * Looks for capitalized abbreviations & includes them with the following fragment.
230
-     *
231
-     * For example:
232
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
233
-     *        ... becomes ...
234
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
235
-     *  [ "Mr. Comey was not available for comment." ]
236
-     *
237
-     * @param string[] $fragments
238
-     *
239
-     * @return string[]
240
-     */
241
-    private function abbreviationMerge($fragments)
242
-    {
243
-        $return_fragment = [];
244
-
245
-        $previous_fragment = '';
246
-        $previous_is_abbreviation = false;
247
-        $i = 0;
248
-        foreach ($fragments as $fragment) {
249
-            $is_abbreviation = self::isAbreviation($fragment);
250
-
251
-            // merge previous fragment with this
252
-            if ($previous_is_abbreviation) {
253
-                $fragment = $previous_fragment . $fragment;
254
-            }
255
-            $return_fragment[$i] = $fragment;
256
-
257
-            $previous_is_abbreviation = $is_abbreviation;
258
-            $previous_fragment = $fragment;
259
-
260
-            // only increment if this isn't an abbreviation
261
-            if (!$is_abbreviation) {
262
-                $i++;
263
-            }
264
-        }
265
-
266
-        return $return_fragment;
267
-    }
268
-
269
-    /**
270
-     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
271
-     *
272
-     * @param $fragment
273
-     *
274
-     * @return bool
275
-     */
276
-    private static function isAbreviation($fragment)
277
-    {
278
-        $words = mb_split('\s+', Multibyte::trim($fragment));
279
-
280
-        $word_count = count($words);
281
-
282
-        $last_word = Multibyte::trim($words[$word_count - 1]);
283
-        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
284
-        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
285
-
286
-        return $last_is_capital > 0
287
-            && $last_is_abbreviation > 0
288
-            && mb_strlen($last_word) <= 3;
289
-    }
290
-
291
-    /**
292
-     * Merges any part starting with a closing parenthesis ')' to the previous
293
-     * part.
294
-     *
295
-     * @param string[] $parts
296
-     *
297
-     * @return string[]
298
-     */
299
-    private function parenthesesMerge($parts)
300
-    {
301
-        $subsentences = [];
302
-
303
-        foreach ($parts as $part) {
304
-            if ($part[0] === ')' && !empty($subsentences)) {
305
-                $subsentences[count($subsentences) - 1] .= $part;
306
-            } else {
307
-                $subsentences[] = $part;
308
-            }
309
-        }
310
-
311
-        return $subsentences;
312
-    }
313
-
314
-    /**
315
-     * Looks for closing quotes to include them with the previous statement.
316
-     * "That was very interesting," he said.
317
-     * "That was very interesting."
318
-     *
319
-     * @param string[] $statements
320
-     *
321
-     * @return string[]
322
-     */
323
-    private function closeQuotesMerge($statements)
324
-    {
325
-        $i = 0;
326
-        $previous_statement = '';
327
-        $return = [];
328
-        foreach ($statements as $statement) {
329
-            if (self::isEndQuote($statement)) {
330
-                $statement = $previous_statement . $statement;
331
-            } else {
332
-                $i++;
333
-            }
334
-
335
-            $return[$i] = $statement;
336
-            $previous_statement = $statement;
337
-        }
338
-
339
-        return $return;
340
-    }
341
-
342
-    /**
343
-     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
344
-     *
345
-     * @param $statement
346
-     *
347
-     * @return bool
348
-     */
349
-    private static function isEndQuote($statement)
350
-    {
351
-        $trimmed = Multibyte::trim($statement);
352
-        $first = mb_substr($statement, 0, 1);
353
-
354
-        return in_array($trimmed, ['"', '\''])
355
-            || (
356
-                in_array($first, ['"', '\''])
357
-                && mb_substr($statement, 1, 1) === ' '
358
-                && ctype_lower(mb_substr($statement, 2, 1)) === true
359
-            );
360
-    }
361
-
362
-    /**
363
-     * Merges items into larger sentences.
364
-     * Multibyte.php safe
365
-     *
366
-     * @param string[] $shorts
367
-     *
368
-     * @return string[]
369
-     */
370
-    private function sentenceMerge($shorts)
371
-    {
372
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
373
-
374
-        $sentences = [];
375
-
376
-        $sentence = '';
377
-        $has_words = false;
378
-        $previous_word_ending = null;
379
-        foreach ($shorts as $short) {
380
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
381
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
382
-
383
-            if ($after_non_abbreviating_terminal
384
-                || ($has_words && $word_count > 1)) {
385
-
386
-                $sentences[] = $sentence;
387
-
388
-                $sentence = '';
389
-                $has_words = false;
390
-            }
391
-
392
-            $has_words = $has_words
393
-                || $word_count > 1;
394
-
395
-            $sentence .= $short;
396
-            $previous_word_ending = mb_substr($short, -1);
397
-        }
398
-
399
-        if (!empty($sentence)) {
400
-            $sentences[] = $sentence;
401
-        }
402
-
403
-        return $sentences;
404
-    }
405
-
406
-    /**
407
-     * Return the sentences sentences detected in the provided text.
408
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
409
-     *
410
-     * @param string  $text
411
-     * @param integer $flags
412
-     *
413
-     * @return string[]
414
-     */
415
-    public function split($text, $flags = 0)
416
-    {
417
-        static $pipeline = [
418
-            'replaceFloatNumbers',
419
-            'punctuationSplit',
420
-            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
421
-            'punctuationMerge',
422
-            'abbreviationMerge',
423
-            'closeQuotesMerge',
424
-            'sentenceMerge',
425
-            'restoreReplacements',
426
-        ];
427
-
428
-        // clean funny quotes
429
-        $text = Multibyte::cleanUnicode($text);
430
-
431
-        // Split
432
-        $sentences = [];
433
-        foreach (self::linebreakSplit($text) as $input) {
434
-            if (Multibyte::trim($input) !== '') {
435
-                foreach ($pipeline as $method) {
436
-                    $input = $this->$method($input);
437
-                }
438
-                $sentences = array_merge($sentences, $input);
439
-            }
440
-        }
441
-
442
-        // Post process
443
-        if ($flags & self::SPLIT_TRIM) {
444
-            return self::trimSentences($sentences);
445
-        }
446
-
447
-        return $sentences;
448
-    }
449
-
450
-    /**
451
-     * Multibyte.php trim each string in an array.
452
-     *
453
-     * @param string[] $sentences
454
-     *
455
-     * @return string[]
456
-     */
457
-    private static function trimSentences($sentences)
458
-    {
459
-        return array_map(function ($sentence) {
460
-            return Multibyte::trim($sentence);
461
-        }, $sentences);
462
-    }
463
-
464
-    /**
465
-     * Return the number of sentences detected in the provided text.
466
-     *
467
-     * @param string $text
468
-     *
469
-     * @return integer
470
-     */
471
-    public function count($text)
472
-    {
473
-        return count($this->split($text));
474
-    }
20
+	/**
21
+	 * Specify this flag with the split method to trim whitespace.
22
+	 */
23
+	const SPLIT_TRIM = 0x1;
24
+
25
+	/**
26
+	 * List of characters used to terminate sentences.
27
+	 *
28
+	 * @var string[]
29
+	 */
30
+	private $terminals = ['.', '!', '?'];
31
+
32
+	/**
33
+	 * List of characters used for abbreviations.
34
+	 *
35
+	 * @var string[]
36
+	 */
37
+	private $abbreviators = ['.'];
38
+
39
+	/**
40
+	 * List of replacements in the text.
41
+	 *
42
+	 * @var string[]
43
+	 */
44
+	private $replacements = [];
45
+
46
+	/**
47
+	 * Generate an in-text replacement code for the specified index
48
+	 *
49
+	 * @param int $index
50
+	 *
51
+	 * @return string
52
+	 */
53
+	private function getReplaceCode(int $index)
54
+	{
55
+		return 0x02 . $index . 0x03;
56
+	}
57
+
58
+	/**
59
+	 * Clean floating point numbers by replace them with an in-text index
60
+	 *
61
+	 * @param string $text
62
+	 *
63
+	 * @return string
64
+	 */
65
+	private function replaceFloatNumbers(string $text)
66
+	{
67
+		preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
68
+
69
+		$this->replacements = [];
70
+		$index = 0;
71
+		foreach (array_reverse($matches[0]) as $match) {
72
+			$number = $match[0];
73
+			$offset = $match[1];
74
+			$code = $this->getReplaceCode($index);
75
+
76
+			$this->replacements[$index] = $number;
77
+
78
+			$text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
79
+
80
+			++$index;
81
+		}
82
+
83
+		return $text;
84
+	}
85
+
86
+	/**
87
+	 * Restore any stored replacements
88
+	 *
89
+	 * @param string[] $text
90
+	 *
91
+	 * @return string[]
92
+	 */
93
+	private function restoreReplacements($text)
94
+	{
95
+		return array_map(function ($value) {
96
+			foreach ($this->replacements as $index => $number) {
97
+				$code = $this->getReplaceCode($index);
98
+				$value = str_replace($code, $number, $value);
99
+			}
100
+
101
+			return $value;
102
+		}, $text);
103
+	}
104
+
105
+	/**
106
+	 * Breaks a piece of text into lines by linebreak.
107
+	 * Eats up any linebreak characters as if one.
108
+	 *
109
+	 * Multibyte.php safe
110
+	 *
111
+	 * @param string $text
112
+	 *
113
+	 * @return string[]
114
+	 */
115
+	private static function linebreakSplit($text)
116
+	{
117
+		$lines = [];
118
+		$line = '';
119
+
120
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
121
+			$line .= $part;
122
+			if (Multibyte::trim($part) === '') {
123
+				$lines[] = $line;
124
+				$line = '';
125
+			}
126
+		}
127
+		$lines[] = $line;
128
+
129
+		return $lines;
130
+	}
131
+
132
+	/**
133
+	 * Splits an array of lines by (consecutive sequences of)
134
+	 * terminals, keeping terminals.
135
+	 *
136
+	 * Multibyte.php safe (atleast for UTF-8)
137
+	 *
138
+	 * For example:
139
+	 *    "There ... is. More!"
140
+	 *        ... becomes ...
141
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
142
+	 *
143
+	 * @param string $line
144
+	 *
145
+	 * @return string[]
146
+	 */
147
+	private function punctuationSplit($line)
148
+	{
149
+		$parts = [];
150
+
151
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
152
+		//add space after each terminal because every sentence ends with terminal and whitespace
153
+		$terminals = array_map(function($terminal)
154
+			{
155
+				return sprintf('%s ', $terminal);
156
+			},
157
+			$this->terminals
158
+		);
159
+		$is_terminal = in_array($chars[0], $terminals);
160
+
161
+		$part = '';
162
+		foreach ($chars as $char) {
163
+			if (in_array($char, $terminals) !== $is_terminal) {
164
+				$parts[] = $part;
165
+				$part = '';
166
+				$is_terminal = !$is_terminal;
167
+			}
168
+			$part .= $char;
169
+		}
170
+
171
+		if (!empty($part)) {
172
+			$parts[] = $part;
173
+		}
174
+
175
+		return $parts;
176
+	}
177
+
178
+
179
+	/**
180
+	 * Appends each terminal item after it's preceding
181
+	 * non-terminals.
182
+	 *
183
+	 * Multibyte.php safe (atleast for UTF-8)
184
+	 *
185
+	 * For example:
186
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
187
+	 *        ... becomes ...
188
+	 *    [ "There ... is.", "More!" ]
189
+	 *
190
+	 * @param string[] $punctuations
191
+	 *
192
+	 * @return string[]
193
+	 */
194
+	private function punctuationMerge($punctuations)
195
+	{
196
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
197
+
198
+		$merges = [];
199
+		$merge = '';
200
+
201
+		$filtered = array_filter($punctuations, function ($p) {
202
+			return $p !== '';
203
+		});
204
+
205
+		foreach ($filtered as $punctuation) {
206
+			$merge .= $punctuation;
207
+			if (mb_strlen($punctuation) === 1
208
+				&& in_array($punctuation, $this->terminals)) {
209
+				$merges[] = $merge;
210
+				$merge = '';
211
+			} else {
212
+				foreach ($definite_terminals as $terminal) {
213
+					if (mb_strpos($punctuation, $terminal) !== false) {
214
+						$merges[] = $merge;
215
+						$merge = '';
216
+						break;
217
+					}
218
+				}
219
+			}
220
+		}
221
+		if (!empty($merge)) {
222
+			$merges[] = $merge;
223
+		}
224
+
225
+		return $merges;
226
+	}
227
+
228
+	/**
229
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
230
+	 *
231
+	 * For example:
232
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
233
+	 *        ... becomes ...
234
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
235
+	 *  [ "Mr. Comey was not available for comment." ]
236
+	 *
237
+	 * @param string[] $fragments
238
+	 *
239
+	 * @return string[]
240
+	 */
241
+	private function abbreviationMerge($fragments)
242
+	{
243
+		$return_fragment = [];
244
+
245
+		$previous_fragment = '';
246
+		$previous_is_abbreviation = false;
247
+		$i = 0;
248
+		foreach ($fragments as $fragment) {
249
+			$is_abbreviation = self::isAbreviation($fragment);
250
+
251
+			// merge previous fragment with this
252
+			if ($previous_is_abbreviation) {
253
+				$fragment = $previous_fragment . $fragment;
254
+			}
255
+			$return_fragment[$i] = $fragment;
256
+
257
+			$previous_is_abbreviation = $is_abbreviation;
258
+			$previous_fragment = $fragment;
259
+
260
+			// only increment if this isn't an abbreviation
261
+			if (!$is_abbreviation) {
262
+				$i++;
263
+			}
264
+		}
265
+
266
+		return $return_fragment;
267
+	}
268
+
269
+	/**
270
+	 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
271
+	 *
272
+	 * @param $fragment
273
+	 *
274
+	 * @return bool
275
+	 */
276
+	private static function isAbreviation($fragment)
277
+	{
278
+		$words = mb_split('\s+', Multibyte::trim($fragment));
279
+
280
+		$word_count = count($words);
281
+
282
+		$last_word = Multibyte::trim($words[$word_count - 1]);
283
+		$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
284
+		$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
285
+
286
+		return $last_is_capital > 0
287
+			&& $last_is_abbreviation > 0
288
+			&& mb_strlen($last_word) <= 3;
289
+	}
290
+
291
+	/**
292
+	 * Merges any part starting with a closing parenthesis ')' to the previous
293
+	 * part.
294
+	 *
295
+	 * @param string[] $parts
296
+	 *
297
+	 * @return string[]
298
+	 */
299
+	private function parenthesesMerge($parts)
300
+	{
301
+		$subsentences = [];
302
+
303
+		foreach ($parts as $part) {
304
+			if ($part[0] === ')' && !empty($subsentences)) {
305
+				$subsentences[count($subsentences) - 1] .= $part;
306
+			} else {
307
+				$subsentences[] = $part;
308
+			}
309
+		}
310
+
311
+		return $subsentences;
312
+	}
313
+
314
+	/**
315
+	 * Looks for closing quotes to include them with the previous statement.
316
+	 * "That was very interesting," he said.
317
+	 * "That was very interesting."
318
+	 *
319
+	 * @param string[] $statements
320
+	 *
321
+	 * @return string[]
322
+	 */
323
+	private function closeQuotesMerge($statements)
324
+	{
325
+		$i = 0;
326
+		$previous_statement = '';
327
+		$return = [];
328
+		foreach ($statements as $statement) {
329
+			if (self::isEndQuote($statement)) {
330
+				$statement = $previous_statement . $statement;
331
+			} else {
332
+				$i++;
333
+			}
334
+
335
+			$return[$i] = $statement;
336
+			$previous_statement = $statement;
337
+		}
338
+
339
+		return $return;
340
+	}
341
+
342
+	/**
343
+	 * Check if the entire string is a quotation mark or quote, then space, then lowercase.
344
+	 *
345
+	 * @param $statement
346
+	 *
347
+	 * @return bool
348
+	 */
349
+	private static function isEndQuote($statement)
350
+	{
351
+		$trimmed = Multibyte::trim($statement);
352
+		$first = mb_substr($statement, 0, 1);
353
+
354
+		return in_array($trimmed, ['"', '\''])
355
+			|| (
356
+				in_array($first, ['"', '\''])
357
+				&& mb_substr($statement, 1, 1) === ' '
358
+				&& ctype_lower(mb_substr($statement, 2, 1)) === true
359
+			);
360
+	}
361
+
362
+	/**
363
+	 * Merges items into larger sentences.
364
+	 * Multibyte.php safe
365
+	 *
366
+	 * @param string[] $shorts
367
+	 *
368
+	 * @return string[]
369
+	 */
370
+	private function sentenceMerge($shorts)
371
+	{
372
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
373
+
374
+		$sentences = [];
375
+
376
+		$sentence = '';
377
+		$has_words = false;
378
+		$previous_word_ending = null;
379
+		foreach ($shorts as $short) {
380
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
381
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
382
+
383
+			if ($after_non_abbreviating_terminal
384
+				|| ($has_words && $word_count > 1)) {
385
+
386
+				$sentences[] = $sentence;
387
+
388
+				$sentence = '';
389
+				$has_words = false;
390
+			}
391
+
392
+			$has_words = $has_words
393
+				|| $word_count > 1;
394
+
395
+			$sentence .= $short;
396
+			$previous_word_ending = mb_substr($short, -1);
397
+		}
398
+
399
+		if (!empty($sentence)) {
400
+			$sentences[] = $sentence;
401
+		}
402
+
403
+		return $sentences;
404
+	}
405
+
406
+	/**
407
+	 * Return the sentences sentences detected in the provided text.
408
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
409
+	 *
410
+	 * @param string  $text
411
+	 * @param integer $flags
412
+	 *
413
+	 * @return string[]
414
+	 */
415
+	public function split($text, $flags = 0)
416
+	{
417
+		static $pipeline = [
418
+			'replaceFloatNumbers',
419
+			'punctuationSplit',
420
+			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
421
+			'punctuationMerge',
422
+			'abbreviationMerge',
423
+			'closeQuotesMerge',
424
+			'sentenceMerge',
425
+			'restoreReplacements',
426
+		];
427
+
428
+		// clean funny quotes
429
+		$text = Multibyte::cleanUnicode($text);
430
+
431
+		// Split
432
+		$sentences = [];
433
+		foreach (self::linebreakSplit($text) as $input) {
434
+			if (Multibyte::trim($input) !== '') {
435
+				foreach ($pipeline as $method) {
436
+					$input = $this->$method($input);
437
+				}
438
+				$sentences = array_merge($sentences, $input);
439
+			}
440
+		}
441
+
442
+		// Post process
443
+		if ($flags & self::SPLIT_TRIM) {
444
+			return self::trimSentences($sentences);
445
+		}
446
+
447
+		return $sentences;
448
+	}
449
+
450
+	/**
451
+	 * Multibyte.php trim each string in an array.
452
+	 *
453
+	 * @param string[] $sentences
454
+	 *
455
+	 * @return string[]
456
+	 */
457
+	private static function trimSentences($sentences)
458
+	{
459
+		return array_map(function ($sentence) {
460
+			return Multibyte::trim($sentence);
461
+		}, $sentences);
462
+	}
463
+
464
+	/**
465
+	 * Return the number of sentences detected in the provided text.
466
+	 *
467
+	 * @param string $text
468
+	 *
469
+	 * @return integer
470
+	 */
471
+	public function count($text)
472
+	{
473
+		return count($this->split($text));
474
+	}
475 475
 
476 476
 }
Please login to merge, or discard this patch.