Passed
Push — master ( 20b79e...0831e5 )
by Martijn
01:32
created
classes/Sentence.php 2 patches
Indentation   +433 added lines, -433 removed lines patch added patch discarded remove patch
@@ -15,438 +15,438 @@
 block discarded – undo
15 15
 class Sentence
16 16
 {
17 17
 
18
-    /**
19
-     * Specify this flag with the split method to trim whitespace.
20
-     */
21
-    const SPLIT_TRIM = 0x1;
22
-
23
-    /**
24
-     * List of characters used to terminate sentences.
25
-     *
26
-     * @var string[]
27
-     */
28
-    private $terminals = array('.', '!', '?');
29
-
30
-    /**
31
-     * List of characters used for abbreviations.
32
-     *
33
-     * @var string[]
34
-     */
35
-    private $abbreviators = array('.');
36
-
37
-    /**
38
-     * Multibyte safe version of standard trim() function.
39
-     *
40
-     * @param string $string
41
-     * @return string
42
-     */
43
-    private static function mbTrim($string)
44
-    {
45
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
46
-    }
47
-
48
-    /**
49
-     * A cross between mb_split and preg_split, adding the preg_split flags
50
-     * to mb_split.
51
-     *
52
-     * @param string $pattern
53
-     * @param string $string
54
-     * @param int $limit
55
-     * @param int $flags
56
-     * @return array
57
-     */
58
-    private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
59
-    {
60
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
61
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
63
-
64
-        $strlen = strlen($string); // bytes!
65
-        mb_ereg_search_init($string);
66
-
67
-        $lengths = array();
68
-        $position = 0;
69
-        while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
70
-            // capture split
71
-            $lengths[] = array($array[0] - $position, false, null);
72
-
73
-            // move position
74
-            $position = $array[0] + $array[1];
75
-
76
-            // capture delimiter
77
-            $regs = mb_ereg_search_getregs();
78
-            $lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
79
-
80
-            // Continue on?
81
-            if ($position >= $strlen) {
82
-                break;
83
-            }
84
-        }
85
-
86
-        // Add last bit, if not ending with split
87
-        $lengths[] = array($strlen - $position, false, null);
88
-
89
-        // Substrings
90
-        $parts = array();
91
-        $position = 0;
92
-        $count = 1;
93
-        foreach ($lengths as $length) {
94
-            $split_empty = ($length[0] || !$split_no_empty);
95
-            $is_delimiter = $length[1];
96
-            $is_captured = $length[2];
97
-
98
-            if ($limit > 0 && !$is_delimiter && $split_empty && ++$count > $limit) {
99
-                if ($length[0] > 0 || $split_empty) {
100
-                    $parts[] = $offset_capture ? array(mb_strcut($string, $position), $position) : mb_strcut($string, $position);
101
-                }
102
-                break;
103
-            } elseif ((!$is_delimiter || ($delim_capture && $is_captured)) && ($length[0] || $split_empty)) {
104
-                $parts[] = $offset_capture ? array(mb_strcut($string, $position, $length[0]), $position) : mb_strcut($string, $position, $length[0]);
105
-            }
106
-
107
-            $position += $length[0];
108
-        }
109
-
110
-        return $parts;
111
-    }
112
-
113
-    /**
114
-     * Breaks a piece of text into lines by linebreak.
115
-     * Eats up any linebreak characters as if one.
116
-     *
117
-     * Multibyte safe
118
-     *
119
-     * @param string $text
120
-     * @return string[]
121
-     */
122
-    private static function linebreakSplit($text)
123
-    {
124
-        $lines = array();
125
-        $line = '';
126
-
127
-        foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
128
-            $line .= $part;
129
-            if (self::mbTrim($part) === '') {
130
-                $lines[] = $line;
131
-                $line = '';
132
-            }
133
-        }
134
-        $lines[] = $line;
135
-
136
-        return $lines;
137
-    }
138
-
139
-    /**
140
-     * Replace
141
-     *
142
-     * @staticvar array $chr_map
143
-     * @param string $string
144
-     * @return string
145
-     */
146
-    private static function cleanUnicode($string)
147
-    {
148
-        //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
149
-        static $character_map = array(
150
-            // Windows codepage 1252
151
-            "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
152
-            "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
153
-            "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
154
-            "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
155
-            "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
156
-            "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
157
-            "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
158
-            "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
159
-            // Regular Unicode     // U+0022 quotation mark (")
160
-            // U+0027 apostrophe     (')
161
-            "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
162
-            "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
163
-            "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
164
-            "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
165
-            "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
166
-            "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
167
-            "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
168
-            "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
169
-            "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
170
-            "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
171
-            "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
172
-            "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
173
-        );
174
-
175
-        $character = array_keys($character_map); // but: for efficiency you should
176
-        $replace = array_values($character_map); // pre-calculate these two arrays
177
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
178
-    }
179
-
180
-    /**
181
-     * Splits an array of lines by (consecutive sequences of)
182
-     * terminals, keeping terminals.
183
-     *
184
-     * Multibyte safe (atleast for UTF-8)
185
-     *
186
-     * For example:
187
-     *    "There ... is. More!"
188
-     *        ... becomes ...
189
-     *    [ "There ", "...", " is", ".", " More", "!" ]
190
-     *
191
-     * @param string $line
192
-     * @return string[]
193
-     */
194
-    private function punctuationSplit($line)
195
-    {
196
-        $parts = array();
197
-
198
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
199
-        $is_terminal = in_array($chars[0], $this->terminals);
200
-
201
-        $part = '';
202
-        foreach ($chars as $index => $char) {
203
-            if (in_array($char, $this->terminals) !== $is_terminal) {
204
-                $parts[] = $part;
205
-                $part = '';
206
-                $is_terminal = !$is_terminal;
207
-            }
208
-            $part .= $char;
209
-        }
210
-
211
-        if (!empty($part)) {
212
-            $parts[] = $part;
213
-        }
214
-
215
-        return $parts;
216
-    }
217
-
218
-    /**
219
-     * Appends each terminal item after it's preceding
220
-     * non-terminals.
221
-     *
222
-     * Multibyte safe (atleast for UTF-8)
223
-     *
224
-     * For example:
225
-     *    [ "There ", "...", " is", ".", " More", "!" ]
226
-     *        ... becomes ...
227
-     *    [ "There ... is.", "More!" ]
228
-     *
229
-     * @param string[] $punctuations
230
-     * @return string[]
231
-     */
232
-    private function punctuationMerge($punctuations)
233
-    {
234
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
235
-
236
-        $merges = array();
237
-        $merge = '';
238
-
239
-        foreach ($punctuations as $punctuation) {
240
-            if ($punctuation !== '') {
241
-                $merge .= $punctuation;
242
-                if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) {
243
-                    $merges[] = $merge;
244
-                    $merge = '';
245
-                } else {
246
-                    foreach ($definite_terminals as $terminal) {
247
-                        if (mb_strpos($punctuation, $terminal) !== false) {
248
-                            $merges[] = $merge;
249
-                            $merge = '';
250
-                            break;
251
-                        }
252
-                    }
253
-                }
254
-            }
255
-        }
256
-        if (!empty($merge)) {
257
-            $merges[] = $merge;
258
-        }
259
-
260
-        return $merges;
261
-    }
262
-
263
-    /**
264
-     * Looks for capitalized abbreviations & includes them with the following fragment.
265
-     *
266
-     * For example:
267
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
268
-     *        ... becomes ...
269
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
270
-     *  [ "Mr. Comey was not available for comment." ]
271
-     *
272
-     * @param string[] $fragments
273
-     * @return string[]
274
-     */
275
-    private function abbreviationMerge($fragments)
276
-    {
277
-        $return_fragment = array();
278
-
279
-        $previous_string = '';
280
-        $previous_is_abbreviation = false;
281
-        $i = 0;
282
-
283
-        foreach ($fragments as $fragment) {
284
-            $current_string = $fragment;
285
-            $words = mb_split('\s+', self::mbTrim($fragment));
286
-
287
-            $word_count = count($words);
288
-
289
-            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
290
-            $last_word = trim($words[$word_count - 1]);
291
-            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
292
-            $last_is_abbreviation = substr(trim($fragment), -1) == '.';
293
-            $is_abbreviation = $last_is_capital > 0 && $last_is_abbreviation > 0 && mb_strlen($last_word) <= 3;
294
-
295
-            // merge previous fragment with this
296
-            if ($previous_is_abbreviation === true) {
297
-                $current_string = $previous_string . $current_string;
298
-            }
299
-            $return_fragment[$i] = $current_string;
300
-
301
-            $previous_is_abbreviation = $is_abbreviation;
302
-            $previous_string = $current_string;
303
-            // only increment if this isn't an abbreviation
304
-            if ($is_abbreviation === false) {
305
-                $i++;
306
-            }
307
-        }
308
-        return $return_fragment;
309
-    }
310
-
311
-    /**
312
-     * Merges any part starting with a closing parenthesis ')' to the previous
313
-     * part.
314
-     *
315
-     * @param string[] $parts
316
-     * @return string[]
317
-     */
318
-    private function parenthesesMerge($parts)
319
-    {
320
-        $subsentences = array();
321
-
322
-        foreach ($parts as $part) {
323
-            if ($part[0] === ')') {
324
-                $subsentences[count($subsentences) - 1] .= $part;
325
-            } else {
326
-                $subsentences[] = $part;
327
-            }
328
-        }
329
-
330
-        return $subsentences;
331
-    }
332
-
333
-    /**
334
-     * Looks for closing quotes to include them with the previous statement.
335
-     * "That was very interesting," he said.
336
-     * "That was very interesting."
337
-     *
338
-     * @param string[] $statements
339
-     * @return string[]
340
-     */
341
-    private function closeQuotesMerge($statements)
342
-    {
343
-        $i = 0;
344
-        $previous_statement = "";
345
-        $return = array();
346
-        foreach ($statements as $statement) {
347
-            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
348
-            if (trim($statement) == '"' || trim($statement) == "'" ||
349
-                (
350
-                    (substr($statement, 0, 1) === '"' || substr($statement, 0, 1) === "'")
351
-                    and substr($statement, 1, 1) === ' '
352
-                    and ctype_lower(substr($statement, 2, 1)) === true
353
-                )
354
-            ) {
355
-                $statement = $previous_statement . $statement;
356
-            } else {
357
-                $i++;
358
-            }
359
-
360
-            $return[$i] = $statement;
361
-            $previous_statement = $statement;
362
-        }
363
-
364
-        return $return;
365
-    }
366
-
367
-    /**
368
-     * Merges items into larger sentences.
369
-     * Multibyte safe
370
-     *
371
-     * @param string[] $shorts
372
-     * @return string[]
373
-     */
374
-    private function sentenceMerge($shorts)
375
-    {
376
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
377
-
378
-        $sentences = array();
379
-
380
-        $sentence = '';
381
-        $has_words = false;
382
-        $previous_word_ending = null;
383
-        foreach ($shorts as $short) {
384
-            $word_count = count(mb_split('\s+', self::mbTrim($short)));
385
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
386
-
387
-            if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) {
388
-                $sentences[] = $sentence;
389
-                $sentence = '';
390
-                $has_words = $word_count > 1;
391
-            } else {
392
-                $has_words = ($has_words || $word_count > 1);
393
-            }
394
-
395
-            $sentence .= $short;
396
-            $previous_word_ending = mb_substr($short, -1);
397
-        }
398
-        if (!empty($sentence)) {
399
-            $sentences[] = $sentence;
400
-        }
401
-
402
-        return $sentences;
403
-    }
404
-
405
-    /**
406
-     * Return the sentences sentences detected in the provided text.
407
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
408
-     * @param string $text
409
-     * @param integer $flags
410
-     * @return string[]
411
-     */
412
-    public function split($text, $flags = 0)
413
-    {
414
-        $sentences = array();
415
-
416
-        // clean funny quotes
417
-        $text = self::cleanUnicode($text);
418
-
419
-        // Split
420
-        foreach (self::linebreakSplit($text) as $line) {
421
-            if (self::mbTrim($line) !== '') {
422
-                $punctuations = $this->punctuationSplit($line);
423
-                $parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
424
-                $merges = $this->punctuationMerge($parentheses);
425
-                $shorts = $this->abbreviationMerge($merges);
426
-                $quotes = $this->closeQuotesMerge($shorts);
427
-                $sentences = array_merge($sentences, $this->sentenceMerge($quotes));
428
-            }
429
-        }
430
-
431
-        // Post process
432
-        if ($flags & self::SPLIT_TRIM) {
433
-            foreach ($sentences as &$sentence) {
434
-                $sentence = self::mbTrim($sentence);
435
-            }
436
-            unset($sentence);
437
-        }
438
-
439
-        return $sentences;
440
-    }
441
-
442
-    /**
443
-     * Return the number of sentences detected in the provided text.
444
-     * @param string $text
445
-     * @return integer
446
-     */
447
-    public function count($text)
448
-    {
449
-        return count($this->split($text));
450
-    }
18
+	/**
19
+	 * Specify this flag with the split method to trim whitespace.
20
+	 */
21
+	const SPLIT_TRIM = 0x1;
22
+
23
+	/**
24
+	 * List of characters used to terminate sentences.
25
+	 *
26
+	 * @var string[]
27
+	 */
28
+	private $terminals = array('.', '!', '?');
29
+
30
+	/**
31
+	 * List of characters used for abbreviations.
32
+	 *
33
+	 * @var string[]
34
+	 */
35
+	private $abbreviators = array('.');
36
+
37
+	/**
38
+	 * Multibyte safe version of standard trim() function.
39
+	 *
40
+	 * @param string $string
41
+	 * @return string
42
+	 */
43
+	private static function mbTrim($string)
44
+	{
45
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
46
+	}
47
+
48
+	/**
49
+	 * A cross between mb_split and preg_split, adding the preg_split flags
50
+	 * to mb_split.
51
+	 *
52
+	 * @param string $pattern
53
+	 * @param string $string
54
+	 * @param int $limit
55
+	 * @param int $flags
56
+	 * @return array
57
+	 */
58
+	private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
59
+	{
60
+		$split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
61
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
+		$delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
63
+
64
+		$strlen = strlen($string); // bytes!
65
+		mb_ereg_search_init($string);
66
+
67
+		$lengths = array();
68
+		$position = 0;
69
+		while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
70
+			// capture split
71
+			$lengths[] = array($array[0] - $position, false, null);
72
+
73
+			// move position
74
+			$position = $array[0] + $array[1];
75
+
76
+			// capture delimiter
77
+			$regs = mb_ereg_search_getregs();
78
+			$lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
79
+
80
+			// Continue on?
81
+			if ($position >= $strlen) {
82
+				break;
83
+			}
84
+		}
85
+
86
+		// Add last bit, if not ending with split
87
+		$lengths[] = array($strlen - $position, false, null);
88
+
89
+		// Substrings
90
+		$parts = array();
91
+		$position = 0;
92
+		$count = 1;
93
+		foreach ($lengths as $length) {
94
+			$split_empty = ($length[0] || !$split_no_empty);
95
+			$is_delimiter = $length[1];
96
+			$is_captured = $length[2];
97
+
98
+			if ($limit > 0 && !$is_delimiter && $split_empty && ++$count > $limit) {
99
+				if ($length[0] > 0 || $split_empty) {
100
+					$parts[] = $offset_capture ? array(mb_strcut($string, $position), $position) : mb_strcut($string, $position);
101
+				}
102
+				break;
103
+			} elseif ((!$is_delimiter || ($delim_capture && $is_captured)) && ($length[0] || $split_empty)) {
104
+				$parts[] = $offset_capture ? array(mb_strcut($string, $position, $length[0]), $position) : mb_strcut($string, $position, $length[0]);
105
+			}
106
+
107
+			$position += $length[0];
108
+		}
109
+
110
+		return $parts;
111
+	}
112
+
113
+	/**
114
+	 * Breaks a piece of text into lines by linebreak.
115
+	 * Eats up any linebreak characters as if one.
116
+	 *
117
+	 * Multibyte safe
118
+	 *
119
+	 * @param string $text
120
+	 * @return string[]
121
+	 */
122
+	private static function linebreakSplit($text)
123
+	{
124
+		$lines = array();
125
+		$line = '';
126
+
127
+		foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
128
+			$line .= $part;
129
+			if (self::mbTrim($part) === '') {
130
+				$lines[] = $line;
131
+				$line = '';
132
+			}
133
+		}
134
+		$lines[] = $line;
135
+
136
+		return $lines;
137
+	}
138
+
139
+	/**
140
+	 * Replace
141
+	 *
142
+	 * @staticvar array $chr_map
143
+	 * @param string $string
144
+	 * @return string
145
+	 */
146
+	private static function cleanUnicode($string)
147
+	{
148
+		//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
149
+		static $character_map = array(
150
+			// Windows codepage 1252
151
+			"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
152
+			"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
153
+			"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
154
+			"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
155
+			"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
156
+			"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
157
+			"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
158
+			"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
159
+			// Regular Unicode     // U+0022 quotation mark (")
160
+			// U+0027 apostrophe     (')
161
+			"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
162
+			"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
163
+			"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
164
+			"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
165
+			"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
166
+			"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
167
+			"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
168
+			"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
169
+			"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
170
+			"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
171
+			"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
172
+			"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
173
+		);
174
+
175
+		$character = array_keys($character_map); // but: for efficiency you should
176
+		$replace = array_values($character_map); // pre-calculate these two arrays
177
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
178
+	}
179
+
180
+	/**
181
+	 * Splits an array of lines by (consecutive sequences of)
182
+	 * terminals, keeping terminals.
183
+	 *
184
+	 * Multibyte safe (atleast for UTF-8)
185
+	 *
186
+	 * For example:
187
+	 *    "There ... is. More!"
188
+	 *        ... becomes ...
189
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
190
+	 *
191
+	 * @param string $line
192
+	 * @return string[]
193
+	 */
194
+	private function punctuationSplit($line)
195
+	{
196
+		$parts = array();
197
+
198
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
199
+		$is_terminal = in_array($chars[0], $this->terminals);
200
+
201
+		$part = '';
202
+		foreach ($chars as $index => $char) {
203
+			if (in_array($char, $this->terminals) !== $is_terminal) {
204
+				$parts[] = $part;
205
+				$part = '';
206
+				$is_terminal = !$is_terminal;
207
+			}
208
+			$part .= $char;
209
+		}
210
+
211
+		if (!empty($part)) {
212
+			$parts[] = $part;
213
+		}
214
+
215
+		return $parts;
216
+	}
217
+
218
+	/**
219
+	 * Appends each terminal item after it's preceding
220
+	 * non-terminals.
221
+	 *
222
+	 * Multibyte safe (atleast for UTF-8)
223
+	 *
224
+	 * For example:
225
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
226
+	 *        ... becomes ...
227
+	 *    [ "There ... is.", "More!" ]
228
+	 *
229
+	 * @param string[] $punctuations
230
+	 * @return string[]
231
+	 */
232
+	private function punctuationMerge($punctuations)
233
+	{
234
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
235
+
236
+		$merges = array();
237
+		$merge = '';
238
+
239
+		foreach ($punctuations as $punctuation) {
240
+			if ($punctuation !== '') {
241
+				$merge .= $punctuation;
242
+				if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) {
243
+					$merges[] = $merge;
244
+					$merge = '';
245
+				} else {
246
+					foreach ($definite_terminals as $terminal) {
247
+						if (mb_strpos($punctuation, $terminal) !== false) {
248
+							$merges[] = $merge;
249
+							$merge = '';
250
+							break;
251
+						}
252
+					}
253
+				}
254
+			}
255
+		}
256
+		if (!empty($merge)) {
257
+			$merges[] = $merge;
258
+		}
259
+
260
+		return $merges;
261
+	}
262
+
263
+	/**
264
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
265
+	 *
266
+	 * For example:
267
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
268
+	 *        ... becomes ...
269
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
270
+	 *  [ "Mr. Comey was not available for comment." ]
271
+	 *
272
+	 * @param string[] $fragments
273
+	 * @return string[]
274
+	 */
275
+	private function abbreviationMerge($fragments)
276
+	{
277
+		$return_fragment = array();
278
+
279
+		$previous_string = '';
280
+		$previous_is_abbreviation = false;
281
+		$i = 0;
282
+
283
+		foreach ($fragments as $fragment) {
284
+			$current_string = $fragment;
285
+			$words = mb_split('\s+', self::mbTrim($fragment));
286
+
287
+			$word_count = count($words);
288
+
289
+			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
290
+			$last_word = trim($words[$word_count - 1]);
291
+			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
292
+			$last_is_abbreviation = substr(trim($fragment), -1) == '.';
293
+			$is_abbreviation = $last_is_capital > 0 && $last_is_abbreviation > 0 && mb_strlen($last_word) <= 3;
294
+
295
+			// merge previous fragment with this
296
+			if ($previous_is_abbreviation === true) {
297
+				$current_string = $previous_string . $current_string;
298
+			}
299
+			$return_fragment[$i] = $current_string;
300
+
301
+			$previous_is_abbreviation = $is_abbreviation;
302
+			$previous_string = $current_string;
303
+			// only increment if this isn't an abbreviation
304
+			if ($is_abbreviation === false) {
305
+				$i++;
306
+			}
307
+		}
308
+		return $return_fragment;
309
+	}
310
+
311
+	/**
312
+	 * Merges any part starting with a closing parenthesis ')' to the previous
313
+	 * part.
314
+	 *
315
+	 * @param string[] $parts
316
+	 * @return string[]
317
+	 */
318
+	private function parenthesesMerge($parts)
319
+	{
320
+		$subsentences = array();
321
+
322
+		foreach ($parts as $part) {
323
+			if ($part[0] === ')') {
324
+				$subsentences[count($subsentences) - 1] .= $part;
325
+			} else {
326
+				$subsentences[] = $part;
327
+			}
328
+		}
329
+
330
+		return $subsentences;
331
+	}
332
+
333
+	/**
334
+	 * Looks for closing quotes to include them with the previous statement.
335
+	 * "That was very interesting," he said.
336
+	 * "That was very interesting."
337
+	 *
338
+	 * @param string[] $statements
339
+	 * @return string[]
340
+	 */
341
+	private function closeQuotesMerge($statements)
342
+	{
343
+		$i = 0;
344
+		$previous_statement = "";
345
+		$return = array();
346
+		foreach ($statements as $statement) {
347
+			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
348
+			if (trim($statement) == '"' || trim($statement) == "'" ||
349
+				(
350
+					(substr($statement, 0, 1) === '"' || substr($statement, 0, 1) === "'")
351
+					and substr($statement, 1, 1) === ' '
352
+					and ctype_lower(substr($statement, 2, 1)) === true
353
+				)
354
+			) {
355
+				$statement = $previous_statement . $statement;
356
+			} else {
357
+				$i++;
358
+			}
359
+
360
+			$return[$i] = $statement;
361
+			$previous_statement = $statement;
362
+		}
363
+
364
+		return $return;
365
+	}
366
+
367
+	/**
368
+	 * Merges items into larger sentences.
369
+	 * Multibyte safe
370
+	 *
371
+	 * @param string[] $shorts
372
+	 * @return string[]
373
+	 */
374
+	private function sentenceMerge($shorts)
375
+	{
376
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
377
+
378
+		$sentences = array();
379
+
380
+		$sentence = '';
381
+		$has_words = false;
382
+		$previous_word_ending = null;
383
+		foreach ($shorts as $short) {
384
+			$word_count = count(mb_split('\s+', self::mbTrim($short)));
385
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
386
+
387
+			if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) {
388
+				$sentences[] = $sentence;
389
+				$sentence = '';
390
+				$has_words = $word_count > 1;
391
+			} else {
392
+				$has_words = ($has_words || $word_count > 1);
393
+			}
394
+
395
+			$sentence .= $short;
396
+			$previous_word_ending = mb_substr($short, -1);
397
+		}
398
+		if (!empty($sentence)) {
399
+			$sentences[] = $sentence;
400
+		}
401
+
402
+		return $sentences;
403
+	}
404
+
405
+	/**
406
+	 * Return the sentences sentences detected in the provided text.
407
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
408
+	 * @param string $text
409
+	 * @param integer $flags
410
+	 * @return string[]
411
+	 */
412
+	public function split($text, $flags = 0)
413
+	{
414
+		$sentences = array();
415
+
416
+		// clean funny quotes
417
+		$text = self::cleanUnicode($text);
418
+
419
+		// Split
420
+		foreach (self::linebreakSplit($text) as $line) {
421
+			if (self::mbTrim($line) !== '') {
422
+				$punctuations = $this->punctuationSplit($line);
423
+				$parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
424
+				$merges = $this->punctuationMerge($parentheses);
425
+				$shorts = $this->abbreviationMerge($merges);
426
+				$quotes = $this->closeQuotesMerge($shorts);
427
+				$sentences = array_merge($sentences, $this->sentenceMerge($quotes));
428
+			}
429
+		}
430
+
431
+		// Post process
432
+		if ($flags & self::SPLIT_TRIM) {
433
+			foreach ($sentences as &$sentence) {
434
+				$sentence = self::mbTrim($sentence);
435
+			}
436
+			unset($sentence);
437
+		}
438
+
439
+		return $sentences;
440
+	}
441
+
442
+	/**
443
+	 * Return the number of sentences detected in the provided text.
444
+	 * @param string $text
445
+	 * @return integer
446
+	 */
447
+	public function count($text)
448
+	{
449
+		return count($this->split($text));
450
+	}
451 451
 
452 452
 }
Please login to merge, or discard this patch.
Spacing   +3 added lines, -3 removed lines patch added patch discarded remove patch
@@ -57,9 +57,9 @@
 block discarded – undo
57 57
      */
58 58
     private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
59 59
     {
60
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
61
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
60
+        $split_no_empty = (bool) ($flags & PREG_SPLIT_NO_EMPTY);
61
+        $offset_capture = (bool) ($flags & PREG_SPLIT_OFFSET_CAPTURE);
62
+        $delim_capture = (bool) ($flags & PREG_SPLIT_DELIM_CAPTURE);
63 63
 
64 64
         $strlen = strlen($string); // bytes!
65 65
         mb_ereg_search_init($string);
Please login to merge, or discard this patch.