Passed
Push — master ( 16305b...fb2011 )
by Martijn
01:51
created

Sentence::abbreviationMerge()   C

Complexity

Conditions 7
Paths 9

Size

Total Lines 38
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 23
nc 9
nop 1
dl 0
loc 38
rs 6.7272
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Segments sentences.
5
 * Clipping may not be perfect.
6
 * Sentence count should be VERY close to the truth.
7
 *
8
 * Multibyte safe (atleast for UTF-8), but rules based on germanic
9
 * language stucture (English, Dutch, German). Should work for most
10
 * latin-alphabet languages.
11
 *
12
 * @author Martijn van der Lee (@vanderlee)
13
 * @author @marktaw
14
 */
15
class Sentence {
16
17
	/**
18
	 * Specify this flag with the split method to trim whitespace.
19
	 */
20
	const SPLIT_TRIM = 0x1;
21
22
	/**
23
	 * List of characters used to terminate sentences.
24
	 * @var array
25
	 */
26
	private $terminals = array('.', '!', '?');
27
28
	/**
29
	 * List of characters used for abbreviations.
30
	 * @var array
31
	 */
32
	private $abbreviators = array('.');
33
34
	/**
35
	 * Multibyte safe version of standard trim() function.
36
	 * @param string $string
37
	 * @return string
38
	 */
39
	private static function mbTrim($string)
40
	{
41
		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
42
	}
43
44
	/**
45
	 * A cross between mb_split and preg_split, adding the preg_split flags
46
	 * to mb_split.
47
	 * @param string $pattern
48
	 * @param string $string
49
	 * @param int $limit
50
	 * @param int $flags
51
	 * @return array
52
	 */
53
	private static function mbSplit($pattern, $string, $limit = -1, $flags = 0)
54
	{
55
		$strlen = strlen($string);  // bytes!
56
		mb_ereg_search_init($string);
57
58
		$lengths = array();
59
		$position = 0;
60
		while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
61
			// capture split
62
			$lengths[] = array($array[0] - $position, false, null);
63
64
			// move position
65
			$position = $array[0] + $array[1];
66
67
			// capture delimiter
68
			$regs = mb_ereg_search_getregs();
69
			$lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
70
71
			// Continue on?
72
			if ($position >= $strlen) {
73
				break;
74
			}
75
		}
76
77
		// Add last bit, if not ending with split
78
		$lengths[] = array($strlen - $position, false, null);
79
80
		// Substrings
81
		$parts = array();
82
		$position = 0;
83
		$count = 1;
84
		foreach ($lengths as $length) {
85
			$is_delimiter = $length[1];
86
			$is_captured = $length[2];
87
88
			if ($limit > 0 && !$is_delimiter && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY) && ++$count > $limit) {
89
				if ($length[0] > 0 || ~$flags & PREG_SPLIT_NO_EMPTY) {
90
					$parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE ? array(mb_strcut($string, $position), $position) : mb_strcut($string, $position);
91
				}
92
				break;
93
			} elseif ((!$is_delimiter || ($flags & PREG_SPLIT_DELIM_CAPTURE && $is_captured)) && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY)) {
94
				$parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE ? array(mb_strcut($string, $position, $length[0]), $position) : mb_strcut($string, $position, $length[0]);
95
			}
96
97
			$position += $length[0];
98
		}
99
100
		return $parts;
101
	}
102
103
	/**
104
	 * Breaks a piece of text into lines by linebreak.
105
	 * Eats up any linebreak characters as if one.
106
	 *
107
	 * Multibyte safe
108
	 *
109
	 * @param string $text
110
	 * @return array
111
	 */
112
	private static function linebreakSplit($text)
113
	{
114
		$lines = array();
115
		$line = '';
116
117
		foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
118
			$line .= $part;
119
			if (self::mbTrim($part) === '') {
120
				$lines[] = $line;
121
				$line = '';
122
			}
123
		}
124
		$lines[] = $line;
125
126
		return $lines;
127
	}
128
129
	/**
130
	 * Replace
131
	 * @staticvar array $chr_map
132
	 * @param String $string
133
	 * @return String
134
	 */
135
	private static function cleanUnicode($string)
136
	{
137
		//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
138
		static $character_map = array(
139
			// Windows codepage 1252
140
			"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
141
			"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
142
			"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
143
			"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
144
			"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
145
			"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
146
			"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
147
			"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
148
			// Regular Unicode     // U+0022 quotation mark (")
149
			// U+0027 apostrophe     (')
150
			"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
151
			"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
152
			"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
153
			"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
154
			"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
155
			"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
156
			"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
157
			"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
158
			"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
159
			"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
160
			"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
161
			"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
162
		);
163
164
		$character = array_keys($character_map); // but: for efficiency you should
165
		$replace = array_values($character_map); // pre-calculate these two arrays
166
		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
167
	}
168
169
	/**
170
	 * Splits an array of lines by (consecutive sequences of)
171
	 * terminals, keeping terminals.
172
	 *
173
	 * Multibyte safe (atleast for UTF-8)
174
	 *
175
	 * For example:
176
	 * 	"There ... is. More!"
177
	 * 		... becomes ...
178
	 * 	[ "There ", "...", " is", ".", " More", "!" ]
179
	 *
180
	 * @param array $lines
181
	 * @return array
182
	 */
183
	private function punctuationSplit($line)
184
	{
185
		$parts = array();
186
187
		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
188
		$is_terminal = in_array($chars[0], $this->terminals);
189
190
		$part = '';
191
		foreach ($chars as $index => $char) {
192
			if (in_array($char, $this->terminals) !== $is_terminal) {
193
				$parts[] = $part;
194
				$part = '';
195
				$is_terminal = !$is_terminal;
196
			}
197
			$part .= $char;
198
		}
199
200
		if (!empty($part)) {
201
			$parts[] = $part;
202
		}
203
204
		return $parts;
205
	}
206
207
	/**
208
	 * Appends each terminal item after it's preceding
209
	 * non-terminals.
210
	 *
211
	 * Multibyte safe (atleast for UTF-8)
212
	 *
213
	 * For example:
214
	 * 	[ "There ", "...", " is", ".", " More", "!" ]
215
	 * 		... becomes ...
216
	 * 	[ "There ... is.", "More!" ]
217
	 *
218
	 * @param array $punctuations
219
	 * @return array
220
	 */
221
	private function punctuationMerge($punctuations)
222
	{
223
		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
224
225
		$merges = array();
226
		$merge = '';
227
228
		foreach ($punctuations as $punctuation) {
229
			if ($punctuation !== '') {
230
				$merge .= $punctuation;
231
				if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) {
232
					$merges[] = $merge;
233
					$merge = '';
234
				} else {
235
					foreach ($definite_terminals as $terminal) {
236
						if (mb_strpos($punctuation, $terminal) !== false) {
237
							$merges[] = $merge;
238
							$merge = '';
239
							break;
240
						}
241
					}
242
				}
243
			}
244
		}
245
		if (!empty($merge)) {
246
			$merges[] = $merge;
247
		}
248
249
		return $merges;
250
	}
251
252
	/**
253
	 * Looks for capitalized abbreviations & includes them with the following fragment.
254
	 *
255
	 * For example:
256
	 * 	[ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
257
	 * 		... becomes ...
258
	 * 	[ "Last week, former director of the F.B.I. James B. Comey was fired." ]
259
	 *  [ "Mr. Comey was not available for comment." ]
260
	 *
261
	 * @param array $fragments
262
	 * @return array
263
	 */
264
	private function abbreviationMerge($fragments)
265
	{
266
		$return_fragment = array();
267
268
		$previous_string = '';
269
		$previous_is_abbreviation = false;
270
		$i = 0;
271
272
		foreach ($fragments as $fragment) {
273
			$current_string = $fragment;
274
			$words = mb_split('\s+', self::mbTrim($fragment));
275
276
			$word_count = count($words);
277
278
			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
279
			$last_word = trim($words[$word_count - 1]);
280
			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
281
			$last_is_abbreviation = substr(trim($fragment), -1) == '.';
282
			if ($last_is_capital > 0 && $last_is_abbreviation > 0 && mb_strlen($last_word) <= 3) {
283
				$is_abbreviation = true;
284
			} else {
285
				$is_abbreviation = false;
286
			}
287
			// merge previous fragment with this
288
			if ($previous_is_abbreviation == true) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
289
				$current_string = $previous_string . $current_string;
290
			}
291
			$return_fragment[$i] = $current_string;
292
293
294
			$previous_is_abbreviation = $is_abbreviation;
295
			$previous_string = $current_string;
296
			// only increment if this isn't an abbreviation
297
			if ($is_abbreviation == false) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
298
				$i++;
299
			}
300
		}
301
		return $return_fragment;
302
	}
303
304
	/**
305
	 * Merges any part starting with a closing parenthesis ')' to the previous
306
	 * part.
307
	 * @param type $parts
308
	 */
309
	private function parenthesesMerge($parts)
310
	{
311
		$subsentences = array();
312
313
		foreach ($parts as $part) {
314
			if ($part[0] === ')') {
315
				$subsentences[count($subsentences) - 1] .= $part;
316
			} else {
317
				$subsentences[] = $part;
318
			}
319
		}
320
321
		return $subsentences;
322
	}
323
324
	/**
325
	  Looks for closing quotes to include them with the previous statement.
326
	  "That was very interesting," he said.
327
	  "That was very interesting."
328
	 */
329
	private function closeQuotesMerge($statements)
330
	{
331
		$i = 0;
332
		$previous_statement = "";
333
		foreach ($statements as $statement) {
334
			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
335
			if (trim($statement) == '"' || trim($statement) == "'" ||
336
					(
337
					( substr($statement, 0, 1) == '"' || substr($statement, 0, 1) == "'" )
338
					and substr($statement, 1, 1) == " "
339
					and ctype_lower(substr($statement, 2, 1)) == true
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
340
					)
341
			) {
342
				$statement = $previous_statement . $statement;
343
			} else {
344
				$i++;
345
			}
346
347
			$return[$i] = $statement;
348
			$previous_statement = $statement;
349
		}
350
		return($return);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $return seems to be defined by a foreach iteration on line 333. Are you sure the iterator is never empty, otherwise this variable is not defined?
Loading history...
351
	}
352
353
	/**
354
	 * Merges items into larger sentences.
355
	 *
356
	 * Multibyte safe
357
	 *
358
	 * @param array $shorts
359
	 * @return array
360
	 */
361
	private function sentenceMerge($shorts)
362
	{
363
		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
364
365
		$sentences = array();
366
367
		$sentence = '';
368
		$has_words = false;
369
		$previous_word_ending = null;
370
		foreach ($shorts as $short) {
371
			$word_count = count(mb_split('\s+', self::mbTrim($short)));
372
			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
373
374
			if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) {
375
				$sentences[] = $sentence;
376
				$sentence = '';
377
				$has_words = $word_count > 1;
378
			} else {
379
				$has_words = $has_words || $word_count > 1;
0 ignored issues
show
Comprehensibility introduced by
Consider adding parentheses for clarity. Current Interpretation: $has_words = ($has_words || $word_count > 1), Probably Intended Meaning: ($has_words = $has_words) || $word_count > 1
Loading history...
380
			}
381
382
			$sentence .= $short;
383
			$previous_word_ending = mb_substr($short, -1);
384
		}
385
		if (!empty($sentence)) {
386
			$sentences[] = $sentence;
387
		}
388
389
		return $sentences;
390
	}
391
392
	/**
393
	 * Return the sentences sentences detected in the provided text.
394
	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
395
	 * @param string $text
396
	 * @param integer $flags
397
	 * @return array
398
	 */
399
	public function split($text, $flags = 0)
400
	{
401
		$sentences = array();
402
403
		// clean funny quotes
404
		$text = self::cleanUnicode($text);
405
406
		// Split
407
		foreach (self::linebreakSplit($text) as $line) {
408
			if (self::mbTrim($line) !== '') {
409
				$punctuations = $this->punctuationSplit($line);
410
				$parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
0 ignored issues
show
Bug introduced by
$punctuations of type array is incompatible with the type type expected by parameter $parts of Sentence::parenthesesMerge(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

410
				$parentheses = $this->parenthesesMerge(/** @scrutinizer ignore-type */ $punctuations); // also works after punctuationMerge or abbreviationMerge
Loading history...
411
				$merges = $this->punctuationMerge($parentheses);
412
				$shorts = $this->abbreviationMerge($merges);
413
				$quotes = $this->closeQuotesMerge($shorts);
414
				$sentences = array_merge($sentences, $this->sentenceMerge($quotes));
415
			}
416
		}
417
418
		// Post process
419
		if ($flags & self::SPLIT_TRIM) {
420
			foreach ($sentences as &$sentence) {
421
				$sentence = self::mbTrim($sentence);
422
			}
423
			unset($sentence);
424
		}
425
426
		return $sentences;
427
	}
428
429
	/**
430
	 * Return the number of sentences detected in the provided text.
431
	 * @param string $text
432
	 * @return integer
433
	 */
434
	public function count($text)
435
	{
436
		return count($this->split($text));
437
	}
438
439
}
440