Passed
Push — master ( 538637...3febfa )
by Martijn
02:31
created

Sentence::closeQuotesMerge()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 17
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 11
c 1
b 0
f 0
dl 0
loc 17
rs 9.9
cc 3
nc 3
nop 1
1
<?php
2
3
namespace Vanderlee\Sentence;
4
5
/**
6
 * Segments sentences.
7
 * Clipping may not be perfect.
8
 * Sentence count should be VERY close to the truth.
9
 *
10
 * Multibyte.php safe (at least for UTF-8), but rules based on germanic
11
 * language structure (English, Dutch, German). Should work for most
12
 * latin-alphabet languages.
13
 *
14
 * @author Martijn van der Lee (@vanderlee)
15
 * @author @marktaw
16
 */
17
class Sentence
18
{
19
20
    /**
21
     * Specify this flag with the split method to trim whitespace.
22
     */
23
    const SPLIT_TRIM = 0x1;
24
25
    /**
26
     * List of characters used to terminate sentences.
27
     *
28
     * @var string[]
29
     */
30
    private $terminals = ['.', '!', '?'];
31
32
    /**
33
     * List of characters used for abbreviations.
34
     *
35
     * @var string[]
36
     */
37
    private $abbreviators = ['.'];
38
39
    /**
40
     * List of replacements in the text.
41
     *
42
     * @var string[]
43
     */
44
    private $replacements = [];
45
46
    /**
47
     * Generate an in-text replacement code for the specified index
48
     *
49
     * @param int $index
50
     *
51
     * @return string
52
     */
53
    private function getReplaceCode($index)
54
    {
55
        return 0x02 . $index . 0x03;
56
    }
57
58
    /**
59
     * Clean floating point numbers by replace them with an in-text index
60
     *
61
     * @param string $text
62
     *
63
     * @return string
64
     */
65
    private function replaceFloatNumbers($text)
66
    {
67
        $matches = array();
68
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
69
70
        $this->replacements = [];
71
        $index = 0;
72
        foreach (array_reverse($matches[0]) as $match) {
73
            $number = $match[0];
74
            $offset = $match[1];
75
            $code = $this->getReplaceCode($index);
76
77
            $this->replacements[$index] = $number;
78
79
            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
80
81
            ++$index;
82
        }
83
84
        return $text;
85
    }
86
87
    /**
88
     * Restore any stored replacements
89
     *
90
     * @param string[] $text
91
     *
92
     * @return string[]
93
     */
94
    private function restoreReplacements($text)
95
    {
96
        return array_map(function ($value) {
97
            foreach ($this->replacements as $index => $number) {
98
                $code = $this->getReplaceCode($index);
99
                $value = str_replace($code, $number, $value);
100
            }
101
102
            return $value;
103
        }, $text);
104
    }
105
106
    /**
107
     * Breaks a piece of text into lines by linebreak.
108
     * Eats up any linebreak characters as if one.
109
     *
110
     * Multibyte.php safe
111
     *
112
     * @param string $text
113
     *
114
     * @return string[]
115
     */
116
    private static function linebreakSplit($text)
117
    {
118
        $lines = [];
119
        $line = '';
120
121
        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
122
            $line .= $part;
123
            if (Multibyte::trim($part) === '') {
124
                $lines[] = $line;
125
                $line = '';
126
            }
127
        }
128
        $lines[] = $line;
129
130
        return $lines;
131
    }
132
133
    /**
134
     * Splits an array of lines by (consecutive sequences of)
135
     * terminals, keeping terminals.
136
     *
137
     * Multibyte.php safe (at least for UTF-8)
138
     *
139
     * For example:
140
     *    "There ... is. More!"
141
     *        ... becomes ...
142
     *    [ "There ", "...", " is", ".", " More", "!" ]
143
     *
144
     * @param string $line
145
     *
146
     * @return string[]
147
     */
148
    private function punctuationSplit($line)
149
    {
150
        $parts = [];
151
152
        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
153
        $is_terminal = in_array($chars[0], $this->terminals);
154
155
        $part = '';
156
        foreach ($chars as $char) {
157
            if (in_array($char, $this->terminals) !== $is_terminal) {
158
                $parts[] = $part;
159
                $part = '';
160
                $is_terminal = !$is_terminal;
161
            }
162
            $part .= $char;
163
        }
164
165
        if (!empty($part)) {
166
            $parts[] = $part;
167
        }
168
169
        return $parts;
170
    }
171
172
    /**
173
     * Appends each terminal item after it's preceding
174
     * non-terminals.
175
     *
176
     * Multibyte.php safe (at least for UTF-8)
177
     *
178
     * For example:
179
     *    [ "There ", "...", " is", ".", " More", "!" ]
180
     *        ... becomes ...
181
     *    [ "There ... is.", "More!" ]
182
     *
183
     * @param string[] $punctuations
184
     *
185
     * @return string[]
186
     */
187
    private function punctuationMerge($punctuations)
188
    {
189
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
190
191
        $merges = [];
192
        $merge = '';
193
194
        $filtered = array_filter($punctuations, function ($p) {
195
            return $p !== '';
196
        });
197
198
        foreach ($filtered as $punctuation) {
199
            $merge .= $punctuation;
200
            if (mb_strlen($punctuation) === 1
201
                && in_array($punctuation, $this->terminals)) {
202
                $merges[] = $merge;
203
                $merge = '';
204
            } else {
205
                foreach ($definite_terminals as $terminal) {
206
                    if (mb_strpos($punctuation, $terminal) !== false) {
207
                        $merges[] = $merge;
208
                        $merge = '';
209
                        break;
210
                    }
211
                }
212
            }
213
        }
214
        if (!empty($merge)) {
215
            $merges[] = $merge;
216
        }
217
218
        return $merges;
219
    }
220
221
    /**
222
     * Looks for capitalized abbreviations & includes them with the following fragment.
223
     *
224
     * For example:
225
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
226
     *        ... becomes ...
227
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
228
     *  [ "Mr. Comey was not available for comment." ]
229
     *
230
     * @param string[] $fragments
231
     *
232
     * @return string[]
233
     */
234
    private function abbreviationMerge($fragments)
235
    {
236
        $return_fragment = [];
237
238
        $previous_fragment = '';
239
        $previous_is_abbreviation = false;
240
        $i = 0;
241
        foreach ($fragments as $fragment) {
242
            $is_abbreviation = self::isAbbreviation($fragment);
243
244
            // merge previous fragment with this
245
            if ($previous_is_abbreviation) {
246
                $fragment = $previous_fragment . $fragment;
247
            }
248
            $return_fragment[$i] = $fragment;
249
250
            $previous_is_abbreviation = $is_abbreviation;
251
            $previous_fragment = $fragment;
252
253
            // only increment if this isn't an abbreviation
254
            if (!$is_abbreviation) {
255
                $i++;
256
            }
257
        }
258
259
        return $return_fragment;
260
    }
261
262
    /**
263
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
264
     *
265
     * @param $fragment
266
     *
267
     * @return bool
268
     */
269
    private static function isAbbreviation($fragment)
270
    {
271
        $words = mb_split('\s+', Multibyte::trim($fragment));
272
273
        $word_count = count($words);
274
275
        $last_word = Multibyte::trim($words[$word_count - 1]);
276
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
277
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
278
279
        return $last_is_capital > 0
280
            && $last_is_abbreviation > 0
281
            && mb_strlen($last_word) <= 3;
282
    }
283
284
    /**
285
     * Merges any part starting with a closing parenthesis ')' to the previous
286
     * part.
287
     *
288
     * @param string[] $parts
289
     *
290
     * @return string[]
291
     */
292
    private function parenthesesMerge($parts)
293
    {
294
        $subSentences = [];
295
296
        foreach ($parts as $part) {
297
            if ($part[0] === ')' && !empty($subSentences)) {
298
                $subSentences[count($subSentences) - 1] .= $part;
299
            } else {
300
                $subSentences[] = $part;
301
            }
302
        }
303
304
        return $subSentences;
305
    }
306
307
    /**
308
     * Looks for closing quotes to include them with the previous statement.
309
     * "That was very interesting," he said.
310
     * "That was very interesting."
311
     *
312
     * @param string[] $statements
313
     *
314
     * @return string[]
315
     */
316
    private function closeQuotesMerge($statements)
317
    {
318
        $i = 0;
319
        $previous_statement = '';
320
        $return = [];
321
        foreach ($statements as $statement) {
322
            if (self::isEndQuote($statement)) {
323
                $statement = $previous_statement . $statement;
324
            } else {
325
                $i++;
326
            }
327
328
            $return[$i] = $statement;
329
            $previous_statement = $statement;
330
        }
331
332
        return $return;
333
    }
334
335
    /**
336
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
337
     *
338
     * @param $statement
339
     *
340
     * @return bool
341
     */
342
    private static function isEndQuote($statement)
343
    {
344
        $trimmed = Multibyte::trim($statement);
345
        $first = mb_substr($statement, 0, 1);
346
347
        return in_array($trimmed, ['"', '\''])
348
            || (
349
                in_array($first, ['"', '\''])
350
                && mb_substr($statement, 1, 1) === ' '
351
                && ctype_lower(mb_substr($statement, 2, 1)) === true
352
            );
353
    }
354
355
    /**
356
     * Merges items into larger sentences.
357
     * Multibyte.php safe
358
     *
359
     * @param string[] $shorts
360
     *
361
     * @return string[]
362
     */
363
    private function sentenceMerge($shorts)
364
    {
365
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
366
367
        $sentences = [];
368
369
        $sentence = '';
370
        $has_words = false;
371
        $previous_word_ending = null;
372
        foreach ($shorts as $short) {
373
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
374
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
375
376
            if ($after_non_abbreviating_terminal
377
                || ($has_words && $word_count > 1)) {
378
379
                $sentences[] = $sentence;
380
381
                $sentence = '';
382
                $has_words = false;
383
            }
384
385
            $has_words = $has_words
386
                || $word_count > 1;
387
388
            $sentence .= $short;
389
            $previous_word_ending = mb_substr($short, -1);
390
        }
391
392
        if (!empty($sentence)) {
393
            $sentences[] = $sentence;
394
        }
395
396
        return $sentences;
397
    }
398
399
    /**
400
     * Return the sentences detected in the provided text.
401
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
402
     *
403
     * @param string  $text
404
     * @param integer $flags
405
     *
406
     * @return string[]
407
     */
408
    public function split($text, $flags = 0, $pipeline = [])
409
    {
410
        if (empty($pipeline)) {
411
            static $pipeline = [
412
                'replaceFloatNumbers',
413
                'punctuationSplit',
414
                'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
415
                'punctuationMerge',
416
                'abbreviationMerge',
417
                'closeQuotesMerge',
418
                'sentenceMerge',
419
                'restoreReplacements',
420
            ];
421
        }
422
423
        // clean funny quotes
424
        $text = Multibyte::cleanUnicode($text);
425
426
        // Split
427
        $sentences = [];
428
        foreach (self::linebreakSplit($text) as $input) {
429
            if (Multibyte::trim($input) !== '') {
430
                foreach ($pipeline as $method) {
431
                    $input = $this->$method($input);
432
                }
433
                $sentences = array_merge($sentences, $input);
434
            }
435
        }
436
437
        // Post process
438
        if ($flags & self::SPLIT_TRIM) {
439
            return self::trimSentences($sentences);
440
        }
441
442
        return $sentences;
443
    }
444
445
    /**
446
     * Multibyte.php trim each string in an array.
447
     *
448
     * @param string[] $sentences
449
     *
450
     * @return string[]
451
     */
452
    private static function trimSentences($sentences)
453
    {
454
        return array_map(function ($sentence) {
455
            return Multibyte::trim($sentence);
456
        }, $sentences);
457
    }
458
459
    /**
460
     * Return the number of sentences detected in the provided text.
461
     *
462
     * @param string $text
463
     *
464
     * @return integer
465
     */
466
    public function count($text)
467
    {
468
        return count($this->split($text));
469
    }
470
471
}
472