Passed
Push — master ( bd113f...e772b1 )
by Martijn
02:09
created

Sentence::getReplaceCode()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
namespace Vanderlee\Sentence;
4
5
/**
6
 * Segments sentences.
7
 * Clipping may not be perfect.
8
 * Sentence count should be VERY close to the truth.
9
 *
10
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11
 * language stucture (English, Dutch, German). Should work for most
12
 * latin-alphabet languages.
13
 *
14
 * @author Martijn van der Lee (@vanderlee)
15
 * @author @marktaw
16
 */
17
class Sentence
18
{
19
    /**
20
     * Specify this flag with the split method to trim whitespace.
21
     */
22
    const SPLIT_TRIM = 0x1;
23
24
    /**
25
     * List of characters used to terminate sentences.
26
     *
27
     * @var string[]
28
     */
29
    private $terminals = ['.', '!', '?'];
30
31
    /**
32
     * List of characters used for abbreviations.
33
     *
34
     * @var string[]
35
     */
36
    private $abbreviators = ['.'];
37
38
    /**
39
     * List of float numbers in the text
40
     *
41
     * @var string[]
42
     */
43
    private $replacements = [];
44
45
    /**
46
     * Generate an in-text replacement code for the specified index
47
     *
48
     * @param string $index
49
     *
50
     * @return string
51
     */
52
    private function getReplaceCode(string $index)
53
    {
54
        return 0x02 . $index . 0x03;
55
    }
56
57
    /**
58
     * Clean floating point numbers by replace them with an in-text index
59
     *
60
     * @param string $text
61
     *
62
     * @return string
63
     */
64
    private function replaceFloatNumbers(string $text)
65
    {
66
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
67
68
        $this->replacements = [];
69
        $index = 0;
70
        foreach (array_reverse($matches[0]) as $match) {
71
            $number = $match[0];
72
            $offset = $match[1];
73
            $code = $this->getReplaceCode($index);
74
75
            $this->replacements[$index] = $number;
76
77
            $text = substr_replace($text, $code, $offset, mb_strlen($number));
78
79
            ++$index;
80
        }
81
82
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
83
    }
84
85
    /**
86
     * Restore any stored replacements
87
     *
88
     * @param string[] $text
89
     *
90
     * @return string[]
91
     */
92
    private function restoreReplacements($text)
93
    {
94
        return array_map(function($value) {
95
            foreach ($this->replacements as $index => $number) {
96
                $code = $this->getReplaceCode($index);
97
                $value = str_replace($code, $number, $value);
98
            }
99
            return $value;
100
        }, $text);
101
    }
102
103
    /**
104
     * Breaks a piece of text into lines by linebreak.
105
     * Eats up any linebreak characters as if one.
106
     *
107
     * Multibyte.php safe
108
     *
109
     * @param string $text
110
     * @return string[]
111
     */
112
    private static function linebreakSplit($text)
113
    {
114
        $lines = [];
115
        $line = '';
116
117
        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
118
            $line .= $part;
119
            if (Multibyte::trim($part) === '') {
120
                $lines[] = $line;
121
                $line = '';
122
            }
123
        }
124
        $lines[] = $line;
125
126
        return $lines;
127
    }
128
129
    /**
130
     * Splits an array of lines by (consecutive sequences of)
131
     * terminals, keeping terminals.
132
     *
133
     * Multibyte.php safe (atleast for UTF-8)
134
     *
135
     * For example:
136
     *    "There ... is. More!"
137
     *        ... becomes ...
138
     *    [ "There ", "...", " is", ".", " More", "!" ]
139
     *
140
     * @param string $line
141
     * @return string[]
142
     */
143
    private function punctuationSplit($line)
144
    {
145
        $parts = [];
146
147
        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
148
        $is_terminal = in_array($chars[0], $this->terminals);
149
150
        $part = '';
151
        foreach ($chars as $index => $char) {
152
            if (in_array($char, $this->terminals) !== $is_terminal) {
153
                $parts[] = $part;
154
                $part = '';
155
                $is_terminal = !$is_terminal;
156
            }
157
            $part .= $char;
158
        }
159
160
        if (!empty($part)) {
161
            $parts[] = $part;
162
        }
163
164
        return $parts;
165
    }
166
167
    /**
168
     * Appends each terminal item after it's preceding
169
     * non-terminals.
170
     *
171
     * Multibyte.php safe (atleast for UTF-8)
172
     *
173
     * For example:
174
     *    [ "There ", "...", " is", ".", " More", "!" ]
175
     *        ... becomes ...
176
     *    [ "There ... is.", "More!" ]
177
     *
178
     * @param string[] $punctuations
179
     * @return string[]
180
     */
181
    private function punctuationMerge($punctuations)
182
    {
183
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
184
185
        $merges = [];
186
        $merge = '';
187
188
        $filtered = array_filter($punctuations, function ($p) {
189
            return $p !== '';
190
        });
191
192
        foreach ($filtered as $punctuation) {
193
            $merge .= $punctuation;
194
            if (mb_strlen($punctuation) === 1
195
                && in_array($punctuation, $this->terminals)) {
196
                $merges[] = $merge;
197
                $merge = '';
198
            } else {
199
                foreach ($definite_terminals as $terminal) {
200
                    if (mb_strpos($punctuation, $terminal) !== false) {
201
                        $merges[] = $merge;
202
                        $merge = '';
203
                        break;
204
                    }
205
                }
206
            }
207
        }
208
        if (!empty($merge)) {
209
            $merges[] = $merge;
210
        }
211
212
        return $merges;
213
    }
214
215
    /**
216
     * Looks for capitalized abbreviations & includes them with the following fragment.
217
     *
218
     * For example:
219
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
220
     *        ... becomes ...
221
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
222
     *  [ "Mr. Comey was not available for comment." ]
223
     *
224
     * @param string[] $fragments
225
     * @return string[]
226
     */
227
    private function abbreviationMerge($fragments)
228
    {
229
        $return_fragment = [];
230
231
        $previous_fragment = '';
232
        $previous_is_abbreviation = false;
233
        $i = 0;
234
        foreach ($fragments as $fragment) {
235
            $is_abbreviation = self::isAbreviation($fragment);
236
237
            // merge previous fragment with this
238
            if ($previous_is_abbreviation) {
239
                $fragment = $previous_fragment . $fragment;
240
            }
241
            $return_fragment[$i] = $fragment;
242
243
            $previous_is_abbreviation = $is_abbreviation;
244
            $previous_fragment = $fragment;
245
246
            // only increment if this isn't an abbreviation
247
            if (!$is_abbreviation) {
248
                $i++;
249
            }
250
        }
251
        return $return_fragment;
252
    }
253
254
    /**
255
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
256
     *
257
     * @param $fragment
258
     * @return bool
259
     */
260
    private static function isAbreviation($fragment)
261
    {
262
        $words = mb_split('\s+', Multibyte::trim($fragment));
263
264
        $word_count = count($words);
265
266
        $last_word = Multibyte::trim($words[$word_count - 1]);
267
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
268
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
269
270
        return $last_is_capital > 0
271
            && $last_is_abbreviation > 0
272
            && mb_strlen($last_word) <= 3;
273
    }
274
275
    /**
276
     * Merges any part starting with a closing parenthesis ')' to the previous
277
     * part.
278
     *
279
     * @param string[] $parts
280
     * @return string[]
281
     */
282
    private function parenthesesMerge($parts)
283
    {
284
        $subsentences = [];
285
286
        foreach ($parts as $part) {
287
            if ($part[0] === ')') {
288
                $subsentences[count($subsentences) - 1] .= $part;
289
            } else {
290
                $subsentences[] = $part;
291
            }
292
        }
293
294
        return $subsentences;
295
    }
296
297
    /**
298
     * Looks for closing quotes to include them with the previous statement.
299
     * "That was very interesting," he said.
300
     * "That was very interesting."
301
     *
302
     * @param string[] $statements
303
     * @return string[]
304
     */
305
    private function closeQuotesMerge($statements)
306
    {
307
        $i = 0;
308
        $previous_statement = '';
309
        $return = [];
310
        foreach ($statements as $statement) {
311
            if (self::isEndQuote($statement)) {
312
                $statement = $previous_statement . $statement;
313
            } else {
314
                $i++;
315
            }
316
317
            $return[$i] = $statement;
318
            $previous_statement = $statement;
319
        }
320
321
        return $return;
322
    }
323
324
    /**
325
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
326
     *
327
     * @param $statement
328
     * @return bool
329
     */
330
    private static function isEndQuote($statement)
331
    {
332
        $trimmed = Multibyte::trim($statement);
333
        $first = mb_substr($statement, 0, 1);
334
335
        return in_array($trimmed, ['"', '\''])
336
            || (
337
                in_array($first, ['"', '\''])
338
                && mb_substr($statement, 1, 1) === ' '
339
                && ctype_lower(mb_substr($statement, 2, 1)) === true
340
            );
341
    }
342
343
    /**
344
     * Merges items into larger sentences.
345
     * Multibyte.php safe
346
     *
347
     * @param string[] $shorts
348
     * @return string[]
349
     */
350
    private function sentenceMerge($shorts)
351
    {
352
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
353
354
        $sentences = [];
355
356
        $sentence = '';
357
        $has_words = false;
358
        $previous_word_ending = null;
359
        foreach ($shorts as $short) {
360
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
361
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
362
363
            if ($after_non_abbreviating_terminal
364
                || ($has_words && $word_count > 1)) {
365
366
                $sentences[] = $sentence;
367
368
                $sentence = '';
369
                $has_words = false;
370
            }
371
372
            $has_words = $has_words
373
                || $word_count > 1;
374
375
            $sentence .= $short;
376
            $previous_word_ending = mb_substr($short, -1);
377
        }
378
379
        if (!empty($sentence)) {
380
            $sentences[] = $sentence;
381
        }
382
383
        return $sentences;
384
    }
385
386
    /**
387
     * Return the sentences sentences detected in the provided text.
388
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
389
     * @param string $text
390
     * @param integer $flags
391
     * @return string[]
392
     */
393
    public function split($text, $flags = 0)
394
    {
395
        static $pipeline = [
396
            'replaceFloatNumbers',
397
            'punctuationSplit',
398
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
399
            'punctuationMerge',
400
            'abbreviationMerge',
401
            'closeQuotesMerge',
402
            'sentenceMerge',
403
            'restoreReplacements'
404
        ];
405
406
        // clean funny quotes
407
        $text = Multibyte::cleanUnicode($text);
408
409
        // Split
410
        $sentences = [];
411
        foreach (self::linebreakSplit($text) as $input) {
412
            if (Multibyte::trim($input) !== '') {
413
                foreach ($pipeline as $method) {
414
                    $input = $this->$method($input);
415
                }
416
                $sentences = array_merge($sentences, $input);
417
            }
418
        }
419
420
        // Post process
421
        if ($flags & self::SPLIT_TRIM) {
422
            return self::trimSentences($sentences);
423
        }
424
425
        return $sentences;
426
    }
427
428
    /**
429
     * Multibyte.php trim each string in an array.
430
     * @param string[] $sentences
431
     * @return string[]
432
     */
433
    private static function trimSentences($sentences)
434
    {
435
        return array_map(function ($sentence) {
436
            return Multibyte::trim($sentence);
437
        }, $sentences);
438
    }
439
440
    /**
441
     * Return the number of sentences detected in the provided text.
442
     * @param string $text
443
     * @return integer
444
     */
445
    public function count($text)
446
    {
447
        return count($this->split($text));
448
    }
449
450
}
451