Passed
Pull Request — master (#16)
by Umut
02:46
created

Sentence::split()   A

Complexity

Conditions 5
Paths 8

Size

Total Lines 33
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
eloc 19
c 2
b 0
f 0
dl 0
loc 33
rs 9.3222
cc 5
nc 8
nop 2
1
<?php
2
3
namespace Vanderlee\Sentence;
4
5
/**
6
 * Segments sentences.
7
 * Clipping may not be perfect.
8
 * Sentence count should be VERY close to the truth.
9
 *
10
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11
 * language stucture (English, Dutch, German). Should work for most
12
 * latin-alphabet languages.
13
 *
14
 * @author Martijn van der Lee (@vanderlee)
15
 * @author @marktaw
16
 */
17
class Sentence
18
{
19
20
    /**
21
     * Specify this flag with the split method to trim whitespace.
22
     */
23
    const SPLIT_TRIM = 0x1;
24
25
    /**
26
     * List of characters used to terminate sentences.
27
     *
28
     * @var string[]
29
     */
30
    private $terminals = ['.', '!', '?'];
31
32
    /**
33
     * List of characters used for abbreviations.
34
     *
35
     * @var string[]
36
     */
37
    private $abbreviators = ['.'];
38
39
    /**
40
     * List of float numbers in the text
41
     *
42
     * @var string[]
43
     */
44
    private $floatNumbers = [];
45
46
    /**
47
     * Clean floating point numbers by replace them with their md5 hash
48
     *
49
     * @param string $text
50
     *
51
     * @return string
52
     */
53
    private function floatNumberClean(string $text)
54
    {
55
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches);
56
57
        foreach ($matches[0] as $floatNumber) {
58
            $this->floatNumbers[$floatNumber] = md5($floatNumber);
59
60
            $text = str_replace($floatNumber, md5($floatNumber), $text);
61
        }
62
63
        return $text;
64
    }
65
66
    /**
67
     * Revert the hashed floating number back
68
     *
69
     * @param string[] $text
70
     *
71
     * @return string[]
72
     */
73
    private function floatNumberRevert($text)
74
    {
75
        
76
        return array_map(function($value) {
77
            foreach ($this->floatNumbers as $number => $hash) {
78
                $value = str_replace($hash, $number, $value);
79
            }
80
            return $value;
81
        }, $text);
82
    }
83
84
    /**
85
     * Breaks a piece of text into lines by linebreak.
86
     * Eats up any linebreak characters as if one.
87
     *
88
     * Multibyte.php safe
89
     *
90
     * @param string $text
91
     * @return string[]
92
     */
93
    private static function linebreakSplit($text)
94
    {
95
        $lines = [];
96
        $line = '';
97
98
        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
99
            $line .= $part;
100
            if (Multibyte::trim($part) === '') {
101
                $lines[] = $line;
102
                $line = '';
103
            }
104
        }
105
        $lines[] = $line;
106
107
        return $lines;
108
    }
109
110
    /**
111
     * Splits an array of lines by (consecutive sequences of)
112
     * terminals, keeping terminals.
113
     *
114
     * Multibyte.php safe (atleast for UTF-8)
115
     *
116
     * For example:
117
     *    "There ... is. More!"
118
     *        ... becomes ...
119
     *    [ "There ", "...", " is", ".", " More", "!" ]
120
     *
121
     * @param string $line
122
     * @return string[]
123
     */
124
    private function punctuationSplit($line)
125
    {
126
        $parts = [];
127
128
        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
129
        $is_terminal = in_array($chars[0], $this->terminals);
130
131
        $part = '';
132
        foreach ($chars as $index => $char) {
133
            if (in_array($char, $this->terminals) !== $is_terminal) {
134
                $parts[] = $part;
135
                $part = '';
136
                $is_terminal = !$is_terminal;
137
            }
138
            $part .= $char;
139
        }
140
141
        if (!empty($part)) {
142
            $parts[] = $part;
143
        }
144
145
        return $parts;
146
    }
147
148
    /**
149
     * Appends each terminal item after it's preceding
150
     * non-terminals.
151
     *
152
     * Multibyte.php safe (atleast for UTF-8)
153
     *
154
     * For example:
155
     *    [ "There ", "...", " is", ".", " More", "!" ]
156
     *        ... becomes ...
157
     *    [ "There ... is.", "More!" ]
158
     *
159
     * @param string[] $punctuations
160
     * @return string[]
161
     */
162
    private function punctuationMerge($punctuations)
163
    {
164
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
165
166
        $merges = [];
167
        $merge = '';
168
169
        $filtered = array_filter($punctuations, function ($p) {
170
            return $p !== '';
171
        });
172
173
        foreach ($filtered as $punctuation) {
174
            $merge .= $punctuation;
175
            if (mb_strlen($punctuation) === 1
176
                && in_array($punctuation, $this->terminals)) {
177
                $merges[] = $merge;
178
                $merge = '';
179
            } else {
180
                foreach ($definite_terminals as $terminal) {
181
                    if (mb_strpos($punctuation, $terminal) !== false) {
182
                        $merges[] = $merge;
183
                        $merge = '';
184
                        break;
185
                    }
186
                }
187
            }
188
        }
189
        if (!empty($merge)) {
190
            $merges[] = $merge;
191
        }
192
193
        return $merges;
194
    }
195
196
    /**
197
     * Looks for capitalized abbreviations & includes them with the following fragment.
198
     *
199
     * For example:
200
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
201
     *        ... becomes ...
202
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
203
     *  [ "Mr. Comey was not available for comment." ]
204
     *
205
     * @param string[] $fragments
206
     * @return string[]
207
     */
208
    private function abbreviationMerge($fragments)
209
    {
210
        $return_fragment = [];
211
212
        $previous_fragment = '';
213
        $previous_is_abbreviation = false;
214
        $i = 0;
215
        foreach ($fragments as $fragment) {
216
            $is_abbreviation = self::isAbreviation($fragment);
217
218
            // merge previous fragment with this
219
            if ($previous_is_abbreviation) {
220
                $fragment = $previous_fragment . $fragment;
221
            }
222
            $return_fragment[$i] = $fragment;
223
224
            $previous_is_abbreviation = $is_abbreviation;
225
            $previous_fragment = $fragment;
226
227
            // only increment if this isn't an abbreviation
228
            if (!$is_abbreviation) {
229
                $i++;
230
            }
231
        }
232
        return $return_fragment;
233
    }
234
235
    /**
236
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
237
     *
238
     * @param $fragment
239
     * @return bool
240
     */
241
    private static function isAbreviation($fragment)
242
    {
243
        $words = mb_split('\s+', Multibyte::trim($fragment));
244
245
        $word_count = count($words);
246
247
        $last_word = Multibyte::trim($words[$word_count - 1]);
248
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
249
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
250
251
        return $last_is_capital > 0
252
            && $last_is_abbreviation > 0
253
            && mb_strlen($last_word) <= 3;
254
    }
255
256
    /**
257
     * Merges any part starting with a closing parenthesis ')' to the previous
258
     * part.
259
     *
260
     * @param string[] $parts
261
     * @return string[]
262
     */
263
    private function parenthesesMerge($parts)
264
    {
265
        $subsentences = [];
266
267
        foreach ($parts as $part) {
268
            if ($part[0] === ')') {
269
                $subsentences[count($subsentences) - 1] .= $part;
270
            } else {
271
                $subsentences[] = $part;
272
            }
273
        }
274
275
        return $subsentences;
276
    }
277
278
    /**
279
     * Looks for closing quotes to include them with the previous statement.
280
     * "That was very interesting," he said.
281
     * "That was very interesting."
282
     *
283
     * @param string[] $statements
284
     * @return string[]
285
     */
286
    private function closeQuotesMerge($statements)
287
    {
288
        $i = 0;
289
        $previous_statement = '';
290
        $return = [];
291
        foreach ($statements as $statement) {
292
            if (self::isEndQuote($statement)) {
293
                $statement = $previous_statement . $statement;
294
            } else {
295
                $i++;
296
            }
297
298
            $return[$i] = $statement;
299
            $previous_statement = $statement;
300
        }
301
302
        return $return;
303
    }
304
305
    /**
306
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
307
     *
308
     * @param $statement
309
     * @return bool
310
     */
311
    private static function isEndQuote($statement)
312
    {
313
        $trimmed = Multibyte::trim($statement);
314
        $first = mb_substr($statement, 0, 1);
315
316
        return in_array($trimmed, ['"', '\''])
317
            || (
318
                in_array($first, ['"', '\''])
319
                && mb_substr($statement, 1, 1) === ' '
320
                && ctype_lower(mb_substr($statement, 2, 1)) === true
321
            );
322
    }
323
324
    /**
325
     * Merges items into larger sentences.
326
     * Multibyte.php safe
327
     *
328
     * @param string[] $shorts
329
     * @return string[]
330
     */
331
    private function sentenceMerge($shorts)
332
    {
333
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
334
335
        $sentences = [];
336
337
        $sentence = '';
338
        $has_words = false;
339
        $previous_word_ending = null;
340
        foreach ($shorts as $short) {
341
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
342
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
343
344
            if ($after_non_abbreviating_terminal
345
                || ($has_words && $word_count > 1)) {
346
347
                $sentences[] = $sentence;
348
349
                $sentence = '';
350
                $has_words = false;
351
            }
352
353
            $has_words = $has_words
354
                || $word_count > 1;
355
356
            $sentence .= $short;
357
            $previous_word_ending = mb_substr($short, -1);
358
        }
359
360
        if (!empty($sentence)) {
361
            $sentences[] = $sentence;
362
        }
363
364
        return $sentences;
365
    }
366
367
    /**
368
     * Return the sentences sentences detected in the provided text.
369
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
370
     * @param string $text
371
     * @param integer $flags
372
     * @return string[]
373
     */
374
    public function split($text, $flags = 0)
375
    {
376
        static $pipeline = [
377
            'floatNumberClean',
378
            'punctuationSplit',
379
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
380
            'punctuationMerge',
381
            'abbreviationMerge',
382
            'closeQuotesMerge',
383
            'sentenceMerge',
384
            'floatNumberRevert'
385
        ];
386
387
        // clean funny quotes
388
        $text = Multibyte::cleanUnicode($text);
389
390
        // Split
391
        $sentences = [];
392
        foreach (self::linebreakSplit($text) as $input) {
393
            if (Multibyte::trim($input) !== '') {
394
                foreach ($pipeline as $method) {
395
                    $input = $this->$method($input);
396
                }
397
                $sentences = array_merge($sentences, $input);
398
            }
399
        }
400
401
        // Post process
402
        if ($flags & self::SPLIT_TRIM) {
403
            return self::trimSentences($sentences);
404
        }
405
406
        return $sentences;
407
    }
408
409
    /**
410
     * Multibyte.php trim each string in an array.
411
     * @param string[] $sentences
412
     * @return string[]
413
     */
414
    private static function trimSentences($sentences)
415
    {
416
        return array_map(function ($sentence) {
417
            return Multibyte::trim($sentence);
418
        }, $sentences);
419
    }
420
421
    /**
422
     * Return the number of sentences detected in the provided text.
423
     * @param string $text
424
     * @return integer
425
     */
426
    public function count($text)
427
    {
428
        return count($this->split($text));
429
    }
430
431
}
432