Passed
Push — master ( d7563a...f1d8ce )
by Martijn
01:40
created

Sentence::split()   A

Complexity

Conditions 5
Paths 8

Size

Total Lines 31
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 17
dl 0
loc 31
rs 9.3888
c 0
b 0
f 0
cc 5
nc 8
nop 2
1
<?php
2
3
namespace Vanderlee\Sentence;
4
5
/**
6
 * Segments sentences.
7
 * Clipping may not be perfect.
8
 * Sentence count should be VERY close to the truth.
9
 *
10
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11
 * language stucture (English, Dutch, German). Should work for most
12
 * latin-alphabet languages.
13
 *
14
 * @author Martijn van der Lee (@vanderlee)
15
 * @author @marktaw
16
 */
17
class Sentence
18
{
19
20
    /**
21
     * Specify this flag with the split method to trim whitespace.
22
     */
23
    const SPLIT_TRIM = 0x1;
24
25
    /**
26
     * List of characters used to terminate sentences.
27
     *
28
     * @var string[]
29
     */
30
    private $terminals = ['.', '!', '?'];
31
32
    /**
33
     * List of characters used for abbreviations.
34
     *
35
     * @var string[]
36
     */
37
    private $abbreviators = ['.'];
38
39
    /**
40
     * Breaks a piece of text into lines by linebreak.
41
     * Eats up any linebreak characters as if one.
42
     *
43
     * Multibyte.php safe
44
     *
45
     * @param string $text
46
     * @return string[]
47
     */
48
    private static function linebreakSplit($text)
49
    {
50
        $lines = [];
51
        $line = '';
52
53
        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
            $line .= $part;
55
            if (Multibyte::trim($part) === '') {
56
                $lines[] = $line;
57
                $line = '';
58
            }
59
        }
60
        $lines[] = $line;
61
62
        return $lines;
63
    }
64
65
    /**
66
     * Splits an array of lines by (consecutive sequences of)
67
     * terminals, keeping terminals.
68
     *
69
     * Multibyte.php safe (atleast for UTF-8)
70
     *
71
     * For example:
72
     *    "There ... is. More!"
73
     *        ... becomes ...
74
     *    [ "There ", "...", " is", ".", " More", "!" ]
75
     *
76
     * @param string $line
77
     * @return string[]
78
     */
79
    private function punctuationSplit($line)
80
    {
81
        $parts = [];
82
83
        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84
        $is_terminal = in_array($chars[0], $this->terminals);
85
86
        $part = '';
87
        foreach ($chars as $index => $char) {
88
            if (in_array($char, $this->terminals) !== $is_terminal) {
89
                $parts[] = $part;
90
                $part = '';
91
                $is_terminal = !$is_terminal;
92
            }
93
            $part .= $char;
94
        }
95
96
        if (!empty($part)) {
97
            $parts[] = $part;
98
        }
99
100
        return $parts;
101
    }
102
103
    /**
104
     * Appends each terminal item after it's preceding
105
     * non-terminals.
106
     *
107
     * Multibyte.php safe (atleast for UTF-8)
108
     *
109
     * For example:
110
     *    [ "There ", "...", " is", ".", " More", "!" ]
111
     *        ... becomes ...
112
     *    [ "There ... is.", "More!" ]
113
     *
114
     * @param string[] $punctuations
115
     * @return string[]
116
     */
117
    private function punctuationMerge($punctuations)
118
    {
119
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
120
121
        $merges = [];
122
        $merge = '';
123
124
        $filtered = array_filter($punctuations, function ($p) {
125
            return $p !== '';
126
        });
127
128
        foreach ($filtered as $punctuation) {
129
            $merge .= $punctuation;
130
            if (mb_strlen($punctuation) === 1
131
                && in_array($punctuation, $this->terminals)) {
132
                $merges[] = $merge;
133
                $merge = '';
134
            } else {
135
                foreach ($definite_terminals as $terminal) {
136
                    if (mb_strpos($punctuation, $terminal) !== false) {
137
                        $merges[] = $merge;
138
                        $merge = '';
139
                        break;
140
                    }
141
                }
142
            }
143
        }
144
        if (!empty($merge)) {
145
            $merges[] = $merge;
146
        }
147
148
        return $merges;
149
    }
150
151
    /**
152
     * Looks for capitalized abbreviations & includes them with the following fragment.
153
     *
154
     * For example:
155
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
156
     *        ... becomes ...
157
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
158
     *  [ "Mr. Comey was not available for comment." ]
159
     *
160
     * @param string[] $fragments
161
     * @return string[]
162
     */
163
    private function abbreviationMerge($fragments)
164
    {
165
        $return_fragment = [];
166
167
        $previous_fragment = '';
168
        $previous_is_abbreviation = false;
169
        $i = 0;
170
        foreach ($fragments as $fragment) {
171
            $is_abbreviation = self::isAbreviation($fragment);
172
173
            // merge previous fragment with this
174
            if ($previous_is_abbreviation) {
175
                $fragment = $previous_fragment . $fragment;
176
            }
177
            $return_fragment[$i] = $fragment;
178
179
            $previous_is_abbreviation = $is_abbreviation;
180
            $previous_fragment = $fragment;
181
182
            // only increment if this isn't an abbreviation
183
            if (!$is_abbreviation) {
184
                $i++;
185
            }
186
        }
187
        return $return_fragment;
188
    }
189
190
    /**
191
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
192
     *
193
     * @param $fragment
194
     * @return bool
195
     */
196
    private static function isAbreviation($fragment)
197
    {
198
        $words = mb_split('\s+', Multibyte::trim($fragment));
199
200
        $word_count = count($words);
201
202
        $last_word = Multibyte::trim($words[$word_count - 1]);
203
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
204
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
205
206
        return $last_is_capital > 0
207
            && $last_is_abbreviation > 0
208
            && mb_strlen($last_word) <= 3;
209
    }
210
211
    /**
212
     * Merges any part starting with a closing parenthesis ')' to the previous
213
     * part.
214
     *
215
     * @param string[] $parts
216
     * @return string[]
217
     */
218
    private function parenthesesMerge($parts)
219
    {
220
        $subsentences = [];
221
222
        foreach ($parts as $part) {
223
            if ($part[0] === ')') {
224
                $subsentences[count($subsentences) - 1] .= $part;
225
            } else {
226
                $subsentences[] = $part;
227
            }
228
        }
229
230
        return $subsentences;
231
    }
232
233
    /**
234
     * Looks for closing quotes to include them with the previous statement.
235
     * "That was very interesting," he said.
236
     * "That was very interesting."
237
     *
238
     * @param string[] $statements
239
     * @return string[]
240
     */
241
    private function closeQuotesMerge($statements)
242
    {
243
        $i = 0;
244
        $previous_statement = '';
245
        $return = [];
246
        foreach ($statements as $statement) {
247
            if (self::isEndQuote($statement)) {
248
                $statement = $previous_statement . $statement;
249
            } else {
250
                $i++;
251
            }
252
253
            $return[$i] = $statement;
254
            $previous_statement = $statement;
255
        }
256
257
        return $return;
258
    }
259
260
    /**
261
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
262
     *
263
     * @param $statement
264
     * @return bool
265
     */
266
    private static function isEndQuote($statement)
267
    {
268
        $trimmed = Multibyte::trim($statement);
269
        $first = mb_substr($statement, 0, 1);
270
271
        return in_array($trimmed, ['"', '\''])
272
            || (
273
                in_array($first, ['"', '\''])
274
                && mb_substr($statement, 1, 1) === ' '
275
                && ctype_lower(mb_substr($statement, 2, 1)) === true
276
            );
277
    }
278
279
    /**
280
     * Merges items into larger sentences.
281
     * Multibyte.php safe
282
     *
283
     * @param string[] $shorts
284
     * @return string[]
285
     */
286
    private function sentenceMerge($shorts)
287
    {
288
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
289
290
        $sentences = [];
291
292
        $sentence = '';
293
        $has_words = false;
294
        $previous_word_ending = null;
295
        foreach ($shorts as $short) {
296
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
297
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
298
299
            if ($after_non_abbreviating_terminal
300
                || ($has_words && $word_count > 1)) {
301
302
                $sentences[] = $sentence;
303
304
                $sentence = '';
305
                $has_words = false;
306
            }
307
308
            $has_words = $has_words
309
                || $word_count > 1;
310
311
            $sentence .= $short;
312
            $previous_word_ending = mb_substr($short, -1);
313
        }
314
315
        if (!empty($sentence)) {
316
            $sentences[] = $sentence;
317
        }
318
319
        return $sentences;
320
    }
321
322
    /**
323
     * Return the sentences sentences detected in the provided text.
324
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
325
     * @param string $text
326
     * @param integer $flags
327
     * @return string[]
328
     */
329
    public function split($text, $flags = 0)
330
    {
331
        static $pipeline = [
332
            'punctuationSplit',
333
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
334
            'punctuationMerge',
335
            'abbreviationMerge',
336
            'closeQuotesMerge',
337
            'sentenceMerge',
338
        ];
339
340
        // clean funny quotes
341
        $text = Multibyte::cleanUnicode($text);
342
343
        // Split
344
        $sentences = [];
345
        foreach (self::linebreakSplit($text) as $input) {
346
            if (Multibyte::trim($input) !== '') {
347
                foreach ($pipeline as $method) {
348
                    $input = $this->$method($input);
349
                }
350
                $sentences = array_merge($sentences, $input);
351
            }
352
        }
353
354
        // Post process
355
        if ($flags & self::SPLIT_TRIM) {
356
            return self::trimSentences($sentences);
357
        }
358
359
        return $sentences;
360
    }
361
362
    /**
363
     * Multibyte.php trim each string in an array.
364
     * @param string[] $sentences
365
     * @return string[]
366
     */
367
    private static function trimSentences($sentences)
368
    {
369
        return array_map(function ($sentence) {
370
            return Multibyte::trim($sentence);
371
        }, $sentences);
372
    }
373
374
    /**
375
     * Return the number of sentences detected in the provided text.
376
     * @param string $text
377
     * @return integer
378
     */
379
    public function count($text)
380
    {
381
        return count($this->split($text));
382
    }
383
384
}
385