Passed
Push — master ( e85d9b...d7563a )
by Martijn
01:34
created

Sentence   B

Complexity

Total Complexity 46

Size/Duplication

Total Lines 340
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 46
eloc 133
dl 0
loc 340
rs 8.72
c 0
b 0
f 0

10 Methods

Rating   Name   Duplication   Size   Complexity  
A linebreakSplit() 0 15 3
A count() 0 3 1
B abbreviationMerge() 0 36 6
A split() 0 31 5
A trimSentences() 0 5 1
B punctuationMerge() 0 30 8
A parenthesesMerge() 0 13 3
B sentenceMerge() 0 31 7
A punctuationSplit() 0 22 4
B closeQuotesMerge() 0 26 8

How to fix   Complexity   

Complex Class

Complex classes like Sentence often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Sentence, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Vanderlee\Sentence;
4
5
/**
6
 * Segments sentences.
7
 * Clipping may not be perfect.
8
 * Sentence count should be VERY close to the truth.
9
 *
10
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11
 * language stucture (English, Dutch, German). Should work for most
12
 * latin-alphabet languages.
13
 *
14
 * @author Martijn van der Lee (@vanderlee)
15
 * @author @marktaw
16
 */
17
class Sentence
18
{
19
20
    /**
21
     * Specify this flag with the split method to trim whitespace.
22
     */
23
    const SPLIT_TRIM = 0x1;
24
25
    /**
26
     * List of characters used to terminate sentences.
27
     *
28
     * @var string[]
29
     */
30
    private $terminals = ['.', '!', '?'];
31
32
    /**
33
     * List of characters used for abbreviations.
34
     *
35
     * @var string[]
36
     */
37
    private $abbreviators = ['.'];
38
39
    /**
40
     * Breaks a piece of text into lines by linebreak.
41
     * Eats up any linebreak characters as if one.
42
     *
43
     * Multibyte.php safe
44
     *
45
     * @param string $text
46
     * @return string[]
47
     */
48
    private static function linebreakSplit($text)
49
    {
50
        $lines = [];
51
        $line = '';
52
53
        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
            $line .= $part;
55
            if (Multibyte::trim($part) === '') {
56
                $lines[] = $line;
57
                $line = '';
58
            }
59
        }
60
        $lines[] = $line;
61
62
        return $lines;
63
    }
64
65
    /**
66
     * Splits an array of lines by (consecutive sequences of)
67
     * terminals, keeping terminals.
68
     *
69
     * Multibyte.php safe (atleast for UTF-8)
70
     *
71
     * For example:
72
     *    "There ... is. More!"
73
     *        ... becomes ...
74
     *    [ "There ", "...", " is", ".", " More", "!" ]
75
     *
76
     * @param string $line
77
     * @return string[]
78
     */
79
    private function punctuationSplit($line)
80
    {
81
        $parts = [];
82
83
        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84
        $is_terminal = in_array($chars[0], $this->terminals);
85
86
        $part = '';
87
        foreach ($chars as $index => $char) {
88
            if (in_array($char, $this->terminals) !== $is_terminal) {
89
                $parts[] = $part;
90
                $part = '';
91
                $is_terminal = !$is_terminal;
92
            }
93
            $part .= $char;
94
        }
95
96
        if (!empty($part)) {
97
            $parts[] = $part;
98
        }
99
100
        return $parts;
101
    }
102
103
    /**
104
     * Appends each terminal item after it's preceding
105
     * non-terminals.
106
     *
107
     * Multibyte.php safe (atleast for UTF-8)
108
     *
109
     * For example:
110
     *    [ "There ", "...", " is", ".", " More", "!" ]
111
     *        ... becomes ...
112
     *    [ "There ... is.", "More!" ]
113
     *
114
     * @param string[] $punctuations
115
     * @return string[]
116
     */
117
    private function punctuationMerge($punctuations)
118
    {
119
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
120
121
        $merges = [];
122
        $merge = '';
123
124
        foreach ($punctuations as $punctuation) {
125
            if ($punctuation !== '') {
126
                $merge .= $punctuation;
127
                if (mb_strlen($punctuation) === 1
128
                    && in_array($punctuation, $this->terminals)) {
129
                    $merges[] = $merge;
130
                    $merge = '';
131
                } else {
132
                    foreach ($definite_terminals as $terminal) {
133
                        if (mb_strpos($punctuation, $terminal) !== false) {
134
                            $merges[] = $merge;
135
                            $merge = '';
136
                            break;
137
                        }
138
                    }
139
                }
140
            }
141
        }
142
        if (!empty($merge)) {
143
            $merges[] = $merge;
144
        }
145
146
        return $merges;
147
    }
148
149
    /**
150
     * Looks for capitalized abbreviations & includes them with the following fragment.
151
     *
152
     * For example:
153
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
154
     *        ... becomes ...
155
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
156
     *  [ "Mr. Comey was not available for comment." ]
157
     *
158
     * @param string[] $fragments
159
     * @return string[]
160
     */
161
    private function abbreviationMerge($fragments)
162
    {
163
        $return_fragment = [];
164
165
        $previous_string = '';
166
        $previous_is_abbreviation = false;
167
        $i = 0;
168
169
        foreach ($fragments as $fragment) {
170
            $current_string = $fragment;
171
            $words = mb_split('\s+', Multibyte::trim($fragment));
172
173
            $word_count = count($words);
174
175
            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
176
            $last_word = trim($words[$word_count - 1]);
177
            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
178
            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
179
            $is_abbreviation = $last_is_capital > 0
180
                && $last_is_abbreviation > 0
181
                && mb_strlen($last_word) <= 3;
182
183
            // merge previous fragment with this
184
            if ($previous_is_abbreviation === true) {
185
                $current_string = $previous_string . $current_string;
186
            }
187
            $return_fragment[$i] = $current_string;
188
189
            $previous_is_abbreviation = $is_abbreviation;
190
            $previous_string = $current_string;
191
            // only increment if this isn't an abbreviation
192
            if ($is_abbreviation === false) {
193
                $i++;
194
            }
195
        }
196
        return $return_fragment;
197
    }
198
199
    /**
200
     * Merges any part starting with a closing parenthesis ')' to the previous
201
     * part.
202
     *
203
     * @param string[] $parts
204
     * @return string[]
205
     */
206
    private function parenthesesMerge($parts)
207
    {
208
        $subsentences = [];
209
210
        foreach ($parts as $part) {
211
            if ($part[0] === ')') {
212
                $subsentences[count($subsentences) - 1] .= $part;
213
            } else {
214
                $subsentences[] = $part;
215
            }
216
        }
217
218
        return $subsentences;
219
    }
220
221
    /**
222
     * Looks for closing quotes to include them with the previous statement.
223
     * "That was very interesting," he said.
224
     * "That was very interesting."
225
     *
226
     * @param string[] $statements
227
     * @return string[]
228
     */
229
    private function closeQuotesMerge($statements)
230
    {
231
        $i = 0;
232
        $previous_statement = "";
233
        $return = [];
234
        foreach ($statements as $statement) {
235
            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
236
            if (trim($statement) === '"'
237
                || trim($statement) === "'"
238
                || (
239
                    (substr($statement, 0, 1) === '"'
240
                        || substr($statement, 0, 1) === "'")
241
                    && substr($statement, 1, 1) === ' '
242
                    && ctype_lower(substr($statement, 2, 1)) === true
243
                )
244
            ) {
245
                $statement = $previous_statement . $statement;
246
            } else {
247
                $i++;
248
            }
249
250
            $return[$i] = $statement;
251
            $previous_statement = $statement;
252
        }
253
254
        return $return;
255
    }
256
257
    /**
258
     * Merges items into larger sentences.
259
     * Multibyte.php safe
260
     *
261
     * @param string[] $shorts
262
     * @return string[]
263
     */
264
    private function sentenceMerge($shorts)
265
    {
266
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
267
268
        $sentences = [];
269
270
        $sentence = '';
271
        $has_words = false;
272
        $previous_word_ending = null;
273
        foreach ($shorts as $short) {
274
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
275
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
276
277
            if ($after_non_abbreviating_terminal
278
                || ($has_words && $word_count > 1)) {
279
                $sentences[] = $sentence;
280
                $sentence = '';
281
                $has_words = $word_count > 1;
282
            } else {
283
                $has_words = ($has_words
284
                    || $word_count > 1);
285
            }
286
287
            $sentence .= $short;
288
            $previous_word_ending = mb_substr($short, -1);
289
        }
290
        if (!empty($sentence)) {
291
            $sentences[] = $sentence;
292
        }
293
294
        return $sentences;
295
    }
296
297
    /**
298
     * Return the sentences sentences detected in the provided text.
299
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
300
     * @param string $text
301
     * @param integer $flags
302
     * @return string[]
303
     */
304
    public function split($text, $flags = 0)
305
    {
306
        static $pipeline = [
307
            'punctuationSplit',
308
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
309
            'punctuationMerge',
310
            'abbreviationMerge',
311
            'closeQuotesMerge',
312
            'sentenceMerge',
313
        ];
314
315
        // clean funny quotes
316
        $text = Multibyte::cleanUnicode($text);
317
318
        // Split
319
        $sentences = [];
320
        foreach (self::linebreakSplit($text) as $input) {
321
            if (Multibyte::trim($input) !== '') {
322
                foreach ($pipeline as $method) {
323
                    $input = $this->$method($input);
324
                }
325
                $sentences = array_merge($sentences, $input);
326
            }
327
        }
328
329
        // Post process
330
        if ($flags & self::SPLIT_TRIM) {
331
            return self::trimSentences($sentences);
332
        }
333
334
        return $sentences;
335
    }
336
337
    /**
338
     * Multibyte.php trim each string in an array.
339
     * @param string[] $sentences
340
     * @return string[]
341
     */
342
    private static function trimSentences($sentences)
343
    {
344
        return array_map(function($sentence) {
345
            return Multibyte::trim($sentence);
346
        }, $sentences);
347
    }
348
349
    /**
350
     * Return the number of sentences detected in the provided text.
351
     * @param string $text
352
     * @return integer
353
     */
354
    public function count($text)
355
    {
356
        return count($this->split($text));
357
    }
358
359
}
360