Sentence::sentenceMerge() - Code Metrics - Inspection of "Refactor cleanUnicode" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( e85d9b...d7563a )

by Martijn

created 2019-04-01 11:08 UTC

Sentence::sentenceMerge() B

↳ Parent: Sentence

Complexity

Conditions	7
Paths	8

Size

Total Lines	31
Code Lines	21

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	21
dl	0
loc	31
rs	8.6506
c	0
b	0
f	0
cc	7
nc	8
nop	1

<?php

namespace Vanderlee\Sentence;

/**
 * Segments sentences.
 * Clipping may not be perfect.
 * Sentence count should be VERY close to the truth.
 *
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
 * language stucture (English, Dutch, German). Should work for most
 * latin-alphabet languages.
 *
 * @author Martijn van der Lee (@vanderlee)
 * @author @marktaw
 */
class Sentence
{

    /**
     * Specify this flag with the split method to trim whitespace.
     */
    const SPLIT_TRIM = 0x1;

    /**
     * List of characters used to terminate sentences.
     *
     * @var string[]
     */
    private $terminals = ['.', '!', '?'];

    /**
     * List of characters used for abbreviations.
     *
     * @var string[]
     */
    private $abbreviators = ['.'];

    /**
     * Breaks a piece of text into lines by linebreak.
     * Eats up any linebreak characters as if one.
     *
     * Multibyte.php safe
     *
     * @param string $text
     * @return string[]
     */
    private static function linebreakSplit($text)
    {
        $lines = [];
        $line = '';

        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
            $line .= $part;
            if (Multibyte::trim($part) === '') {
                $lines[] = $line;
                $line = '';
            }
        }
        $lines[] = $line;

        return $lines;
    }

    /**
     * Splits an array of lines by (consecutive sequences of)
     * terminals, keeping terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    "There ... is. More!"
     *        ... becomes ...
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *
     * @param string $line
     * @return string[]
     */
    private function punctuationSplit($line)
    {
        $parts = [];

        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
        $is_terminal = in_array($chars[0], $this->terminals);

        $part = '';
        foreach ($chars as $index => $char) {
            if (in_array($char, $this->terminals) !== $is_terminal) {
                $parts[] = $part;
                $part = '';
                $is_terminal = !$is_terminal;
            }
            $part .= $char;
        }

        if (!empty($part)) {
            $parts[] = $part;
        }

        return $parts;
    }

    /**
     * Appends each terminal item after it's preceding
     * non-terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *        ... becomes ...
     *    [ "There ... is.", "More!" ]
     *
     * @param string[] $punctuations
     * @return string[]
     */
    private function punctuationMerge($punctuations)
    {
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);

        $merges = [];
        $merge = '';

        foreach ($punctuations as $punctuation) {
            if ($punctuation !== '') {
                $merge .= $punctuation;
                if (mb_strlen($punctuation) === 1
                    && in_array($punctuation, $this->terminals)) {
                    $merges[] = $merge;
                    $merge = '';
                } else {
                    foreach ($definite_terminals as $terminal) {
                        if (mb_strpos($punctuation, $terminal) !== false) {
                            $merges[] = $merge;
                            $merge = '';
                            break;
                        }
                    }
                }
            }
        }
        if (!empty($merge)) {
            $merges[] = $merge;
        }

        return $merges;
    }

    /**
     * Looks for capitalized abbreviations & includes them with the following fragment.
     *
     * For example:
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
     *        ... becomes ...
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
     *  [ "Mr. Comey was not available for comment." ]
     *
     * @param string[] $fragments
     * @return string[]
     */
    private function abbreviationMerge($fragments)
    {
        $return_fragment = [];

        $previous_string = '';
        $previous_is_abbreviation = false;
        $i = 0;

        foreach ($fragments as $fragment) {
            $current_string = $fragment;
            $words = mb_split('\s+', Multibyte::trim($fragment));

            $word_count = count($words);

            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
            $last_word = trim($words[$word_count - 1]);
            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
            $is_abbreviation = $last_is_capital > 0
                && $last_is_abbreviation > 0
                && mb_strlen($last_word) <= 3;

            // merge previous fragment with this
            if ($previous_is_abbreviation === true) {
                $current_string = $previous_string . $current_string;
            }
            $return_fragment[$i] = $current_string;

            $previous_is_abbreviation = $is_abbreviation;
            $previous_string = $current_string;
            // only increment if this isn't an abbreviation
            if ($is_abbreviation === false) {
                $i++;
            }
        }
        return $return_fragment;
    }

    /**
     * Merges any part starting with a closing parenthesis ')' to the previous
     * part.
     *
     * @param string[] $parts
     * @return string[]
     */
    private function parenthesesMerge($parts)
    {
        $subsentences = [];

        foreach ($parts as $part) {
            if ($part[0] === ')') {
                $subsentences[count($subsentences) - 1] .= $part;
            } else {
                $subsentences[] = $part;
            }
        }

        return $subsentences;
    }

    /**
     * Looks for closing quotes to include them with the previous statement.
     * "That was very interesting," he said.
     * "That was very interesting."
     *
     * @param string[] $statements
     * @return string[]
     */
    private function closeQuotesMerge($statements)
    {
        $i = 0;
        $previous_statement = "";
        $return = [];
        foreach ($statements as $statement) {
            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
            if (trim($statement) === '"'
                || trim($statement) === "'"
                || (
                    (substr($statement, 0, 1) === '"'
                        || substr($statement, 0, 1) === "'")
                    && substr($statement, 1, 1) === ' '
                    && ctype_lower(substr($statement, 2, 1)) === true
                )
            ) {
                $statement = $previous_statement . $statement;
            } else {
                $i++;
            }

            $return[$i] = $statement;
            $previous_statement = $statement;
        }

        return $return;
    }

    /**
     * Merges items into larger sentences.
     * Multibyte.php safe
     *
     * @param string[] $shorts
     * @return string[]
     */
    private function sentenceMerge($shorts)
    {
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);

        $sentences = [];

        $sentence = '';
        $has_words = false;
        $previous_word_ending = null;
        foreach ($shorts as $short) {
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);

            if ($after_non_abbreviating_terminal
                || ($has_words && $word_count > 1)) {
                $sentences[] = $sentence;
                $sentence = '';
                $has_words = $word_count > 1;
            } else {
                $has_words = ($has_words
                    || $word_count > 1);
            }

            $sentence .= $short;
            $previous_word_ending = mb_substr($short, -1);
        }
        if (!empty($sentence)) {
            $sentences[] = $sentence;
        }

        return $sentences;
    }

    /**
     * Return the sentences sentences detected in the provided text.
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
     * @param string $text
     * @param integer $flags
     * @return string[]
     */
    public function split($text, $flags = 0)
    {
        static $pipeline = [
            'punctuationSplit',
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
            'punctuationMerge',
            'abbreviationMerge',
            'closeQuotesMerge',
            'sentenceMerge',
        ];

        // clean funny quotes
        $text = Multibyte::cleanUnicode($text);

        // Split
        $sentences = [];
        foreach (self::linebreakSplit($text) as $input) {
            if (Multibyte::trim($input) !== '') {
                foreach ($pipeline as $method) {
                    $input = $this->$method($input);
                }
                $sentences = array_merge($sentences, $input);
            }
        }

        // Post process
        if ($flags & self::SPLIT_TRIM) {
            return self::trimSentences($sentences);
        }

        return $sentences;
    }

    /**
     * Multibyte.php trim each string in an array.
     * @param string[] $sentences
     * @return string[]
     */
    private static function trimSentences($sentences)
    {
        return array_map(function($sentence) {
            return Multibyte::trim($sentence);
        }, $sentences);
    }

    /**
     * Return the number of sentences detected in the provided text.
     * @param string $text
     * @return integer
     */
    public function count($text)
    {
        return count($this->split($text));
    }

}


1			<?php
2
3			namespace Vanderlee\Sentence;
4
5			/**
6			* Segments sentences.
7			* Clipping may not be perfect.
8			* Sentence count should be VERY close to the truth.
9			*
10			* Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11			* language stucture (English, Dutch, German). Should work for most
12			* latin-alphabet languages.
13			*
14			* @author Martijn van der Lee (@vanderlee)
15			* @author @marktaw
16			*/
17			class Sentence
18			{
19
20			/**
21			* Specify this flag with the split method to trim whitespace.
22			*/
23			const SPLIT_TRIM = 0x1;
24
25			/**
26			* List of characters used to terminate sentences.
27			*
28			* @var string[]
29			*/
30			private $terminals = ['.', '!', '?'];
31
32			/**
33			* List of characters used for abbreviations.
34			*
35			* @var string[]
36			*/
37			private $abbreviators = ['.'];
38
39			/**
40			* Breaks a piece of text into lines by linebreak.
41			* Eats up any linebreak characters as if one.
42			*
43			* Multibyte.php safe
44			*
45			* @param string $text
46			* @return string[]
47			*/
48			private static function linebreakSplit($text)
49			{
50			$lines = [];
51			$line = '';
52
53			foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54			$line .= $part;
55			if (Multibyte::trim($part) === '') {
56			$lines[] = $line;
57			$line = '';
58			}
59			}
60			$lines[] = $line;
61
62			return $lines;
63			}
64
65			/**
66			* Splits an array of lines by (consecutive sequences of)
67			* terminals, keeping terminals.
68			*
69			* Multibyte.php safe (atleast for UTF-8)
70			*
71			* For example:
72			* "There ... is. More!"
73			* ... becomes ...
74			* [ "There ", "...", " is", ".", " More", "!" ]
75			*
76			* @param string $line
77			* @return string[]
78			*/
79			private function punctuationSplit($line)
80			{
81			$parts = [];
82
83			$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84			$is_terminal = in_array($chars[0], $this->terminals);
85
86			$part = '';
87			foreach ($chars as $index => $char) {
88			if (in_array($char, $this->terminals) !== $is_terminal) {
89			$parts[] = $part;
90			$part = '';
91			$is_terminal = !$is_terminal;
92			}
93			$part .= $char;
94			}
95
96			if (!empty($part)) {
97			$parts[] = $part;
98			}
99
100			return $parts;
101			}
102
103			/**
104			* Appends each terminal item after it's preceding
105			* non-terminals.
106			*
107			* Multibyte.php safe (atleast for UTF-8)
108			*
109			* For example:
110			* [ "There ", "...", " is", ".", " More", "!" ]
111			* ... becomes ...
112			* [ "There ... is.", "More!" ]
113			*
114			* @param string[] $punctuations
115			* @return string[]
116			*/
117			private function punctuationMerge($punctuations)
118			{
119			$definite_terminals = array_diff($this->terminals, $this->abbreviators);
120
121			$merges = [];
122			$merge = '';
123
124			foreach ($punctuations as $punctuation) {
125			if ($punctuation !== '') {
126			$merge .= $punctuation;
127			if (mb_strlen($punctuation) === 1
128			&& in_array($punctuation, $this->terminals)) {
129			$merges[] = $merge;
130			$merge = '';
131			} else {
132			foreach ($definite_terminals as $terminal) {
133			if (mb_strpos($punctuation, $terminal) !== false) {
134			$merges[] = $merge;
135			$merge = '';
136			break;
137			}
138			}
139			}
140			}
141			}
142			if (!empty($merge)) {
143			$merges[] = $merge;
144			}
145
146			return $merges;
147			}
148
149			/**
150			* Looks for capitalized abbreviations & includes them with the following fragment.
151			*
152			* For example:
153			* [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
154			* ... becomes ...
155			* [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
156			* [ "Mr. Comey was not available for comment." ]
157			*
158			* @param string[] $fragments
159			* @return string[]
160			*/
161			private function abbreviationMerge($fragments)
162			{
163			$return_fragment = [];
164
165			$previous_string = '';
166			$previous_is_abbreviation = false;
167			$i = 0;
168
169			foreach ($fragments as $fragment) {
170			$current_string = $fragment;
171			$words = mb_split('\s+', Multibyte::trim($fragment));
172
173			$word_count = count($words);
174
175			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
176			$last_word = trim($words[$word_count - 1]);
177			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
178			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
179			$is_abbreviation = $last_is_capital > 0
180			&& $last_is_abbreviation > 0
181			&& mb_strlen($last_word) <= 3;
182
183			// merge previous fragment with this
184			if ($previous_is_abbreviation === true) {
185			$current_string = $previous_string . $current_string;
186			}
187			$return_fragment[$i] = $current_string;
188
189			$previous_is_abbreviation = $is_abbreviation;
190			$previous_string = $current_string;
191			// only increment if this isn't an abbreviation
192			if ($is_abbreviation === false) {
193			$i++;
194			}
195			}
196			return $return_fragment;
197			}
198
199			/**
200			* Merges any part starting with a closing parenthesis ')' to the previous
201			* part.
202			*
203			* @param string[] $parts
204			* @return string[]
205			*/
206			private function parenthesesMerge($parts)
207			{
208			$subsentences = [];
209
210			foreach ($parts as $part) {
211			if ($part[0] === ')') {
212			$subsentences[count($subsentences) - 1] .= $part;
213			} else {
214			$subsentences[] = $part;
215			}
216			}
217
218			return $subsentences;
219			}
220
221			/**
222			* Looks for closing quotes to include them with the previous statement.
223			* "That was very interesting," he said.
224			* "That was very interesting."
225			*
226			* @param string[] $statements
227			* @return string[]
228			*/
229			private function closeQuotesMerge($statements)
230			{
231			$i = 0;
232			$previous_statement = "";
233			$return = [];
234			foreach ($statements as $statement) {
235			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
236			if (trim($statement) === '"'
237			\|\| trim($statement) === "'"
238			\|\| (
239			(substr($statement, 0, 1) === '"'
240			\|\| substr($statement, 0, 1) === "'")
241			&& substr($statement, 1, 1) === ' '
242			&& ctype_lower(substr($statement, 2, 1)) === true
243			)
244			) {
245			$statement = $previous_statement . $statement;
246			} else {
247			$i++;
248			}
249
250			$return[$i] = $statement;
251			$previous_statement = $statement;
252			}
253
254			return $return;
255			}
256
257			/**
258			* Merges items into larger sentences.
259			* Multibyte.php safe
260			*
261			* @param string[] $shorts
262			* @return string[]
263			*/
264			private function sentenceMerge($shorts)
265			{
266			$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
267
268			$sentences = [];
269
270			$sentence = '';
271			$has_words = false;
272			$previous_word_ending = null;
273			foreach ($shorts as $short) {
274			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
275			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
276
277			if ($after_non_abbreviating_terminal
278			\|\| ($has_words && $word_count > 1)) {
279			$sentences[] = $sentence;
280			$sentence = '';
281			$has_words = $word_count > 1;
282			} else {
283			$has_words = ($has_words
284			\|\| $word_count > 1);
285			}
286
287			$sentence .= $short;
288			$previous_word_ending = mb_substr($short, -1);
289			}
290			if (!empty($sentence)) {
291			$sentences[] = $sentence;
292			}
293
294			return $sentences;
295			}
296
297			/**
298			* Return the sentences sentences detected in the provided text.
299			* Set the Sentence::SPLIT_TRIM flag to trim whitespace.
300			* @param string $text
301			* @param integer $flags
302			* @return string[]
303			*/
304			public function split($text, $flags = 0)
305			{
306			static $pipeline = [
307			'punctuationSplit',
308			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
309			'punctuationMerge',
310			'abbreviationMerge',
311			'closeQuotesMerge',
312			'sentenceMerge',
313			];
314
315			// clean funny quotes
316			$text = Multibyte::cleanUnicode($text);
317
318			// Split
319			$sentences = [];
320			foreach (self::linebreakSplit($text) as $input) {
321			if (Multibyte::trim($input) !== '') {
322			foreach ($pipeline as $method) {
323			$input = $this->$method($input);
324			}
325			$sentences = array_merge($sentences, $input);
326			}
327			}
328
329			// Post process
330			if ($flags & self::SPLIT_TRIM) {
331			return self::trimSentences($sentences);
332			}
333
334			return $sentences;
335			}
336
337			/**
338			* Multibyte.php trim each string in an array.
339			* @param string[] $sentences
340			* @return string[]
341			*/
342			private static function trimSentences($sentences)
343			{
344			return array_map(function($sentence) {
345			return Multibyte::trim($sentence);
346			}, $sentences);
347			}
348
349			/**
350			* Return the number of sentences detected in the provided text.
351			* @param string $text
352			* @return integer
353			*/
354			public function count($text)
355			{
356			return count($this->split($text));
357			}
358
359			}
360

vanderlee / php-sentence

Push — master ( e85d9b...d7563a )

Sentence::sentenceMerge() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like