Sentence::isAbreviation() - Code Metrics - Inspection of "Eliminate recursive replacement" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( bd113f...e772b1 )

by Martijn

created 2022-03-23 13:16 UTC

Sentence::isAbreviation() A

↳ Parent: Sentence

Complexity

Conditions	3
Paths	3

Size

Total Lines	13
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	8
dl	0
loc	13
rs	10
c	0
b	0
f	0
cc	3
nc	3
nop	1

<?php

namespace Vanderlee\Sentence;

/**
 * Segments sentences.
 * Clipping may not be perfect.
 * Sentence count should be VERY close to the truth.
 *
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
 * language stucture (English, Dutch, German). Should work for most
 * latin-alphabet languages.
 *
 * @author Martijn van der Lee (@vanderlee)
 * @author @marktaw
 */
class Sentence
{
    /**
     * Specify this flag with the split method to trim whitespace.
     */
    const SPLIT_TRIM = 0x1;

    /**
     * List of characters used to terminate sentences.
     *
     * @var string[]
     */
    private $terminals = ['.', '!', '?'];

    /**
     * List of characters used for abbreviations.
     *
     * @var string[]
     */
    private $abbreviators = ['.'];

    /**
     * List of float numbers in the text
     *
     * @var string[]
     */
    private $replacements = [];

    /**
     * Generate an in-text replacement code for the specified index
     *
     * @param string $index
     *
     * @return string
     */
    private function getReplaceCode(string $index)
    {
        return 0x02 . $index . 0x03;
    }

    /**
     * Clean floating point numbers by replace them with an in-text index
     *
     * @param string $text
     *
     * @return string
     */
    private function replaceFloatNumbers(string $text)
    {
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);

        $this->replacements = [];
        $index = 0;
        foreach (array_reverse($matches[0]) as $match) {
            $number = $match[0];
            $offset = $match[1];
            $code = $this->getReplaceCode($index);

            $this->replacements[$index] = $number;

            $text = substr_replace($text, $code, $offset, mb_strlen($number));

            ++$index;
        }

        return $text;

    }

    /**
     * Restore any stored replacements
     *
     * @param string[] $text
     *
     * @return string[]
     */
    private function restoreReplacements($text)
    {
        return array_map(function($value) {
            foreach ($this->replacements as $index => $number) {
                $code = $this->getReplaceCode($index);
                $value = str_replace($code, $number, $value);
            }
            return $value;
        }, $text);
    }

    /**
     * Breaks a piece of text into lines by linebreak.
     * Eats up any linebreak characters as if one.
     *
     * Multibyte.php safe
     *
     * @param string $text
     * @return string[]
     */
    private static function linebreakSplit($text)
    {
        $lines = [];
        $line = '';

        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
            $line .= $part;
            if (Multibyte::trim($part) === '') {
                $lines[] = $line;
                $line = '';
            }
        }
        $lines[] = $line;

        return $lines;
    }

    /**
     * Splits an array of lines by (consecutive sequences of)
     * terminals, keeping terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    "There ... is. More!"
     *        ... becomes ...
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *
     * @param string $line
     * @return string[]
     */
    private function punctuationSplit($line)
    {
        $parts = [];

        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
        $is_terminal = in_array($chars[0], $this->terminals);

        $part = '';
        foreach ($chars as $index => $char) {
            if (in_array($char, $this->terminals) !== $is_terminal) {
                $parts[] = $part;
                $part = '';
                $is_terminal = !$is_terminal;
            }
            $part .= $char;
        }

        if (!empty($part)) {
            $parts[] = $part;
        }

        return $parts;
    }

    /**
     * Appends each terminal item after it's preceding
     * non-terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *        ... becomes ...
     *    [ "There ... is.", "More!" ]
     *
     * @param string[] $punctuations
     * @return string[]
     */
    private function punctuationMerge($punctuations)
    {
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);

        $merges = [];
        $merge = '';

        $filtered = array_filter($punctuations, function ($p) {
            return $p !== '';
        });

        foreach ($filtered as $punctuation) {
            $merge .= $punctuation;
            if (mb_strlen($punctuation) === 1
                && in_array($punctuation, $this->terminals)) {
                $merges[] = $merge;
                $merge = '';
            } else {
                foreach ($definite_terminals as $terminal) {
                    if (mb_strpos($punctuation, $terminal) !== false) {
                        $merges[] = $merge;
                        $merge = '';
                        break;
                    }
                }
            }
        }
        if (!empty($merge)) {
            $merges[] = $merge;
        }

        return $merges;
    }

    /**
     * Looks for capitalized abbreviations & includes them with the following fragment.
     *
     * For example:
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
     *        ... becomes ...
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
     *  [ "Mr. Comey was not available for comment." ]
     *
     * @param string[] $fragments
     * @return string[]
     */
    private function abbreviationMerge($fragments)
    {
        $return_fragment = [];

        $previous_fragment = '';
        $previous_is_abbreviation = false;
        $i = 0;
        foreach ($fragments as $fragment) {
            $is_abbreviation = self::isAbreviation($fragment);

            // merge previous fragment with this
            if ($previous_is_abbreviation) {
                $fragment = $previous_fragment . $fragment;
            }
            $return_fragment[$i] = $fragment;

            $previous_is_abbreviation = $is_abbreviation;
            $previous_fragment = $fragment;

            // only increment if this isn't an abbreviation
            if (!$is_abbreviation) {
                $i++;
            }
        }
        return $return_fragment;
    }

    /**
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
     *
     * @param $fragment
     * @return bool
     */
    private static function isAbreviation($fragment)
    {
        $words = mb_split('\s+', Multibyte::trim($fragment));

        $word_count = count($words);

        $last_word = Multibyte::trim($words[$word_count - 1]);
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';

        return $last_is_capital > 0
            && $last_is_abbreviation > 0
            && mb_strlen($last_word) <= 3;
    }

    /**
     * Merges any part starting with a closing parenthesis ')' to the previous
     * part.
     *
     * @param string[] $parts
     * @return string[]
     */
    private function parenthesesMerge($parts)
    {
        $subsentences = [];

        foreach ($parts as $part) {
            if ($part[0] === ')') {
                $subsentences[count($subsentences) - 1] .= $part;
            } else {
                $subsentences[] = $part;
            }
        }

        return $subsentences;
    }

    /**
     * Looks for closing quotes to include them with the previous statement.
     * "That was very interesting," he said.
     * "That was very interesting."
     *
     * @param string[] $statements
     * @return string[]
     */
    private function closeQuotesMerge($statements)
    {
        $i = 0;
        $previous_statement = '';
        $return = [];
        foreach ($statements as $statement) {
            if (self::isEndQuote($statement)) {
                $statement = $previous_statement . $statement;
            } else {
                $i++;
            }

            $return[$i] = $statement;
            $previous_statement = $statement;
        }

        return $return;
    }

    /**
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
     *
     * @param $statement
     * @return bool
     */
    private static function isEndQuote($statement)
    {
        $trimmed = Multibyte::trim($statement);
        $first = mb_substr($statement, 0, 1);

        return in_array($trimmed, ['"', '\''])
            || (
                in_array($first, ['"', '\''])
                && mb_substr($statement, 1, 1) === ' '
                && ctype_lower(mb_substr($statement, 2, 1)) === true
            );
    }

    /**
     * Merges items into larger sentences.
     * Multibyte.php safe
     *
     * @param string[] $shorts
     * @return string[]
     */
    private function sentenceMerge($shorts)
    {
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);

        $sentences = [];

        $sentence = '';
        $has_words = false;
        $previous_word_ending = null;
        foreach ($shorts as $short) {
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);

            if ($after_non_abbreviating_terminal
                || ($has_words && $word_count > 1)) {

                $sentences[] = $sentence;

                $sentence = '';
                $has_words = false;
            }

            $has_words = $has_words
                || $word_count > 1;

            $sentence .= $short;
            $previous_word_ending = mb_substr($short, -1);
        }

        if (!empty($sentence)) {
            $sentences[] = $sentence;
        }

        return $sentences;
    }

    /**
     * Return the sentences sentences detected in the provided text.
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
     * @param string $text
     * @param integer $flags
     * @return string[]
     */
    public function split($text, $flags = 0)
    {
        static $pipeline = [
            'replaceFloatNumbers',
            'punctuationSplit',
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
            'punctuationMerge',
            'abbreviationMerge',
            'closeQuotesMerge',
            'sentenceMerge',
            'restoreReplacements'
        ];

        // clean funny quotes
        $text = Multibyte::cleanUnicode($text);

        // Split
        $sentences = [];
        foreach (self::linebreakSplit($text) as $input) {
            if (Multibyte::trim($input) !== '') {
                foreach ($pipeline as $method) {
                    $input = $this->$method($input);
                }
                $sentences = array_merge($sentences, $input);
            }
        }

        // Post process
        if ($flags & self::SPLIT_TRIM) {
            return self::trimSentences($sentences);
        }

        return $sentences;
    }

    /**
     * Multibyte.php trim each string in an array.
     * @param string[] $sentences
     * @return string[]
     */
    private static function trimSentences($sentences)
    {
        return array_map(function ($sentence) {
            return Multibyte::trim($sentence);
        }, $sentences);
    }

    /**
     * Return the number of sentences detected in the provided text.
     * @param string $text
     * @return integer
     */
    public function count($text)
    {
        return count($this->split($text));
    }

}


1			<?php
2
3			namespace Vanderlee\Sentence;
4
5			/**
6			* Segments sentences.
7			* Clipping may not be perfect.
8			* Sentence count should be VERY close to the truth.
9			*
10			* Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11			* language stucture (English, Dutch, German). Should work for most
12			* latin-alphabet languages.
13			*
14			* @author Martijn van der Lee (@vanderlee)
15			* @author @marktaw
16			*/
17			class Sentence
18			{
19			/**
20			* Specify this flag with the split method to trim whitespace.
21			*/
22			const SPLIT_TRIM = 0x1;
23
24			/**
25			* List of characters used to terminate sentences.
26			*
27			* @var string[]
28			*/
29			private $terminals = ['.', '!', '?'];
30
31			/**
32			* List of characters used for abbreviations.
33			*
34			* @var string[]
35			*/
36			private $abbreviators = ['.'];
37
38			/**
39			* List of float numbers in the text
40			*
41			* @var string[]
42			*/
43			private $replacements = [];
44
45			/**
46			* Generate an in-text replacement code for the specified index
47			*
48			* @param string $index
49			*
50			* @return string
51			*/
52			private function getReplaceCode(string $index)
53			{
54			return 0x02 . $index . 0x03;
55			}
56
57			/**
58			* Clean floating point numbers by replace them with an in-text index
59			*
60			* @param string $text
61			*
62			* @return string
63			*/
64			private function replaceFloatNumbers(string $text)
65			{
66			preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
67
68			$this->replacements = [];
69			$index = 0;
70			foreach (array_reverse($matches[0]) as $match) {
71			$number = $match[0];
72			$offset = $match[1];
73			$code = $this->getReplaceCode($index);
74
75			$this->replacements[$index] = $number;
76
77			$text = substr_replace($text, $code, $offset, mb_strlen($number));
78
79			++$index;
80			}
81
82			return $text;
			0 ignored issues – show Bug Best Practice introduced 2022-03-23 13:18 UTC by Report Bug Copy Issue Report The expression `return $text` also could return the type `array` which is incompatible with the documented return type `string`. Loading history...
83			}
84
85			/**
86			* Restore any stored replacements
87			*
88			* @param string[] $text
89			*
90			* @return string[]
91			*/
92			private function restoreReplacements($text)
93			{
94			return array_map(function($value) {
95			foreach ($this->replacements as $index => $number) {
96			$code = $this->getReplaceCode($index);
97			$value = str_replace($code, $number, $value);
98			}
99			return $value;
100			}, $text);
101			}
102
103			/**
104			* Breaks a piece of text into lines by linebreak.
105			* Eats up any linebreak characters as if one.
106			*
107			* Multibyte.php safe
108			*
109			* @param string $text
110			* @return string[]
111			*/
112			private static function linebreakSplit($text)
113			{
114			$lines = [];
115			$line = '';
116
117			foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
118			$line .= $part;
119			if (Multibyte::trim($part) === '') {
120			$lines[] = $line;
121			$line = '';
122			}
123			}
124			$lines[] = $line;
125
126			return $lines;
127			}
128
129			/**
130			* Splits an array of lines by (consecutive sequences of)
131			* terminals, keeping terminals.
132			*
133			* Multibyte.php safe (atleast for UTF-8)
134			*
135			* For example:
136			* "There ... is. More!"
137			* ... becomes ...
138			* [ "There ", "...", " is", ".", " More", "!" ]
139			*
140			* @param string $line
141			* @return string[]
142			*/
143			private function punctuationSplit($line)
144			{
145			$parts = [];
146
147			$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
148			$is_terminal = in_array($chars[0], $this->terminals);
149
150			$part = '';
151			foreach ($chars as $index => $char) {
152			if (in_array($char, $this->terminals) !== $is_terminal) {
153			$parts[] = $part;
154			$part = '';
155			$is_terminal = !$is_terminal;
156			}
157			$part .= $char;
158			}
159
160			if (!empty($part)) {
161			$parts[] = $part;
162			}
163
164			return $parts;
165			}
166
167			/**
168			* Appends each terminal item after it's preceding
169			* non-terminals.
170			*
171			* Multibyte.php safe (atleast for UTF-8)
172			*
173			* For example:
174			* [ "There ", "...", " is", ".", " More", "!" ]
175			* ... becomes ...
176			* [ "There ... is.", "More!" ]
177			*
178			* @param string[] $punctuations
179			* @return string[]
180			*/
181			private function punctuationMerge($punctuations)
182			{
183			$definite_terminals = array_diff($this->terminals, $this->abbreviators);
184
185			$merges = [];
186			$merge = '';
187
188			$filtered = array_filter($punctuations, function ($p) {
189			return $p !== '';
190			});
191
192			foreach ($filtered as $punctuation) {
193			$merge .= $punctuation;
194			if (mb_strlen($punctuation) === 1
195			&& in_array($punctuation, $this->terminals)) {
196			$merges[] = $merge;
197			$merge = '';
198			} else {
199			foreach ($definite_terminals as $terminal) {
200			if (mb_strpos($punctuation, $terminal) !== false) {
201			$merges[] = $merge;
202			$merge = '';
203			break;
204			}
205			}
206			}
207			}
208			if (!empty($merge)) {
209			$merges[] = $merge;
210			}
211
212			return $merges;
213			}
214
215			/**
216			* Looks for capitalized abbreviations & includes them with the following fragment.
217			*
218			* For example:
219			* [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
220			* ... becomes ...
221			* [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
222			* [ "Mr. Comey was not available for comment." ]
223			*
224			* @param string[] $fragments
225			* @return string[]
226			*/
227			private function abbreviationMerge($fragments)
228			{
229			$return_fragment = [];
230
231			$previous_fragment = '';
232			$previous_is_abbreviation = false;
233			$i = 0;
234			foreach ($fragments as $fragment) {
235			$is_abbreviation = self::isAbreviation($fragment);
236
237			// merge previous fragment with this
238			if ($previous_is_abbreviation) {
239			$fragment = $previous_fragment . $fragment;
240			}
241			$return_fragment[$i] = $fragment;
242
243			$previous_is_abbreviation = $is_abbreviation;
244			$previous_fragment = $fragment;
245
246			// only increment if this isn't an abbreviation
247			if (!$is_abbreviation) {
248			$i++;
249			}
250			}
251			return $return_fragment;
252			}
253
254			/**
255			* Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
256			*
257			* @param $fragment
258			* @return bool
259			*/
260			private static function isAbreviation($fragment)
261			{
262			$words = mb_split('\s+', Multibyte::trim($fragment));
263
264			$word_count = count($words);
265
266			$last_word = Multibyte::trim($words[$word_count - 1]);
267			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
268			$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
269
270			return $last_is_capital > 0
271			&& $last_is_abbreviation > 0
272			&& mb_strlen($last_word) <= 3;
273			}
274
275			/**
276			* Merges any part starting with a closing parenthesis ')' to the previous
277			* part.
278			*
279			* @param string[] $parts
280			* @return string[]
281			*/
282			private function parenthesesMerge($parts)
283			{
284			$subsentences = [];
285
286			foreach ($parts as $part) {
287			if ($part[0] === ')') {
288			$subsentences[count($subsentences) - 1] .= $part;
289			} else {
290			$subsentences[] = $part;
291			}
292			}
293
294			return $subsentences;
295			}
296
297			/**
298			* Looks for closing quotes to include them with the previous statement.
299			* "That was very interesting," he said.
300			* "That was very interesting."
301			*
302			* @param string[] $statements
303			* @return string[]
304			*/
305			private function closeQuotesMerge($statements)
306			{
307			$i = 0;
308			$previous_statement = '';
309			$return = [];
310			foreach ($statements as $statement) {
311			if (self::isEndQuote($statement)) {
312			$statement = $previous_statement . $statement;
313			} else {
314			$i++;
315			}
316
317			$return[$i] = $statement;
318			$previous_statement = $statement;
319			}
320
321			return $return;
322			}
323
324			/**
325			* Check if the entire string is a quotation mark or quote, then space, then lowercase.
326			*
327			* @param $statement
328			* @return bool
329			*/
330			private static function isEndQuote($statement)
331			{
332			$trimmed = Multibyte::trim($statement);
333			$first = mb_substr($statement, 0, 1);
334
335			return in_array($trimmed, ['"', '\''])
336			\|\| (
337			in_array($first, ['"', '\''])
338			&& mb_substr($statement, 1, 1) === ' '
339			&& ctype_lower(mb_substr($statement, 2, 1)) === true
340			);
341			}
342
343			/**
344			* Merges items into larger sentences.
345			* Multibyte.php safe
346			*
347			* @param string[] $shorts
348			* @return string[]
349			*/
350			private function sentenceMerge($shorts)
351			{
352			$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
353
354			$sentences = [];
355
356			$sentence = '';
357			$has_words = false;
358			$previous_word_ending = null;
359			foreach ($shorts as $short) {
360			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
361			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
362
363			if ($after_non_abbreviating_terminal
364			\|\| ($has_words && $word_count > 1)) {
365
366			$sentences[] = $sentence;
367
368			$sentence = '';
369			$has_words = false;
370			}
371
372			$has_words = $has_words
373			\|\| $word_count > 1;
374
375			$sentence .= $short;
376			$previous_word_ending = mb_substr($short, -1);
377			}
378
379			if (!empty($sentence)) {
380			$sentences[] = $sentence;
381			}
382
383			return $sentences;
384			}
385
386			/**
387			* Return the sentences sentences detected in the provided text.
388			* Set the Sentence::SPLIT_TRIM flag to trim whitespace.
389			* @param string $text
390			* @param integer $flags
391			* @return string[]
392			*/
393			public function split($text, $flags = 0)
394			{
395			static $pipeline = [
396			'replaceFloatNumbers',
397			'punctuationSplit',
398			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
399			'punctuationMerge',
400			'abbreviationMerge',
401			'closeQuotesMerge',
402			'sentenceMerge',
403			'restoreReplacements'
404			];
405
406			// clean funny quotes
407			$text = Multibyte::cleanUnicode($text);
408
409			// Split
410			$sentences = [];
411			foreach (self::linebreakSplit($text) as $input) {
412			if (Multibyte::trim($input) !== '') {
413			foreach ($pipeline as $method) {
414			$input = $this->$method($input);
415			}
416			$sentences = array_merge($sentences, $input);
417			}
418			}
419
420			// Post process
421			if ($flags & self::SPLIT_TRIM) {
422			return self::trimSentences($sentences);
423			}
424
425			return $sentences;
426			}
427
428			/**
429			* Multibyte.php trim each string in an array.
430			* @param string[] $sentences
431			* @return string[]
432			*/
433			private static function trimSentences($sentences)
434			{
435			return array_map(function ($sentence) {
436			return Multibyte::trim($sentence);
437			}, $sentences);
438			}
439
440			/**
441			* Return the number of sentences detected in the provided text.
442			* @param string $text
443			* @return integer
444			*/
445			public function count($text)
446			{
447			return count($this->split($text));
448			}
449
450			}
451

vanderlee / php-sentence

Push — master ( bd113f...e772b1 )

Sentence::isAbreviation() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like