Sentence::count() - Code Metrics - Inspection of "Fix some longstanding minor issues" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 538637...3febfa )

by Martijn

created 2024-04-11 09:45 UTC

Sentence::count() A

↳ Parent: Sentence

Complexity

Conditions	1
Paths	1

Size

Total Lines	3
Code Lines	1

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	1
dl	0
loc	3
rs	10
c	0
b	0
f	0
cc	1
nc	1
nop	1

<?php

namespace Vanderlee\Sentence;

/**
 * Segments sentences.
 * Clipping may not be perfect.
 * Sentence count should be VERY close to the truth.
 *
 * Multibyte.php safe (at least for UTF-8), but rules based on germanic
 * language structure (English, Dutch, German). Should work for most
 * latin-alphabet languages.
 *
 * @author Martijn van der Lee (@vanderlee)
 * @author @marktaw
 */
class Sentence
{

    /**
     * Specify this flag with the split method to trim whitespace.
     */
    const SPLIT_TRIM = 0x1;

    /**
     * List of characters used to terminate sentences.
     *
     * @var string[]
     */
    private $terminals = ['.', '!', '?'];

    /**
     * List of characters used for abbreviations.
     *
     * @var string[]
     */
    private $abbreviators = ['.'];

    /**
     * List of replacements in the text.
     *
     * @var string[]
     */
    private $replacements = [];

    /**
     * Generate an in-text replacement code for the specified index
     *
     * @param int $index
     *
     * @return string
     */
    private function getReplaceCode($index)
    {
        return 0x02 . $index . 0x03;
    }

    /**
     * Clean floating point numbers by replace them with an in-text index
     *
     * @param string $text
     *
     * @return string
     */
    private function replaceFloatNumbers($text)
    {
        $matches = array();
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);

        $this->replacements = [];
        $index = 0;
        foreach (array_reverse($matches[0]) as $match) {
            $number = $match[0];
            $offset = $match[1];
            $code = $this->getReplaceCode($index);

            $this->replacements[$index] = $number;

            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));

            ++$index;
        }

        return $text;
    }

    /**
     * Restore any stored replacements
     *
     * @param string[] $text
     *
     * @return string[]
     */
    private function restoreReplacements($text)
    {
        return array_map(function ($value) {
            foreach ($this->replacements as $index => $number) {
                $code = $this->getReplaceCode($index);
                $value = str_replace($code, $number, $value);
            }

            return $value;
        }, $text);
    }

    /**
     * Breaks a piece of text into lines by linebreak.
     * Eats up any linebreak characters as if one.
     *
     * Multibyte.php safe
     *
     * @param string $text
     *
     * @return string[]
     */
    private static function linebreakSplit($text)
    {
        $lines = [];
        $line = '';

        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
            $line .= $part;
            if (Multibyte::trim($part) === '') {
                $lines[] = $line;
                $line = '';
            }
        }
        $lines[] = $line;

        return $lines;
    }

    /**
     * Splits an array of lines by (consecutive sequences of)
     * terminals, keeping terminals.
     *
     * Multibyte.php safe (at least for UTF-8)
     *
     * For example:
     *    "There ... is. More!"
     *        ... becomes ...
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *
     * @param string $line
     *
     * @return string[]
     */
    private function punctuationSplit($line)
    {
        $parts = [];

        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
        $is_terminal = in_array($chars[0], $this->terminals);

        $part = '';
        foreach ($chars as $char) {
            if (in_array($char, $this->terminals) !== $is_terminal) {
                $parts[] = $part;
                $part = '';
                $is_terminal = !$is_terminal;
            }
            $part .= $char;
        }

        if (!empty($part)) {
            $parts[] = $part;
        }

        return $parts;
    }

    /**
     * Appends each terminal item after it's preceding
     * non-terminals.
     *
     * Multibyte.php safe (at least for UTF-8)
     *
     * For example:
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *        ... becomes ...
     *    [ "There ... is.", "More!" ]
     *
     * @param string[] $punctuations
     *
     * @return string[]
     */
    private function punctuationMerge($punctuations)
    {
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);

        $merges = [];
        $merge = '';

        $filtered = array_filter($punctuations, function ($p) {
            return $p !== '';
        });

        foreach ($filtered as $punctuation) {
            $merge .= $punctuation;
            if (mb_strlen($punctuation) === 1
                && in_array($punctuation, $this->terminals)) {
                $merges[] = $merge;
                $merge = '';
            } else {
                foreach ($definite_terminals as $terminal) {
                    if (mb_strpos($punctuation, $terminal) !== false) {
                        $merges[] = $merge;
                        $merge = '';
                        break;
                    }
                }
            }
        }
        if (!empty($merge)) {
            $merges[] = $merge;
        }

        return $merges;
    }

    /**
     * Looks for capitalized abbreviations & includes them with the following fragment.
     *
     * For example:
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
     *        ... becomes ...
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
     *  [ "Mr. Comey was not available for comment." ]
     *
     * @param string[] $fragments
     *
     * @return string[]
     */
    private function abbreviationMerge($fragments)
    {
        $return_fragment = [];

        $previous_fragment = '';
        $previous_is_abbreviation = false;
        $i = 0;
        foreach ($fragments as $fragment) {
            $is_abbreviation = self::isAbbreviation($fragment);

            // merge previous fragment with this
            if ($previous_is_abbreviation) {
                $fragment = $previous_fragment . $fragment;
            }
            $return_fragment[$i] = $fragment;

            $previous_is_abbreviation = $is_abbreviation;
            $previous_fragment = $fragment;

            // only increment if this isn't an abbreviation
            if (!$is_abbreviation) {
                $i++;
            }
        }

        return $return_fragment;
    }

    /**
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
     *
     * @param $fragment
     *
     * @return bool
     */
    private static function isAbbreviation($fragment)
    {
        $words = mb_split('\s+', Multibyte::trim($fragment));

        $word_count = count($words);

        $last_word = Multibyte::trim($words[$word_count - 1]);
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';

        return $last_is_capital > 0
            && $last_is_abbreviation > 0
            && mb_strlen($last_word) <= 3;
    }

    /**
     * Merges any part starting with a closing parenthesis ')' to the previous
     * part.
     *
     * @param string[] $parts
     *
     * @return string[]
     */
    private function parenthesesMerge($parts)
    {
        $subSentences = [];

        foreach ($parts as $part) {
            if ($part[0] === ')' && !empty($subSentences)) {
                $subSentences[count($subSentences) - 1] .= $part;
            } else {
                $subSentences[] = $part;
            }
        }

        return $subSentences;
    }

    /**
     * Looks for closing quotes to include them with the previous statement.
     * "That was very interesting," he said.
     * "That was very interesting."
     *
     * @param string[] $statements
     *
     * @return string[]
     */
    private function closeQuotesMerge($statements)
    {
        $i = 0;
        $previous_statement = '';
        $return = [];
        foreach ($statements as $statement) {
            if (self::isEndQuote($statement)) {
                $statement = $previous_statement . $statement;
            } else {
                $i++;
            }

            $return[$i] = $statement;
            $previous_statement = $statement;
        }

        return $return;
    }

    /**
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
     *
     * @param $statement
     *
     * @return bool
     */
    private static function isEndQuote($statement)
    {
        $trimmed = Multibyte::trim($statement);
        $first = mb_substr($statement, 0, 1);

        return in_array($trimmed, ['"', '\''])
            || (
                in_array($first, ['"', '\''])
                && mb_substr($statement, 1, 1) === ' '
                && ctype_lower(mb_substr($statement, 2, 1)) === true
            );
    }

    /**
     * Merges items into larger sentences.
     * Multibyte.php safe
     *
     * @param string[] $shorts
     *
     * @return string[]
     */
    private function sentenceMerge($shorts)
    {
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);

        $sentences = [];

        $sentence = '';
        $has_words = false;
        $previous_word_ending = null;
        foreach ($shorts as $short) {
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);

            if ($after_non_abbreviating_terminal
                || ($has_words && $word_count > 1)) {

                $sentences[] = $sentence;

                $sentence = '';
                $has_words = false;
            }

            $has_words = $has_words
                || $word_count > 1;

            $sentence .= $short;
            $previous_word_ending = mb_substr($short, -1);
        }

        if (!empty($sentence)) {
            $sentences[] = $sentence;
        }

        return $sentences;
    }

    /**
     * Return the sentences detected in the provided text.
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
     *
     * @param string  $text
     * @param integer $flags
     *
     * @return string[]
     */
    public function split($text, $flags = 0, $pipeline = [])
    {
        if (empty($pipeline)) {
            static $pipeline = [
                'replaceFloatNumbers',
                'punctuationSplit',
                'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
                'punctuationMerge',
                'abbreviationMerge',
                'closeQuotesMerge',
                'sentenceMerge',
                'restoreReplacements',
            ];
        }

        // clean funny quotes
        $text = Multibyte::cleanUnicode($text);

        // Split
        $sentences = [];
        foreach (self::linebreakSplit($text) as $input) {
            if (Multibyte::trim($input) !== '') {
                foreach ($pipeline as $method) {
                    $input = $this->$method($input);
                }
                $sentences = array_merge($sentences, $input);
            }
        }

        // Post process
        if ($flags & self::SPLIT_TRIM) {
            return self::trimSentences($sentences);
        }

        return $sentences;
    }

    /**
     * Multibyte.php trim each string in an array.
     *
     * @param string[] $sentences
     *
     * @return string[]
     */
    private static function trimSentences($sentences)
    {
        return array_map(function ($sentence) {
            return Multibyte::trim($sentence);
        }, $sentences);
    }

    /**
     * Return the number of sentences detected in the provided text.
     *
     * @param string $text
     *
     * @return integer
     */
    public function count($text)
    {
        return count($this->split($text));
    }

}


1			<?php
2
3			namespace Vanderlee\Sentence;
4
5			/**
6			* Segments sentences.
7			* Clipping may not be perfect.
8			* Sentence count should be VERY close to the truth.
9			*
10			* Multibyte.php safe (at least for UTF-8), but rules based on germanic
11			* language structure (English, Dutch, German). Should work for most
12			* latin-alphabet languages.
13			*
14			* @author Martijn van der Lee (@vanderlee)
15			* @author @marktaw
16			*/
17			class Sentence
18			{
19
20			/**
21			* Specify this flag with the split method to trim whitespace.
22			*/
23			const SPLIT_TRIM = 0x1;
24
25			/**
26			* List of characters used to terminate sentences.
27			*
28			* @var string[]
29			*/
30			private $terminals = ['.', '!', '?'];
31
32			/**
33			* List of characters used for abbreviations.
34			*
35			* @var string[]
36			*/
37			private $abbreviators = ['.'];
38
39			/**
40			* List of replacements in the text.
41			*
42			* @var string[]
43			*/
44			private $replacements = [];
45
46			/**
47			* Generate an in-text replacement code for the specified index
48			*
49			* @param int $index
50			*
51			* @return string
52			*/
53			private function getReplaceCode($index)
54			{
55			return 0x02 . $index . 0x03;
56			}
57
58			/**
59			* Clean floating point numbers by replace them with an in-text index
60			*
61			* @param string $text
62			*
63			* @return string
64			*/
65			private function replaceFloatNumbers($text)
66			{
67			$matches = array();
68			preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
69
70			$this->replacements = [];
71			$index = 0;
72			foreach (array_reverse($matches[0]) as $match) {
73			$number = $match[0];
74			$offset = $match[1];
75			$code = $this->getReplaceCode($index);
76
77			$this->replacements[$index] = $number;
78
79			$text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
80
81			++$index;
82			}
83
84			return $text;
85			}
86
87			/**
88			* Restore any stored replacements
89			*
90			* @param string[] $text
91			*
92			* @return string[]
93			*/
94			private function restoreReplacements($text)
95			{
96			return array_map(function ($value) {
97			foreach ($this->replacements as $index => $number) {
98			$code = $this->getReplaceCode($index);
99			$value = str_replace($code, $number, $value);
100			}
101
102			return $value;
103			}, $text);
104			}
105
106			/**
107			* Breaks a piece of text into lines by linebreak.
108			* Eats up any linebreak characters as if one.
109			*
110			* Multibyte.php safe
111			*
112			* @param string $text
113			*
114			* @return string[]
115			*/
116			private static function linebreakSplit($text)
117			{
118			$lines = [];
119			$line = '';
120
121			foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
122			$line .= $part;
123			if (Multibyte::trim($part) === '') {
124			$lines[] = $line;
125			$line = '';
126			}
127			}
128			$lines[] = $line;
129
130			return $lines;
131			}
132
133			/**
134			* Splits an array of lines by (consecutive sequences of)
135			* terminals, keeping terminals.
136			*
137			* Multibyte.php safe (at least for UTF-8)
138			*
139			* For example:
140			* "There ... is. More!"
141			* ... becomes ...
142			* [ "There ", "...", " is", ".", " More", "!" ]
143			*
144			* @param string $line
145			*
146			* @return string[]
147			*/
148			private function punctuationSplit($line)
149			{
150			$parts = [];
151
152			$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
153			$is_terminal = in_array($chars[0], $this->terminals);
154
155			$part = '';
156			foreach ($chars as $char) {
157			if (in_array($char, $this->terminals) !== $is_terminal) {
158			$parts[] = $part;
159			$part = '';
160			$is_terminal = !$is_terminal;
161			}
162			$part .= $char;
163			}
164
165			if (!empty($part)) {
166			$parts[] = $part;
167			}
168
169			return $parts;
170			}
171
172			/**
173			* Appends each terminal item after it's preceding
174			* non-terminals.
175			*
176			* Multibyte.php safe (at least for UTF-8)
177			*
178			* For example:
179			* [ "There ", "...", " is", ".", " More", "!" ]
180			* ... becomes ...
181			* [ "There ... is.", "More!" ]
182			*
183			* @param string[] $punctuations
184			*
185			* @return string[]
186			*/
187			private function punctuationMerge($punctuations)
188			{
189			$definite_terminals = array_diff($this->terminals, $this->abbreviators);
190
191			$merges = [];
192			$merge = '';
193
194			$filtered = array_filter($punctuations, function ($p) {
195			return $p !== '';
196			});
197
198			foreach ($filtered as $punctuation) {
199			$merge .= $punctuation;
200			if (mb_strlen($punctuation) === 1
201			&& in_array($punctuation, $this->terminals)) {
202			$merges[] = $merge;
203			$merge = '';
204			} else {
205			foreach ($definite_terminals as $terminal) {
206			if (mb_strpos($punctuation, $terminal) !== false) {
207			$merges[] = $merge;
208			$merge = '';
209			break;
210			}
211			}
212			}
213			}
214			if (!empty($merge)) {
215			$merges[] = $merge;
216			}
217
218			return $merges;
219			}
220
221			/**
222			* Looks for capitalized abbreviations & includes them with the following fragment.
223			*
224			* For example:
225			* [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
226			* ... becomes ...
227			* [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
228			* [ "Mr. Comey was not available for comment." ]
229			*
230			* @param string[] $fragments
231			*
232			* @return string[]
233			*/
234			private function abbreviationMerge($fragments)
235			{
236			$return_fragment = [];
237
238			$previous_fragment = '';
239			$previous_is_abbreviation = false;
240			$i = 0;
241			foreach ($fragments as $fragment) {
242			$is_abbreviation = self::isAbbreviation($fragment);
243
244			// merge previous fragment with this
245			if ($previous_is_abbreviation) {
246			$fragment = $previous_fragment . $fragment;
247			}
248			$return_fragment[$i] = $fragment;
249
250			$previous_is_abbreviation = $is_abbreviation;
251			$previous_fragment = $fragment;
252
253			// only increment if this isn't an abbreviation
254			if (!$is_abbreviation) {
255			$i++;
256			}
257			}
258
259			return $return_fragment;
260			}
261
262			/**
263			* Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
264			*
265			* @param $fragment
266			*
267			* @return bool
268			*/
269			private static function isAbbreviation($fragment)
270			{
271			$words = mb_split('\s+', Multibyte::trim($fragment));
272
273			$word_count = count($words);
274
275			$last_word = Multibyte::trim($words[$word_count - 1]);
276			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
277			$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
278
279			return $last_is_capital > 0
280			&& $last_is_abbreviation > 0
281			&& mb_strlen($last_word) <= 3;
282			}
283
284			/**
285			* Merges any part starting with a closing parenthesis ')' to the previous
286			* part.
287			*
288			* @param string[] $parts
289			*
290			* @return string[]
291			*/
292			private function parenthesesMerge($parts)
293			{
294			$subSentences = [];
295
296			foreach ($parts as $part) {
297			if ($part[0] === ')' && !empty($subSentences)) {
298			$subSentences[count($subSentences) - 1] .= $part;
299			} else {
300			$subSentences[] = $part;
301			}
302			}
303
304			return $subSentences;
305			}
306
307			/**
308			* Looks for closing quotes to include them with the previous statement.
309			* "That was very interesting," he said.
310			* "That was very interesting."
311			*
312			* @param string[] $statements
313			*
314			* @return string[]
315			*/
316			private function closeQuotesMerge($statements)
317			{
318			$i = 0;
319			$previous_statement = '';
320			$return = [];
321			foreach ($statements as $statement) {
322			if (self::isEndQuote($statement)) {
323			$statement = $previous_statement . $statement;
324			} else {
325			$i++;
326			}
327
328			$return[$i] = $statement;
329			$previous_statement = $statement;
330			}
331
332			return $return;
333			}
334
335			/**
336			* Check if the entire string is a quotation mark or quote, then space, then lowercase.
337			*
338			* @param $statement
339			*
340			* @return bool
341			*/
342			private static function isEndQuote($statement)
343			{
344			$trimmed = Multibyte::trim($statement);
345			$first = mb_substr($statement, 0, 1);
346
347			return in_array($trimmed, ['"', '\''])
348			\|\| (
349			in_array($first, ['"', '\''])
350			&& mb_substr($statement, 1, 1) === ' '
351			&& ctype_lower(mb_substr($statement, 2, 1)) === true
352			);
353			}
354
355			/**
356			* Merges items into larger sentences.
357			* Multibyte.php safe
358			*
359			* @param string[] $shorts
360			*
361			* @return string[]
362			*/
363			private function sentenceMerge($shorts)
364			{
365			$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
366
367			$sentences = [];
368
369			$sentence = '';
370			$has_words = false;
371			$previous_word_ending = null;
372			foreach ($shorts as $short) {
373			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
374			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
375
376			if ($after_non_abbreviating_terminal
377			\|\| ($has_words && $word_count > 1)) {
378
379			$sentences[] = $sentence;
380
381			$sentence = '';
382			$has_words = false;
383			}
384
385			$has_words = $has_words
386			\|\| $word_count > 1;
387
388			$sentence .= $short;
389			$previous_word_ending = mb_substr($short, -1);
390			}
391
392			if (!empty($sentence)) {
393			$sentences[] = $sentence;
394			}
395
396			return $sentences;
397			}
398
399			/**
400			* Return the sentences detected in the provided text.
401			* Set the Sentence::SPLIT_TRIM flag to trim whitespace.
402			*
403			* @param string $text
404			* @param integer $flags
405			*
406			* @return string[]
407			*/
408			public function split($text, $flags = 0, $pipeline = [])
409			{
410			if (empty($pipeline)) {
411			static $pipeline = [
412			'replaceFloatNumbers',
413			'punctuationSplit',
414			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
415			'punctuationMerge',
416			'abbreviationMerge',
417			'closeQuotesMerge',
418			'sentenceMerge',
419			'restoreReplacements',
420			];
421			}
422
423			// clean funny quotes
424			$text = Multibyte::cleanUnicode($text);
425
426			// Split
427			$sentences = [];
428			foreach (self::linebreakSplit($text) as $input) {
429			if (Multibyte::trim($input) !== '') {
430			foreach ($pipeline as $method) {
431			$input = $this->$method($input);
432			}
433			$sentences = array_merge($sentences, $input);
434			}
435			}
436
437			// Post process
438			if ($flags & self::SPLIT_TRIM) {
439			return self::trimSentences($sentences);
440			}
441
442			return $sentences;
443			}
444
445			/**
446			* Multibyte.php trim each string in an array.
447			*
448			* @param string[] $sentences
449			*
450			* @return string[]
451			*/
452			private static function trimSentences($sentences)
453			{
454			return array_map(function ($sentence) {
455			return Multibyte::trim($sentence);
456			}, $sentences);
457			}
458
459			/**
460			* Return the number of sentences detected in the provided text.
461			*
462			* @param string $text
463			*
464			* @return integer
465			*/
466			public function count($text)
467			{
468			return count($this->split($text));
469			}
470
471			}
472

vanderlee / php-sentence

Push — master ( 538637...3febfa )

Sentence::count() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like