Sentence::floatNumberClean() - Code Metrics - Inspection of "Add float number sense" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#16)

by Umut

created 2021-10-26 23:41 UTC

Sentence::floatNumberClean() A

↳ Parent: Sentence

Complexity

Conditions	2
Paths	2

Size

Total Lines	11
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	5
c	0
b	0
f	0
dl	0
loc	11
rs	10
cc	2
nc	2
nop	1

<?php

namespace Vanderlee\Sentence;

/**
 * Segments sentences.
 * Clipping may not be perfect.
 * Sentence count should be VERY close to the truth.
 *
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
 * language stucture (English, Dutch, German). Should work for most
 * latin-alphabet languages.
 *
 * @author Martijn van der Lee (@vanderlee)
 * @author @marktaw
 */
class Sentence
{

    /**
     * Specify this flag with the split method to trim whitespace.
     */
    const SPLIT_TRIM = 0x1;

    /**
     * List of characters used to terminate sentences.
     *
     * @var string[]
     */
    private $terminals = ['.', '!', '?'];

    /**
     * List of characters used for abbreviations.
     *
     * @var string[]
     */
    private $abbreviators = ['.'];

    /**
     * List of float numbers in the text
     *
     * @var string[]
     */
    private $floatNumbers = [];

    /**
     * Clean floating point numbers by replace them with their md5 hash
     *
     * @param string $text
     *
     * @return string
     */
    private function floatNumberClean(string $text)
    {
        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches);

        foreach ($matches[0] as $floatNumber) {
            $this->floatNumbers[$floatNumber] = md5($floatNumber);

            $text = str_replace($floatNumber, md5($floatNumber), $text);
        }

        return $text;
    }

    /**
     * Revert the hashed floating number back
     *
     * @param string[] $text
     *
     * @return string[]
     */
    private function floatNumberRevert($text)
    {
        
        return array_map(function($value) {
            foreach ($this->floatNumbers as $number => $hash) {
                $value = str_replace($hash, $number, $value);
            }
            return $value;
        }, $text);
    }

    /**
     * Breaks a piece of text into lines by linebreak.
     * Eats up any linebreak characters as if one.
     *
     * Multibyte.php safe
     *
     * @param string $text
     * @return string[]
     */
    private static function linebreakSplit($text)
    {
        $lines = [];
        $line = '';

        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
            $line .= $part;
            if (Multibyte::trim($part) === '') {
                $lines[] = $line;
                $line = '';
            }
        }
        $lines[] = $line;

        return $lines;
    }

    /**
     * Splits an array of lines by (consecutive sequences of)
     * terminals, keeping terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    "There ... is. More!"
     *        ... becomes ...
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *
     * @param string $line
     * @return string[]
     */
    private function punctuationSplit($line)
    {
        $parts = [];

        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
        $is_terminal = in_array($chars[0], $this->terminals);

        $part = '';
        foreach ($chars as $index => $char) {
            if (in_array($char, $this->terminals) !== $is_terminal) {
                $parts[] = $part;
                $part = '';
                $is_terminal = !$is_terminal;
            }
            $part .= $char;
        }

        if (!empty($part)) {
            $parts[] = $part;
        }

        return $parts;
    }

    /**
     * Appends each terminal item after it's preceding
     * non-terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *        ... becomes ...
     *    [ "There ... is.", "More!" ]
     *
     * @param string[] $punctuations
     * @return string[]
     */
    private function punctuationMerge($punctuations)
    {
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);

        $merges = [];
        $merge = '';

        $filtered = array_filter($punctuations, function ($p) {
            return $p !== '';
        });

        foreach ($filtered as $punctuation) {
            $merge .= $punctuation;
            if (mb_strlen($punctuation) === 1
                && in_array($punctuation, $this->terminals)) {
                $merges[] = $merge;
                $merge = '';
            } else {
                foreach ($definite_terminals as $terminal) {
                    if (mb_strpos($punctuation, $terminal) !== false) {
                        $merges[] = $merge;
                        $merge = '';
                        break;
                    }
                }
            }
        }
        if (!empty($merge)) {
            $merges[] = $merge;
        }

        return $merges;
    }

    /**
     * Looks for capitalized abbreviations & includes them with the following fragment.
     *
     * For example:
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
     *        ... becomes ...
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
     *  [ "Mr. Comey was not available for comment." ]
     *
     * @param string[] $fragments
     * @return string[]
     */
    private function abbreviationMerge($fragments)
    {
        $return_fragment = [];

        $previous_fragment = '';
        $previous_is_abbreviation = false;
        $i = 0;
        foreach ($fragments as $fragment) {
            $is_abbreviation = self::isAbreviation($fragment);

            // merge previous fragment with this
            if ($previous_is_abbreviation) {
                $fragment = $previous_fragment . $fragment;
            }
            $return_fragment[$i] = $fragment;

            $previous_is_abbreviation = $is_abbreviation;
            $previous_fragment = $fragment;

            // only increment if this isn't an abbreviation
            if (!$is_abbreviation) {
                $i++;
            }
        }
        return $return_fragment;
    }

    /**
     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
     *
     * @param $fragment
     * @return bool
     */
    private static function isAbreviation($fragment)
    {
        $words = mb_split('\s+', Multibyte::trim($fragment));

        $word_count = count($words);

        $last_word = Multibyte::trim($words[$word_count - 1]);
        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';

        return $last_is_capital > 0
            && $last_is_abbreviation > 0
            && mb_strlen($last_word) <= 3;
    }

    /**
     * Merges any part starting with a closing parenthesis ')' to the previous
     * part.
     *
     * @param string[] $parts
     * @return string[]
     */
    private function parenthesesMerge($parts)
    {
        $subsentences = [];

        foreach ($parts as $part) {
            if ($part[0] === ')') {
                $subsentences[count($subsentences) - 1] .= $part;
            } else {
                $subsentences[] = $part;
            }
        }

        return $subsentences;
    }

    /**
     * Looks for closing quotes to include them with the previous statement.
     * "That was very interesting," he said.
     * "That was very interesting."
     *
     * @param string[] $statements
     * @return string[]
     */
    private function closeQuotesMerge($statements)
    {
        $i = 0;
        $previous_statement = '';
        $return = [];
        foreach ($statements as $statement) {
            if (self::isEndQuote($statement)) {
                $statement = $previous_statement . $statement;
            } else {
                $i++;
            }

            $return[$i] = $statement;
            $previous_statement = $statement;
        }

        return $return;
    }

    /**
     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
     *
     * @param $statement
     * @return bool
     */
    private static function isEndQuote($statement)
    {
        $trimmed = Multibyte::trim($statement);
        $first = mb_substr($statement, 0, 1);

        return in_array($trimmed, ['"', '\''])
            || (
                in_array($first, ['"', '\''])
                && mb_substr($statement, 1, 1) === ' '
                && ctype_lower(mb_substr($statement, 2, 1)) === true
            );
    }

    /**
     * Merges items into larger sentences.
     * Multibyte.php safe
     *
     * @param string[] $shorts
     * @return string[]
     */
    private function sentenceMerge($shorts)
    {
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);

        $sentences = [];

        $sentence = '';
        $has_words = false;
        $previous_word_ending = null;
        foreach ($shorts as $short) {
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);

            if ($after_non_abbreviating_terminal
                || ($has_words && $word_count > 1)) {

                $sentences[] = $sentence;

                $sentence = '';
                $has_words = false;
            }

            $has_words = $has_words
                || $word_count > 1;

            $sentence .= $short;
            $previous_word_ending = mb_substr($short, -1);
        }

        if (!empty($sentence)) {
            $sentences[] = $sentence;
        }

        return $sentences;
    }

    /**
     * Return the sentences sentences detected in the provided text.
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
     * @param string $text
     * @param integer $flags
     * @return string[]
     */
    public function split($text, $flags = 0)
    {
        static $pipeline = [
            'floatNumberClean',
            'punctuationSplit',
            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
            'punctuationMerge',
            'abbreviationMerge',
            'closeQuotesMerge',
            'sentenceMerge',
            'floatNumberRevert'
        ];

        // clean funny quotes
        $text = Multibyte::cleanUnicode($text);

        // Split
        $sentences = [];
        foreach (self::linebreakSplit($text) as $input) {
            if (Multibyte::trim($input) !== '') {
                foreach ($pipeline as $method) {
                    $input = $this->$method($input);
                }
                $sentences = array_merge($sentences, $input);
            }
        }

        // Post process
        if ($flags & self::SPLIT_TRIM) {
            return self::trimSentences($sentences);
        }

        return $sentences;
    }

    /**
     * Multibyte.php trim each string in an array.
     * @param string[] $sentences
     * @return string[]
     */
    private static function trimSentences($sentences)
    {
        return array_map(function ($sentence) {
            return Multibyte::trim($sentence);
        }, $sentences);
    }

    /**
     * Return the number of sentences detected in the provided text.
     * @param string $text
     * @return integer
     */
    public function count($text)
    {
        return count($this->split($text));
    }

}


1			<?php
2
3			namespace Vanderlee\Sentence;
4
5			/**
6			* Segments sentences.
7			* Clipping may not be perfect.
8			* Sentence count should be VERY close to the truth.
9			*
10			* Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11			* language stucture (English, Dutch, German). Should work for most
12			* latin-alphabet languages.
13			*
14			* @author Martijn van der Lee (@vanderlee)
15			* @author @marktaw
16			*/
17			class Sentence
18			{
19
20			/**
21			* Specify this flag with the split method to trim whitespace.
22			*/
23			const SPLIT_TRIM = 0x1;
24
25			/**
26			* List of characters used to terminate sentences.
27			*
28			* @var string[]
29			*/
30			private $terminals = ['.', '!', '?'];
31
32			/**
33			* List of characters used for abbreviations.
34			*
35			* @var string[]
36			*/
37			private $abbreviators = ['.'];
38
39			/**
40			* List of float numbers in the text
41			*
42			* @var string[]
43			*/
44			private $floatNumbers = [];
45
46			/**
47			* Clean floating point numbers by replace them with their md5 hash
48			*
49			* @param string $text
50			*
51			* @return string
52			*/
53			private function floatNumberClean(string $text)
54			{
55			preg_match_all('!\d+(?:\.\d+)?!', $text, $matches);
56
57			foreach ($matches[0] as $floatNumber) {
58			$this->floatNumbers[$floatNumber] = md5($floatNumber);
59
60			$text = str_replace($floatNumber, md5($floatNumber), $text);
61			}
62
63			return $text;
64			}
65
66			/**
67			* Revert the hashed floating number back
68			*
69			* @param string[] $text
70			*
71			* @return string[]
72			*/
73			private function floatNumberRevert($text)
74			{
75
76			return array_map(function($value) {
77			foreach ($this->floatNumbers as $number => $hash) {
78			$value = str_replace($hash, $number, $value);
79			}
80			return $value;
81			}, $text);
82			}
83
84			/**
85			* Breaks a piece of text into lines by linebreak.
86			* Eats up any linebreak characters as if one.
87			*
88			* Multibyte.php safe
89			*
90			* @param string $text
91			* @return string[]
92			*/
93			private static function linebreakSplit($text)
94			{
95			$lines = [];
96			$line = '';
97
98			foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
99			$line .= $part;
100			if (Multibyte::trim($part) === '') {
101			$lines[] = $line;
102			$line = '';
103			}
104			}
105			$lines[] = $line;
106
107			return $lines;
108			}
109
110			/**
111			* Splits an array of lines by (consecutive sequences of)
112			* terminals, keeping terminals.
113			*
114			* Multibyte.php safe (atleast for UTF-8)
115			*
116			* For example:
117			* "There ... is. More!"
118			* ... becomes ...
119			* [ "There ", "...", " is", ".", " More", "!" ]
120			*
121			* @param string $line
122			* @return string[]
123			*/
124			private function punctuationSplit($line)
125			{
126			$parts = [];
127
128			$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
129			$is_terminal = in_array($chars[0], $this->terminals);
130
131			$part = '';
132			foreach ($chars as $index => $char) {
133			if (in_array($char, $this->terminals) !== $is_terminal) {
134			$parts[] = $part;
135			$part = '';
136			$is_terminal = !$is_terminal;
137			}
138			$part .= $char;
139			}
140
141			if (!empty($part)) {
142			$parts[] = $part;
143			}
144
145			return $parts;
146			}
147
148			/**
149			* Appends each terminal item after it's preceding
150			* non-terminals.
151			*
152			* Multibyte.php safe (atleast for UTF-8)
153			*
154			* For example:
155			* [ "There ", "...", " is", ".", " More", "!" ]
156			* ... becomes ...
157			* [ "There ... is.", "More!" ]
158			*
159			* @param string[] $punctuations
160			* @return string[]
161			*/
162			private function punctuationMerge($punctuations)
163			{
164			$definite_terminals = array_diff($this->terminals, $this->abbreviators);
165
166			$merges = [];
167			$merge = '';
168
169			$filtered = array_filter($punctuations, function ($p) {
170			return $p !== '';
171			});
172
173			foreach ($filtered as $punctuation) {
174			$merge .= $punctuation;
175			if (mb_strlen($punctuation) === 1
176			&& in_array($punctuation, $this->terminals)) {
177			$merges[] = $merge;
178			$merge = '';
179			} else {
180			foreach ($definite_terminals as $terminal) {
181			if (mb_strpos($punctuation, $terminal) !== false) {
182			$merges[] = $merge;
183			$merge = '';
184			break;
185			}
186			}
187			}
188			}
189			if (!empty($merge)) {
190			$merges[] = $merge;
191			}
192
193			return $merges;
194			}
195
196			/**
197			* Looks for capitalized abbreviations & includes them with the following fragment.
198			*
199			* For example:
200			* [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
201			* ... becomes ...
202			* [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
203			* [ "Mr. Comey was not available for comment." ]
204			*
205			* @param string[] $fragments
206			* @return string[]
207			*/
208			private function abbreviationMerge($fragments)
209			{
210			$return_fragment = [];
211
212			$previous_fragment = '';
213			$previous_is_abbreviation = false;
214			$i = 0;
215			foreach ($fragments as $fragment) {
216			$is_abbreviation = self::isAbreviation($fragment);
217
218			// merge previous fragment with this
219			if ($previous_is_abbreviation) {
220			$fragment = $previous_fragment . $fragment;
221			}
222			$return_fragment[$i] = $fragment;
223
224			$previous_is_abbreviation = $is_abbreviation;
225			$previous_fragment = $fragment;
226
227			// only increment if this isn't an abbreviation
228			if (!$is_abbreviation) {
229			$i++;
230			}
231			}
232			return $return_fragment;
233			}
234
235			/**
236			* Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
237			*
238			* @param $fragment
239			* @return bool
240			*/
241			private static function isAbreviation($fragment)
242			{
243			$words = mb_split('\s+', Multibyte::trim($fragment));
244
245			$word_count = count($words);
246
247			$last_word = Multibyte::trim($words[$word_count - 1]);
248			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
249			$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
250
251			return $last_is_capital > 0
252			&& $last_is_abbreviation > 0
253			&& mb_strlen($last_word) <= 3;
254			}
255
256			/**
257			* Merges any part starting with a closing parenthesis ')' to the previous
258			* part.
259			*
260			* @param string[] $parts
261			* @return string[]
262			*/
263			private function parenthesesMerge($parts)
264			{
265			$subsentences = [];
266
267			foreach ($parts as $part) {
268			if ($part[0] === ')') {
269			$subsentences[count($subsentences) - 1] .= $part;
270			} else {
271			$subsentences[] = $part;
272			}
273			}
274
275			return $subsentences;
276			}
277
278			/**
279			* Looks for closing quotes to include them with the previous statement.
280			* "That was very interesting," he said.
281			* "That was very interesting."
282			*
283			* @param string[] $statements
284			* @return string[]
285			*/
286			private function closeQuotesMerge($statements)
287			{
288			$i = 0;
289			$previous_statement = '';
290			$return = [];
291			foreach ($statements as $statement) {
292			if (self::isEndQuote($statement)) {
293			$statement = $previous_statement . $statement;
294			} else {
295			$i++;
296			}
297
298			$return[$i] = $statement;
299			$previous_statement = $statement;
300			}
301
302			return $return;
303			}
304
305			/**
306			* Check if the entire string is a quotation mark or quote, then space, then lowercase.
307			*
308			* @param $statement
309			* @return bool
310			*/
311			private static function isEndQuote($statement)
312			{
313			$trimmed = Multibyte::trim($statement);
314			$first = mb_substr($statement, 0, 1);
315
316			return in_array($trimmed, ['"', '\''])
317			\|\| (
318			in_array($first, ['"', '\''])
319			&& mb_substr($statement, 1, 1) === ' '
320			&& ctype_lower(mb_substr($statement, 2, 1)) === true
321			);
322			}
323
324			/**
325			* Merges items into larger sentences.
326			* Multibyte.php safe
327			*
328			* @param string[] $shorts
329			* @return string[]
330			*/
331			private function sentenceMerge($shorts)
332			{
333			$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
334
335			$sentences = [];
336
337			$sentence = '';
338			$has_words = false;
339			$previous_word_ending = null;
340			foreach ($shorts as $short) {
341			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
342			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
343
344			if ($after_non_abbreviating_terminal
345			\|\| ($has_words && $word_count > 1)) {
346
347			$sentences[] = $sentence;
348
349			$sentence = '';
350			$has_words = false;
351			}
352
353			$has_words = $has_words
354			\|\| $word_count > 1;
355
356			$sentence .= $short;
357			$previous_word_ending = mb_substr($short, -1);
358			}
359
360			if (!empty($sentence)) {
361			$sentences[] = $sentence;
362			}
363
364			return $sentences;
365			}
366
367			/**
368			* Return the sentences sentences detected in the provided text.
369			* Set the Sentence::SPLIT_TRIM flag to trim whitespace.
370			* @param string $text
371			* @param integer $flags
372			* @return string[]
373			*/
374			public function split($text, $flags = 0)
375			{
376			static $pipeline = [
377			'floatNumberClean',
378			'punctuationSplit',
379			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
380			'punctuationMerge',
381			'abbreviationMerge',
382			'closeQuotesMerge',
383			'sentenceMerge',
384			'floatNumberRevert'
385			];
386
387			// clean funny quotes
388			$text = Multibyte::cleanUnicode($text);
389
390			// Split
391			$sentences = [];
392			foreach (self::linebreakSplit($text) as $input) {
393			if (Multibyte::trim($input) !== '') {
394			foreach ($pipeline as $method) {
395			$input = $this->$method($input);
396			}
397			$sentences = array_merge($sentences, $input);
398			}
399			}
400
401			// Post process
402			if ($flags & self::SPLIT_TRIM) {
403			return self::trimSentences($sentences);
404			}
405
406			return $sentences;
407			}
408
409			/**
410			* Multibyte.php trim each string in an array.
411			* @param string[] $sentences
412			* @return string[]
413			*/
414			private static function trimSentences($sentences)
415			{
416			return array_map(function ($sentence) {
417			return Multibyte::trim($sentence);
418			}, $sentences);
419			}
420
421			/**
422			* Return the number of sentences detected in the provided text.
423			* @param string $text
424			* @return integer
425			*/
426			public function count($text)
427			{
428			return count($this->split($text));
429			}
430
431			}
432

vanderlee / php-sentence

Pull Request — master (#16)

Sentence::floatNumberClean() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like