Sentence::cleanUnicode() - Code Metrics - Inspection of "Improve code quality" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 20ad1a...1e5204 )

by Martijn

created 2019-04-01 10:16 UTC

Sentence::cleanUnicode() A

↳ Parent: Sentence

Complexity

Conditions	1
Paths	1

Size

Total Lines	32
Code Lines	24

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	24
dl	0
loc	32
rs	9.536
c	0
b	0
f	0
cc	1
nc	1
nop	1

<?php

namespace Vanderlee\Sentence;

/**
 * Segments sentences.
 * Clipping may not be perfect.
 * Sentence count should be VERY close to the truth.
 *
 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
 * language stucture (English, Dutch, German). Should work for most
 * latin-alphabet languages.
 *
 * @author Martijn van der Lee (@vanderlee)
 * @author @marktaw
 */
class Sentence
{

    /**
     * Specify this flag with the split method to trim whitespace.
     */
    const SPLIT_TRIM = 0x1;

    /**
     * List of characters used to terminate sentences.
     *
     * @var string[]
     */
    private $terminals = array('.', '!', '?');

    /**
     * List of characters used for abbreviations.
     *
     * @var string[]
     */
    private $abbreviators = array('.');

    /**
     * Breaks a piece of text into lines by linebreak.
     * Eats up any linebreak characters as if one.
     *
     * Multibyte.php safe
     *
     * @param string $text
     * @return string[]
     */
    private static function linebreakSplit($text)
    {
        $lines = array();
        $line = '';

        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
            $line .= $part;
            if (Multibyte::trim($part) === '') {
                $lines[] = $line;
                $line = '';
            }
        }
        $lines[] = $line;

        return $lines;
    }

    /**
     * Replace
     *
     * @staticvar array $chr_map
     * @param string $string
     * @return string
     */
    private static function cleanUnicode($string)
    {
        //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
        static $character_map = array(
            // Windows codepage 1252
            "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
            "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
            "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
            "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
            "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
            "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
            "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
            "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
            // Regular Unicode     // U+0022 quotation mark (")
            // U+0027 apostrophe     (')
            "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
            "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
            "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
            "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
            "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
            "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
            "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
            "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
            "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
            "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
            "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
            "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
        );

        $character = array_keys($character_map); // but: for efficiency you should
        $replace = array_values($character_map); // pre-calculate these two arrays
        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
    }

    /**
     * Splits an array of lines by (consecutive sequences of)
     * terminals, keeping terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    "There ... is. More!"
     *        ... becomes ...
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *
     * @param string $line
     * @return string[]
     */
    private function punctuationSplit($line)
    {
        $parts = array();

        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
        $is_terminal = in_array($chars[0], $this->terminals);

        $part = '';
        foreach ($chars as $index => $char) {
            if (in_array($char, $this->terminals) !== $is_terminal) {
                $parts[] = $part;
                $part = '';
                $is_terminal = !$is_terminal;
            }
            $part .= $char;
        }

        if (!empty($part)) {
            $parts[] = $part;
        }

        return $parts;
    }

    /**
     * Appends each terminal item after it's preceding
     * non-terminals.
     *
     * Multibyte.php safe (atleast for UTF-8)
     *
     * For example:
     *    [ "There ", "...", " is", ".", " More", "!" ]
     *        ... becomes ...
     *    [ "There ... is.", "More!" ]
     *
     * @param string[] $punctuations
     * @return string[]
     */
    private function punctuationMerge($punctuations)
    {
        $definite_terminals = array_diff($this->terminals, $this->abbreviators);

        $merges = array();
        $merge = '';

        foreach ($punctuations as $punctuation) {
            if ($punctuation !== '') {
                $merge .= $punctuation;
                if (mb_strlen($punctuation) === 1
                    && in_array($punctuation, $this->terminals)) {
                    $merges[] = $merge;
                    $merge = '';
                } else {
                    foreach ($definite_terminals as $terminal) {
                        if (mb_strpos($punctuation, $terminal) !== false) {
                            $merges[] = $merge;
                            $merge = '';
                            break;
                        }
                    }
                }
            }
        }
        if (!empty($merge)) {
            $merges[] = $merge;
        }

        return $merges;
    }

    /**
     * Looks for capitalized abbreviations & includes them with the following fragment.
     *
     * For example:
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
     *        ... becomes ...
     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
     *  [ "Mr. Comey was not available for comment." ]
     *
     * @param string[] $fragments
     * @return string[]
     */
    private function abbreviationMerge($fragments)
    {
        $return_fragment = array();

        $previous_string = '';
        $previous_is_abbreviation = false;
        $i = 0;

        foreach ($fragments as $fragment) {
            $current_string = $fragment;
            $words = mb_split('\s+', Multibyte::trim($fragment));

            $word_count = count($words);

            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
            $last_word = trim($words[$word_count - 1]);
            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
            $is_abbreviation = $last_is_capital > 0
                && $last_is_abbreviation > 0
                && mb_strlen($last_word) <= 3;

            // merge previous fragment with this
            if ($previous_is_abbreviation === true) {
                $current_string = $previous_string . $current_string;
            }
            $return_fragment[$i] = $current_string;

            $previous_is_abbreviation = $is_abbreviation;
            $previous_string = $current_string;
            // only increment if this isn't an abbreviation
            if ($is_abbreviation === false) {
                $i++;
            }
        }
        return $return_fragment;
    }

    /**
     * Merges any part starting with a closing parenthesis ')' to the previous
     * part.
     *
     * @param string[] $parts
     * @return string[]
     */
    private function parenthesesMerge($parts)
    {
        $subsentences = array();

        foreach ($parts as $part) {
            if ($part[0] === ')') {
                $subsentences[count($subsentences) - 1] .= $part;
            } else {
                $subsentences[] = $part;
            }
        }

        return $subsentences;
    }

    /**
     * Looks for closing quotes to include them with the previous statement.
     * "That was very interesting," he said.
     * "That was very interesting."
     *
     * @param string[] $statements
     * @return string[]
     */
    private function closeQuotesMerge($statements)
    {
        $i = 0;
        $previous_statement = "";
        $return = array();
        foreach ($statements as $statement) {
            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
            if (trim($statement) === '"'
                || trim($statement) === "'"
                || (
                    (substr($statement, 0, 1) === '"'
                        || substr($statement, 0, 1) === "'")
                    && substr($statement, 1, 1) === ' '
                    && ctype_lower(substr($statement, 2, 1)) === true
                )
            ) {
                $statement = $previous_statement . $statement;
            } else {
                $i++;
            }

            $return[$i] = $statement;
            $previous_statement = $statement;
        }

        return $return;
    }

    /**
     * Merges items into larger sentences.
     * Multibyte.php safe
     *
     * @param string[] $shorts
     * @return string[]
     */
    private function sentenceMerge($shorts)
    {
        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);

        $sentences = array();

        $sentence = '';
        $has_words = false;
        $previous_word_ending = null;
        foreach ($shorts as $short) {
            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);

            if ($after_non_abbreviating_terminal
                || ($has_words && $word_count > 1)) {
                $sentences[] = $sentence;
                $sentence = '';
                $has_words = $word_count > 1;
            } else {
                $has_words = ($has_words
                    || $word_count > 1);
            }

            $sentence .= $short;
            $previous_word_ending = mb_substr($short, -1);
        }
        if (!empty($sentence)) {
            $sentences[] = $sentence;
        }

        return $sentences;
    }

    /**
     * Return the sentences sentences detected in the provided text.
     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
     * @param string $text
     * @param integer $flags
     * @return string[]
     */
    public function split($text, $flags = 0)
    {
        $sentences = array();

        // clean funny quotes
        $text = self::cleanUnicode($text);

        // Split
        foreach (self::linebreakSplit($text) as $line) {
            if (Multibyte::trim($line) !== '') {
                $punctuations = $this->punctuationSplit($line);
                $parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
                $merges = $this->punctuationMerge($parentheses);
                $shorts = $this->abbreviationMerge($merges);
                $quotes = $this->closeQuotesMerge($shorts);
                $sentences = array_merge($sentences, $this->sentenceMerge($quotes));
            }
        }

        // Post process
        if ($flags & self::SPLIT_TRIM) {
            return self::trimSentences($sentences);
        }

        return $sentences;
    }

    /**
     * Multibyte.php trim each string in an array.
     * @param string[] $sentences
     * @return string[]
     */
    private static function trimSentences($sentences)
    {
        $trimmed = array();
        foreach ($sentences as $sentence) {
            $trimmed[] = Multibyte::trim($sentence);
        }
        return $trimmed;
    }

    /**
     * Return the number of sentences detected in the provided text.
     * @param string $text
     * @return integer
     */
    public function count($text)
    {
        return count($this->split($text));
    }

}


1			<?php
2
3			namespace Vanderlee\Sentence;
4
5			/**
6			* Segments sentences.
7			* Clipping may not be perfect.
8			* Sentence count should be VERY close to the truth.
9			*
10			* Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11			* language stucture (English, Dutch, German). Should work for most
12			* latin-alphabet languages.
13			*
14			* @author Martijn van der Lee (@vanderlee)
15			* @author @marktaw
16			*/
17			class Sentence
18			{
19
20			/**
21			* Specify this flag with the split method to trim whitespace.
22			*/
23			const SPLIT_TRIM = 0x1;
24
25			/**
26			* List of characters used to terminate sentences.
27			*
28			* @var string[]
29			*/
30			private $terminals = array('.', '!', '?');
31
32			/**
33			* List of characters used for abbreviations.
34			*
35			* @var string[]
36			*/
37			private $abbreviators = array('.');
38
39			/**
40			* Breaks a piece of text into lines by linebreak.
41			* Eats up any linebreak characters as if one.
42			*
43			* Multibyte.php safe
44			*
45			* @param string $text
46			* @return string[]
47			*/
48			private static function linebreakSplit($text)
49			{
50			$lines = array();
51			$line = '';
52
53			foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54			$line .= $part;
55			if (Multibyte::trim($part) === '') {
56			$lines[] = $line;
57			$line = '';
58			}
59			}
60			$lines[] = $line;
61
62			return $lines;
63			}
64
65			/**
66			* Replace
67			*
68			* @staticvar array $chr_map
69			* @param string $string
70			* @return string
71			*/
72			private static function cleanUnicode($string)
73			{
74			//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
75			static $character_map = array(
76			// Windows codepage 1252
77			"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
78			"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
79			"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
80			"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
81			"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
82			"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
83			"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
84			"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
85			// Regular Unicode // U+0022 quotation mark (")
86			// U+0027 apostrophe (')
87			"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
88			"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
89			"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
90			"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
91			"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
92			"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
93			"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
94			"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
95			"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
96			"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
97			"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
98			"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
99			);
100
101			$character = array_keys($character_map); // but: for efficiency you should
102			$replace = array_values($character_map); // pre-calculate these two arrays
103			return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
104			}
105
106			/**
107			* Splits an array of lines by (consecutive sequences of)
108			* terminals, keeping terminals.
109			*
110			* Multibyte.php safe (atleast for UTF-8)
111			*
112			* For example:
113			* "There ... is. More!"
114			* ... becomes ...
115			* [ "There ", "...", " is", ".", " More", "!" ]
116			*
117			* @param string $line
118			* @return string[]
119			*/
120			private function punctuationSplit($line)
121			{
122			$parts = array();
123
124			$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
125			$is_terminal = in_array($chars[0], $this->terminals);
126
127			$part = '';
128			foreach ($chars as $index => $char) {
129			if (in_array($char, $this->terminals) !== $is_terminal) {
130			$parts[] = $part;
131			$part = '';
132			$is_terminal = !$is_terminal;
133			}
134			$part .= $char;
135			}
136
137			if (!empty($part)) {
138			$parts[] = $part;
139			}
140
141			return $parts;
142			}
143
144			/**
145			* Appends each terminal item after it's preceding
146			* non-terminals.
147			*
148			* Multibyte.php safe (atleast for UTF-8)
149			*
150			* For example:
151			* [ "There ", "...", " is", ".", " More", "!" ]
152			* ... becomes ...
153			* [ "There ... is.", "More!" ]
154			*
155			* @param string[] $punctuations
156			* @return string[]
157			*/
158			private function punctuationMerge($punctuations)
159			{
160			$definite_terminals = array_diff($this->terminals, $this->abbreviators);
161
162			$merges = array();
163			$merge = '';
164
165			foreach ($punctuations as $punctuation) {
166			if ($punctuation !== '') {
167			$merge .= $punctuation;
168			if (mb_strlen($punctuation) === 1
169			&& in_array($punctuation, $this->terminals)) {
170			$merges[] = $merge;
171			$merge = '';
172			} else {
173			foreach ($definite_terminals as $terminal) {
174			if (mb_strpos($punctuation, $terminal) !== false) {
175			$merges[] = $merge;
176			$merge = '';
177			break;
178			}
179			}
180			}
181			}
182			}
183			if (!empty($merge)) {
184			$merges[] = $merge;
185			}
186
187			return $merges;
188			}
189
190			/**
191			* Looks for capitalized abbreviations & includes them with the following fragment.
192			*
193			* For example:
194			* [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
195			* ... becomes ...
196			* [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
197			* [ "Mr. Comey was not available for comment." ]
198			*
199			* @param string[] $fragments
200			* @return string[]
201			*/
202			private function abbreviationMerge($fragments)
203			{
204			$return_fragment = array();
205
206			$previous_string = '';
207			$previous_is_abbreviation = false;
208			$i = 0;
209
210			foreach ($fragments as $fragment) {
211			$current_string = $fragment;
212			$words = mb_split('\s+', Multibyte::trim($fragment));
213
214			$word_count = count($words);
215
216			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
217			$last_word = trim($words[$word_count - 1]);
218			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
219			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
220			$is_abbreviation = $last_is_capital > 0
221			&& $last_is_abbreviation > 0
222			&& mb_strlen($last_word) <= 3;
223
224			// merge previous fragment with this
225			if ($previous_is_abbreviation === true) {
226			$current_string = $previous_string . $current_string;
227			}
228			$return_fragment[$i] = $current_string;
229
230			$previous_is_abbreviation = $is_abbreviation;
231			$previous_string = $current_string;
232			// only increment if this isn't an abbreviation
233			if ($is_abbreviation === false) {
234			$i++;
235			}
236			}
237			return $return_fragment;
238			}
239
240			/**
241			* Merges any part starting with a closing parenthesis ')' to the previous
242			* part.
243			*
244			* @param string[] $parts
245			* @return string[]
246			*/
247			private function parenthesesMerge($parts)
248			{
249			$subsentences = array();
250
251			foreach ($parts as $part) {
252			if ($part[0] === ')') {
253			$subsentences[count($subsentences) - 1] .= $part;
254			} else {
255			$subsentences[] = $part;
256			}
257			}
258
259			return $subsentences;
260			}
261
262			/**
263			* Looks for closing quotes to include them with the previous statement.
264			* "That was very interesting," he said.
265			* "That was very interesting."
266			*
267			* @param string[] $statements
268			* @return string[]
269			*/
270			private function closeQuotesMerge($statements)
271			{
272			$i = 0;
273			$previous_statement = "";
274			$return = array();
275			foreach ($statements as $statement) {
276			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
277			if (trim($statement) === '"'
278			\|\| trim($statement) === "'"
279			\|\| (
280			(substr($statement, 0, 1) === '"'
281			\|\| substr($statement, 0, 1) === "'")
282			&& substr($statement, 1, 1) === ' '
283			&& ctype_lower(substr($statement, 2, 1)) === true
284			)
285			) {
286			$statement = $previous_statement . $statement;
287			} else {
288			$i++;
289			}
290
291			$return[$i] = $statement;
292			$previous_statement = $statement;
293			}
294
295			return $return;
296			}
297
298			/**
299			* Merges items into larger sentences.
300			* Multibyte.php safe
301			*
302			* @param string[] $shorts
303			* @return string[]
304			*/
305			private function sentenceMerge($shorts)
306			{
307			$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
308
309			$sentences = array();
310
311			$sentence = '';
312			$has_words = false;
313			$previous_word_ending = null;
314			foreach ($shorts as $short) {
315			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
316			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
317
318			if ($after_non_abbreviating_terminal
319			\|\| ($has_words && $word_count > 1)) {
320			$sentences[] = $sentence;
321			$sentence = '';
322			$has_words = $word_count > 1;
323			} else {
324			$has_words = ($has_words
325			\|\| $word_count > 1);
326			}
327
328			$sentence .= $short;
329			$previous_word_ending = mb_substr($short, -1);
330			}
331			if (!empty($sentence)) {
332			$sentences[] = $sentence;
333			}
334
335			return $sentences;
336			}
337
338			/**
339			* Return the sentences sentences detected in the provided text.
340			* Set the Sentence::SPLIT_TRIM flag to trim whitespace.
341			* @param string $text
342			* @param integer $flags
343			* @return string[]
344			*/
345			public function split($text, $flags = 0)
346			{
347			$sentences = array();
348
349			// clean funny quotes
350			$text = self::cleanUnicode($text);
351
352			// Split
353			foreach (self::linebreakSplit($text) as $line) {
354			if (Multibyte::trim($line) !== '') {
355			$punctuations = $this->punctuationSplit($line);
356			$parentheses = $this->parenthesesMerge($punctuations); // also works after punctuationMerge or abbreviationMerge
357			$merges = $this->punctuationMerge($parentheses);
358			$shorts = $this->abbreviationMerge($merges);
359			$quotes = $this->closeQuotesMerge($shorts);
360			$sentences = array_merge($sentences, $this->sentenceMerge($quotes));
361			}
362			}
363
364			// Post process
365			if ($flags & self::SPLIT_TRIM) {
366			return self::trimSentences($sentences);
367			}
368
369			return $sentences;
370			}
371
372			/**
373			* Multibyte.php trim each string in an array.
374			* @param string[] $sentences
375			* @return string[]
376			*/
377			private static function trimSentences($sentences)
378			{
379			$trimmed = array();
380			foreach ($sentences as $sentence) {
381			$trimmed[] = Multibyte::trim($sentence);
382			}
383			return $trimmed;
384			}
385
386			/**
387			* Return the number of sentences detected in the provided text.
388			* @param string $text
389			* @return integer
390			*/
391			public function count($text)
392			{
393			return count($this->split($text));
394			}
395
396			}
397

vanderlee / php-sentence

Push — master ( 20ad1a...1e5204 )

Sentence::cleanUnicode() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like