Inspection of "Refactor some of the methods" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( d7563a...f1d8ce )

by Martijn

created 2019-04-01 13:15 UTC

Status

Indentation +363 added lines, -363 removed lines patch added patch discarded remove patch

@@ -17,368 +17,368 @@
 block discarded – undo
 class Sentence
 {
 
-    /**
-     * Specify this flag with the split method to trim whitespace.
-     */
-    const SPLIT_TRIM = 0x1;
-
-    /**
-     * List of characters used to terminate sentences.
-     *
-     * @var string[]
-     */
-    private $terminals = ['.', '!', '?'];
-
-    /**
-     * List of characters used for abbreviations.
-     *
-     * @var string[]
-     */
-    private $abbreviators = ['.'];
-
-    /**
-     * Breaks a piece of text into lines by linebreak.
-     * Eats up any linebreak characters as if one.
-     *
-     * Multibyte.php safe
-     *
-     * @param string $text
-     * @return string[]
-     */
-    private static function linebreakSplit($text)
-    {
-        $lines = [];
-        $line = '';
-
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
-            $line .= $part;
-            if (Multibyte::trim($part) === '') {
-                $lines[] = $line;
-                $line = '';
-            }
-        }
-        $lines[] = $line;
-
-        return $lines;
-    }
-
-    /**
-     * Splits an array of lines by (consecutive sequences of)
-     * terminals, keeping terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    "There ... is. More!"
-     *        ... becomes ...
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *
-     * @param string $line
-     * @return string[]
-     */
-    private function punctuationSplit($line)
-    {
-        $parts = [];
-
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
-        $is_terminal = in_array($chars[0], $this->terminals);
-
-        $part = '';
-        foreach ($chars as $index => $char) {
-            if (in_array($char, $this->terminals) !== $is_terminal) {
-                $parts[] = $part;
-                $part = '';
-                $is_terminal = !$is_terminal;
-            }
-            $part .= $char;
-        }
-
-        if (!empty($part)) {
-            $parts[] = $part;
-        }
-
-        return $parts;
-    }
-
-    /**
-     * Appends each terminal item after it's preceding
-     * non-terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *        ... becomes ...
-     *    [ "There ... is.", "More!" ]
-     *
-     * @param string[] $punctuations
-     * @return string[]
-     */
-    private function punctuationMerge($punctuations)
-    {
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $merges = [];
-        $merge = '';
-
-        $filtered = array_filter($punctuations, function ($p) {
-            return $p !== '';
-        });
-
-        foreach ($filtered as $punctuation) {
-            $merge .= $punctuation;
-            if (mb_strlen($punctuation) === 1
-                && in_array($punctuation, $this->terminals)) {
-                $merges[] = $merge;
-                $merge = '';
-            } else {
-                foreach ($definite_terminals as $terminal) {
-                    if (mb_strpos($punctuation, $terminal) !== false) {
-                        $merges[] = $merge;
-                        $merge = '';
-                        break;
-                    }
-                }
-            }
-        }
-        if (!empty($merge)) {
-            $merges[] = $merge;
-        }
-
-        return $merges;
-    }
-
-    /**
-     * Looks for capitalized abbreviations & includes them with the following fragment.
-     *
-     * For example:
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
-     *        ... becomes ...
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
-     *  [ "Mr. Comey was not available for comment." ]
-     *
-     * @param string[] $fragments
-     * @return string[]
-     */
-    private function abbreviationMerge($fragments)
-    {
-        $return_fragment = [];
-
-        $previous_fragment = '';
-        $previous_is_abbreviation = false;
-        $i = 0;
-        foreach ($fragments as $fragment) {
-            $is_abbreviation = self::isAbreviation($fragment);
-
-            // merge previous fragment with this
-            if ($previous_is_abbreviation) {
-                $fragment = $previous_fragment . $fragment;
-            }
-            $return_fragment[$i] = $fragment;
-
-            $previous_is_abbreviation = $is_abbreviation;
-            $previous_fragment = $fragment;
-
-            // only increment if this isn't an abbreviation
-            if (!$is_abbreviation) {
-                $i++;
-            }
-        }
-        return $return_fragment;
-    }
-
-    /**
-     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
-     *
-     * @param $fragment
-     * @return bool
-     */
-    private static function isAbreviation($fragment)
-    {
-        $words = mb_split('\s+', Multibyte::trim($fragment));
-
-        $word_count = count($words);
-
-        $last_word = Multibyte::trim($words[$word_count - 1]);
-        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
-        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
-
-        return $last_is_capital > 0
-            && $last_is_abbreviation > 0
-            && mb_strlen($last_word) <= 3;
-    }
-
-    /**
-     * Merges any part starting with a closing parenthesis ')' to the previous
-     * part.
-     *
-     * @param string[] $parts
-     * @return string[]
-     */
-    private function parenthesesMerge($parts)
-    {
-        $subsentences = [];
-
-        foreach ($parts as $part) {
-            if ($part[0] === ')') {
-                $subsentences[count($subsentences) - 1] .= $part;
-            } else {
-                $subsentences[] = $part;
-            }
-        }
-
-        return $subsentences;
-    }
-
-    /**
-     * Looks for closing quotes to include them with the previous statement.
-     * "That was very interesting," he said.
-     * "That was very interesting."
-     *
-     * @param string[] $statements
-     * @return string[]
-     */
-    private function closeQuotesMerge($statements)
-    {
-        $i = 0;
-        $previous_statement = '';
-        $return = [];
-        foreach ($statements as $statement) {
-            if (self::isEndQuote($statement)) {
-                $statement = $previous_statement . $statement;
-            } else {
-                $i++;
-            }
-
-            $return[$i] = $statement;
-            $previous_statement = $statement;
-        }
-
-        return $return;
-    }
-
-    /**
-     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
-     *
-     * @param $statement
-     * @return bool
-     */
-    private static function isEndQuote($statement)
-    {
-        $trimmed = Multibyte::trim($statement);
-        $first = mb_substr($statement, 0, 1);
-
-        return in_array($trimmed, ['"', '\''])
-            || (
-                in_array($first, ['"', '\''])
-                && mb_substr($statement, 1, 1) === ' '
-                && ctype_lower(mb_substr($statement, 2, 1)) === true
-            );
-    }
-
-    /**
-     * Merges items into larger sentences.
-     * Multibyte.php safe
-     *
-     * @param string[] $shorts
-     * @return string[]
-     */
-    private function sentenceMerge($shorts)
-    {
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $sentences = [];
-
-        $sentence = '';
-        $has_words = false;
-        $previous_word_ending = null;
-        foreach ($shorts as $short) {
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
-
-            if ($after_non_abbreviating_terminal
-                || ($has_words && $word_count > 1)) {
-
-                $sentences[] = $sentence;
-
-                $sentence = '';
-                $has_words = false;
-            }
-
-            $has_words = $has_words
-                || $word_count > 1;
-
-            $sentence .= $short;
-            $previous_word_ending = mb_substr($short, -1);
-        }
-
-        if (!empty($sentence)) {
-            $sentences[] = $sentence;
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Return the sentences sentences detected in the provided text.
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
-     * @param string $text
-     * @param integer $flags
-     * @return string[]
-     */
-    public function split($text, $flags = 0)
-    {
-        static $pipeline = [
-            'punctuationSplit',
-            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
-            'punctuationMerge',
-            'abbreviationMerge',
-            'closeQuotesMerge',
-            'sentenceMerge',
-        ];
-
-        // clean funny quotes
-        $text = Multibyte::cleanUnicode($text);
-
-        // Split
-        $sentences = [];
-        foreach (self::linebreakSplit($text) as $input) {
-            if (Multibyte::trim($input) !== '') {
-                foreach ($pipeline as $method) {
-                    $input = $this->$method($input);
-                }
-                $sentences = array_merge($sentences, $input);
-            }
-        }
-
-        // Post process
-        if ($flags & self::SPLIT_TRIM) {
-            return self::trimSentences($sentences);
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Multibyte.php trim each string in an array.
-     * @param string[] $sentences
-     * @return string[]
-     */
-    private static function trimSentences($sentences)
-    {
-        return array_map(function ($sentence) {
-            return Multibyte::trim($sentence);
-        }, $sentences);
-    }
-
-    /**
-     * Return the number of sentences detected in the provided text.
-     * @param string $text
-     * @return integer
-     */
-    public function count($text)
-    {
-        return count($this->split($text));
-    }
+	/**
+	 * Specify this flag with the split method to trim whitespace.
+	 */
+	const SPLIT_TRIM = 0x1;
+
+	/**
+	 * List of characters used to terminate sentences.
+	 *
+	 * @var string[]
+	 */
+	private $terminals = ['.', '!', '?'];
+
+	/**
+	 * List of characters used for abbreviations.
+	 *
+	 * @var string[]
+	 */
+	private $abbreviators = ['.'];
+
+	/**
+	 * Breaks a piece of text into lines by linebreak.
+	 * Eats up any linebreak characters as if one.
+	 *
+	 * Multibyte.php safe
+	 *
+	 * @param string $text
+	 * @return string[]
+	 */
+	private static function linebreakSplit($text)
+	{
+		$lines = [];
+		$line = '';
+
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
+			$line .= $part;
+			if (Multibyte::trim($part) === '') {
+				$lines[] = $line;
+				$line = '';
+			}
+		}
+		$lines[] = $line;
+
+		return $lines;
+	}
+
+	/**
+	 * Splits an array of lines by (consecutive sequences of)
+	 * terminals, keeping terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    "There ... is. More!"
+	 *        ... becomes ...
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *
+	 * @param string $line
+	 * @return string[]
+	 */
+	private function punctuationSplit($line)
+	{
+		$parts = [];
+
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
+		$is_terminal = in_array($chars[0], $this->terminals);
+
+		$part = '';
+		foreach ($chars as $index => $char) {
+			if (in_array($char, $this->terminals) !== $is_terminal) {
+				$parts[] = $part;
+				$part = '';
+				$is_terminal = !$is_terminal;
+			}
+			$part .= $char;
+		}
+
+		if (!empty($part)) {
+			$parts[] = $part;
+		}
+
+		return $parts;
+	}
+
+	/**
+	 * Appends each terminal item after it's preceding
+	 * non-terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *        ... becomes ...
+	 *    [ "There ... is.", "More!" ]
+	 *
+	 * @param string[] $punctuations
+	 * @return string[]
+	 */
+	private function punctuationMerge($punctuations)
+	{
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$merges = [];
+		$merge = '';
+
+		$filtered = array_filter($punctuations, function ($p) {
+			return $p !== '';
+		});
+
+		foreach ($filtered as $punctuation) {
+			$merge .= $punctuation;
+			if (mb_strlen($punctuation) === 1
+				&& in_array($punctuation, $this->terminals)) {
+				$merges[] = $merge;
+				$merge = '';
+			} else {
+				foreach ($definite_terminals as $terminal) {
+					if (mb_strpos($punctuation, $terminal) !== false) {
+						$merges[] = $merge;
+						$merge = '';
+						break;
+					}
+				}
+			}
+		}
+		if (!empty($merge)) {
+			$merges[] = $merge;
+		}
+
+		return $merges;
+	}
+
+	/**
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
+	 *
+	 * For example:
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
+	 *        ... becomes ...
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
+	 *  [ "Mr. Comey was not available for comment." ]
+	 *
+	 * @param string[] $fragments
+	 * @return string[]
+	 */
+	private function abbreviationMerge($fragments)
+	{
+		$return_fragment = [];
+
+		$previous_fragment = '';
+		$previous_is_abbreviation = false;
+		$i = 0;
+		foreach ($fragments as $fragment) {
+			$is_abbreviation = self::isAbreviation($fragment);
+
+			// merge previous fragment with this
+			if ($previous_is_abbreviation) {
+				$fragment = $previous_fragment . $fragment;
+			}
+			$return_fragment[$i] = $fragment;
+
+			$previous_is_abbreviation = $is_abbreviation;
+			$previous_fragment = $fragment;
+
+			// only increment if this isn't an abbreviation
+			if (!$is_abbreviation) {
+				$i++;
+			}
+		}
+		return $return_fragment;
+	}
+
+	/**
+	 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
+	 *
+	 * @param $fragment
+	 * @return bool
+	 */
+	private static function isAbreviation($fragment)
+	{
+		$words = mb_split('\s+', Multibyte::trim($fragment));
+
+		$word_count = count($words);
+
+		$last_word = Multibyte::trim($words[$word_count - 1]);
+		$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
+		$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
+
+		return $last_is_capital > 0
+			&& $last_is_abbreviation > 0
+			&& mb_strlen($last_word) <= 3;
+	}
+
+	/**
+	 * Merges any part starting with a closing parenthesis ')' to the previous
+	 * part.
+	 *
+	 * @param string[] $parts
+	 * @return string[]
+	 */
+	private function parenthesesMerge($parts)
+	{
+		$subsentences = [];
+
+		foreach ($parts as $part) {
+			if ($part[0] === ')') {
+				$subsentences[count($subsentences) - 1] .= $part;
+			} else {
+				$subsentences[] = $part;
+			}
+		}
+
+		return $subsentences;
+	}
+
+	/**
+	 * Looks for closing quotes to include them with the previous statement.
+	 * "That was very interesting," he said.
+	 * "That was very interesting."
+	 *
+	 * @param string[] $statements
+	 * @return string[]
+	 */
+	private function closeQuotesMerge($statements)
+	{
+		$i = 0;
+		$previous_statement = '';
+		$return = [];
+		foreach ($statements as $statement) {
+			if (self::isEndQuote($statement)) {
+				$statement = $previous_statement . $statement;
+			} else {
+				$i++;
+			}
+
+			$return[$i] = $statement;
+			$previous_statement = $statement;
+		}
+
+		return $return;
+	}
+
+	/**
+	 * Check if the entire string is a quotation mark or quote, then space, then lowercase.
+	 *
+	 * @param $statement
+	 * @return bool
+	 */
+	private static function isEndQuote($statement)
+	{
+		$trimmed = Multibyte::trim($statement);
+		$first = mb_substr($statement, 0, 1);
+
+		return in_array($trimmed, ['"', '\''])
+			|| (
+				in_array($first, ['"', '\''])
+				&& mb_substr($statement, 1, 1) === ' '
+				&& ctype_lower(mb_substr($statement, 2, 1)) === true
+			);
+	}
+
+	/**
+	 * Merges items into larger sentences.
+	 * Multibyte.php safe
+	 *
+	 * @param string[] $shorts
+	 * @return string[]
+	 */
+	private function sentenceMerge($shorts)
+	{
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$sentences = [];
+
+		$sentence = '';
+		$has_words = false;
+		$previous_word_ending = null;
+		foreach ($shorts as $short) {
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
+
+			if ($after_non_abbreviating_terminal
+				|| ($has_words && $word_count > 1)) {
+
+				$sentences[] = $sentence;
+
+				$sentence = '';
+				$has_words = false;
+			}
+
+			$has_words = $has_words
+				|| $word_count > 1;
+
+			$sentence .= $short;
+			$previous_word_ending = mb_substr($short, -1);
+		}
+
+		if (!empty($sentence)) {
+			$sentences[] = $sentence;
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Return the sentences sentences detected in the provided text.
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
+	 * @param string $text
+	 * @param integer $flags
+	 * @return string[]
+	 */
+	public function split($text, $flags = 0)
+	{
+		static $pipeline = [
+			'punctuationSplit',
+			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
+			'punctuationMerge',
+			'abbreviationMerge',
+			'closeQuotesMerge',
+			'sentenceMerge',
+		];
+
+		// clean funny quotes
+		$text = Multibyte::cleanUnicode($text);
+
+		// Split
+		$sentences = [];
+		foreach (self::linebreakSplit($text) as $input) {
+			if (Multibyte::trim($input) !== '') {
+				foreach ($pipeline as $method) {
+					$input = $this->$method($input);
+				}
+				$sentences = array_merge($sentences, $input);
+			}
+		}
+
+		// Post process
+		if ($flags & self::SPLIT_TRIM) {
+			return self::trimSentences($sentences);
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Multibyte.php trim each string in an array.
+	 * @param string[] $sentences
+	 * @return string[]
+	 */
+	private static function trimSentences($sentences)
+	{
+		return array_map(function ($sentence) {
+			return Multibyte::trim($sentence);
+		}, $sentences);
+	}
+
+	/**
+	 * Return the number of sentences detected in the provided text.
+	 * @param string $text
+	 * @return integer
+	 */
+	public function count($text)
+	{
+		return count($this->split($text));
+	}
 
 }

Please login to merge, or discard this patch.

Spacing +2 added lines, -2 removed lines patch added patch discarded remove patch

@@ -121,7 +121,7 @@  discard block
 block discarded – undo
         $merges = [];
         $merge = '';
 
-        $filtered = array_filter($punctuations, function ($p) {
+        $filtered = array_filter($punctuations, function($p) {
             return $p !== '';
         });
 
@@ -366,7 +366,7 @@  discard block
 block discarded – undo
      */
     private static function trimSentences($sentences)
     {
-        return array_map(function ($sentence) {
+        return array_map(function($sentence) {
             return Multibyte::trim($sentence);
         }, $sentences);
     }

Please login to merge, or discard this patch.

		@@ -17,368 +17,368 @@
		block discarded – undo
17	17	class Sentence
18	18	{
19	19
20		- /**
21		- * Specify this flag with the split method to trim whitespace.
22		- */
23		- const SPLIT_TRIM = 0x1;
24		-
25		- /**
26		- * List of characters used to terminate sentences.
27		- *
28		- * @var string[]
29		- */
30		- private $terminals = ['.', '!', '?'];
31		-
32		- /**
33		- * List of characters used for abbreviations.
34		- *
35		- * @var string[]
36		- */
37		- private $abbreviators = ['.'];
38		-
39		- /**
40		- * Breaks a piece of text into lines by linebreak.
41		- * Eats up any linebreak characters as if one.
42		- *
43		- * Multibyte.php safe
44		- *
45		- * @param string $text
46		- * @return string[]
47		- */
48		- private static function linebreakSplit($text)
49		- {
50		- $lines = [];
51		- $line = '';
52		-
53		- foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54		- $line .= $part;
55		- if (Multibyte::trim($part) === '') {
56		- $lines[] = $line;
57		- $line = '';
58		- }
59		- }
60		- $lines[] = $line;
61		-
62		- return $lines;
63		- }
64		-
65		- /**
66		- * Splits an array of lines by (consecutive sequences of)
67		- * terminals, keeping terminals.
68		- *
69		- * Multibyte.php safe (atleast for UTF-8)
70		- *
71		- * For example:
72		- * "There ... is. More!"
73		- * ... becomes ...
74		- * [ "There ", "...", " is", ".", " More", "!" ]
75		- *
76		- * @param string $line
77		- * @return string[]
78		- */
79		- private function punctuationSplit($line)
80		- {
81		- $parts = [];
82		-
83		- $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84		- $is_terminal = in_array($chars[0], $this->terminals);
85		-
86		- $part = '';
87		- foreach ($chars as $index => $char) {
88		- if (in_array($char, $this->terminals) !== $is_terminal) {
89		- $parts[] = $part;
90		- $part = '';
91		- $is_terminal = !$is_terminal;
92		- }
93		- $part .= $char;
94		- }
95		-
96		- if (!empty($part)) {
97		- $parts[] = $part;
98		- }
99		-
100		- return $parts;
101		- }
102		-
103		- /**
104		- * Appends each terminal item after it's preceding
105		- * non-terminals.
106		- *
107		- * Multibyte.php safe (atleast for UTF-8)
108		- *
109		- * For example:
110		- * [ "There ", "...", " is", ".", " More", "!" ]
111		- * ... becomes ...
112		- * [ "There ... is.", "More!" ]
113		- *
114		- * @param string[] $punctuations
115		- * @return string[]
116		- */
117		- private function punctuationMerge($punctuations)
118		- {
119		- $definite_terminals = array_diff($this->terminals, $this->abbreviators);
120		-
121		- $merges = [];
122		- $merge = '';
123		-
124		- $filtered = array_filter($punctuations, function ($p) {
125		- return $p !== '';
126		- });
127		-
128		- foreach ($filtered as $punctuation) {
129		- $merge .= $punctuation;
130		- if (mb_strlen($punctuation) === 1
131		- && in_array($punctuation, $this->terminals)) {
132		- $merges[] = $merge;
133		- $merge = '';
134		- } else {
135		- foreach ($definite_terminals as $terminal) {
136		- if (mb_strpos($punctuation, $terminal) !== false) {
137		- $merges[] = $merge;
138		- $merge = '';
139		- break;
140		- }
141		- }
142		- }
143		- }
144		- if (!empty($merge)) {
145		- $merges[] = $merge;
146		- }
147		-
148		- return $merges;
149		- }
150		-
151		- /**
152		- * Looks for capitalized abbreviations & includes them with the following fragment.
153		- *
154		- * For example:
155		- * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
156		- * ... becomes ...
157		- * [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
158		- * [ "Mr. Comey was not available for comment." ]
159		- *
160		- * @param string[] $fragments
161		- * @return string[]
162		- */
163		- private function abbreviationMerge($fragments)
164		- {
165		- $return_fragment = [];
166		-
167		- $previous_fragment = '';
168		- $previous_is_abbreviation = false;
169		- $i = 0;
170		- foreach ($fragments as $fragment) {
171		- $is_abbreviation = self::isAbreviation($fragment);
172		-
173		- // merge previous fragment with this
174		- if ($previous_is_abbreviation) {
175		- $fragment = $previous_fragment . $fragment;
176		- }
177		- $return_fragment[$i] = $fragment;
178		-
179		- $previous_is_abbreviation = $is_abbreviation;
180		- $previous_fragment = $fragment;
181		-
182		- // only increment if this isn't an abbreviation
183		- if (!$is_abbreviation) {
184		- $i++;
185		- }
186		- }
187		- return $return_fragment;
188		- }
189		-
190		- /**
191		- * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
192		- *
193		- * @param $fragment
194		- * @return bool
195		- */
196		- private static function isAbreviation($fragment)
197		- {
198		- $words = mb_split('\s+', Multibyte::trim($fragment));
199		-
200		- $word_count = count($words);
201		-
202		- $last_word = Multibyte::trim($words[$word_count - 1]);
203		- $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
204		- $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
205		-
206		- return $last_is_capital > 0
207		- && $last_is_abbreviation > 0
208		- && mb_strlen($last_word) <= 3;
209		- }
210		-
211		- /**
212		- * Merges any part starting with a closing parenthesis ')' to the previous
213		- * part.
214		- *
215		- * @param string[] $parts
216		- * @return string[]
217		- */
218		- private function parenthesesMerge($parts)
219		- {
220		- $subsentences = [];
221		-
222		- foreach ($parts as $part) {
223		- if ($part[0] === ')') {
224		- $subsentences[count($subsentences) - 1] .= $part;
225		- } else {
226		- $subsentences[] = $part;
227		- }
228		- }
229		-
230		- return $subsentences;
231		- }
232		-
233		- /**
234		- * Looks for closing quotes to include them with the previous statement.
235		- * "That was very interesting," he said.
236		- * "That was very interesting."
237		- *
238		- * @param string[] $statements
239		- * @return string[]
240		- */
241		- private function closeQuotesMerge($statements)
242		- {
243		- $i = 0;
244		- $previous_statement = '';
245		- $return = [];
246		- foreach ($statements as $statement) {
247		- if (self::isEndQuote($statement)) {
248		- $statement = $previous_statement . $statement;
249		- } else {
250		- $i++;
251		- }
252		-
253		- $return[$i] = $statement;
254		- $previous_statement = $statement;
255		- }
256		-
257		- return $return;
258		- }
259		-
260		- /**
261		- * Check if the entire string is a quotation mark or quote, then space, then lowercase.
262		- *
263		- * @param $statement
264		- * @return bool
265		- */
266		- private static function isEndQuote($statement)
267		- {
268		- $trimmed = Multibyte::trim($statement);
269		- $first = mb_substr($statement, 0, 1);
270		-
271		- return in_array($trimmed, ['"', '\''])
272		- \|\| (
273		- in_array($first, ['"', '\''])
274		- && mb_substr($statement, 1, 1) === ' '
275		- && ctype_lower(mb_substr($statement, 2, 1)) === true
276		- );
277		- }
278		-
279		- /**
280		- * Merges items into larger sentences.
281		- * Multibyte.php safe
282		- *
283		- * @param string[] $shorts
284		- * @return string[]
285		- */
286		- private function sentenceMerge($shorts)
287		- {
288		- $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
289		-
290		- $sentences = [];
291		-
292		- $sentence = '';
293		- $has_words = false;
294		- $previous_word_ending = null;
295		- foreach ($shorts as $short) {
296		- $word_count = count(mb_split('\s+', Multibyte::trim($short)));
297		- $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
298		-
299		- if ($after_non_abbreviating_terminal
300		- \|\| ($has_words && $word_count > 1)) {
301		-
302		- $sentences[] = $sentence;
303		-
304		- $sentence = '';
305		- $has_words = false;
306		- }
307		-
308		- $has_words = $has_words
309		- \|\| $word_count > 1;
310		-
311		- $sentence .= $short;
312		- $previous_word_ending = mb_substr($short, -1);
313		- }
314		-
315		- if (!empty($sentence)) {
316		- $sentences[] = $sentence;
317		- }
318		-
319		- return $sentences;
320		- }
321		-
322		- /**
323		- * Return the sentences sentences detected in the provided text.
324		- * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
325		- * @param string $text
326		- * @param integer $flags
327		- * @return string[]
328		- */
329		- public function split($text, $flags = 0)
330		- {
331		- static $pipeline = [
332		- 'punctuationSplit',
333		- 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
334		- 'punctuationMerge',
335		- 'abbreviationMerge',
336		- 'closeQuotesMerge',
337		- 'sentenceMerge',
338		- ];
339		-
340		- // clean funny quotes
341		- $text = Multibyte::cleanUnicode($text);
342		-
343		- // Split
344		- $sentences = [];
345		- foreach (self::linebreakSplit($text) as $input) {
346		- if (Multibyte::trim($input) !== '') {
347		- foreach ($pipeline as $method) {
348		- $input = $this->$method($input);
349		- }
350		- $sentences = array_merge($sentences, $input);
351		- }
352		- }
353		-
354		- // Post process
355		- if ($flags & self::SPLIT_TRIM) {
356		- return self::trimSentences($sentences);
357		- }
358		-
359		- return $sentences;
360		- }
361		-
362		- /**
363		- * Multibyte.php trim each string in an array.
364		- * @param string[] $sentences
365		- * @return string[]
366		- */
367		- private static function trimSentences($sentences)
368		- {
369		- return array_map(function ($sentence) {
370		- return Multibyte::trim($sentence);
371		- }, $sentences);
372		- }
373		-
374		- /**
375		- * Return the number of sentences detected in the provided text.
376		- * @param string $text
377		- * @return integer
378		- */
379		- public function count($text)
380		- {
381		- return count($this->split($text));
382		- }
	20	+ /**
	21	+ * Specify this flag with the split method to trim whitespace.
	22	+ */
	23	+ const SPLIT_TRIM = 0x1;
	24	+
	25	+ /**
	26	+ * List of characters used to terminate sentences.
	27	+ *
	28	+ * @var string[]
	29	+ */
	30	+ private $terminals = ['.', '!', '?'];
	31	+
	32	+ /**
	33	+ * List of characters used for abbreviations.
	34	+ *
	35	+ * @var string[]
	36	+ */
	37	+ private $abbreviators = ['.'];
	38	+
	39	+ /**
	40	+ * Breaks a piece of text into lines by linebreak.
	41	+ * Eats up any linebreak characters as if one.
	42	+ *
	43	+ * Multibyte.php safe
	44	+ *
	45	+ * @param string $text
	46	+ * @return string[]
	47	+ */
	48	+ private static function linebreakSplit($text)
	49	+ {
	50	+ $lines = [];
	51	+ $line = '';
	52	+
	53	+ foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
	54	+ $line .= $part;
	55	+ if (Multibyte::trim($part) === '') {
	56	+ $lines[] = $line;
	57	+ $line = '';
	58	+ }
	59	+ }
	60	+ $lines[] = $line;
	61	+
	62	+ return $lines;
	63	+ }
	64	+
	65	+ /**
	66	+ * Splits an array of lines by (consecutive sequences of)
	67	+ * terminals, keeping terminals.
	68	+ *
	69	+ * Multibyte.php safe (atleast for UTF-8)
	70	+ *
	71	+ * For example:
	72	+ * "There ... is. More!"
	73	+ * ... becomes ...
	74	+ * [ "There ", "...", " is", ".", " More", "!" ]
	75	+ *
	76	+ * @param string $line
	77	+ * @return string[]
	78	+ */
	79	+ private function punctuationSplit($line)
	80	+ {
	81	+ $parts = [];
	82	+
	83	+ $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
	84	+ $is_terminal = in_array($chars[0], $this->terminals);
	85	+
	86	+ $part = '';
	87	+ foreach ($chars as $index => $char) {
	88	+ if (in_array($char, $this->terminals) !== $is_terminal) {
	89	+ $parts[] = $part;
	90	+ $part = '';
	91	+ $is_terminal = !$is_terminal;
	92	+ }
	93	+ $part .= $char;
	94	+ }
	95	+
	96	+ if (!empty($part)) {
	97	+ $parts[] = $part;
	98	+ }
	99	+
	100	+ return $parts;
	101	+ }
	102	+
	103	+ /**
	104	+ * Appends each terminal item after it's preceding
	105	+ * non-terminals.
	106	+ *
	107	+ * Multibyte.php safe (atleast for UTF-8)
	108	+ *
	109	+ * For example:
	110	+ * [ "There ", "...", " is", ".", " More", "!" ]
	111	+ * ... becomes ...
	112	+ * [ "There ... is.", "More!" ]
	113	+ *
	114	+ * @param string[] $punctuations
	115	+ * @return string[]
	116	+ */
	117	+ private function punctuationMerge($punctuations)
	118	+ {
	119	+ $definite_terminals = array_diff($this->terminals, $this->abbreviators);
	120	+
	121	+ $merges = [];
	122	+ $merge = '';
	123	+
	124	+ $filtered = array_filter($punctuations, function ($p) {
	125	+ return $p !== '';
	126	+ });
	127	+
	128	+ foreach ($filtered as $punctuation) {
	129	+ $merge .= $punctuation;
	130	+ if (mb_strlen($punctuation) === 1
	131	+ && in_array($punctuation, $this->terminals)) {
	132	+ $merges[] = $merge;
	133	+ $merge = '';
	134	+ } else {
	135	+ foreach ($definite_terminals as $terminal) {
	136	+ if (mb_strpos($punctuation, $terminal) !== false) {
	137	+ $merges[] = $merge;
	138	+ $merge = '';
	139	+ break;
	140	+ }
	141	+ }
	142	+ }
	143	+ }
	144	+ if (!empty($merge)) {
	145	+ $merges[] = $merge;
	146	+ }
	147	+
	148	+ return $merges;
	149	+ }
	150	+
	151	+ /**
	152	+ * Looks for capitalized abbreviations & includes them with the following fragment.
	153	+ *
	154	+ * For example:
	155	+ * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
	156	+ * ... becomes ...
	157	+ * [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
	158	+ * [ "Mr. Comey was not available for comment." ]
	159	+ *
	160	+ * @param string[] $fragments
	161	+ * @return string[]
	162	+ */
	163	+ private function abbreviationMerge($fragments)
	164	+ {
	165	+ $return_fragment = [];
	166	+
	167	+ $previous_fragment = '';
	168	+ $previous_is_abbreviation = false;
	169	+ $i = 0;
	170	+ foreach ($fragments as $fragment) {
	171	+ $is_abbreviation = self::isAbreviation($fragment);
	172	+
	173	+ // merge previous fragment with this
	174	+ if ($previous_is_abbreviation) {
	175	+ $fragment = $previous_fragment . $fragment;
	176	+ }
	177	+ $return_fragment[$i] = $fragment;
	178	+
	179	+ $previous_is_abbreviation = $is_abbreviation;
	180	+ $previous_fragment = $fragment;
	181	+
	182	+ // only increment if this isn't an abbreviation
	183	+ if (!$is_abbreviation) {
	184	+ $i++;
	185	+ }
	186	+ }
	187	+ return $return_fragment;
	188	+ }
	189	+
	190	+ /**
	191	+ * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
	192	+ *
	193	+ * @param $fragment
	194	+ * @return bool
	195	+ */
	196	+ private static function isAbreviation($fragment)
	197	+ {
	198	+ $words = mb_split('\s+', Multibyte::trim($fragment));
	199	+
	200	+ $word_count = count($words);
	201	+
	202	+ $last_word = Multibyte::trim($words[$word_count - 1]);
	203	+ $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
	204	+ $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
	205	+
	206	+ return $last_is_capital > 0
	207	+ && $last_is_abbreviation > 0
	208	+ && mb_strlen($last_word) <= 3;
	209	+ }
	210	+
	211	+ /**
	212	+ * Merges any part starting with a closing parenthesis ')' to the previous
	213	+ * part.
	214	+ *
	215	+ * @param string[] $parts
	216	+ * @return string[]
	217	+ */
	218	+ private function parenthesesMerge($parts)
	219	+ {
	220	+ $subsentences = [];
	221	+
	222	+ foreach ($parts as $part) {
	223	+ if ($part[0] === ')') {
	224	+ $subsentences[count($subsentences) - 1] .= $part;
	225	+ } else {
	226	+ $subsentences[] = $part;
	227	+ }
	228	+ }
	229	+
	230	+ return $subsentences;
	231	+ }
	232	+
	233	+ /**
	234	+ * Looks for closing quotes to include them with the previous statement.
	235	+ * "That was very interesting," he said.
	236	+ * "That was very interesting."
	237	+ *
	238	+ * @param string[] $statements
	239	+ * @return string[]
	240	+ */
	241	+ private function closeQuotesMerge($statements)
	242	+ {
	243	+ $i = 0;
	244	+ $previous_statement = '';
	245	+ $return = [];
	246	+ foreach ($statements as $statement) {
	247	+ if (self::isEndQuote($statement)) {
	248	+ $statement = $previous_statement . $statement;
	249	+ } else {
	250	+ $i++;
	251	+ }
	252	+
	253	+ $return[$i] = $statement;
	254	+ $previous_statement = $statement;
	255	+ }
	256	+
	257	+ return $return;
	258	+ }
	259	+
	260	+ /**
	261	+ * Check if the entire string is a quotation mark or quote, then space, then lowercase.
	262	+ *
	263	+ * @param $statement
	264	+ * @return bool
	265	+ */
	266	+ private static function isEndQuote($statement)
	267	+ {
	268	+ $trimmed = Multibyte::trim($statement);
	269	+ $first = mb_substr($statement, 0, 1);
	270	+
	271	+ return in_array($trimmed, ['"', '\''])
	272	+ \|\| (
	273	+ in_array($first, ['"', '\''])
	274	+ && mb_substr($statement, 1, 1) === ' '
	275	+ && ctype_lower(mb_substr($statement, 2, 1)) === true
	276	+ );
	277	+ }
	278	+
	279	+ /**
	280	+ * Merges items into larger sentences.
	281	+ * Multibyte.php safe
	282	+ *
	283	+ * @param string[] $shorts
	284	+ * @return string[]
	285	+ */
	286	+ private function sentenceMerge($shorts)
	287	+ {
	288	+ $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
	289	+
	290	+ $sentences = [];
	291	+
	292	+ $sentence = '';
	293	+ $has_words = false;
	294	+ $previous_word_ending = null;
	295	+ foreach ($shorts as $short) {
	296	+ $word_count = count(mb_split('\s+', Multibyte::trim($short)));
	297	+ $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
	298	+
	299	+ if ($after_non_abbreviating_terminal
	300	+ \|\| ($has_words && $word_count > 1)) {
	301	+
	302	+ $sentences[] = $sentence;
	303	+
	304	+ $sentence = '';
	305	+ $has_words = false;
	306	+ }
	307	+
	308	+ $has_words = $has_words
	309	+ \|\| $word_count > 1;
	310	+
	311	+ $sentence .= $short;
	312	+ $previous_word_ending = mb_substr($short, -1);
	313	+ }
	314	+
	315	+ if (!empty($sentence)) {
	316	+ $sentences[] = $sentence;
	317	+ }
	318	+
	319	+ return $sentences;
	320	+ }
	321	+
	322	+ /**
	323	+ * Return the sentences sentences detected in the provided text.
	324	+ * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
	325	+ * @param string $text
	326	+ * @param integer $flags
	327	+ * @return string[]
	328	+ */
	329	+ public function split($text, $flags = 0)
	330	+ {
	331	+ static $pipeline = [
	332	+ 'punctuationSplit',
	333	+ 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
	334	+ 'punctuationMerge',
	335	+ 'abbreviationMerge',
	336	+ 'closeQuotesMerge',
	337	+ 'sentenceMerge',
	338	+ ];
	339	+
	340	+ // clean funny quotes
	341	+ $text = Multibyte::cleanUnicode($text);
	342	+
	343	+ // Split
	344	+ $sentences = [];
	345	+ foreach (self::linebreakSplit($text) as $input) {
	346	+ if (Multibyte::trim($input) !== '') {
	347	+ foreach ($pipeline as $method) {
	348	+ $input = $this->$method($input);
	349	+ }
	350	+ $sentences = array_merge($sentences, $input);
	351	+ }
	352	+ }
	353	+
	354	+ // Post process
	355	+ if ($flags & self::SPLIT_TRIM) {
	356	+ return self::trimSentences($sentences);
	357	+ }
	358	+
	359	+ return $sentences;
	360	+ }
	361	+
	362	+ /**
	363	+ * Multibyte.php trim each string in an array.
	364	+ * @param string[] $sentences
	365	+ * @return string[]
	366	+ */
	367	+ private static function trimSentences($sentences)
	368	+ {
	369	+ return array_map(function ($sentence) {
	370	+ return Multibyte::trim($sentence);
	371	+ }, $sentences);
	372	+ }
	373	+
	374	+ /**
	375	+ * Return the number of sentences detected in the provided text.
	376	+ * @param string $text
	377	+ * @return integer
	378	+ */
	379	+ public function count($text)
	380	+ {
	381	+ return count($this->split($text));
	382	+ }
383	383
384	384	}

		@@ -121,7 +121,7 @@ discard block
		block discarded – undo
121	121	$merges = [];
122	122	$merge = '';
123	123
124		- $filtered = array_filter($punctuations, function ($p) {
	124	+ $filtered = array_filter($punctuations, function($p) {
125	125	return $p !== '';
126	126	});
127	127
		@@ -366,7 +366,7 @@ discard block
		block discarded – undo
366	366	*/
367	367	private static function trimSentences($sentences)
368	368	{
369		- return array_map(function ($sentence) {
	369	+ return array_map(function($sentence) {
370	370	return Multibyte::trim($sentence);
371	371	}, $sentences);
372	372	}

vanderlee / php-sentence

Push — master ( d7563a...f1d8ce )

Status

Category

Indentation +363 added lines, -363 removed lines patch added patch discarded remove patch

Spacing +2 added lines, -2 removed lines patch added patch discarded remove patch