Inspection of "Refactor cleanUnicode" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( e85d9b...d7563a )

by Martijn

created 2019-04-01 11:08 UTC

Status

Indentation +338 added lines, -338 removed lines patch added patch discarded remove patch

@@ -17,343 +17,343 @@
 block discarded – undo
 class Sentence
 {
 
-    /**
-     * Specify this flag with the split method to trim whitespace.
-     */
-    const SPLIT_TRIM = 0x1;
-
-    /**
-     * List of characters used to terminate sentences.
-     *
-     * @var string[]
-     */
-    private $terminals = ['.', '!', '?'];
-
-    /**
-     * List of characters used for abbreviations.
-     *
-     * @var string[]
-     */
-    private $abbreviators = ['.'];
-
-    /**
-     * Breaks a piece of text into lines by linebreak.
-     * Eats up any linebreak characters as if one.
-     *
-     * Multibyte.php safe
-     *
-     * @param string $text
-     * @return string[]
-     */
-    private static function linebreakSplit($text)
-    {
-        $lines = [];
-        $line = '';
-
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
-            $line .= $part;
-            if (Multibyte::trim($part) === '') {
-                $lines[] = $line;
-                $line = '';
-            }
-        }
-        $lines[] = $line;
-
-        return $lines;
-    }
-
-    /**
-     * Splits an array of lines by (consecutive sequences of)
-     * terminals, keeping terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    "There ... is. More!"
-     *        ... becomes ...
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *
-     * @param string $line
-     * @return string[]
-     */
-    private function punctuationSplit($line)
-    {
-        $parts = [];
-
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
-        $is_terminal = in_array($chars[0], $this->terminals);
-
-        $part = '';
-        foreach ($chars as $index => $char) {
-            if (in_array($char, $this->terminals) !== $is_terminal) {
-                $parts[] = $part;
-                $part = '';
-                $is_terminal = !$is_terminal;
-            }
-            $part .= $char;
-        }
-
-        if (!empty($part)) {
-            $parts[] = $part;
-        }
-
-        return $parts;
-    }
-
-    /**
-     * Appends each terminal item after it's preceding
-     * non-terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *        ... becomes ...
-     *    [ "There ... is.", "More!" ]
-     *
-     * @param string[] $punctuations
-     * @return string[]
-     */
-    private function punctuationMerge($punctuations)
-    {
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $merges = [];
-        $merge = '';
-
-        foreach ($punctuations as $punctuation) {
-            if ($punctuation !== '') {
-                $merge .= $punctuation;
-                if (mb_strlen($punctuation) === 1
-                    && in_array($punctuation, $this->terminals)) {
-                    $merges[] = $merge;
-                    $merge = '';
-                } else {
-                    foreach ($definite_terminals as $terminal) {
-                        if (mb_strpos($punctuation, $terminal) !== false) {
-                            $merges[] = $merge;
-                            $merge = '';
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-        if (!empty($merge)) {
-            $merges[] = $merge;
-        }
-
-        return $merges;
-    }
-
-    /**
-     * Looks for capitalized abbreviations & includes them with the following fragment.
-     *
-     * For example:
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
-     *        ... becomes ...
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
-     *  [ "Mr. Comey was not available for comment." ]
-     *
-     * @param string[] $fragments
-     * @return string[]
-     */
-    private function abbreviationMerge($fragments)
-    {
-        $return_fragment = [];
-
-        $previous_string = '';
-        $previous_is_abbreviation = false;
-        $i = 0;
-
-        foreach ($fragments as $fragment) {
-            $current_string = $fragment;
-            $words = mb_split('\s+', Multibyte::trim($fragment));
-
-            $word_count = count($words);
-
-            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
-            $last_word = trim($words[$word_count - 1]);
-            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
-            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
-            $is_abbreviation = $last_is_capital > 0
-                && $last_is_abbreviation > 0
-                && mb_strlen($last_word) <= 3;
-
-            // merge previous fragment with this
-            if ($previous_is_abbreviation === true) {
-                $current_string = $previous_string . $current_string;
-            }
-            $return_fragment[$i] = $current_string;
-
-            $previous_is_abbreviation = $is_abbreviation;
-            $previous_string = $current_string;
-            // only increment if this isn't an abbreviation
-            if ($is_abbreviation === false) {
-                $i++;
-            }
-        }
-        return $return_fragment;
-    }
-
-    /**
-     * Merges any part starting with a closing parenthesis ')' to the previous
-     * part.
-     *
-     * @param string[] $parts
-     * @return string[]
-     */
-    private function parenthesesMerge($parts)
-    {
-        $subsentences = [];
-
-        foreach ($parts as $part) {
-            if ($part[0] === ')') {
-                $subsentences[count($subsentences) - 1] .= $part;
-            } else {
-                $subsentences[] = $part;
-            }
-        }
-
-        return $subsentences;
-    }
-
-    /**
-     * Looks for closing quotes to include them with the previous statement.
-     * "That was very interesting," he said.
-     * "That was very interesting."
-     *
-     * @param string[] $statements
-     * @return string[]
-     */
-    private function closeQuotesMerge($statements)
-    {
-        $i = 0;
-        $previous_statement = "";
-        $return = [];
-        foreach ($statements as $statement) {
-            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
-            if (trim($statement) === '"'
-                || trim($statement) === "'"
-                || (
-                    (substr($statement, 0, 1) === '"'
-                        || substr($statement, 0, 1) === "'")
-                    && substr($statement, 1, 1) === ' '
-                    && ctype_lower(substr($statement, 2, 1)) === true
-                )
-            ) {
-                $statement = $previous_statement . $statement;
-            } else {
-                $i++;
-            }
-
-            $return[$i] = $statement;
-            $previous_statement = $statement;
-        }
-
-        return $return;
-    }
-
-    /**
-     * Merges items into larger sentences.
-     * Multibyte.php safe
-     *
-     * @param string[] $shorts
-     * @return string[]
-     */
-    private function sentenceMerge($shorts)
-    {
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $sentences = [];
-
-        $sentence = '';
-        $has_words = false;
-        $previous_word_ending = null;
-        foreach ($shorts as $short) {
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
-
-            if ($after_non_abbreviating_terminal
-                || ($has_words && $word_count > 1)) {
-                $sentences[] = $sentence;
-                $sentence = '';
-                $has_words = $word_count > 1;
-            } else {
-                $has_words = ($has_words
-                    || $word_count > 1);
-            }
-
-            $sentence .= $short;
-            $previous_word_ending = mb_substr($short, -1);
-        }
-        if (!empty($sentence)) {
-            $sentences[] = $sentence;
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Return the sentences sentences detected in the provided text.
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
-     * @param string $text
-     * @param integer $flags
-     * @return string[]
-     */
-    public function split($text, $flags = 0)
-    {
-        static $pipeline = [
-            'punctuationSplit',
-            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
-            'punctuationMerge',
-            'abbreviationMerge',
-            'closeQuotesMerge',
-            'sentenceMerge',
-        ];
-
-        // clean funny quotes
-        $text = Multibyte::cleanUnicode($text);
-
-        // Split
-        $sentences = [];
-        foreach (self::linebreakSplit($text) as $input) {
-            if (Multibyte::trim($input) !== '') {
-                foreach ($pipeline as $method) {
-                    $input = $this->$method($input);
-                }
-                $sentences = array_merge($sentences, $input);
-            }
-        }
-
-        // Post process
-        if ($flags & self::SPLIT_TRIM) {
-            return self::trimSentences($sentences);
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Multibyte.php trim each string in an array.
-     * @param string[] $sentences
-     * @return string[]
-     */
-    private static function trimSentences($sentences)
-    {
-        return array_map(function($sentence) {
-            return Multibyte::trim($sentence);
-        }, $sentences);
-    }
-
-    /**
-     * Return the number of sentences detected in the provided text.
-     * @param string $text
-     * @return integer
-     */
-    public function count($text)
-    {
-        return count($this->split($text));
-    }
+	/**
+	 * Specify this flag with the split method to trim whitespace.
+	 */
+	const SPLIT_TRIM = 0x1;
+
+	/**
+	 * List of characters used to terminate sentences.
+	 *
+	 * @var string[]
+	 */
+	private $terminals = ['.', '!', '?'];
+
+	/**
+	 * List of characters used for abbreviations.
+	 *
+	 * @var string[]
+	 */
+	private $abbreviators = ['.'];
+
+	/**
+	 * Breaks a piece of text into lines by linebreak.
+	 * Eats up any linebreak characters as if one.
+	 *
+	 * Multibyte.php safe
+	 *
+	 * @param string $text
+	 * @return string[]
+	 */
+	private static function linebreakSplit($text)
+	{
+		$lines = [];
+		$line = '';
+
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
+			$line .= $part;
+			if (Multibyte::trim($part) === '') {
+				$lines[] = $line;
+				$line = '';
+			}
+		}
+		$lines[] = $line;
+
+		return $lines;
+	}
+
+	/**
+	 * Splits an array of lines by (consecutive sequences of)
+	 * terminals, keeping terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    "There ... is. More!"
+	 *        ... becomes ...
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *
+	 * @param string $line
+	 * @return string[]
+	 */
+	private function punctuationSplit($line)
+	{
+		$parts = [];
+
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
+		$is_terminal = in_array($chars[0], $this->terminals);
+
+		$part = '';
+		foreach ($chars as $index => $char) {
+			if (in_array($char, $this->terminals) !== $is_terminal) {
+				$parts[] = $part;
+				$part = '';
+				$is_terminal = !$is_terminal;
+			}
+			$part .= $char;
+		}
+
+		if (!empty($part)) {
+			$parts[] = $part;
+		}
+
+		return $parts;
+	}
+
+	/**
+	 * Appends each terminal item after it's preceding
+	 * non-terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *        ... becomes ...
+	 *    [ "There ... is.", "More!" ]
+	 *
+	 * @param string[] $punctuations
+	 * @return string[]
+	 */
+	private function punctuationMerge($punctuations)
+	{
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$merges = [];
+		$merge = '';
+
+		foreach ($punctuations as $punctuation) {
+			if ($punctuation !== '') {
+				$merge .= $punctuation;
+				if (mb_strlen($punctuation) === 1
+					&& in_array($punctuation, $this->terminals)) {
+					$merges[] = $merge;
+					$merge = '';
+				} else {
+					foreach ($definite_terminals as $terminal) {
+						if (mb_strpos($punctuation, $terminal) !== false) {
+							$merges[] = $merge;
+							$merge = '';
+							break;
+						}
+					}
+				}
+			}
+		}
+		if (!empty($merge)) {
+			$merges[] = $merge;
+		}
+
+		return $merges;
+	}
+
+	/**
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
+	 *
+	 * For example:
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
+	 *        ... becomes ...
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
+	 *  [ "Mr. Comey was not available for comment." ]
+	 *
+	 * @param string[] $fragments
+	 * @return string[]
+	 */
+	private function abbreviationMerge($fragments)
+	{
+		$return_fragment = [];
+
+		$previous_string = '';
+		$previous_is_abbreviation = false;
+		$i = 0;
+
+		foreach ($fragments as $fragment) {
+			$current_string = $fragment;
+			$words = mb_split('\s+', Multibyte::trim($fragment));
+
+			$word_count = count($words);
+
+			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
+			$last_word = trim($words[$word_count - 1]);
+			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
+			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
+			$is_abbreviation = $last_is_capital > 0
+				&& $last_is_abbreviation > 0
+				&& mb_strlen($last_word) <= 3;
+
+			// merge previous fragment with this
+			if ($previous_is_abbreviation === true) {
+				$current_string = $previous_string . $current_string;
+			}
+			$return_fragment[$i] = $current_string;
+
+			$previous_is_abbreviation = $is_abbreviation;
+			$previous_string = $current_string;
+			// only increment if this isn't an abbreviation
+			if ($is_abbreviation === false) {
+				$i++;
+			}
+		}
+		return $return_fragment;
+	}
+
+	/**
+	 * Merges any part starting with a closing parenthesis ')' to the previous
+	 * part.
+	 *
+	 * @param string[] $parts
+	 * @return string[]
+	 */
+	private function parenthesesMerge($parts)
+	{
+		$subsentences = [];
+
+		foreach ($parts as $part) {
+			if ($part[0] === ')') {
+				$subsentences[count($subsentences) - 1] .= $part;
+			} else {
+				$subsentences[] = $part;
+			}
+		}
+
+		return $subsentences;
+	}
+
+	/**
+	 * Looks for closing quotes to include them with the previous statement.
+	 * "That was very interesting," he said.
+	 * "That was very interesting."
+	 *
+	 * @param string[] $statements
+	 * @return string[]
+	 */
+	private function closeQuotesMerge($statements)
+	{
+		$i = 0;
+		$previous_statement = "";
+		$return = [];
+		foreach ($statements as $statement) {
+			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
+			if (trim($statement) === '"'
+				|| trim($statement) === "'"
+				|| (
+					(substr($statement, 0, 1) === '"'
+						|| substr($statement, 0, 1) === "'")
+					&& substr($statement, 1, 1) === ' '
+					&& ctype_lower(substr($statement, 2, 1)) === true
+				)
+			) {
+				$statement = $previous_statement . $statement;
+			} else {
+				$i++;
+			}
+
+			$return[$i] = $statement;
+			$previous_statement = $statement;
+		}
+
+		return $return;
+	}
+
+	/**
+	 * Merges items into larger sentences.
+	 * Multibyte.php safe
+	 *
+	 * @param string[] $shorts
+	 * @return string[]
+	 */
+	private function sentenceMerge($shorts)
+	{
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$sentences = [];
+
+		$sentence = '';
+		$has_words = false;
+		$previous_word_ending = null;
+		foreach ($shorts as $short) {
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
+
+			if ($after_non_abbreviating_terminal
+				|| ($has_words && $word_count > 1)) {
+				$sentences[] = $sentence;
+				$sentence = '';
+				$has_words = $word_count > 1;
+			} else {
+				$has_words = ($has_words
+					|| $word_count > 1);
+			}
+
+			$sentence .= $short;
+			$previous_word_ending = mb_substr($short, -1);
+		}
+		if (!empty($sentence)) {
+			$sentences[] = $sentence;
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Return the sentences sentences detected in the provided text.
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
+	 * @param string $text
+	 * @param integer $flags
+	 * @return string[]
+	 */
+	public function split($text, $flags = 0)
+	{
+		static $pipeline = [
+			'punctuationSplit',
+			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
+			'punctuationMerge',
+			'abbreviationMerge',
+			'closeQuotesMerge',
+			'sentenceMerge',
+		];
+
+		// clean funny quotes
+		$text = Multibyte::cleanUnicode($text);
+
+		// Split
+		$sentences = [];
+		foreach (self::linebreakSplit($text) as $input) {
+			if (Multibyte::trim($input) !== '') {
+				foreach ($pipeline as $method) {
+					$input = $this->$method($input);
+				}
+				$sentences = array_merge($sentences, $input);
+			}
+		}
+
+		// Post process
+		if ($flags & self::SPLIT_TRIM) {
+			return self::trimSentences($sentences);
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Multibyte.php trim each string in an array.
+	 * @param string[] $sentences
+	 * @return string[]
+	 */
+	private static function trimSentences($sentences)
+	{
+		return array_map(function($sentence) {
+			return Multibyte::trim($sentence);
+		}, $sentences);
+	}
+
+	/**
+	 * Return the number of sentences detected in the provided text.
+	 * @param string $text
+	 * @return integer
+	 */
+	public function count($text)
+	{
+		return count($this->split($text));
+	}
 
 }

Please login to merge, or discard this patch.

src/Multibyte.php 1 patch

Indentation +142 added lines, -142 removed lines patch added patch discarded remove patch

@@ -7,146 +7,146 @@
 block discarded – undo
  */
 class Multibyte
 {
-    //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
-    private static $unicodeCharacterMap = [
-        // Windows codepage 1252
-        "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
-        "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
-        "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
-        "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
-        "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
-        "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
-        "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
-        "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
-        // Regular Unicode     // U+0022 quotation mark (")
-        // U+0027 apostrophe     (')
-        "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
-        "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
-        "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
-        "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
-        "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
-        "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
-        "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
-        "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
-        "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
-        "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
-        "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
-        "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
-    ];
-
-    /**
-     * Replace
-     *
-     * @staticvar array $chr_map
-     * @param string $string
-     * @return string
-     */
-    public static function cleanUnicode($string)
-    {
-        $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
-        $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
-    }
-
-    /**
-     * Multibyte.php safe version of standard trim() function.
-     *
-     * @param string $string
-     * @return string
-     */
-    public static function trim($string)
-    {
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
-    }
-
-    /**
-     * A cross between mb_split and preg_split, adding the preg_split flags
-     * to mb_split.
-     *
-     * @param string $pattern
-     * @param string $string
-     * @param int $limit
-     * @param int $flags
-     * @return array
-     */
-    public static function split($pattern, $string, $limit = -1, $flags = 0)
-    {
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
-
-        $lengths = self::getSplitLengths($pattern, $string);
-
-        // Substrings
-        $parts = [];
-        $position = 0;
-        $count = 1;
-        foreach ($lengths as $length) {
-            $split_empty = !$split_no_empty || $length[0];
-            $is_delimiter = $length[1];
-            $is_captured = $delim_capture && $length[2];
-
-            if ($limit > 0
-                && !$is_delimiter
-                && $split_empty
-                && ++$count > $limit) {
-
-                $cut = mb_strcut($string, $position);
-
-                $parts[] = $offset_capture
-                    ? [$cut, $position]
-                    : $cut;
-
-                break;
-            } elseif ((!$is_delimiter
-                    || $is_captured)
-                && $split_empty) {
-
-                $cut = mb_strcut($string, $position, $length[0]);
-
-                $parts[] = $offset_capture
-                    ? [$cut, $position]
-                    : $cut;
-            }
-
-            $position += $length[0];
-        }
-
-        return $parts;
-    }
-
-    /**
-     * Splits the string by pattern and for each element (part or split) returns:
-     *  [ 0 => length, 1 => is_delimiter?, 2 =>
-     *
-     * @param $pattern
-     * @param $string
-     * @return array
-     */
-    private static function getSplitLengths($pattern, $string)
-    {
-        $strlen = strlen($string); // bytes!
-        $lengths = [];
-
-        mb_ereg_search_init($string);
-
-        $position = 0;
-        while ($position < $strlen
-            && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
-            // capture split
-            $lengths[] = [$array[0] - $position, false, null];
-
-            // move position
-            $position = $array[0] + $array[1];
-
-            // capture delimiter
-            $regs = mb_ereg_search_getregs();
-            $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
-        }
-
-        // Add last bit, if not ending with split
-        $lengths[] = [$strlen - $position, false, null];
-
-        return $lengths;
-    }
+	//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
+	private static $unicodeCharacterMap = [
+		// Windows codepage 1252
+		"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
+		"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
+		"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
+		"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
+		"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
+		"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
+		"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
+		"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
+		// Regular Unicode     // U+0022 quotation mark (")
+		// U+0027 apostrophe     (')
+		"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
+		"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
+		"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
+		"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
+		"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
+		"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
+		"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
+		"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
+		"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
+		"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
+		"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
+		"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
+	];
+
+	/**
+	 * Replace
+	 *
+	 * @staticvar array $chr_map
+	 * @param string $string
+	 * @return string
+	 */
+	public static function cleanUnicode($string)
+	{
+		$character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
+		$replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
+	}
+
+	/**
+	 * Multibyte.php safe version of standard trim() function.
+	 *
+	 * @param string $string
+	 * @return string
+	 */
+	public static function trim($string)
+	{
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
+	}
+
+	/**
+	 * A cross between mb_split and preg_split, adding the preg_split flags
+	 * to mb_split.
+	 *
+	 * @param string $pattern
+	 * @param string $string
+	 * @param int $limit
+	 * @param int $flags
+	 * @return array
+	 */
+	public static function split($pattern, $string, $limit = -1, $flags = 0)
+	{
+		$split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
+		$delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
+
+		$lengths = self::getSplitLengths($pattern, $string);
+
+		// Substrings
+		$parts = [];
+		$position = 0;
+		$count = 1;
+		foreach ($lengths as $length) {
+			$split_empty = !$split_no_empty || $length[0];
+			$is_delimiter = $length[1];
+			$is_captured = $delim_capture && $length[2];
+
+			if ($limit > 0
+				&& !$is_delimiter
+				&& $split_empty
+				&& ++$count > $limit) {
+
+				$cut = mb_strcut($string, $position);
+
+				$parts[] = $offset_capture
+					? [$cut, $position]
+					: $cut;
+
+				break;
+			} elseif ((!$is_delimiter
+					|| $is_captured)
+				&& $split_empty) {
+
+				$cut = mb_strcut($string, $position, $length[0]);
+
+				$parts[] = $offset_capture
+					? [$cut, $position]
+					: $cut;
+			}
+
+			$position += $length[0];
+		}
+
+		return $parts;
+	}
+
+	/**
+	 * Splits the string by pattern and for each element (part or split) returns:
+	 *  [ 0 => length, 1 => is_delimiter?, 2 =>
+	 *
+	 * @param $pattern
+	 * @param $string
+	 * @return array
+	 */
+	private static function getSplitLengths($pattern, $string)
+	{
+		$strlen = strlen($string); // bytes!
+		$lengths = [];
+
+		mb_ereg_search_init($string);
+
+		$position = 0;
+		while ($position < $strlen
+			&& ($array = mb_ereg_search_pos($pattern, '')) !== false) {
+			// capture split
+			$lengths[] = [$array[0] - $position, false, null];
+
+			// move position
+			$position = $array[0] + $array[1];
+
+			// capture delimiter
+			$regs = mb_ereg_search_getregs();
+			$lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
+		}
+
+		// Add last bit, if not ending with split
+		$lengths[] = [$strlen - $position, false, null];
+
+		return $lengths;
+	}
 }
\ No newline at end of file

Please login to merge, or discard this patch.

		@@ -17,343 +17,343 @@
		block discarded – undo
17	17	class Sentence
18	18	{
19	19
20		- /**
21		- * Specify this flag with the split method to trim whitespace.
22		- */
23		- const SPLIT_TRIM = 0x1;
24		-
25		- /**
26		- * List of characters used to terminate sentences.
27		- *
28		- * @var string[]
29		- */
30		- private $terminals = ['.', '!', '?'];
31		-
32		- /**
33		- * List of characters used for abbreviations.
34		- *
35		- * @var string[]
36		- */
37		- private $abbreviators = ['.'];
38		-
39		- /**
40		- * Breaks a piece of text into lines by linebreak.
41		- * Eats up any linebreak characters as if one.
42		- *
43		- * Multibyte.php safe
44		- *
45		- * @param string $text
46		- * @return string[]
47		- */
48		- private static function linebreakSplit($text)
49		- {
50		- $lines = [];
51		- $line = '';
52		-
53		- foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54		- $line .= $part;
55		- if (Multibyte::trim($part) === '') {
56		- $lines[] = $line;
57		- $line = '';
58		- }
59		- }
60		- $lines[] = $line;
61		-
62		- return $lines;
63		- }
64		-
65		- /**
66		- * Splits an array of lines by (consecutive sequences of)
67		- * terminals, keeping terminals.
68		- *
69		- * Multibyte.php safe (atleast for UTF-8)
70		- *
71		- * For example:
72		- * "There ... is. More!"
73		- * ... becomes ...
74		- * [ "There ", "...", " is", ".", " More", "!" ]
75		- *
76		- * @param string $line
77		- * @return string[]
78		- */
79		- private function punctuationSplit($line)
80		- {
81		- $parts = [];
82		-
83		- $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84		- $is_terminal = in_array($chars[0], $this->terminals);
85		-
86		- $part = '';
87		- foreach ($chars as $index => $char) {
88		- if (in_array($char, $this->terminals) !== $is_terminal) {
89		- $parts[] = $part;
90		- $part = '';
91		- $is_terminal = !$is_terminal;
92		- }
93		- $part .= $char;
94		- }
95		-
96		- if (!empty($part)) {
97		- $parts[] = $part;
98		- }
99		-
100		- return $parts;
101		- }
102		-
103		- /**
104		- * Appends each terminal item after it's preceding
105		- * non-terminals.
106		- *
107		- * Multibyte.php safe (atleast for UTF-8)
108		- *
109		- * For example:
110		- * [ "There ", "...", " is", ".", " More", "!" ]
111		- * ... becomes ...
112		- * [ "There ... is.", "More!" ]
113		- *
114		- * @param string[] $punctuations
115		- * @return string[]
116		- */
117		- private function punctuationMerge($punctuations)
118		- {
119		- $definite_terminals = array_diff($this->terminals, $this->abbreviators);
120		-
121		- $merges = [];
122		- $merge = '';
123		-
124		- foreach ($punctuations as $punctuation) {
125		- if ($punctuation !== '') {
126		- $merge .= $punctuation;
127		- if (mb_strlen($punctuation) === 1
128		- && in_array($punctuation, $this->terminals)) {
129		- $merges[] = $merge;
130		- $merge = '';
131		- } else {
132		- foreach ($definite_terminals as $terminal) {
133		- if (mb_strpos($punctuation, $terminal) !== false) {
134		- $merges[] = $merge;
135		- $merge = '';
136		- break;
137		- }
138		- }
139		- }
140		- }
141		- }
142		- if (!empty($merge)) {
143		- $merges[] = $merge;
144		- }
145		-
146		- return $merges;
147		- }
148		-
149		- /**
150		- * Looks for capitalized abbreviations & includes them with the following fragment.
151		- *
152		- * For example:
153		- * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
154		- * ... becomes ...
155		- * [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
156		- * [ "Mr. Comey was not available for comment." ]
157		- *
158		- * @param string[] $fragments
159		- * @return string[]
160		- */
161		- private function abbreviationMerge($fragments)
162		- {
163		- $return_fragment = [];
164		-
165		- $previous_string = '';
166		- $previous_is_abbreviation = false;
167		- $i = 0;
168		-
169		- foreach ($fragments as $fragment) {
170		- $current_string = $fragment;
171		- $words = mb_split('\s+', Multibyte::trim($fragment));
172		-
173		- $word_count = count($words);
174		-
175		- // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
176		- $last_word = trim($words[$word_count - 1]);
177		- $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
178		- $last_is_abbreviation = substr(trim($fragment), -1) === '.';
179		- $is_abbreviation = $last_is_capital > 0
180		- && $last_is_abbreviation > 0
181		- && mb_strlen($last_word) <= 3;
182		-
183		- // merge previous fragment with this
184		- if ($previous_is_abbreviation === true) {
185		- $current_string = $previous_string . $current_string;
186		- }
187		- $return_fragment[$i] = $current_string;
188		-
189		- $previous_is_abbreviation = $is_abbreviation;
190		- $previous_string = $current_string;
191		- // only increment if this isn't an abbreviation
192		- if ($is_abbreviation === false) {
193		- $i++;
194		- }
195		- }
196		- return $return_fragment;
197		- }
198		-
199		- /**
200		- * Merges any part starting with a closing parenthesis ')' to the previous
201		- * part.
202		- *
203		- * @param string[] $parts
204		- * @return string[]
205		- */
206		- private function parenthesesMerge($parts)
207		- {
208		- $subsentences = [];
209		-
210		- foreach ($parts as $part) {
211		- if ($part[0] === ')') {
212		- $subsentences[count($subsentences) - 1] .= $part;
213		- } else {
214		- $subsentences[] = $part;
215		- }
216		- }
217		-
218		- return $subsentences;
219		- }
220		-
221		- /**
222		- * Looks for closing quotes to include them with the previous statement.
223		- * "That was very interesting," he said.
224		- * "That was very interesting."
225		- *
226		- * @param string[] $statements
227		- * @return string[]
228		- */
229		- private function closeQuotesMerge($statements)
230		- {
231		- $i = 0;
232		- $previous_statement = "";
233		- $return = [];
234		- foreach ($statements as $statement) {
235		- // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
236		- if (trim($statement) === '"'
237		- \|\| trim($statement) === "'"
238		- \|\| (
239		- (substr($statement, 0, 1) === '"'
240		- \|\| substr($statement, 0, 1) === "'")
241		- && substr($statement, 1, 1) === ' '
242		- && ctype_lower(substr($statement, 2, 1)) === true
243		- )
244		- ) {
245		- $statement = $previous_statement . $statement;
246		- } else {
247		- $i++;
248		- }
249		-
250		- $return[$i] = $statement;
251		- $previous_statement = $statement;
252		- }
253		-
254		- return $return;
255		- }
256		-
257		- /**
258		- * Merges items into larger sentences.
259		- * Multibyte.php safe
260		- *
261		- * @param string[] $shorts
262		- * @return string[]
263		- */
264		- private function sentenceMerge($shorts)
265		- {
266		- $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
267		-
268		- $sentences = [];
269		-
270		- $sentence = '';
271		- $has_words = false;
272		- $previous_word_ending = null;
273		- foreach ($shorts as $short) {
274		- $word_count = count(mb_split('\s+', Multibyte::trim($short)));
275		- $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
276		-
277		- if ($after_non_abbreviating_terminal
278		- \|\| ($has_words && $word_count > 1)) {
279		- $sentences[] = $sentence;
280		- $sentence = '';
281		- $has_words = $word_count > 1;
282		- } else {
283		- $has_words = ($has_words
284		- \|\| $word_count > 1);
285		- }
286		-
287		- $sentence .= $short;
288		- $previous_word_ending = mb_substr($short, -1);
289		- }
290		- if (!empty($sentence)) {
291		- $sentences[] = $sentence;
292		- }
293		-
294		- return $sentences;
295		- }
296		-
297		- /**
298		- * Return the sentences sentences detected in the provided text.
299		- * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
300		- * @param string $text
301		- * @param integer $flags
302		- * @return string[]
303		- */
304		- public function split($text, $flags = 0)
305		- {
306		- static $pipeline = [
307		- 'punctuationSplit',
308		- 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
309		- 'punctuationMerge',
310		- 'abbreviationMerge',
311		- 'closeQuotesMerge',
312		- 'sentenceMerge',
313		- ];
314		-
315		- // clean funny quotes
316		- $text = Multibyte::cleanUnicode($text);
317		-
318		- // Split
319		- $sentences = [];
320		- foreach (self::linebreakSplit($text) as $input) {
321		- if (Multibyte::trim($input) !== '') {
322		- foreach ($pipeline as $method) {
323		- $input = $this->$method($input);
324		- }
325		- $sentences = array_merge($sentences, $input);
326		- }
327		- }
328		-
329		- // Post process
330		- if ($flags & self::SPLIT_TRIM) {
331		- return self::trimSentences($sentences);
332		- }
333		-
334		- return $sentences;
335		- }
336		-
337		- /**
338		- * Multibyte.php trim each string in an array.
339		- * @param string[] $sentences
340		- * @return string[]
341		- */
342		- private static function trimSentences($sentences)
343		- {
344		- return array_map(function($sentence) {
345		- return Multibyte::trim($sentence);
346		- }, $sentences);
347		- }
348		-
349		- /**
350		- * Return the number of sentences detected in the provided text.
351		- * @param string $text
352		- * @return integer
353		- */
354		- public function count($text)
355		- {
356		- return count($this->split($text));
357		- }
	20	+ /**
	21	+ * Specify this flag with the split method to trim whitespace.
	22	+ */
	23	+ const SPLIT_TRIM = 0x1;
	24	+
	25	+ /**
	26	+ * List of characters used to terminate sentences.
	27	+ *
	28	+ * @var string[]
	29	+ */
	30	+ private $terminals = ['.', '!', '?'];
	31	+
	32	+ /**
	33	+ * List of characters used for abbreviations.
	34	+ *
	35	+ * @var string[]
	36	+ */
	37	+ private $abbreviators = ['.'];
	38	+
	39	+ /**
	40	+ * Breaks a piece of text into lines by linebreak.
	41	+ * Eats up any linebreak characters as if one.
	42	+ *
	43	+ * Multibyte.php safe
	44	+ *
	45	+ * @param string $text
	46	+ * @return string[]
	47	+ */
	48	+ private static function linebreakSplit($text)
	49	+ {
	50	+ $lines = [];
	51	+ $line = '';
	52	+
	53	+ foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
	54	+ $line .= $part;
	55	+ if (Multibyte::trim($part) === '') {
	56	+ $lines[] = $line;
	57	+ $line = '';
	58	+ }
	59	+ }
	60	+ $lines[] = $line;
	61	+
	62	+ return $lines;
	63	+ }
	64	+
	65	+ /**
	66	+ * Splits an array of lines by (consecutive sequences of)
	67	+ * terminals, keeping terminals.
	68	+ *
	69	+ * Multibyte.php safe (atleast for UTF-8)
	70	+ *
	71	+ * For example:
	72	+ * "There ... is. More!"
	73	+ * ... becomes ...
	74	+ * [ "There ", "...", " is", ".", " More", "!" ]
	75	+ *
	76	+ * @param string $line
	77	+ * @return string[]
	78	+ */
	79	+ private function punctuationSplit($line)
	80	+ {
	81	+ $parts = [];
	82	+
	83	+ $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
	84	+ $is_terminal = in_array($chars[0], $this->terminals);
	85	+
	86	+ $part = '';
	87	+ foreach ($chars as $index => $char) {
	88	+ if (in_array($char, $this->terminals) !== $is_terminal) {
	89	+ $parts[] = $part;
	90	+ $part = '';
	91	+ $is_terminal = !$is_terminal;
	92	+ }
	93	+ $part .= $char;
	94	+ }
	95	+
	96	+ if (!empty($part)) {
	97	+ $parts[] = $part;
	98	+ }
	99	+
	100	+ return $parts;
	101	+ }
	102	+
	103	+ /**
	104	+ * Appends each terminal item after it's preceding
	105	+ * non-terminals.
	106	+ *
	107	+ * Multibyte.php safe (atleast for UTF-8)
	108	+ *
	109	+ * For example:
	110	+ * [ "There ", "...", " is", ".", " More", "!" ]
	111	+ * ... becomes ...
	112	+ * [ "There ... is.", "More!" ]
	113	+ *
	114	+ * @param string[] $punctuations
	115	+ * @return string[]
	116	+ */
	117	+ private function punctuationMerge($punctuations)
	118	+ {
	119	+ $definite_terminals = array_diff($this->terminals, $this->abbreviators);
	120	+
	121	+ $merges = [];
	122	+ $merge = '';
	123	+
	124	+ foreach ($punctuations as $punctuation) {
	125	+ if ($punctuation !== '') {
	126	+ $merge .= $punctuation;
	127	+ if (mb_strlen($punctuation) === 1
	128	+ && in_array($punctuation, $this->terminals)) {
	129	+ $merges[] = $merge;
	130	+ $merge = '';
	131	+ } else {
	132	+ foreach ($definite_terminals as $terminal) {
	133	+ if (mb_strpos($punctuation, $terminal) !== false) {
	134	+ $merges[] = $merge;
	135	+ $merge = '';
	136	+ break;
	137	+ }
	138	+ }
	139	+ }
	140	+ }
	141	+ }
	142	+ if (!empty($merge)) {
	143	+ $merges[] = $merge;
	144	+ }
	145	+
	146	+ return $merges;
	147	+ }
	148	+
	149	+ /**
	150	+ * Looks for capitalized abbreviations & includes them with the following fragment.
	151	+ *
	152	+ * For example:
	153	+ * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
	154	+ * ... becomes ...
	155	+ * [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
	156	+ * [ "Mr. Comey was not available for comment." ]
	157	+ *
	158	+ * @param string[] $fragments
	159	+ * @return string[]
	160	+ */
	161	+ private function abbreviationMerge($fragments)
	162	+ {
	163	+ $return_fragment = [];
	164	+
	165	+ $previous_string = '';
	166	+ $previous_is_abbreviation = false;
	167	+ $i = 0;
	168	+
	169	+ foreach ($fragments as $fragment) {
	170	+ $current_string = $fragment;
	171	+ $words = mb_split('\s+', Multibyte::trim($fragment));
	172	+
	173	+ $word_count = count($words);
	174	+
	175	+ // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
	176	+ $last_word = trim($words[$word_count - 1]);
	177	+ $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
	178	+ $last_is_abbreviation = substr(trim($fragment), -1) === '.';
	179	+ $is_abbreviation = $last_is_capital > 0
	180	+ && $last_is_abbreviation > 0
	181	+ && mb_strlen($last_word) <= 3;
	182	+
	183	+ // merge previous fragment with this
	184	+ if ($previous_is_abbreviation === true) {
	185	+ $current_string = $previous_string . $current_string;
	186	+ }
	187	+ $return_fragment[$i] = $current_string;
	188	+
	189	+ $previous_is_abbreviation = $is_abbreviation;
	190	+ $previous_string = $current_string;
	191	+ // only increment if this isn't an abbreviation
	192	+ if ($is_abbreviation === false) {
	193	+ $i++;
	194	+ }
	195	+ }
	196	+ return $return_fragment;
	197	+ }
	198	+
	199	+ /**
	200	+ * Merges any part starting with a closing parenthesis ')' to the previous
	201	+ * part.
	202	+ *
	203	+ * @param string[] $parts
	204	+ * @return string[]
	205	+ */
	206	+ private function parenthesesMerge($parts)
	207	+ {
	208	+ $subsentences = [];
	209	+
	210	+ foreach ($parts as $part) {
	211	+ if ($part[0] === ')') {
	212	+ $subsentences[count($subsentences) - 1] .= $part;
	213	+ } else {
	214	+ $subsentences[] = $part;
	215	+ }
	216	+ }
	217	+
	218	+ return $subsentences;
	219	+ }
	220	+
	221	+ /**
	222	+ * Looks for closing quotes to include them with the previous statement.
	223	+ * "That was very interesting," he said.
	224	+ * "That was very interesting."
	225	+ *
	226	+ * @param string[] $statements
	227	+ * @return string[]
	228	+ */
	229	+ private function closeQuotesMerge($statements)
	230	+ {
	231	+ $i = 0;
	232	+ $previous_statement = "";
	233	+ $return = [];
	234	+ foreach ($statements as $statement) {
	235	+ // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
	236	+ if (trim($statement) === '"'
	237	+ \|\| trim($statement) === "'"
	238	+ \|\| (
	239	+ (substr($statement, 0, 1) === '"'
	240	+ \|\| substr($statement, 0, 1) === "'")
	241	+ && substr($statement, 1, 1) === ' '
	242	+ && ctype_lower(substr($statement, 2, 1)) === true
	243	+ )
	244	+ ) {
	245	+ $statement = $previous_statement . $statement;
	246	+ } else {
	247	+ $i++;
	248	+ }
	249	+
	250	+ $return[$i] = $statement;
	251	+ $previous_statement = $statement;
	252	+ }
	253	+
	254	+ return $return;
	255	+ }
	256	+
	257	+ /**
	258	+ * Merges items into larger sentences.
	259	+ * Multibyte.php safe
	260	+ *
	261	+ * @param string[] $shorts
	262	+ * @return string[]
	263	+ */
	264	+ private function sentenceMerge($shorts)
	265	+ {
	266	+ $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
	267	+
	268	+ $sentences = [];
	269	+
	270	+ $sentence = '';
	271	+ $has_words = false;
	272	+ $previous_word_ending = null;
	273	+ foreach ($shorts as $short) {
	274	+ $word_count = count(mb_split('\s+', Multibyte::trim($short)));
	275	+ $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
	276	+
	277	+ if ($after_non_abbreviating_terminal
	278	+ \|\| ($has_words && $word_count > 1)) {
	279	+ $sentences[] = $sentence;
	280	+ $sentence = '';
	281	+ $has_words = $word_count > 1;
	282	+ } else {
	283	+ $has_words = ($has_words
	284	+ \|\| $word_count > 1);
	285	+ }
	286	+
	287	+ $sentence .= $short;
	288	+ $previous_word_ending = mb_substr($short, -1);
	289	+ }
	290	+ if (!empty($sentence)) {
	291	+ $sentences[] = $sentence;
	292	+ }
	293	+
	294	+ return $sentences;
	295	+ }
	296	+
	297	+ /**
	298	+ * Return the sentences sentences detected in the provided text.
	299	+ * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
	300	+ * @param string $text
	301	+ * @param integer $flags
	302	+ * @return string[]
	303	+ */
	304	+ public function split($text, $flags = 0)
	305	+ {
	306	+ static $pipeline = [
	307	+ 'punctuationSplit',
	308	+ 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
	309	+ 'punctuationMerge',
	310	+ 'abbreviationMerge',
	311	+ 'closeQuotesMerge',
	312	+ 'sentenceMerge',
	313	+ ];
	314	+
	315	+ // clean funny quotes
	316	+ $text = Multibyte::cleanUnicode($text);
	317	+
	318	+ // Split
	319	+ $sentences = [];
	320	+ foreach (self::linebreakSplit($text) as $input) {
	321	+ if (Multibyte::trim($input) !== '') {
	322	+ foreach ($pipeline as $method) {
	323	+ $input = $this->$method($input);
	324	+ }
	325	+ $sentences = array_merge($sentences, $input);
	326	+ }
	327	+ }
	328	+
	329	+ // Post process
	330	+ if ($flags & self::SPLIT_TRIM) {
	331	+ return self::trimSentences($sentences);
	332	+ }
	333	+
	334	+ return $sentences;
	335	+ }
	336	+
	337	+ /**
	338	+ * Multibyte.php trim each string in an array.
	339	+ * @param string[] $sentences
	340	+ * @return string[]
	341	+ */
	342	+ private static function trimSentences($sentences)
	343	+ {
	344	+ return array_map(function($sentence) {
	345	+ return Multibyte::trim($sentence);
	346	+ }, $sentences);
	347	+ }
	348	+
	349	+ /**
	350	+ * Return the number of sentences detected in the provided text.
	351	+ * @param string $text
	352	+ * @return integer
	353	+ */
	354	+ public function count($text)
	355	+ {
	356	+ return count($this->split($text));
	357	+ }
358	358
359	359	}

		@@ -7,146 +7,146 @@
		block discarded – undo
7	7	*/
8	8	class Multibyte
9	9	{
10		- //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11		- private static $unicodeCharacterMap = [
12		- // Windows codepage 1252
13		- "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14		- "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15		- "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16		- "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17		- "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18		- "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19		- "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20		- "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21		- // Regular Unicode // U+0022 quotation mark (")
22		- // U+0027 apostrophe (')
23		- "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24		- "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25		- "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26		- "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27		- "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28		- "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29		- "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30		- "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31		- "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32		- "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33		- "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34		- "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35		- ];
36		-
37		- /**
38		- * Replace
39		- *
40		- * @staticvar array $chr_map
41		- * @param string $string
42		- * @return string
43		- */
44		- public static function cleanUnicode($string)
45		- {
46		- $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47		- $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48		- return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49		- }
50		-
51		- /**
52		- * Multibyte.php safe version of standard trim() function.
53		- *
54		- * @param string $string
55		- * @return string
56		- */
57		- public static function trim($string)
58		- {
59		- return mb_ereg_replace('^\s([\s\S]?)\s*$', '\1', $string);
60		- }
61		-
62		- /**
63		- * A cross between mb_split and preg_split, adding the preg_split flags
64		- * to mb_split.
65		- *
66		- * @param string $pattern
67		- * @param string $string
68		- * @param int $limit
69		- * @param int $flags
70		- * @return array
71		- */
72		- public static function split($pattern, $string, $limit = -1, $flags = 0)
73		- {
74		- $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
75		- $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
76		- $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
77		-
78		- $lengths = self::getSplitLengths($pattern, $string);
79		-
80		- // Substrings
81		- $parts = [];
82		- $position = 0;
83		- $count = 1;
84		- foreach ($lengths as $length) {
85		- $split_empty = !$split_no_empty \|\| $length[0];
86		- $is_delimiter = $length[1];
87		- $is_captured = $delim_capture && $length[2];
88		-
89		- if ($limit > 0
90		- && !$is_delimiter
91		- && $split_empty
92		- && ++$count > $limit) {
93		-
94		- $cut = mb_strcut($string, $position);
95		-
96		- $parts[] = $offset_capture
97		- ? [$cut, $position]
98		- : $cut;
99		-
100		- break;
101		- } elseif ((!$is_delimiter
102		- \|\| $is_captured)
103		- && $split_empty) {
104		-
105		- $cut = mb_strcut($string, $position, $length[0]);
106		-
107		- $parts[] = $offset_capture
108		- ? [$cut, $position]
109		- : $cut;
110		- }
111		-
112		- $position += $length[0];
113		- }
114		-
115		- return $parts;
116		- }
117		-
118		- /**
119		- * Splits the string by pattern and for each element (part or split) returns:
120		- * [ 0 => length, 1 => is_delimiter?, 2 =>
121		- *
122		- * @param $pattern
123		- * @param $string
124		- * @return array
125		- */
126		- private static function getSplitLengths($pattern, $string)
127		- {
128		- $strlen = strlen($string); // bytes!
129		- $lengths = [];
130		-
131		- mb_ereg_search_init($string);
132		-
133		- $position = 0;
134		- while ($position < $strlen
135		- && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
136		- // capture split
137		- $lengths[] = [$array[0] - $position, false, null];
138		-
139		- // move position
140		- $position = $array[0] + $array[1];
141		-
142		- // capture delimiter
143		- $regs = mb_ereg_search_getregs();
144		- $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
145		- }
146		-
147		- // Add last bit, if not ending with split
148		- $lengths[] = [$strlen - $position, false, null];
149		-
150		- return $lengths;
151		- }
	10	+ //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
	11	+ private static $unicodeCharacterMap = [
	12	+ // Windows codepage 1252
	13	+ "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
	14	+ "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
	15	+ "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
	16	+ "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
	17	+ "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
	18	+ "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
	19	+ "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
	20	+ "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
	21	+ // Regular Unicode // U+0022 quotation mark (")
	22	+ // U+0027 apostrophe (')
	23	+ "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
	24	+ "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
	25	+ "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
	26	+ "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
	27	+ "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
	28	+ "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
	29	+ "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
	30	+ "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
	31	+ "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
	32	+ "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
	33	+ "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
	34	+ "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
	35	+ ];
	36	+
	37	+ /**
	38	+ * Replace
	39	+ *
	40	+ * @staticvar array $chr_map
	41	+ * @param string $string
	42	+ * @return string
	43	+ */
	44	+ public static function cleanUnicode($string)
	45	+ {
	46	+ $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
	47	+ $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
	48	+ return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
	49	+ }
	50	+
	51	+ /**
	52	+ * Multibyte.php safe version of standard trim() function.
	53	+ *
	54	+ * @param string $string
	55	+ * @return string
	56	+ */
	57	+ public static function trim($string)
	58	+ {
	59	+ return mb_ereg_replace('^\s([\s\S]?)\s*$', '\1', $string);
	60	+ }
	61	+
	62	+ /**
	63	+ * A cross between mb_split and preg_split, adding the preg_split flags
	64	+ * to mb_split.
	65	+ *
	66	+ * @param string $pattern
	67	+ * @param string $string
	68	+ * @param int $limit
	69	+ * @param int $flags
	70	+ * @return array
	71	+ */
	72	+ public static function split($pattern, $string, $limit = -1, $flags = 0)
	73	+ {
	74	+ $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
	75	+ $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
	76	+ $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
	77	+
	78	+ $lengths = self::getSplitLengths($pattern, $string);
	79	+
	80	+ // Substrings
	81	+ $parts = [];
	82	+ $position = 0;
	83	+ $count = 1;
	84	+ foreach ($lengths as $length) {
	85	+ $split_empty = !$split_no_empty \|\| $length[0];
	86	+ $is_delimiter = $length[1];
	87	+ $is_captured = $delim_capture && $length[2];
	88	+
	89	+ if ($limit > 0
	90	+ && !$is_delimiter
	91	+ && $split_empty
	92	+ && ++$count > $limit) {
	93	+
	94	+ $cut = mb_strcut($string, $position);
	95	+
	96	+ $parts[] = $offset_capture
	97	+ ? [$cut, $position]
	98	+ : $cut;
	99	+
	100	+ break;
	101	+ } elseif ((!$is_delimiter
	102	+ \|\| $is_captured)
	103	+ && $split_empty) {
	104	+
	105	+ $cut = mb_strcut($string, $position, $length[0]);
	106	+
	107	+ $parts[] = $offset_capture
	108	+ ? [$cut, $position]
	109	+ : $cut;
	110	+ }
	111	+
	112	+ $position += $length[0];
	113	+ }
	114	+
	115	+ return $parts;
	116	+ }
	117	+
	118	+ /**
	119	+ * Splits the string by pattern and for each element (part or split) returns:
	120	+ * [ 0 => length, 1 => is_delimiter?, 2 =>
	121	+ *
	122	+ * @param $pattern
	123	+ * @param $string
	124	+ * @return array
	125	+ */
	126	+ private static function getSplitLengths($pattern, $string)
	127	+ {
	128	+ $strlen = strlen($string); // bytes!
	129	+ $lengths = [];
	130	+
	131	+ mb_ereg_search_init($string);
	132	+
	133	+ $position = 0;
	134	+ while ($position < $strlen
	135	+ && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
	136	+ // capture split
	137	+ $lengths[] = [$array[0] - $position, false, null];
	138	+
	139	+ // move position
	140	+ $position = $array[0] + $array[1];
	141	+
	142	+ // capture delimiter
	143	+ $regs = mb_ereg_search_getregs();
	144	+ $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
	145	+ }
	146	+
	147	+ // Add last bit, if not ending with split
	148	+ $lengths[] = [$strlen - $position, false, null];
	149	+
	150	+ return $lengths;
	151	+ }
152	152	}
153	153	\ No newline at end of file

vanderlee / php-sentence

Push — master ( e85d9b...d7563a )

Status

Category

Indentation +338 added lines, -338 removed lines patch added patch discarded remove patch

Indentation +142 added lines, -142 removed lines patch added patch discarded remove patch