Inspection of "Support custom pipeline" - vanderlee/php-sentence - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#19)

unknown

created 2022-06-09 18:11 UTC

Status

Indentation +173 added lines, -173 removed lines patch added patch discarded remove patch

@@ -7,177 +7,177 @@
 block discarded – undo
  */
 class Multibyte
 {
-    //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
-    private static $unicodeCharacterMap = [
-        // Windows codepage 1252
-        "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
-        "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
-        "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
-        "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
-        "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
-        "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
-        "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
-        "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
-        // Regular Unicode     // U+0022 quotation mark (")
-        // U+0027 apostrophe     (')
-        "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
-        "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
-        "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
-        "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
-        "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
-        "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
-        "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
-        "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
-        "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
-        "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
-        "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
-        "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
-    ];
-
-    /**
-     * Replace
-     *
-     * @staticvar array $chr_map
-     * @param string $string
-     * @return string
-     */
-    public static function cleanUnicode($string)
-    {
-        $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
-        $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
-    }
-
-    /**
-     * Multibyte.php safe version of standard trim() function.
-     *
-     * @param string $string
-     * @return string
-     */
-    public static function trim($string)
-    {
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
-    }
-
-    /**
-     * A cross between mb_split and preg_split, adding the preg_split flags
-     * to mb_split.
-     *
-     * @param string $pattern
-     * @param string $string
-     * @param int $limit
-     * @param int $flags
-     * @return array
-     */
-    public static function split($pattern, $string, $limit = -1, $flags = 0)
-    {
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
-
-        $lengths = self::getSplitLengths($pattern, $string);
-
-        // Substrings
-        $parts = [];
-        $position = 0;
-        $count = 1;
-        foreach ($lengths as $length) {
-            if (self::isLastPart($length, $flags, $limit, $count)) {
-                $parts[] = self::makePart($string, $position, null, $offset_capture);
-                return $parts;
-            }
-
-            if (self::isPart($length, $flags)) {
-                $parts[] = self::makePart($string, $position, $length[0], $offset_capture);
-            }
-
-            $position += $length[0];
-        }
-
-        return $parts;
-    }
-
-    /**
-     * @param $length
-     * @param $flags
-     * @param $limit
-     * @param $count
-     * @return bool
-     */
-    private static function isLastPart($length, $flags, $limit, &$count)
-    {
-        $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
-        $is_delimiter = $length[1];
-
-        return $limit > 0
-            && !$is_delimiter
-            && $split_empty
-            && ++$count > $limit;
-    }
-
-    /**
-     * @param $length
-     * @param $flags
-     * @return bool
-     */
-    private static function isPart($length, $flags)
-    {
-        $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
-        $is_delimiter = $length[1];
-        $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
-
-        return (!$is_delimiter
-                || $is_captured)
-            && $split_empty;
-    }
-
-    /**
-     * Make part
-     * @param string $string
-     * @param integer $position
-     * @param integer|null $length
-     * @param bool $offset_capture
-     * @return array|string
-     */
-    private static function makePart($string, $position, $length = null, $offset_capture = false)
-    {
-        $cut = mb_strcut($string, $position, $length);
-
-        return $offset_capture
-            ? [$cut, $position]
-            : $cut;
-    }
-
-    /**
-     * Splits the string by pattern and for each element (part or split) returns:
-     *  [ 0 => length, 1 => is_delimiter?, 2 =>
-     *
-     * @param $pattern
-     * @param $string
-     * @return array
-     */
-    private static function getSplitLengths($pattern, $string)
-    {
-        $strlen = strlen($string); // bytes!
-        $lengths = [];
-
-        mb_ereg_search_init($string);
-
-        $position = 0;
-        while ($position < $strlen
-            && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
-            // capture split
-            $lengths[] = [$array[0] - $position, false, null];
-
-            // move position
-            $position = $array[0] + $array[1];
-
-            // capture delimiter
-            $regs = mb_ereg_search_getregs();
-            $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
-        }
-
-        // Add last bit, if not ending with split
-        $lengths[] = [$strlen - $position, false, null];
-
-        return $lengths;
-    }
+	//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
+	private static $unicodeCharacterMap = [
+		// Windows codepage 1252
+		"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
+		"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
+		"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
+		"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
+		"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
+		"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
+		"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
+		"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
+		// Regular Unicode     // U+0022 quotation mark (")
+		// U+0027 apostrophe     (')
+		"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
+		"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
+		"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
+		"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
+		"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
+		"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
+		"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
+		"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
+		"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
+		"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
+		"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
+		"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
+	];
+
+	/**
+	 * Replace
+	 *
+	 * @staticvar array $chr_map
+	 * @param string $string
+	 * @return string
+	 */
+	public static function cleanUnicode($string)
+	{
+		$character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
+		$replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
+	}
+
+	/**
+	 * Multibyte.php safe version of standard trim() function.
+	 *
+	 * @param string $string
+	 * @return string
+	 */
+	public static function trim($string)
+	{
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
+	}
+
+	/**
+	 * A cross between mb_split and preg_split, adding the preg_split flags
+	 * to mb_split.
+	 *
+	 * @param string $pattern
+	 * @param string $string
+	 * @param int $limit
+	 * @param int $flags
+	 * @return array
+	 */
+	public static function split($pattern, $string, $limit = -1, $flags = 0)
+	{
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
+
+		$lengths = self::getSplitLengths($pattern, $string);
+
+		// Substrings
+		$parts = [];
+		$position = 0;
+		$count = 1;
+		foreach ($lengths as $length) {
+			if (self::isLastPart($length, $flags, $limit, $count)) {
+				$parts[] = self::makePart($string, $position, null, $offset_capture);
+				return $parts;
+			}
+
+			if (self::isPart($length, $flags)) {
+				$parts[] = self::makePart($string, $position, $length[0], $offset_capture);
+			}
+
+			$position += $length[0];
+		}
+
+		return $parts;
+	}
+
+	/**
+	 * @param $length
+	 * @param $flags
+	 * @param $limit
+	 * @param $count
+	 * @return bool
+	 */
+	private static function isLastPart($length, $flags, $limit, &$count)
+	{
+		$split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
+		$is_delimiter = $length[1];
+
+		return $limit > 0
+			&& !$is_delimiter
+			&& $split_empty
+			&& ++$count > $limit;
+	}
+
+	/**
+	 * @param $length
+	 * @param $flags
+	 * @return bool
+	 */
+	private static function isPart($length, $flags)
+	{
+		$split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
+		$is_delimiter = $length[1];
+		$is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
+
+		return (!$is_delimiter
+				|| $is_captured)
+			&& $split_empty;
+	}
+
+	/**
+	 * Make part
+	 * @param string $string
+	 * @param integer $position
+	 * @param integer|null $length
+	 * @param bool $offset_capture
+	 * @return array|string
+	 */
+	private static function makePart($string, $position, $length = null, $offset_capture = false)
+	{
+		$cut = mb_strcut($string, $position, $length);
+
+		return $offset_capture
+			? [$cut, $position]
+			: $cut;
+	}
+
+	/**
+	 * Splits the string by pattern and for each element (part or split) returns:
+	 *  [ 0 => length, 1 => is_delimiter?, 2 =>
+	 *
+	 * @param $pattern
+	 * @param $string
+	 * @return array
+	 */
+	private static function getSplitLengths($pattern, $string)
+	{
+		$strlen = strlen($string); // bytes!
+		$lengths = [];
+
+		mb_ereg_search_init($string);
+
+		$position = 0;
+		while ($position < $strlen
+			&& ($array = mb_ereg_search_pos($pattern, '')) !== false) {
+			// capture split
+			$lengths[] = [$array[0] - $position, false, null];
+
+			// move position
+			$position = $array[0] + $array[1];
+
+			// capture delimiter
+			$regs = mb_ereg_search_getregs();
+			$lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
+		}
+
+		// Add last bit, if not ending with split
+		$lengths[] = [$strlen - $position, false, null];
+
+		return $lengths;
+	}
 }
\ No newline at end of file

Please login to merge, or discard this patch.

Spacing +1 added lines, -1 removed lines patch added patch discarded remove patch

@@ -71,7 +71,7 @@
 block discarded – undo
      */
     public static function split($pattern, $string, $limit = -1, $flags = 0)
     {
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
+        $offset_capture = (bool) ($flags & PREG_SPLIT_OFFSET_CAPTURE);
 
         $lengths = self::getSplitLengths($pattern, $string);
 

Please login to merge, or discard this patch.

src/Sentence.php 2 patches

Spacing +4 added lines, -4 removed lines patch added patch discarded remove patch

@@ -75,7 +75,7 @@  discard block
 block discarded – undo
 
             $this->replacements[$index] = $number;
 
-            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
+            $text = (string) substr_replace($text, $code, $offset, mb_strlen($number));
 
             ++$index;
         }
@@ -92,7 +92,7 @@  discard block
 block discarded – undo
      */
     private function restoreReplacements($text)
     {
-        return array_map(function ($value) {
+        return array_map(function($value) {
             foreach ($this->replacements as $index => $number) {
                 $code = $this->getReplaceCode($index);
                 $value = str_replace($code, $number, $value);
@@ -190,7 +190,7 @@  discard block
 block discarded – undo
         $merges = [];
         $merge = '';
 
-        $filtered = array_filter($punctuations, function ($p) {
+        $filtered = array_filter($punctuations, function($p) {
             return $p !== '';
         });
 
@@ -448,7 +448,7 @@  discard block
 block discarded – undo
      */
     private static function trimSentences($sentences)
     {
-        return array_map(function ($sentence) {
+        return array_map(function($sentence) {
             return Multibyte::trim($sentence);
         }, $sentences);
     }

Please login to merge, or discard this patch.

Indentation +449 added lines, -449 removed lines patch added patch discarded remove patch

@@ -17,454 +17,454 @@
 block discarded – undo
 class Sentence
 {
 
-    /**
-     * Specify this flag with the split method to trim whitespace.
-     */
-    const SPLIT_TRIM = 0x1;
-
-    /**
-     * List of characters used to terminate sentences.
-     *
-     * @var string[]
-     */
-    private $terminals = ['.', '!', '?'];
-
-    /**
-     * List of characters used for abbreviations.
-     *
-     * @var string[]
-     */
-    private $abbreviators = ['.'];
-
-    /**
-     * List of replacements in the text.
-     *
-     * @var string[]
-     */
-    private $replacements = [];
-
-    /**
-     * Generate an in-text replacement code for the specified index
-     *
-     * @param int $index
-     *
-     * @return string
-     */
-    private function getReplaceCode(int $index)
-    {
-        return 0x02 . $index . 0x03;
-    }
-
-    /**
-     * Clean floating point numbers by replace them with an in-text index
-     *
-     * @param string $text
-     *
-     * @return string
-     */
-    private function replaceFloatNumbers(string $text)
-    {
-        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
-
-        $this->replacements = [];
-        $index = 0;
-        foreach (array_reverse($matches[0]) as $match) {
-            $number = $match[0];
-            $offset = $match[1];
-            $code = $this->getReplaceCode($index);
-
-            $this->replacements[$index] = $number;
-
-            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
-
-            ++$index;
-        }
-
-        return $text;
-    }
-
-    /**
-     * Restore any stored replacements
-     *
-     * @param string[] $text
-     *
-     * @return string[]
-     */
-    private function restoreReplacements($text)
-    {
-        return array_map(function ($value) {
-            foreach ($this->replacements as $index => $number) {
-                $code = $this->getReplaceCode($index);
-                $value = str_replace($code, $number, $value);
-            }
-
-            return $value;
-        }, $text);
-    }
-
-    /**
-     * Breaks a piece of text into lines by linebreak.
-     * Eats up any linebreak characters as if one.
-     *
-     * Multibyte.php safe
-     *
-     * @param string $text
-     *
-     * @return string[]
-     */
-    private static function linebreakSplit($text)
-    {
-        $lines = [];
-        $line = '';
-
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
-            $line .= $part;
-            if (Multibyte::trim($part) === '') {
-                $lines[] = $line;
-                $line = '';
-            }
-        }
-        $lines[] = $line;
-
-        return $lines;
-    }
-
-    /**
-     * Splits an array of lines by (consecutive sequences of)
-     * terminals, keeping terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    "There ... is. More!"
-     *        ... becomes ...
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *
-     * @param string $line
-     *
-     * @return string[]
-     */
-    private function punctuationSplit($line)
-    {
-        $parts = [];
-
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
-        $is_terminal = in_array($chars[0], $this->terminals);
-
-        $part = '';
-        foreach ($chars as $char) {
-            if (in_array($char, $this->terminals) !== $is_terminal) {
-                $parts[] = $part;
-                $part = '';
-                $is_terminal = !$is_terminal;
-            }
-            $part .= $char;
-        }
-
-        if (!empty($part)) {
-            $parts[] = $part;
-        }
-
-        return $parts;
-    }
-
-    /**
-     * Appends each terminal item after it's preceding
-     * non-terminals.
-     *
-     * Multibyte.php safe (atleast for UTF-8)
-     *
-     * For example:
-     *    [ "There ", "...", " is", ".", " More", "!" ]
-     *        ... becomes ...
-     *    [ "There ... is.", "More!" ]
-     *
-     * @param string[] $punctuations
-     *
-     * @return string[]
-     */
-    private function punctuationMerge($punctuations)
-    {
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $merges = [];
-        $merge = '';
-
-        $filtered = array_filter($punctuations, function ($p) {
-            return $p !== '';
-        });
-
-        foreach ($filtered as $punctuation) {
-            $merge .= $punctuation;
-            if (mb_strlen($punctuation) === 1
-                && in_array($punctuation, $this->terminals)) {
-                $merges[] = $merge;
-                $merge = '';
-            } else {
-                foreach ($definite_terminals as $terminal) {
-                    if (mb_strpos($punctuation, $terminal) !== false) {
-                        $merges[] = $merge;
-                        $merge = '';
-                        break;
-                    }
-                }
-            }
-        }
-        if (!empty($merge)) {
-            $merges[] = $merge;
-        }
-
-        return $merges;
-    }
-
-    /**
-     * Looks for capitalized abbreviations & includes them with the following fragment.
-     *
-     * For example:
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
-     *        ... becomes ...
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
-     *  [ "Mr. Comey was not available for comment." ]
-     *
-     * @param string[] $fragments
-     *
-     * @return string[]
-     */
-    private function abbreviationMerge($fragments)
-    {
-        $return_fragment = [];
-
-        $previous_fragment = '';
-        $previous_is_abbreviation = false;
-        $i = 0;
-        foreach ($fragments as $fragment) {
-            $is_abbreviation = self::isAbreviation($fragment);
-
-            // merge previous fragment with this
-            if ($previous_is_abbreviation) {
-                $fragment = $previous_fragment . $fragment;
-            }
-            $return_fragment[$i] = $fragment;
-
-            $previous_is_abbreviation = $is_abbreviation;
-            $previous_fragment = $fragment;
-
-            // only increment if this isn't an abbreviation
-            if (!$is_abbreviation) {
-                $i++;
-            }
-        }
-
-        return $return_fragment;
-    }
-
-    /**
-     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
-     *
-     * @param $fragment
-     *
-     * @return bool
-     */
-    private static function isAbreviation($fragment)
-    {
-        $words = mb_split('\s+', Multibyte::trim($fragment));
-
-        $word_count = count($words);
-
-        $last_word = Multibyte::trim($words[$word_count - 1]);
-        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
-        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
-
-        return $last_is_capital > 0
-            && $last_is_abbreviation > 0
-            && mb_strlen($last_word) <= 3;
-    }
-
-    /**
-     * Merges any part starting with a closing parenthesis ')' to the previous
-     * part.
-     *
-     * @param string[] $parts
-     *
-     * @return string[]
-     */
-    private function parenthesesMerge($parts)
-    {
-        $subsentences = [];
-
-        foreach ($parts as $part) {
-            if ($part[0] === ')' && !empty($subsentences)) {
-                $subsentences[count($subsentences) - 1] .= $part;
-            } else {
-                $subsentences[] = $part;
-            }
-        }
-
-        return $subsentences;
-    }
-
-    /**
-     * Looks for closing quotes to include them with the previous statement.
-     * "That was very interesting," he said.
-     * "That was very interesting."
-     *
-     * @param string[] $statements
-     *
-     * @return string[]
-     */
-    private function closeQuotesMerge($statements)
-    {
-        $i = 0;
-        $previous_statement = '';
-        $return = [];
-        foreach ($statements as $statement) {
-            if (self::isEndQuote($statement)) {
-                $statement = $previous_statement . $statement;
-            } else {
-                $i++;
-            }
-
-            $return[$i] = $statement;
-            $previous_statement = $statement;
-        }
-
-        return $return;
-    }
-
-    /**
-     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
-     *
-     * @param $statement
-     *
-     * @return bool
-     */
-    private static function isEndQuote($statement)
-    {
-        $trimmed = Multibyte::trim($statement);
-        $first = mb_substr($statement, 0, 1);
-
-        return in_array($trimmed, ['"', '\''])
-            || (
-                in_array($first, ['"', '\''])
-                && mb_substr($statement, 1, 1) === ' '
-                && ctype_lower(mb_substr($statement, 2, 1)) === true
-            );
-    }
-
-    /**
-     * Merges items into larger sentences.
-     * Multibyte.php safe
-     *
-     * @param string[] $shorts
-     *
-     * @return string[]
-     */
-    private function sentenceMerge($shorts)
-    {
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
-
-        $sentences = [];
-
-        $sentence = '';
-        $has_words = false;
-        $previous_word_ending = null;
-        foreach ($shorts as $short) {
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
-
-            if ($after_non_abbreviating_terminal
-                || ($has_words && $word_count > 1)) {
-
-                $sentences[] = $sentence;
-
-                $sentence = '';
-                $has_words = false;
-            }
-
-            $has_words = $has_words
-                || $word_count > 1;
-
-            $sentence .= $short;
-            $previous_word_ending = mb_substr($short, -1);
-        }
-
-        if (!empty($sentence)) {
-            $sentences[] = $sentence;
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Return the sentences sentences detected in the provided text.
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
-     *
-     * @param string  $text
-     * @param integer $flags
-     *
-     * @return string[]
-     */
-    public function split($text, $flags = 0)
-    {
-        if (empty($pipeline)) {
-            static $pipeline = [
-                'replaceFloatNumbers',
-                'punctuationSplit',
-                'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
-                'punctuationMerge',
-                'abbreviationMerge',
-                'closeQuotesMerge',
-                'sentenceMerge',
-                'restoreReplacements',
-            ];
-        }
-
-        // clean funny quotes
-        $text = Multibyte::cleanUnicode($text);
-
-        // Split
-        $sentences = [];
-        foreach (self::linebreakSplit($text) as $input) {
-            if (Multibyte::trim($input) !== '') {
-                foreach ($pipeline as $method) {
-                    $input = $this->$method($input);
-                }
-                $sentences = array_merge($sentences, $input);
-            }
-        }
-
-        // Post process
-        if ($flags & self::SPLIT_TRIM) {
-            return self::trimSentences($sentences);
-        }
-
-        return $sentences;
-    }
-
-    /**
-     * Multibyte.php trim each string in an array.
-     *
-     * @param string[] $sentences
-     *
-     * @return string[]
-     */
-    private static function trimSentences($sentences)
-    {
-        return array_map(function ($sentence) {
-            return Multibyte::trim($sentence);
-        }, $sentences);
-    }
-
-    /**
-     * Return the number of sentences detected in the provided text.
-     *
-     * @param string $text
-     *
-     * @return integer
-     */
-    public function count($text)
-    {
-        return count($this->split($text));
-    }
+	/**
+	 * Specify this flag with the split method to trim whitespace.
+	 */
+	const SPLIT_TRIM = 0x1;
+
+	/**
+	 * List of characters used to terminate sentences.
+	 *
+	 * @var string[]
+	 */
+	private $terminals = ['.', '!', '?'];
+
+	/**
+	 * List of characters used for abbreviations.
+	 *
+	 * @var string[]
+	 */
+	private $abbreviators = ['.'];
+
+	/**
+	 * List of replacements in the text.
+	 *
+	 * @var string[]
+	 */
+	private $replacements = [];
+
+	/**
+	 * Generate an in-text replacement code for the specified index
+	 *
+	 * @param int $index
+	 *
+	 * @return string
+	 */
+	private function getReplaceCode(int $index)
+	{
+		return 0x02 . $index . 0x03;
+	}
+
+	/**
+	 * Clean floating point numbers by replace them with an in-text index
+	 *
+	 * @param string $text
+	 *
+	 * @return string
+	 */
+	private function replaceFloatNumbers(string $text)
+	{
+		preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
+
+		$this->replacements = [];
+		$index = 0;
+		foreach (array_reverse($matches[0]) as $match) {
+			$number = $match[0];
+			$offset = $match[1];
+			$code = $this->getReplaceCode($index);
+
+			$this->replacements[$index] = $number;
+
+			$text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
+
+			++$index;
+		}
+
+		return $text;
+	}
+
+	/**
+	 * Restore any stored replacements
+	 *
+	 * @param string[] $text
+	 *
+	 * @return string[]
+	 */
+	private function restoreReplacements($text)
+	{
+		return array_map(function ($value) {
+			foreach ($this->replacements as $index => $number) {
+				$code = $this->getReplaceCode($index);
+				$value = str_replace($code, $number, $value);
+			}
+
+			return $value;
+		}, $text);
+	}
+
+	/**
+	 * Breaks a piece of text into lines by linebreak.
+	 * Eats up any linebreak characters as if one.
+	 *
+	 * Multibyte.php safe
+	 *
+	 * @param string $text
+	 *
+	 * @return string[]
+	 */
+	private static function linebreakSplit($text)
+	{
+		$lines = [];
+		$line = '';
+
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
+			$line .= $part;
+			if (Multibyte::trim($part) === '') {
+				$lines[] = $line;
+				$line = '';
+			}
+		}
+		$lines[] = $line;
+
+		return $lines;
+	}
+
+	/**
+	 * Splits an array of lines by (consecutive sequences of)
+	 * terminals, keeping terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    "There ... is. More!"
+	 *        ... becomes ...
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *
+	 * @param string $line
+	 *
+	 * @return string[]
+	 */
+	private function punctuationSplit($line)
+	{
+		$parts = [];
+
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
+		$is_terminal = in_array($chars[0], $this->terminals);
+
+		$part = '';
+		foreach ($chars as $char) {
+			if (in_array($char, $this->terminals) !== $is_terminal) {
+				$parts[] = $part;
+				$part = '';
+				$is_terminal = !$is_terminal;
+			}
+			$part .= $char;
+		}
+
+		if (!empty($part)) {
+			$parts[] = $part;
+		}
+
+		return $parts;
+	}
+
+	/**
+	 * Appends each terminal item after it's preceding
+	 * non-terminals.
+	 *
+	 * Multibyte.php safe (atleast for UTF-8)
+	 *
+	 * For example:
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
+	 *        ... becomes ...
+	 *    [ "There ... is.", "More!" ]
+	 *
+	 * @param string[] $punctuations
+	 *
+	 * @return string[]
+	 */
+	private function punctuationMerge($punctuations)
+	{
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$merges = [];
+		$merge = '';
+
+		$filtered = array_filter($punctuations, function ($p) {
+			return $p !== '';
+		});
+
+		foreach ($filtered as $punctuation) {
+			$merge .= $punctuation;
+			if (mb_strlen($punctuation) === 1
+				&& in_array($punctuation, $this->terminals)) {
+				$merges[] = $merge;
+				$merge = '';
+			} else {
+				foreach ($definite_terminals as $terminal) {
+					if (mb_strpos($punctuation, $terminal) !== false) {
+						$merges[] = $merge;
+						$merge = '';
+						break;
+					}
+				}
+			}
+		}
+		if (!empty($merge)) {
+			$merges[] = $merge;
+		}
+
+		return $merges;
+	}
+
+	/**
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
+	 *
+	 * For example:
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
+	 *        ... becomes ...
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
+	 *  [ "Mr. Comey was not available for comment." ]
+	 *
+	 * @param string[] $fragments
+	 *
+	 * @return string[]
+	 */
+	private function abbreviationMerge($fragments)
+	{
+		$return_fragment = [];
+
+		$previous_fragment = '';
+		$previous_is_abbreviation = false;
+		$i = 0;
+		foreach ($fragments as $fragment) {
+			$is_abbreviation = self::isAbreviation($fragment);
+
+			// merge previous fragment with this
+			if ($previous_is_abbreviation) {
+				$fragment = $previous_fragment . $fragment;
+			}
+			$return_fragment[$i] = $fragment;
+
+			$previous_is_abbreviation = $is_abbreviation;
+			$previous_fragment = $fragment;
+
+			// only increment if this isn't an abbreviation
+			if (!$is_abbreviation) {
+				$i++;
+			}
+		}
+
+		return $return_fragment;
+	}
+
+	/**
+	 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
+	 *
+	 * @param $fragment
+	 *
+	 * @return bool
+	 */
+	private static function isAbreviation($fragment)
+	{
+		$words = mb_split('\s+', Multibyte::trim($fragment));
+
+		$word_count = count($words);
+
+		$last_word = Multibyte::trim($words[$word_count - 1]);
+		$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
+		$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
+
+		return $last_is_capital > 0
+			&& $last_is_abbreviation > 0
+			&& mb_strlen($last_word) <= 3;
+	}
+
+	/**
+	 * Merges any part starting with a closing parenthesis ')' to the previous
+	 * part.
+	 *
+	 * @param string[] $parts
+	 *
+	 * @return string[]
+	 */
+	private function parenthesesMerge($parts)
+	{
+		$subsentences = [];
+
+		foreach ($parts as $part) {
+			if ($part[0] === ')' && !empty($subsentences)) {
+				$subsentences[count($subsentences) - 1] .= $part;
+			} else {
+				$subsentences[] = $part;
+			}
+		}
+
+		return $subsentences;
+	}
+
+	/**
+	 * Looks for closing quotes to include them with the previous statement.
+	 * "That was very interesting," he said.
+	 * "That was very interesting."
+	 *
+	 * @param string[] $statements
+	 *
+	 * @return string[]
+	 */
+	private function closeQuotesMerge($statements)
+	{
+		$i = 0;
+		$previous_statement = '';
+		$return = [];
+		foreach ($statements as $statement) {
+			if (self::isEndQuote($statement)) {
+				$statement = $previous_statement . $statement;
+			} else {
+				$i++;
+			}
+
+			$return[$i] = $statement;
+			$previous_statement = $statement;
+		}
+
+		return $return;
+	}
+
+	/**
+	 * Check if the entire string is a quotation mark or quote, then space, then lowercase.
+	 *
+	 * @param $statement
+	 *
+	 * @return bool
+	 */
+	private static function isEndQuote($statement)
+	{
+		$trimmed = Multibyte::trim($statement);
+		$first = mb_substr($statement, 0, 1);
+
+		return in_array($trimmed, ['"', '\''])
+			|| (
+				in_array($first, ['"', '\''])
+				&& mb_substr($statement, 1, 1) === ' '
+				&& ctype_lower(mb_substr($statement, 2, 1)) === true
+			);
+	}
+
+	/**
+	 * Merges items into larger sentences.
+	 * Multibyte.php safe
+	 *
+	 * @param string[] $shorts
+	 *
+	 * @return string[]
+	 */
+	private function sentenceMerge($shorts)
+	{
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
+
+		$sentences = [];
+
+		$sentence = '';
+		$has_words = false;
+		$previous_word_ending = null;
+		foreach ($shorts as $short) {
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
+
+			if ($after_non_abbreviating_terminal
+				|| ($has_words && $word_count > 1)) {
+
+				$sentences[] = $sentence;
+
+				$sentence = '';
+				$has_words = false;
+			}
+
+			$has_words = $has_words
+				|| $word_count > 1;
+
+			$sentence .= $short;
+			$previous_word_ending = mb_substr($short, -1);
+		}
+
+		if (!empty($sentence)) {
+			$sentences[] = $sentence;
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Return the sentences sentences detected in the provided text.
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
+	 *
+	 * @param string  $text
+	 * @param integer $flags
+	 *
+	 * @return string[]
+	 */
+	public function split($text, $flags = 0)
+	{
+		if (empty($pipeline)) {
+			static $pipeline = [
+				'replaceFloatNumbers',
+				'punctuationSplit',
+				'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
+				'punctuationMerge',
+				'abbreviationMerge',
+				'closeQuotesMerge',
+				'sentenceMerge',
+				'restoreReplacements',
+			];
+		}
+
+		// clean funny quotes
+		$text = Multibyte::cleanUnicode($text);
+
+		// Split
+		$sentences = [];
+		foreach (self::linebreakSplit($text) as $input) {
+			if (Multibyte::trim($input) !== '') {
+				foreach ($pipeline as $method) {
+					$input = $this->$method($input);
+				}
+				$sentences = array_merge($sentences, $input);
+			}
+		}
+
+		// Post process
+		if ($flags & self::SPLIT_TRIM) {
+			return self::trimSentences($sentences);
+		}
+
+		return $sentences;
+	}
+
+	/**
+	 * Multibyte.php trim each string in an array.
+	 *
+	 * @param string[] $sentences
+	 *
+	 * @return string[]
+	 */
+	private static function trimSentences($sentences)
+	{
+		return array_map(function ($sentence) {
+			return Multibyte::trim($sentence);
+		}, $sentences);
+	}
+
+	/**
+	 * Return the number of sentences detected in the provided text.
+	 *
+	 * @param string $text
+	 *
+	 * @return integer
+	 */
+	public function count($text)
+	{
+		return count($this->split($text));
+	}
 
 }

Please login to merge, or discard this patch.

		@@ -7,177 +7,177 @@
		block discarded – undo
7	7	*/
8	8	class Multibyte
9	9	{
10		- //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11		- private static $unicodeCharacterMap = [
12		- // Windows codepage 1252
13		- "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14		- "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15		- "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16		- "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17		- "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18		- "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19		- "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20		- "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21		- // Regular Unicode // U+0022 quotation mark (")
22		- // U+0027 apostrophe (')
23		- "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24		- "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25		- "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26		- "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27		- "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28		- "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29		- "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30		- "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31		- "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32		- "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33		- "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34		- "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35		- ];
36		-
37		- /**
38		- * Replace
39		- *
40		- * @staticvar array $chr_map
41		- * @param string $string
42		- * @return string
43		- */
44		- public static function cleanUnicode($string)
45		- {
46		- $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47		- $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48		- return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49		- }
50		-
51		- /**
52		- * Multibyte.php safe version of standard trim() function.
53		- *
54		- * @param string $string
55		- * @return string
56		- */
57		- public static function trim($string)
58		- {
59		- return mb_ereg_replace('^\s([\s\S]?)\s*$', '\1', $string);
60		- }
61		-
62		- /**
63		- * A cross between mb_split and preg_split, adding the preg_split flags
64		- * to mb_split.
65		- *
66		- * @param string $pattern
67		- * @param string $string
68		- * @param int $limit
69		- * @param int $flags
70		- * @return array
71		- */
72		- public static function split($pattern, $string, $limit = -1, $flags = 0)
73		- {
74		- $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
75		-
76		- $lengths = self::getSplitLengths($pattern, $string);
77		-
78		- // Substrings
79		- $parts = [];
80		- $position = 0;
81		- $count = 1;
82		- foreach ($lengths as $length) {
83		- if (self::isLastPart($length, $flags, $limit, $count)) {
84		- $parts[] = self::makePart($string, $position, null, $offset_capture);
85		- return $parts;
86		- }
87		-
88		- if (self::isPart($length, $flags)) {
89		- $parts[] = self::makePart($string, $position, $length[0], $offset_capture);
90		- }
91		-
92		- $position += $length[0];
93		- }
94		-
95		- return $parts;
96		- }
97		-
98		- /**
99		- * @param $length
100		- * @param $flags
101		- * @param $limit
102		- * @param $count
103		- * @return bool
104		- */
105		- private static function isLastPart($length, $flags, $limit, &$count)
106		- {
107		- $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) \|\| $length[0];
108		- $is_delimiter = $length[1];
109		-
110		- return $limit > 0
111		- && !$is_delimiter
112		- && $split_empty
113		- && ++$count > $limit;
114		- }
115		-
116		- /**
117		- * @param $length
118		- * @param $flags
119		- * @return bool
120		- */
121		- private static function isPart($length, $flags)
122		- {
123		- $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) \|\| $length[0];
124		- $is_delimiter = $length[1];
125		- $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
126		-
127		- return (!$is_delimiter
128		- \|\| $is_captured)
129		- && $split_empty;
130		- }
131		-
132		- /**
133		- * Make part
134		- * @param string $string
135		- * @param integer $position
136		- * @param integer\|null $length
137		- * @param bool $offset_capture
138		- * @return array\|string
139		- */
140		- private static function makePart($string, $position, $length = null, $offset_capture = false)
141		- {
142		- $cut = mb_strcut($string, $position, $length);
143		-
144		- return $offset_capture
145		- ? [$cut, $position]
146		- : $cut;
147		- }
148		-
149		- /**
150		- * Splits the string by pattern and for each element (part or split) returns:
151		- * [ 0 => length, 1 => is_delimiter?, 2 =>
152		- *
153		- * @param $pattern
154		- * @param $string
155		- * @return array
156		- */
157		- private static function getSplitLengths($pattern, $string)
158		- {
159		- $strlen = strlen($string); // bytes!
160		- $lengths = [];
161		-
162		- mb_ereg_search_init($string);
163		-
164		- $position = 0;
165		- while ($position < $strlen
166		- && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
167		- // capture split
168		- $lengths[] = [$array[0] - $position, false, null];
169		-
170		- // move position
171		- $position = $array[0] + $array[1];
172		-
173		- // capture delimiter
174		- $regs = mb_ereg_search_getregs();
175		- $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
176		- }
177		-
178		- // Add last bit, if not ending with split
179		- $lengths[] = [$strlen - $position, false, null];
180		-
181		- return $lengths;
182		- }
	10	+ //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
	11	+ private static $unicodeCharacterMap = [
	12	+ // Windows codepage 1252
	13	+ "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
	14	+ "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
	15	+ "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
	16	+ "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
	17	+ "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
	18	+ "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
	19	+ "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
	20	+ "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
	21	+ // Regular Unicode // U+0022 quotation mark (")
	22	+ // U+0027 apostrophe (')
	23	+ "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
	24	+ "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
	25	+ "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
	26	+ "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
	27	+ "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
	28	+ "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
	29	+ "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
	30	+ "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
	31	+ "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
	32	+ "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
	33	+ "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
	34	+ "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
	35	+ ];
	36	+
	37	+ /**
	38	+ * Replace
	39	+ *
	40	+ * @staticvar array $chr_map
	41	+ * @param string $string
	42	+ * @return string
	43	+ */
	44	+ public static function cleanUnicode($string)
	45	+ {
	46	+ $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
	47	+ $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
	48	+ return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
	49	+ }
	50	+
	51	+ /**
	52	+ * Multibyte.php safe version of standard trim() function.
	53	+ *
	54	+ * @param string $string
	55	+ * @return string
	56	+ */
	57	+ public static function trim($string)
	58	+ {
	59	+ return mb_ereg_replace('^\s([\s\S]?)\s*$', '\1', $string);
	60	+ }
	61	+
	62	+ /**
	63	+ * A cross between mb_split and preg_split, adding the preg_split flags
	64	+ * to mb_split.
	65	+ *
	66	+ * @param string $pattern
	67	+ * @param string $string
	68	+ * @param int $limit
	69	+ * @param int $flags
	70	+ * @return array
	71	+ */
	72	+ public static function split($pattern, $string, $limit = -1, $flags = 0)
	73	+ {
	74	+ $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
	75	+
	76	+ $lengths = self::getSplitLengths($pattern, $string);
	77	+
	78	+ // Substrings
	79	+ $parts = [];
	80	+ $position = 0;
	81	+ $count = 1;
	82	+ foreach ($lengths as $length) {
	83	+ if (self::isLastPart($length, $flags, $limit, $count)) {
	84	+ $parts[] = self::makePart($string, $position, null, $offset_capture);
	85	+ return $parts;
	86	+ }
	87	+
	88	+ if (self::isPart($length, $flags)) {
	89	+ $parts[] = self::makePart($string, $position, $length[0], $offset_capture);
	90	+ }
	91	+
	92	+ $position += $length[0];
	93	+ }
	94	+
	95	+ return $parts;
	96	+ }
	97	+
	98	+ /**
	99	+ * @param $length
	100	+ * @param $flags
	101	+ * @param $limit
	102	+ * @param $count
	103	+ * @return bool
	104	+ */
	105	+ private static function isLastPart($length, $flags, $limit, &$count)
	106	+ {
	107	+ $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) \|\| $length[0];
	108	+ $is_delimiter = $length[1];
	109	+
	110	+ return $limit > 0
	111	+ && !$is_delimiter
	112	+ && $split_empty
	113	+ && ++$count > $limit;
	114	+ }
	115	+
	116	+ /**
	117	+ * @param $length
	118	+ * @param $flags
	119	+ * @return bool
	120	+ */
	121	+ private static function isPart($length, $flags)
	122	+ {
	123	+ $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) \|\| $length[0];
	124	+ $is_delimiter = $length[1];
	125	+ $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
	126	+
	127	+ return (!$is_delimiter
	128	+ \|\| $is_captured)
	129	+ && $split_empty;
	130	+ }
	131	+
	132	+ /**
	133	+ * Make part
	134	+ * @param string $string
	135	+ * @param integer $position
	136	+ * @param integer\|null $length
	137	+ * @param bool $offset_capture
	138	+ * @return array\|string
	139	+ */
	140	+ private static function makePart($string, $position, $length = null, $offset_capture = false)
	141	+ {
	142	+ $cut = mb_strcut($string, $position, $length);
	143	+
	144	+ return $offset_capture
	145	+ ? [$cut, $position]
	146	+ : $cut;
	147	+ }
	148	+
	149	+ /**
	150	+ * Splits the string by pattern and for each element (part or split) returns:
	151	+ * [ 0 => length, 1 => is_delimiter?, 2 =>
	152	+ *
	153	+ * @param $pattern
	154	+ * @param $string
	155	+ * @return array
	156	+ */
	157	+ private static function getSplitLengths($pattern, $string)
	158	+ {
	159	+ $strlen = strlen($string); // bytes!
	160	+ $lengths = [];
	161	+
	162	+ mb_ereg_search_init($string);
	163	+
	164	+ $position = 0;
	165	+ while ($position < $strlen
	166	+ && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
	167	+ // capture split
	168	+ $lengths[] = [$array[0] - $position, false, null];
	169	+
	170	+ // move position
	171	+ $position = $array[0] + $array[1];
	172	+
	173	+ // capture delimiter
	174	+ $regs = mb_ereg_search_getregs();
	175	+ $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
	176	+ }
	177	+
	178	+ // Add last bit, if not ending with split
	179	+ $lengths[] = [$strlen - $position, false, null];
	180	+
	181	+ return $lengths;
	182	+ }
183	183	}
184	184	\ No newline at end of file

		@@ -71,7 +71,7 @@
		block discarded – undo
71	71	*/
72	72	public static function split($pattern, $string, $limit = -1, $flags = 0)
73	73	{
74		- $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
	74	+ $offset_capture = (bool) ($flags & PREG_SPLIT_OFFSET_CAPTURE);
75	75
76	76	$lengths = self::getSplitLengths($pattern, $string);
77	77

		@@ -75,7 +75,7 @@ discard block
		block discarded – undo
75	75
76	76	$this->replacements[$index] = $number;
77	77
78		- $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
	78	+ $text = (string) substr_replace($text, $code, $offset, mb_strlen($number));
79	79
80	80	++$index;
81	81	}
		@@ -92,7 +92,7 @@ discard block
		block discarded – undo
92	92	*/
93	93	private function restoreReplacements($text)
94	94	{
95		- return array_map(function ($value) {
	95	+ return array_map(function($value) {
96	96	foreach ($this->replacements as $index => $number) {
97	97	$code = $this->getReplaceCode($index);
98	98	$value = str_replace($code, $number, $value);
		@@ -190,7 +190,7 @@ discard block
		block discarded – undo
190	190	$merges = [];
191	191	$merge = '';
192	192
193		- $filtered = array_filter($punctuations, function ($p) {
	193	+ $filtered = array_filter($punctuations, function($p) {
194	194	return $p !== '';
195	195	});
196	196
		@@ -448,7 +448,7 @@ discard block
		block discarded – undo
448	448	*/
449	449	private static function trimSentences($sentences)
450	450	{
451		- return array_map(function ($sentence) {
	451	+ return array_map(function($sentence) {
452	452	return Multibyte::trim($sentence);
453	453	}, $sentences);
454	454	}

vanderlee / php-sentence

Pull Request — master (#19)

Status

Category

Indentation +173 added lines, -173 removed lines patch added patch discarded remove patch

Spacing +1 added lines, -1 removed lines patch added patch discarded remove patch

Spacing +4 added lines, -4 removed lines patch added patch discarded remove patch

Indentation +449 added lines, -449 removed lines patch added patch discarded remove patch