Passed
Push — master ( e85d9b...d7563a )
by Martijn
01:34
created
src/Multibyte.php 2 patches
Spacing   +3 added lines, -3 removed lines patch added patch discarded remove patch
@@ -30,9 +30,9 @@
 block discarded – undo
30 30
      */
31 31
     public static function split($pattern, $string, $limit = -1, $flags = 0)
32 32
     {
33
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
34
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
33
+        $split_no_empty = (bool) ($flags & PREG_SPLIT_NO_EMPTY);
34
+        $offset_capture = (bool) ($flags & PREG_SPLIT_OFFSET_CAPTURE);
35
+        $delim_capture = (bool) ($flags & PREG_SPLIT_DELIM_CAPTURE);
36 36
 
37 37
         $lengths = self::getSplitLengths($pattern, $string);
38 38
 
Please login to merge, or discard this patch.
Indentation   +142 added lines, -142 removed lines patch added patch discarded remove patch
@@ -7,146 +7,146 @@
 block discarded – undo
7 7
  */
8 8
 class Multibyte
9 9
 {
10
-    //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11
-    private static $unicodeCharacterMap = [
12
-        // Windows codepage 1252
13
-        "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14
-        "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15
-        "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16
-        "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17
-        "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18
-        "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19
-        "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20
-        "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21
-        // Regular Unicode     // U+0022 quotation mark (")
22
-        // U+0027 apostrophe     (')
23
-        "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24
-        "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25
-        "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26
-        "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27
-        "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28
-        "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29
-        "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30
-        "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31
-        "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32
-        "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33
-        "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34
-        "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35
-    ];
36
-
37
-    /**
38
-     * Replace
39
-     *
40
-     * @staticvar array $chr_map
41
-     * @param string $string
42
-     * @return string
43
-     */
44
-    public static function cleanUnicode($string)
45
-    {
46
-        $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47
-        $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49
-    }
50
-
51
-    /**
52
-     * Multibyte.php safe version of standard trim() function.
53
-     *
54
-     * @param string $string
55
-     * @return string
56
-     */
57
-    public static function trim($string)
58
-    {
59
-        return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
60
-    }
61
-
62
-    /**
63
-     * A cross between mb_split and preg_split, adding the preg_split flags
64
-     * to mb_split.
65
-     *
66
-     * @param string $pattern
67
-     * @param string $string
68
-     * @param int $limit
69
-     * @param int $flags
70
-     * @return array
71
-     */
72
-    public static function split($pattern, $string, $limit = -1, $flags = 0)
73
-    {
74
-        $split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
75
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
76
-        $delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
77
-
78
-        $lengths = self::getSplitLengths($pattern, $string);
79
-
80
-        // Substrings
81
-        $parts = [];
82
-        $position = 0;
83
-        $count = 1;
84
-        foreach ($lengths as $length) {
85
-            $split_empty = !$split_no_empty || $length[0];
86
-            $is_delimiter = $length[1];
87
-            $is_captured = $delim_capture && $length[2];
88
-
89
-            if ($limit > 0
90
-                && !$is_delimiter
91
-                && $split_empty
92
-                && ++$count > $limit) {
93
-
94
-                $cut = mb_strcut($string, $position);
95
-
96
-                $parts[] = $offset_capture
97
-                    ? [$cut, $position]
98
-                    : $cut;
99
-
100
-                break;
101
-            } elseif ((!$is_delimiter
102
-                    || $is_captured)
103
-                && $split_empty) {
104
-
105
-                $cut = mb_strcut($string, $position, $length[0]);
106
-
107
-                $parts[] = $offset_capture
108
-                    ? [$cut, $position]
109
-                    : $cut;
110
-            }
111
-
112
-            $position += $length[0];
113
-        }
114
-
115
-        return $parts;
116
-    }
117
-
118
-    /**
119
-     * Splits the string by pattern and for each element (part or split) returns:
120
-     *  [ 0 => length, 1 => is_delimiter?, 2 =>
121
-     *
122
-     * @param $pattern
123
-     * @param $string
124
-     * @return array
125
-     */
126
-    private static function getSplitLengths($pattern, $string)
127
-    {
128
-        $strlen = strlen($string); // bytes!
129
-        $lengths = [];
130
-
131
-        mb_ereg_search_init($string);
132
-
133
-        $position = 0;
134
-        while ($position < $strlen
135
-            && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
136
-            // capture split
137
-            $lengths[] = [$array[0] - $position, false, null];
138
-
139
-            // move position
140
-            $position = $array[0] + $array[1];
141
-
142
-            // capture delimiter
143
-            $regs = mb_ereg_search_getregs();
144
-            $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
145
-        }
146
-
147
-        // Add last bit, if not ending with split
148
-        $lengths[] = [$strlen - $position, false, null];
149
-
150
-        return $lengths;
151
-    }
10
+	//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11
+	private static $unicodeCharacterMap = [
12
+		// Windows codepage 1252
13
+		"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14
+		"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15
+		"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16
+		"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17
+		"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18
+		"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19
+		"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20
+		"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21
+		// Regular Unicode     // U+0022 quotation mark (")
22
+		// U+0027 apostrophe     (')
23
+		"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24
+		"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25
+		"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26
+		"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27
+		"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28
+		"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29
+		"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30
+		"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31
+		"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32
+		"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33
+		"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34
+		"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35
+	];
36
+
37
+	/**
38
+	 * Replace
39
+	 *
40
+	 * @staticvar array $chr_map
41
+	 * @param string $string
42
+	 * @return string
43
+	 */
44
+	public static function cleanUnicode($string)
45
+	{
46
+		$character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47
+		$replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49
+	}
50
+
51
+	/**
52
+	 * Multibyte.php safe version of standard trim() function.
53
+	 *
54
+	 * @param string $string
55
+	 * @return string
56
+	 */
57
+	public static function trim($string)
58
+	{
59
+		return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
60
+	}
61
+
62
+	/**
63
+	 * A cross between mb_split and preg_split, adding the preg_split flags
64
+	 * to mb_split.
65
+	 *
66
+	 * @param string $pattern
67
+	 * @param string $string
68
+	 * @param int $limit
69
+	 * @param int $flags
70
+	 * @return array
71
+	 */
72
+	public static function split($pattern, $string, $limit = -1, $flags = 0)
73
+	{
74
+		$split_no_empty = (bool)($flags & PREG_SPLIT_NO_EMPTY);
75
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
76
+		$delim_capture = (bool)($flags & PREG_SPLIT_DELIM_CAPTURE);
77
+
78
+		$lengths = self::getSplitLengths($pattern, $string);
79
+
80
+		// Substrings
81
+		$parts = [];
82
+		$position = 0;
83
+		$count = 1;
84
+		foreach ($lengths as $length) {
85
+			$split_empty = !$split_no_empty || $length[0];
86
+			$is_delimiter = $length[1];
87
+			$is_captured = $delim_capture && $length[2];
88
+
89
+			if ($limit > 0
90
+				&& !$is_delimiter
91
+				&& $split_empty
92
+				&& ++$count > $limit) {
93
+
94
+				$cut = mb_strcut($string, $position);
95
+
96
+				$parts[] = $offset_capture
97
+					? [$cut, $position]
98
+					: $cut;
99
+
100
+				break;
101
+			} elseif ((!$is_delimiter
102
+					|| $is_captured)
103
+				&& $split_empty) {
104
+
105
+				$cut = mb_strcut($string, $position, $length[0]);
106
+
107
+				$parts[] = $offset_capture
108
+					? [$cut, $position]
109
+					: $cut;
110
+			}
111
+
112
+			$position += $length[0];
113
+		}
114
+
115
+		return $parts;
116
+	}
117
+
118
+	/**
119
+	 * Splits the string by pattern and for each element (part or split) returns:
120
+	 *  [ 0 => length, 1 => is_delimiter?, 2 =>
121
+	 *
122
+	 * @param $pattern
123
+	 * @param $string
124
+	 * @return array
125
+	 */
126
+	private static function getSplitLengths($pattern, $string)
127
+	{
128
+		$strlen = strlen($string); // bytes!
129
+		$lengths = [];
130
+
131
+		mb_ereg_search_init($string);
132
+
133
+		$position = 0;
134
+		while ($position < $strlen
135
+			&& ($array = mb_ereg_search_pos($pattern, '')) !== false) {
136
+			// capture split
137
+			$lengths[] = [$array[0] - $position, false, null];
138
+
139
+			// move position
140
+			$position = $array[0] + $array[1];
141
+
142
+			// capture delimiter
143
+			$regs = mb_ereg_search_getregs();
144
+			$lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
145
+		}
146
+
147
+		// Add last bit, if not ending with split
148
+		$lengths[] = [$strlen - $position, false, null];
149
+
150
+		return $lengths;
151
+	}
152 152
 }
153 153
\ No newline at end of file
Please login to merge, or discard this patch.
src/Sentence.php 1 patch
Indentation   +338 added lines, -338 removed lines patch added patch discarded remove patch
@@ -17,343 +17,343 @@
 block discarded – undo
17 17
 class Sentence
18 18
 {
19 19
 
20
-    /**
21
-     * Specify this flag with the split method to trim whitespace.
22
-     */
23
-    const SPLIT_TRIM = 0x1;
24
-
25
-    /**
26
-     * List of characters used to terminate sentences.
27
-     *
28
-     * @var string[]
29
-     */
30
-    private $terminals = ['.', '!', '?'];
31
-
32
-    /**
33
-     * List of characters used for abbreviations.
34
-     *
35
-     * @var string[]
36
-     */
37
-    private $abbreviators = ['.'];
38
-
39
-    /**
40
-     * Breaks a piece of text into lines by linebreak.
41
-     * Eats up any linebreak characters as if one.
42
-     *
43
-     * Multibyte.php safe
44
-     *
45
-     * @param string $text
46
-     * @return string[]
47
-     */
48
-    private static function linebreakSplit($text)
49
-    {
50
-        $lines = [];
51
-        $line = '';
52
-
53
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
-            $line .= $part;
55
-            if (Multibyte::trim($part) === '') {
56
-                $lines[] = $line;
57
-                $line = '';
58
-            }
59
-        }
60
-        $lines[] = $line;
61
-
62
-        return $lines;
63
-    }
64
-
65
-    /**
66
-     * Splits an array of lines by (consecutive sequences of)
67
-     * terminals, keeping terminals.
68
-     *
69
-     * Multibyte.php safe (atleast for UTF-8)
70
-     *
71
-     * For example:
72
-     *    "There ... is. More!"
73
-     *        ... becomes ...
74
-     *    [ "There ", "...", " is", ".", " More", "!" ]
75
-     *
76
-     * @param string $line
77
-     * @return string[]
78
-     */
79
-    private function punctuationSplit($line)
80
-    {
81
-        $parts = [];
82
-
83
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84
-        $is_terminal = in_array($chars[0], $this->terminals);
85
-
86
-        $part = '';
87
-        foreach ($chars as $index => $char) {
88
-            if (in_array($char, $this->terminals) !== $is_terminal) {
89
-                $parts[] = $part;
90
-                $part = '';
91
-                $is_terminal = !$is_terminal;
92
-            }
93
-            $part .= $char;
94
-        }
95
-
96
-        if (!empty($part)) {
97
-            $parts[] = $part;
98
-        }
99
-
100
-        return $parts;
101
-    }
102
-
103
-    /**
104
-     * Appends each terminal item after it's preceding
105
-     * non-terminals.
106
-     *
107
-     * Multibyte.php safe (atleast for UTF-8)
108
-     *
109
-     * For example:
110
-     *    [ "There ", "...", " is", ".", " More", "!" ]
111
-     *        ... becomes ...
112
-     *    [ "There ... is.", "More!" ]
113
-     *
114
-     * @param string[] $punctuations
115
-     * @return string[]
116
-     */
117
-    private function punctuationMerge($punctuations)
118
-    {
119
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
120
-
121
-        $merges = [];
122
-        $merge = '';
123
-
124
-        foreach ($punctuations as $punctuation) {
125
-            if ($punctuation !== '') {
126
-                $merge .= $punctuation;
127
-                if (mb_strlen($punctuation) === 1
128
-                    && in_array($punctuation, $this->terminals)) {
129
-                    $merges[] = $merge;
130
-                    $merge = '';
131
-                } else {
132
-                    foreach ($definite_terminals as $terminal) {
133
-                        if (mb_strpos($punctuation, $terminal) !== false) {
134
-                            $merges[] = $merge;
135
-                            $merge = '';
136
-                            break;
137
-                        }
138
-                    }
139
-                }
140
-            }
141
-        }
142
-        if (!empty($merge)) {
143
-            $merges[] = $merge;
144
-        }
145
-
146
-        return $merges;
147
-    }
148
-
149
-    /**
150
-     * Looks for capitalized abbreviations & includes them with the following fragment.
151
-     *
152
-     * For example:
153
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
154
-     *        ... becomes ...
155
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
156
-     *  [ "Mr. Comey was not available for comment." ]
157
-     *
158
-     * @param string[] $fragments
159
-     * @return string[]
160
-     */
161
-    private function abbreviationMerge($fragments)
162
-    {
163
-        $return_fragment = [];
164
-
165
-        $previous_string = '';
166
-        $previous_is_abbreviation = false;
167
-        $i = 0;
168
-
169
-        foreach ($fragments as $fragment) {
170
-            $current_string = $fragment;
171
-            $words = mb_split('\s+', Multibyte::trim($fragment));
172
-
173
-            $word_count = count($words);
174
-
175
-            // if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
176
-            $last_word = trim($words[$word_count - 1]);
177
-            $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
178
-            $last_is_abbreviation = substr(trim($fragment), -1) === '.';
179
-            $is_abbreviation = $last_is_capital > 0
180
-                && $last_is_abbreviation > 0
181
-                && mb_strlen($last_word) <= 3;
182
-
183
-            // merge previous fragment with this
184
-            if ($previous_is_abbreviation === true) {
185
-                $current_string = $previous_string . $current_string;
186
-            }
187
-            $return_fragment[$i] = $current_string;
188
-
189
-            $previous_is_abbreviation = $is_abbreviation;
190
-            $previous_string = $current_string;
191
-            // only increment if this isn't an abbreviation
192
-            if ($is_abbreviation === false) {
193
-                $i++;
194
-            }
195
-        }
196
-        return $return_fragment;
197
-    }
198
-
199
-    /**
200
-     * Merges any part starting with a closing parenthesis ')' to the previous
201
-     * part.
202
-     *
203
-     * @param string[] $parts
204
-     * @return string[]
205
-     */
206
-    private function parenthesesMerge($parts)
207
-    {
208
-        $subsentences = [];
209
-
210
-        foreach ($parts as $part) {
211
-            if ($part[0] === ')') {
212
-                $subsentences[count($subsentences) - 1] .= $part;
213
-            } else {
214
-                $subsentences[] = $part;
215
-            }
216
-        }
217
-
218
-        return $subsentences;
219
-    }
220
-
221
-    /**
222
-     * Looks for closing quotes to include them with the previous statement.
223
-     * "That was very interesting," he said.
224
-     * "That was very interesting."
225
-     *
226
-     * @param string[] $statements
227
-     * @return string[]
228
-     */
229
-    private function closeQuotesMerge($statements)
230
-    {
231
-        $i = 0;
232
-        $previous_statement = "";
233
-        $return = [];
234
-        foreach ($statements as $statement) {
235
-            // detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
236
-            if (trim($statement) === '"'
237
-                || trim($statement) === "'"
238
-                || (
239
-                    (substr($statement, 0, 1) === '"'
240
-                        || substr($statement, 0, 1) === "'")
241
-                    && substr($statement, 1, 1) === ' '
242
-                    && ctype_lower(substr($statement, 2, 1)) === true
243
-                )
244
-            ) {
245
-                $statement = $previous_statement . $statement;
246
-            } else {
247
-                $i++;
248
-            }
249
-
250
-            $return[$i] = $statement;
251
-            $previous_statement = $statement;
252
-        }
253
-
254
-        return $return;
255
-    }
256
-
257
-    /**
258
-     * Merges items into larger sentences.
259
-     * Multibyte.php safe
260
-     *
261
-     * @param string[] $shorts
262
-     * @return string[]
263
-     */
264
-    private function sentenceMerge($shorts)
265
-    {
266
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
267
-
268
-        $sentences = [];
269
-
270
-        $sentence = '';
271
-        $has_words = false;
272
-        $previous_word_ending = null;
273
-        foreach ($shorts as $short) {
274
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
275
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
276
-
277
-            if ($after_non_abbreviating_terminal
278
-                || ($has_words && $word_count > 1)) {
279
-                $sentences[] = $sentence;
280
-                $sentence = '';
281
-                $has_words = $word_count > 1;
282
-            } else {
283
-                $has_words = ($has_words
284
-                    || $word_count > 1);
285
-            }
286
-
287
-            $sentence .= $short;
288
-            $previous_word_ending = mb_substr($short, -1);
289
-        }
290
-        if (!empty($sentence)) {
291
-            $sentences[] = $sentence;
292
-        }
293
-
294
-        return $sentences;
295
-    }
296
-
297
-    /**
298
-     * Return the sentences sentences detected in the provided text.
299
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
300
-     * @param string $text
301
-     * @param integer $flags
302
-     * @return string[]
303
-     */
304
-    public function split($text, $flags = 0)
305
-    {
306
-        static $pipeline = [
307
-            'punctuationSplit',
308
-            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
309
-            'punctuationMerge',
310
-            'abbreviationMerge',
311
-            'closeQuotesMerge',
312
-            'sentenceMerge',
313
-        ];
314
-
315
-        // clean funny quotes
316
-        $text = Multibyte::cleanUnicode($text);
317
-
318
-        // Split
319
-        $sentences = [];
320
-        foreach (self::linebreakSplit($text) as $input) {
321
-            if (Multibyte::trim($input) !== '') {
322
-                foreach ($pipeline as $method) {
323
-                    $input = $this->$method($input);
324
-                }
325
-                $sentences = array_merge($sentences, $input);
326
-            }
327
-        }
328
-
329
-        // Post process
330
-        if ($flags & self::SPLIT_TRIM) {
331
-            return self::trimSentences($sentences);
332
-        }
333
-
334
-        return $sentences;
335
-    }
336
-
337
-    /**
338
-     * Multibyte.php trim each string in an array.
339
-     * @param string[] $sentences
340
-     * @return string[]
341
-     */
342
-    private static function trimSentences($sentences)
343
-    {
344
-        return array_map(function($sentence) {
345
-            return Multibyte::trim($sentence);
346
-        }, $sentences);
347
-    }
348
-
349
-    /**
350
-     * Return the number of sentences detected in the provided text.
351
-     * @param string $text
352
-     * @return integer
353
-     */
354
-    public function count($text)
355
-    {
356
-        return count($this->split($text));
357
-    }
20
+	/**
21
+	 * Specify this flag with the split method to trim whitespace.
22
+	 */
23
+	const SPLIT_TRIM = 0x1;
24
+
25
+	/**
26
+	 * List of characters used to terminate sentences.
27
+	 *
28
+	 * @var string[]
29
+	 */
30
+	private $terminals = ['.', '!', '?'];
31
+
32
+	/**
33
+	 * List of characters used for abbreviations.
34
+	 *
35
+	 * @var string[]
36
+	 */
37
+	private $abbreviators = ['.'];
38
+
39
+	/**
40
+	 * Breaks a piece of text into lines by linebreak.
41
+	 * Eats up any linebreak characters as if one.
42
+	 *
43
+	 * Multibyte.php safe
44
+	 *
45
+	 * @param string $text
46
+	 * @return string[]
47
+	 */
48
+	private static function linebreakSplit($text)
49
+	{
50
+		$lines = [];
51
+		$line = '';
52
+
53
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
54
+			$line .= $part;
55
+			if (Multibyte::trim($part) === '') {
56
+				$lines[] = $line;
57
+				$line = '';
58
+			}
59
+		}
60
+		$lines[] = $line;
61
+
62
+		return $lines;
63
+	}
64
+
65
+	/**
66
+	 * Splits an array of lines by (consecutive sequences of)
67
+	 * terminals, keeping terminals.
68
+	 *
69
+	 * Multibyte.php safe (atleast for UTF-8)
70
+	 *
71
+	 * For example:
72
+	 *    "There ... is. More!"
73
+	 *        ... becomes ...
74
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
75
+	 *
76
+	 * @param string $line
77
+	 * @return string[]
78
+	 */
79
+	private function punctuationSplit($line)
80
+	{
81
+		$parts = [];
82
+
83
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
84
+		$is_terminal = in_array($chars[0], $this->terminals);
85
+
86
+		$part = '';
87
+		foreach ($chars as $index => $char) {
88
+			if (in_array($char, $this->terminals) !== $is_terminal) {
89
+				$parts[] = $part;
90
+				$part = '';
91
+				$is_terminal = !$is_terminal;
92
+			}
93
+			$part .= $char;
94
+		}
95
+
96
+		if (!empty($part)) {
97
+			$parts[] = $part;
98
+		}
99
+
100
+		return $parts;
101
+	}
102
+
103
+	/**
104
+	 * Appends each terminal item after it's preceding
105
+	 * non-terminals.
106
+	 *
107
+	 * Multibyte.php safe (atleast for UTF-8)
108
+	 *
109
+	 * For example:
110
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
111
+	 *        ... becomes ...
112
+	 *    [ "There ... is.", "More!" ]
113
+	 *
114
+	 * @param string[] $punctuations
115
+	 * @return string[]
116
+	 */
117
+	private function punctuationMerge($punctuations)
118
+	{
119
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
120
+
121
+		$merges = [];
122
+		$merge = '';
123
+
124
+		foreach ($punctuations as $punctuation) {
125
+			if ($punctuation !== '') {
126
+				$merge .= $punctuation;
127
+				if (mb_strlen($punctuation) === 1
128
+					&& in_array($punctuation, $this->terminals)) {
129
+					$merges[] = $merge;
130
+					$merge = '';
131
+				} else {
132
+					foreach ($definite_terminals as $terminal) {
133
+						if (mb_strpos($punctuation, $terminal) !== false) {
134
+							$merges[] = $merge;
135
+							$merge = '';
136
+							break;
137
+						}
138
+					}
139
+				}
140
+			}
141
+		}
142
+		if (!empty($merge)) {
143
+			$merges[] = $merge;
144
+		}
145
+
146
+		return $merges;
147
+	}
148
+
149
+	/**
150
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
151
+	 *
152
+	 * For example:
153
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
154
+	 *        ... becomes ...
155
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
156
+	 *  [ "Mr. Comey was not available for comment." ]
157
+	 *
158
+	 * @param string[] $fragments
159
+	 * @return string[]
160
+	 */
161
+	private function abbreviationMerge($fragments)
162
+	{
163
+		$return_fragment = [];
164
+
165
+		$previous_string = '';
166
+		$previous_is_abbreviation = false;
167
+		$i = 0;
168
+
169
+		foreach ($fragments as $fragment) {
170
+			$current_string = $fragment;
171
+			$words = mb_split('\s+', Multibyte::trim($fragment));
172
+
173
+			$word_count = count($words);
174
+
175
+			// if last word of fragment starts with a Capital, ends in "." & has less than 3 characters, trigger "is abbreviation"
176
+			$last_word = trim($words[$word_count - 1]);
177
+			$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
178
+			$last_is_abbreviation = substr(trim($fragment), -1) === '.';
179
+			$is_abbreviation = $last_is_capital > 0
180
+				&& $last_is_abbreviation > 0
181
+				&& mb_strlen($last_word) <= 3;
182
+
183
+			// merge previous fragment with this
184
+			if ($previous_is_abbreviation === true) {
185
+				$current_string = $previous_string . $current_string;
186
+			}
187
+			$return_fragment[$i] = $current_string;
188
+
189
+			$previous_is_abbreviation = $is_abbreviation;
190
+			$previous_string = $current_string;
191
+			// only increment if this isn't an abbreviation
192
+			if ($is_abbreviation === false) {
193
+				$i++;
194
+			}
195
+		}
196
+		return $return_fragment;
197
+	}
198
+
199
+	/**
200
+	 * Merges any part starting with a closing parenthesis ')' to the previous
201
+	 * part.
202
+	 *
203
+	 * @param string[] $parts
204
+	 * @return string[]
205
+	 */
206
+	private function parenthesesMerge($parts)
207
+	{
208
+		$subsentences = [];
209
+
210
+		foreach ($parts as $part) {
211
+			if ($part[0] === ')') {
212
+				$subsentences[count($subsentences) - 1] .= $part;
213
+			} else {
214
+				$subsentences[] = $part;
215
+			}
216
+		}
217
+
218
+		return $subsentences;
219
+	}
220
+
221
+	/**
222
+	 * Looks for closing quotes to include them with the previous statement.
223
+	 * "That was very interesting," he said.
224
+	 * "That was very interesting."
225
+	 *
226
+	 * @param string[] $statements
227
+	 * @return string[]
228
+	 */
229
+	private function closeQuotesMerge($statements)
230
+	{
231
+		$i = 0;
232
+		$previous_statement = "";
233
+		$return = [];
234
+		foreach ($statements as $statement) {
235
+			// detect end quote - if the entire string is a quotation mark, or it's [quote, space, lowercase]
236
+			if (trim($statement) === '"'
237
+				|| trim($statement) === "'"
238
+				|| (
239
+					(substr($statement, 0, 1) === '"'
240
+						|| substr($statement, 0, 1) === "'")
241
+					&& substr($statement, 1, 1) === ' '
242
+					&& ctype_lower(substr($statement, 2, 1)) === true
243
+				)
244
+			) {
245
+				$statement = $previous_statement . $statement;
246
+			} else {
247
+				$i++;
248
+			}
249
+
250
+			$return[$i] = $statement;
251
+			$previous_statement = $statement;
252
+		}
253
+
254
+		return $return;
255
+	}
256
+
257
+	/**
258
+	 * Merges items into larger sentences.
259
+	 * Multibyte.php safe
260
+	 *
261
+	 * @param string[] $shorts
262
+	 * @return string[]
263
+	 */
264
+	private function sentenceMerge($shorts)
265
+	{
266
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
267
+
268
+		$sentences = [];
269
+
270
+		$sentence = '';
271
+		$has_words = false;
272
+		$previous_word_ending = null;
273
+		foreach ($shorts as $short) {
274
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
275
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
276
+
277
+			if ($after_non_abbreviating_terminal
278
+				|| ($has_words && $word_count > 1)) {
279
+				$sentences[] = $sentence;
280
+				$sentence = '';
281
+				$has_words = $word_count > 1;
282
+			} else {
283
+				$has_words = ($has_words
284
+					|| $word_count > 1);
285
+			}
286
+
287
+			$sentence .= $short;
288
+			$previous_word_ending = mb_substr($short, -1);
289
+		}
290
+		if (!empty($sentence)) {
291
+			$sentences[] = $sentence;
292
+		}
293
+
294
+		return $sentences;
295
+	}
296
+
297
+	/**
298
+	 * Return the sentences sentences detected in the provided text.
299
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
300
+	 * @param string $text
301
+	 * @param integer $flags
302
+	 * @return string[]
303
+	 */
304
+	public function split($text, $flags = 0)
305
+	{
306
+		static $pipeline = [
307
+			'punctuationSplit',
308
+			'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
309
+			'punctuationMerge',
310
+			'abbreviationMerge',
311
+			'closeQuotesMerge',
312
+			'sentenceMerge',
313
+		];
314
+
315
+		// clean funny quotes
316
+		$text = Multibyte::cleanUnicode($text);
317
+
318
+		// Split
319
+		$sentences = [];
320
+		foreach (self::linebreakSplit($text) as $input) {
321
+			if (Multibyte::trim($input) !== '') {
322
+				foreach ($pipeline as $method) {
323
+					$input = $this->$method($input);
324
+				}
325
+				$sentences = array_merge($sentences, $input);
326
+			}
327
+		}
328
+
329
+		// Post process
330
+		if ($flags & self::SPLIT_TRIM) {
331
+			return self::trimSentences($sentences);
332
+		}
333
+
334
+		return $sentences;
335
+	}
336
+
337
+	/**
338
+	 * Multibyte.php trim each string in an array.
339
+	 * @param string[] $sentences
340
+	 * @return string[]
341
+	 */
342
+	private static function trimSentences($sentences)
343
+	{
344
+		return array_map(function($sentence) {
345
+			return Multibyte::trim($sentence);
346
+		}, $sentences);
347
+	}
348
+
349
+	/**
350
+	 * Return the number of sentences detected in the provided text.
351
+	 * @param string $text
352
+	 * @return integer
353
+	 */
354
+	public function count($text)
355
+	{
356
+		return count($this->split($text));
357
+	}
358 358
 
359 359
 }
Please login to merge, or discard this patch.