Passed
Push — master ( 538637...3febfa )
by Martijn
02:31
created
src/Multibyte.php 1 patch
Indentation   +173 added lines, -173 removed lines patch added patch discarded remove patch
@@ -7,177 +7,177 @@
 block discarded – undo
7 7
  */
8 8
 class Multibyte
9 9
 {
10
-    //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11
-    private static $unicodeCharacterMap = [
12
-        // Windows codepage 1252
13
-        "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14
-        "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15
-        "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16
-        "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17
-        "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18
-        "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19
-        "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20
-        "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21
-        // Regular Unicode     // U+0022 quotation mark (")
22
-        // U+0027 apostrophe     (')
23
-        "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24
-        "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25
-        "\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26
-        "\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27
-        "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28
-        "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29
-        "\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30
-        "\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31
-        "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32
-        "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33
-        "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34
-        "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35
-    ];
36
-
37
-    /**
38
-     * Replace
39
-     *
40
-     * @staticvar array $chr_map
41
-     * @param string $string
42
-     * @return string
43
-     */
44
-    public static function cleanUnicode($string)
45
-    {
46
-        $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47
-        $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48
-        return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49
-    }
50
-
51
-    /**
52
-     * Multibyte.php safe version of standard trim() function.
53
-     *
54
-     * @param string $string
55
-     * @return string
56
-     */
57
-    public static function trim($string)
58
-    {
59
-        return mb_ereg_replace('(^\s*)|(\s*$)', '', $string);
60
-    }
61
-
62
-    /**
63
-     * A cross between mb_split and preg_split, adding the preg_split flags
64
-     * to mb_split.
65
-     *
66
-     * @param string $pattern
67
-     * @param string $string
68
-     * @param int $limit
69
-     * @param int $flags
70
-     * @return array
71
-     */
72
-    public static function split($pattern, $string, $limit = -1, $flags = 0)
73
-    {
74
-        $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
75
-
76
-        $lengths = self::getSplitLengths($pattern, $string);
77
-
78
-        // Substrings
79
-        $parts = [];
80
-        $position = 0;
81
-        $count = 1;
82
-        foreach ($lengths as $length) {
83
-            if (self::isLastPart($length, $flags, $limit, $count)) {
84
-                $parts[] = self::makePart($string, $position, null, $offset_capture);
85
-                return $parts;
86
-            }
87
-
88
-            if (self::isPart($length, $flags)) {
89
-                $parts[] = self::makePart($string, $position, $length[0], $offset_capture);
90
-            }
91
-
92
-            $position += $length[0];
93
-        }
94
-
95
-        return $parts;
96
-    }
97
-
98
-    /**
99
-     * @param $length
100
-     * @param $flags
101
-     * @param $limit
102
-     * @param $count
103
-     * @return bool
104
-     */
105
-    private static function isLastPart($length, $flags, $limit, &$count)
106
-    {
107
-        $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
108
-        $is_delimiter = $length[1];
109
-
110
-        return $limit > 0
111
-            && !$is_delimiter
112
-            && $split_empty
113
-            && ++$count > $limit;
114
-    }
115
-
116
-    /**
117
-     * @param $length
118
-     * @param $flags
119
-     * @return bool
120
-     */
121
-    private static function isPart($length, $flags)
122
-    {
123
-        $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
124
-        $is_delimiter = $length[1];
125
-        $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
126
-
127
-        return (!$is_delimiter
128
-                || $is_captured)
129
-            && $split_empty;
130
-    }
131
-
132
-    /**
133
-     * Make part
134
-     * @param string $string
135
-     * @param integer $position
136
-     * @param integer|null $length
137
-     * @param bool $offset_capture
138
-     * @return array|string
139
-     */
140
-    private static function makePart($string, $position, $length = null, $offset_capture = false)
141
-    {
142
-        $cut = mb_strcut($string, $position, $length);
143
-
144
-        return $offset_capture
145
-            ? [$cut, $position]
146
-            : $cut;
147
-    }
148
-
149
-    /**
150
-     * Splits the string by pattern and for each element (part or split) returns:
151
-     *  [ 0 => length, 1 => is_delimiter?, 2 =>
152
-     *
153
-     * @param $pattern
154
-     * @param $string
155
-     * @return array
156
-     */
157
-    private static function getSplitLengths($pattern, $string)
158
-    {
159
-        $strlen = strlen($string); // bytes!
160
-        $lengths = [];
161
-
162
-        mb_ereg_search_init($string);
163
-
164
-        $position = 0;
165
-        while ($position < $strlen
166
-            && ($array = mb_ereg_search_pos($pattern, '')) !== false) {
167
-            // capture split
168
-            $lengths[] = [$array[0] - $position, false, null];
169
-
170
-            // move position
171
-            $position = $array[0] + $array[1];
172
-
173
-            // capture delimiter
174
-            $regs = mb_ereg_search_getregs();
175
-            $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
176
-        }
177
-
178
-        // Add last bit, if not ending with split
179
-        $lengths[] = [$strlen - $position, false, null];
180
-
181
-        return $lengths;
182
-    }
10
+	//https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php
11
+	private static $unicodeCharacterMap = [
12
+		// Windows codepage 1252
13
+		"\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark
14
+		"\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark
15
+		"\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark
16
+		"\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark
17
+		"\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark
18
+		"\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark
19
+		"\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark
20
+		"\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark
21
+		// Regular Unicode     // U+0022 quotation mark (")
22
+		// U+0027 apostrophe     (')
23
+		"\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark
24
+		"\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark
25
+		"\xE2\x80\x98" => "'", // U+2018 left single quotation mark
26
+		"\xE2\x80\x99" => "'", // U+2019 right single quotation mark
27
+		"\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark
28
+		"\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark
29
+		"\xE2\x80\x9C" => '"', // U+201C left double quotation mark
30
+		"\xE2\x80\x9D" => '"', // U+201D right double quotation mark
31
+		"\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark
32
+		"\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark
33
+		"\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark
34
+		"\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark
35
+	];
36
+
37
+	/**
38
+	 * Replace
39
+	 *
40
+	 * @staticvar array $chr_map
41
+	 * @param string $string
42
+	 * @return string
43
+	 */
44
+	public static function cleanUnicode($string)
45
+	{
46
+		$character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should
47
+		$replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays
48
+		return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8"));
49
+	}
50
+
51
+	/**
52
+	 * Multibyte.php safe version of standard trim() function.
53
+	 *
54
+	 * @param string $string
55
+	 * @return string
56
+	 */
57
+	public static function trim($string)
58
+	{
59
+		return mb_ereg_replace('(^\s*)|(\s*$)', '', $string);
60
+	}
61
+
62
+	/**
63
+	 * A cross between mb_split and preg_split, adding the preg_split flags
64
+	 * to mb_split.
65
+	 *
66
+	 * @param string $pattern
67
+	 * @param string $string
68
+	 * @param int $limit
69
+	 * @param int $flags
70
+	 * @return array
71
+	 */
72
+	public static function split($pattern, $string, $limit = -1, $flags = 0)
73
+	{
74
+		$offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE);
75
+
76
+		$lengths = self::getSplitLengths($pattern, $string);
77
+
78
+		// Substrings
79
+		$parts = [];
80
+		$position = 0;
81
+		$count = 1;
82
+		foreach ($lengths as $length) {
83
+			if (self::isLastPart($length, $flags, $limit, $count)) {
84
+				$parts[] = self::makePart($string, $position, null, $offset_capture);
85
+				return $parts;
86
+			}
87
+
88
+			if (self::isPart($length, $flags)) {
89
+				$parts[] = self::makePart($string, $position, $length[0], $offset_capture);
90
+			}
91
+
92
+			$position += $length[0];
93
+		}
94
+
95
+		return $parts;
96
+	}
97
+
98
+	/**
99
+	 * @param $length
100
+	 * @param $flags
101
+	 * @param $limit
102
+	 * @param $count
103
+	 * @return bool
104
+	 */
105
+	private static function isLastPart($length, $flags, $limit, &$count)
106
+	{
107
+		$split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
108
+		$is_delimiter = $length[1];
109
+
110
+		return $limit > 0
111
+			&& !$is_delimiter
112
+			&& $split_empty
113
+			&& ++$count > $limit;
114
+	}
115
+
116
+	/**
117
+	 * @param $length
118
+	 * @param $flags
119
+	 * @return bool
120
+	 */
121
+	private static function isPart($length, $flags)
122
+	{
123
+		$split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0];
124
+		$is_delimiter = $length[1];
125
+		$is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2];
126
+
127
+		return (!$is_delimiter
128
+				|| $is_captured)
129
+			&& $split_empty;
130
+	}
131
+
132
+	/**
133
+	 * Make part
134
+	 * @param string $string
135
+	 * @param integer $position
136
+	 * @param integer|null $length
137
+	 * @param bool $offset_capture
138
+	 * @return array|string
139
+	 */
140
+	private static function makePart($string, $position, $length = null, $offset_capture = false)
141
+	{
142
+		$cut = mb_strcut($string, $position, $length);
143
+
144
+		return $offset_capture
145
+			? [$cut, $position]
146
+			: $cut;
147
+	}
148
+
149
+	/**
150
+	 * Splits the string by pattern and for each element (part or split) returns:
151
+	 *  [ 0 => length, 1 => is_delimiter?, 2 =>
152
+	 *
153
+	 * @param $pattern
154
+	 * @param $string
155
+	 * @return array
156
+	 */
157
+	private static function getSplitLengths($pattern, $string)
158
+	{
159
+		$strlen = strlen($string); // bytes!
160
+		$lengths = [];
161
+
162
+		mb_ereg_search_init($string);
163
+
164
+		$position = 0;
165
+		while ($position < $strlen
166
+			&& ($array = mb_ereg_search_pos($pattern, '')) !== false) {
167
+			// capture split
168
+			$lengths[] = [$array[0] - $position, false, null];
169
+
170
+			// move position
171
+			$position = $array[0] + $array[1];
172
+
173
+			// capture delimiter
174
+			$regs = mb_ereg_search_getregs();
175
+			$lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]];
176
+		}
177
+
178
+		// Add last bit, if not ending with split
179
+		$lengths[] = [$strlen - $position, false, null];
180
+
181
+		return $lengths;
182
+	}
183 183
 }
184 184
\ No newline at end of file
Please login to merge, or discard this patch.
src/Sentence.php 1 patch
Indentation   +450 added lines, -450 removed lines patch added patch discarded remove patch
@@ -17,455 +17,455 @@
 block discarded – undo
17 17
 class Sentence
18 18
 {
19 19
 
20
-    /**
21
-     * Specify this flag with the split method to trim whitespace.
22
-     */
23
-    const SPLIT_TRIM = 0x1;
24
-
25
-    /**
26
-     * List of characters used to terminate sentences.
27
-     *
28
-     * @var string[]
29
-     */
30
-    private $terminals = ['.', '!', '?'];
31
-
32
-    /**
33
-     * List of characters used for abbreviations.
34
-     *
35
-     * @var string[]
36
-     */
37
-    private $abbreviators = ['.'];
38
-
39
-    /**
40
-     * List of replacements in the text.
41
-     *
42
-     * @var string[]
43
-     */
44
-    private $replacements = [];
45
-
46
-    /**
47
-     * Generate an in-text replacement code for the specified index
48
-     *
49
-     * @param int $index
50
-     *
51
-     * @return string
52
-     */
53
-    private function getReplaceCode($index)
54
-    {
55
-        return 0x02 . $index . 0x03;
56
-    }
57
-
58
-    /**
59
-     * Clean floating point numbers by replace them with an in-text index
60
-     *
61
-     * @param string $text
62
-     *
63
-     * @return string
64
-     */
65
-    private function replaceFloatNumbers($text)
66
-    {
67
-        $matches = array();
68
-        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
69
-
70
-        $this->replacements = [];
71
-        $index = 0;
72
-        foreach (array_reverse($matches[0]) as $match) {
73
-            $number = $match[0];
74
-            $offset = $match[1];
75
-            $code = $this->getReplaceCode($index);
76
-
77
-            $this->replacements[$index] = $number;
78
-
79
-            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
80
-
81
-            ++$index;
82
-        }
83
-
84
-        return $text;
85
-    }
86
-
87
-    /**
88
-     * Restore any stored replacements
89
-     *
90
-     * @param string[] $text
91
-     *
92
-     * @return string[]
93
-     */
94
-    private function restoreReplacements($text)
95
-    {
96
-        return array_map(function ($value) {
97
-            foreach ($this->replacements as $index => $number) {
98
-                $code = $this->getReplaceCode($index);
99
-                $value = str_replace($code, $number, $value);
100
-            }
101
-
102
-            return $value;
103
-        }, $text);
104
-    }
105
-
106
-    /**
107
-     * Breaks a piece of text into lines by linebreak.
108
-     * Eats up any linebreak characters as if one.
109
-     *
110
-     * Multibyte.php safe
111
-     *
112
-     * @param string $text
113
-     *
114
-     * @return string[]
115
-     */
116
-    private static function linebreakSplit($text)
117
-    {
118
-        $lines = [];
119
-        $line = '';
120
-
121
-        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
122
-            $line .= $part;
123
-            if (Multibyte::trim($part) === '') {
124
-                $lines[] = $line;
125
-                $line = '';
126
-            }
127
-        }
128
-        $lines[] = $line;
129
-
130
-        return $lines;
131
-    }
132
-
133
-    /**
134
-     * Splits an array of lines by (consecutive sequences of)
135
-     * terminals, keeping terminals.
136
-     *
137
-     * Multibyte.php safe (at least for UTF-8)
138
-     *
139
-     * For example:
140
-     *    "There ... is. More!"
141
-     *        ... becomes ...
142
-     *    [ "There ", "...", " is", ".", " More", "!" ]
143
-     *
144
-     * @param string $line
145
-     *
146
-     * @return string[]
147
-     */
148
-    private function punctuationSplit($line)
149
-    {
150
-        $parts = [];
151
-
152
-        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
153
-        $is_terminal = in_array($chars[0], $this->terminals);
154
-
155
-        $part = '';
156
-        foreach ($chars as $char) {
157
-            if (in_array($char, $this->terminals) !== $is_terminal) {
158
-                $parts[] = $part;
159
-                $part = '';
160
-                $is_terminal = !$is_terminal;
161
-            }
162
-            $part .= $char;
163
-        }
164
-
165
-        if (!empty($part)) {
166
-            $parts[] = $part;
167
-        }
168
-
169
-        return $parts;
170
-    }
171
-
172
-    /**
173
-     * Appends each terminal item after it's preceding
174
-     * non-terminals.
175
-     *
176
-     * Multibyte.php safe (at least for UTF-8)
177
-     *
178
-     * For example:
179
-     *    [ "There ", "...", " is", ".", " More", "!" ]
180
-     *        ... becomes ...
181
-     *    [ "There ... is.", "More!" ]
182
-     *
183
-     * @param string[] $punctuations
184
-     *
185
-     * @return string[]
186
-     */
187
-    private function punctuationMerge($punctuations)
188
-    {
189
-        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
190
-
191
-        $merges = [];
192
-        $merge = '';
193
-
194
-        $filtered = array_filter($punctuations, function ($p) {
195
-            return $p !== '';
196
-        });
197
-
198
-        foreach ($filtered as $punctuation) {
199
-            $merge .= $punctuation;
200
-            if (mb_strlen($punctuation) === 1
201
-                && in_array($punctuation, $this->terminals)) {
202
-                $merges[] = $merge;
203
-                $merge = '';
204
-            } else {
205
-                foreach ($definite_terminals as $terminal) {
206
-                    if (mb_strpos($punctuation, $terminal) !== false) {
207
-                        $merges[] = $merge;
208
-                        $merge = '';
209
-                        break;
210
-                    }
211
-                }
212
-            }
213
-        }
214
-        if (!empty($merge)) {
215
-            $merges[] = $merge;
216
-        }
217
-
218
-        return $merges;
219
-    }
220
-
221
-    /**
222
-     * Looks for capitalized abbreviations & includes them with the following fragment.
223
-     *
224
-     * For example:
225
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
226
-     *        ... becomes ...
227
-     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
228
-     *  [ "Mr. Comey was not available for comment." ]
229
-     *
230
-     * @param string[] $fragments
231
-     *
232
-     * @return string[]
233
-     */
234
-    private function abbreviationMerge($fragments)
235
-    {
236
-        $return_fragment = [];
237
-
238
-        $previous_fragment = '';
239
-        $previous_is_abbreviation = false;
240
-        $i = 0;
241
-        foreach ($fragments as $fragment) {
242
-            $is_abbreviation = self::isAbbreviation($fragment);
243
-
244
-            // merge previous fragment with this
245
-            if ($previous_is_abbreviation) {
246
-                $fragment = $previous_fragment . $fragment;
247
-            }
248
-            $return_fragment[$i] = $fragment;
249
-
250
-            $previous_is_abbreviation = $is_abbreviation;
251
-            $previous_fragment = $fragment;
252
-
253
-            // only increment if this isn't an abbreviation
254
-            if (!$is_abbreviation) {
255
-                $i++;
256
-            }
257
-        }
258
-
259
-        return $return_fragment;
260
-    }
261
-
262
-    /**
263
-     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
264
-     *
265
-     * @param $fragment
266
-     *
267
-     * @return bool
268
-     */
269
-    private static function isAbbreviation($fragment)
270
-    {
271
-        $words = mb_split('\s+', Multibyte::trim($fragment));
272
-
273
-        $word_count = count($words);
274
-
275
-        $last_word = Multibyte::trim($words[$word_count - 1]);
276
-        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
277
-        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
278
-
279
-        return $last_is_capital > 0
280
-            && $last_is_abbreviation > 0
281
-            && mb_strlen($last_word) <= 3;
282
-    }
283
-
284
-    /**
285
-     * Merges any part starting with a closing parenthesis ')' to the previous
286
-     * part.
287
-     *
288
-     * @param string[] $parts
289
-     *
290
-     * @return string[]
291
-     */
292
-    private function parenthesesMerge($parts)
293
-    {
294
-        $subSentences = [];
295
-
296
-        foreach ($parts as $part) {
297
-            if ($part[0] === ')' && !empty($subSentences)) {
298
-                $subSentences[count($subSentences) - 1] .= $part;
299
-            } else {
300
-                $subSentences[] = $part;
301
-            }
302
-        }
303
-
304
-        return $subSentences;
305
-    }
306
-
307
-    /**
308
-     * Looks for closing quotes to include them with the previous statement.
309
-     * "That was very interesting," he said.
310
-     * "That was very interesting."
311
-     *
312
-     * @param string[] $statements
313
-     *
314
-     * @return string[]
315
-     */
316
-    private function closeQuotesMerge($statements)
317
-    {
318
-        $i = 0;
319
-        $previous_statement = '';
320
-        $return = [];
321
-        foreach ($statements as $statement) {
322
-            if (self::isEndQuote($statement)) {
323
-                $statement = $previous_statement . $statement;
324
-            } else {
325
-                $i++;
326
-            }
327
-
328
-            $return[$i] = $statement;
329
-            $previous_statement = $statement;
330
-        }
331
-
332
-        return $return;
333
-    }
334
-
335
-    /**
336
-     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
337
-     *
338
-     * @param $statement
339
-     *
340
-     * @return bool
341
-     */
342
-    private static function isEndQuote($statement)
343
-    {
344
-        $trimmed = Multibyte::trim($statement);
345
-        $first = mb_substr($statement, 0, 1);
346
-
347
-        return in_array($trimmed, ['"', '\''])
348
-            || (
349
-                in_array($first, ['"', '\''])
350
-                && mb_substr($statement, 1, 1) === ' '
351
-                && ctype_lower(mb_substr($statement, 2, 1)) === true
352
-            );
353
-    }
354
-
355
-    /**
356
-     * Merges items into larger sentences.
357
-     * Multibyte.php safe
358
-     *
359
-     * @param string[] $shorts
360
-     *
361
-     * @return string[]
362
-     */
363
-    private function sentenceMerge($shorts)
364
-    {
365
-        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
366
-
367
-        $sentences = [];
368
-
369
-        $sentence = '';
370
-        $has_words = false;
371
-        $previous_word_ending = null;
372
-        foreach ($shorts as $short) {
373
-            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
374
-            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
375
-
376
-            if ($after_non_abbreviating_terminal
377
-                || ($has_words && $word_count > 1)) {
378
-
379
-                $sentences[] = $sentence;
380
-
381
-                $sentence = '';
382
-                $has_words = false;
383
-            }
384
-
385
-            $has_words = $has_words
386
-                || $word_count > 1;
387
-
388
-            $sentence .= $short;
389
-            $previous_word_ending = mb_substr($short, -1);
390
-        }
391
-
392
-        if (!empty($sentence)) {
393
-            $sentences[] = $sentence;
394
-        }
395
-
396
-        return $sentences;
397
-    }
398
-
399
-    /**
400
-     * Return the sentences detected in the provided text.
401
-     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
402
-     *
403
-     * @param string  $text
404
-     * @param integer $flags
405
-     *
406
-     * @return string[]
407
-     */
408
-    public function split($text, $flags = 0, $pipeline = [])
409
-    {
410
-        if (empty($pipeline)) {
411
-            static $pipeline = [
412
-                'replaceFloatNumbers',
413
-                'punctuationSplit',
414
-                'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
415
-                'punctuationMerge',
416
-                'abbreviationMerge',
417
-                'closeQuotesMerge',
418
-                'sentenceMerge',
419
-                'restoreReplacements',
420
-            ];
421
-        }
422
-
423
-        // clean funny quotes
424
-        $text = Multibyte::cleanUnicode($text);
425
-
426
-        // Split
427
-        $sentences = [];
428
-        foreach (self::linebreakSplit($text) as $input) {
429
-            if (Multibyte::trim($input) !== '') {
430
-                foreach ($pipeline as $method) {
431
-                    $input = $this->$method($input);
432
-                }
433
-                $sentences = array_merge($sentences, $input);
434
-            }
435
-        }
436
-
437
-        // Post process
438
-        if ($flags & self::SPLIT_TRIM) {
439
-            return self::trimSentences($sentences);
440
-        }
441
-
442
-        return $sentences;
443
-    }
444
-
445
-    /**
446
-     * Multibyte.php trim each string in an array.
447
-     *
448
-     * @param string[] $sentences
449
-     *
450
-     * @return string[]
451
-     */
452
-    private static function trimSentences($sentences)
453
-    {
454
-        return array_map(function ($sentence) {
455
-            return Multibyte::trim($sentence);
456
-        }, $sentences);
457
-    }
458
-
459
-    /**
460
-     * Return the number of sentences detected in the provided text.
461
-     *
462
-     * @param string $text
463
-     *
464
-     * @return integer
465
-     */
466
-    public function count($text)
467
-    {
468
-        return count($this->split($text));
469
-    }
20
+	/**
21
+	 * Specify this flag with the split method to trim whitespace.
22
+	 */
23
+	const SPLIT_TRIM = 0x1;
24
+
25
+	/**
26
+	 * List of characters used to terminate sentences.
27
+	 *
28
+	 * @var string[]
29
+	 */
30
+	private $terminals = ['.', '!', '?'];
31
+
32
+	/**
33
+	 * List of characters used for abbreviations.
34
+	 *
35
+	 * @var string[]
36
+	 */
37
+	private $abbreviators = ['.'];
38
+
39
+	/**
40
+	 * List of replacements in the text.
41
+	 *
42
+	 * @var string[]
43
+	 */
44
+	private $replacements = [];
45
+
46
+	/**
47
+	 * Generate an in-text replacement code for the specified index
48
+	 *
49
+	 * @param int $index
50
+	 *
51
+	 * @return string
52
+	 */
53
+	private function getReplaceCode($index)
54
+	{
55
+		return 0x02 . $index . 0x03;
56
+	}
57
+
58
+	/**
59
+	 * Clean floating point numbers by replace them with an in-text index
60
+	 *
61
+	 * @param string $text
62
+	 *
63
+	 * @return string
64
+	 */
65
+	private function replaceFloatNumbers($text)
66
+	{
67
+		$matches = array();
68
+		preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
69
+
70
+		$this->replacements = [];
71
+		$index = 0;
72
+		foreach (array_reverse($matches[0]) as $match) {
73
+			$number = $match[0];
74
+			$offset = $match[1];
75
+			$code = $this->getReplaceCode($index);
76
+
77
+			$this->replacements[$index] = $number;
78
+
79
+			$text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
80
+
81
+			++$index;
82
+		}
83
+
84
+		return $text;
85
+	}
86
+
87
+	/**
88
+	 * Restore any stored replacements
89
+	 *
90
+	 * @param string[] $text
91
+	 *
92
+	 * @return string[]
93
+	 */
94
+	private function restoreReplacements($text)
95
+	{
96
+		return array_map(function ($value) {
97
+			foreach ($this->replacements as $index => $number) {
98
+				$code = $this->getReplaceCode($index);
99
+				$value = str_replace($code, $number, $value);
100
+			}
101
+
102
+			return $value;
103
+		}, $text);
104
+	}
105
+
106
+	/**
107
+	 * Breaks a piece of text into lines by linebreak.
108
+	 * Eats up any linebreak characters as if one.
109
+	 *
110
+	 * Multibyte.php safe
111
+	 *
112
+	 * @param string $text
113
+	 *
114
+	 * @return string[]
115
+	 */
116
+	private static function linebreakSplit($text)
117
+	{
118
+		$lines = [];
119
+		$line = '';
120
+
121
+		foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
122
+			$line .= $part;
123
+			if (Multibyte::trim($part) === '') {
124
+				$lines[] = $line;
125
+				$line = '';
126
+			}
127
+		}
128
+		$lines[] = $line;
129
+
130
+		return $lines;
131
+	}
132
+
133
+	/**
134
+	 * Splits an array of lines by (consecutive sequences of)
135
+	 * terminals, keeping terminals.
136
+	 *
137
+	 * Multibyte.php safe (at least for UTF-8)
138
+	 *
139
+	 * For example:
140
+	 *    "There ... is. More!"
141
+	 *        ... becomes ...
142
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
143
+	 *
144
+	 * @param string $line
145
+	 *
146
+	 * @return string[]
147
+	 */
148
+	private function punctuationSplit($line)
149
+	{
150
+		$parts = [];
151
+
152
+		$chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
153
+		$is_terminal = in_array($chars[0], $this->terminals);
154
+
155
+		$part = '';
156
+		foreach ($chars as $char) {
157
+			if (in_array($char, $this->terminals) !== $is_terminal) {
158
+				$parts[] = $part;
159
+				$part = '';
160
+				$is_terminal = !$is_terminal;
161
+			}
162
+			$part .= $char;
163
+		}
164
+
165
+		if (!empty($part)) {
166
+			$parts[] = $part;
167
+		}
168
+
169
+		return $parts;
170
+	}
171
+
172
+	/**
173
+	 * Appends each terminal item after it's preceding
174
+	 * non-terminals.
175
+	 *
176
+	 * Multibyte.php safe (at least for UTF-8)
177
+	 *
178
+	 * For example:
179
+	 *    [ "There ", "...", " is", ".", " More", "!" ]
180
+	 *        ... becomes ...
181
+	 *    [ "There ... is.", "More!" ]
182
+	 *
183
+	 * @param string[] $punctuations
184
+	 *
185
+	 * @return string[]
186
+	 */
187
+	private function punctuationMerge($punctuations)
188
+	{
189
+		$definite_terminals = array_diff($this->terminals, $this->abbreviators);
190
+
191
+		$merges = [];
192
+		$merge = '';
193
+
194
+		$filtered = array_filter($punctuations, function ($p) {
195
+			return $p !== '';
196
+		});
197
+
198
+		foreach ($filtered as $punctuation) {
199
+			$merge .= $punctuation;
200
+			if (mb_strlen($punctuation) === 1
201
+				&& in_array($punctuation, $this->terminals)) {
202
+				$merges[] = $merge;
203
+				$merge = '';
204
+			} else {
205
+				foreach ($definite_terminals as $terminal) {
206
+					if (mb_strpos($punctuation, $terminal) !== false) {
207
+						$merges[] = $merge;
208
+						$merge = '';
209
+						break;
210
+					}
211
+				}
212
+			}
213
+		}
214
+		if (!empty($merge)) {
215
+			$merges[] = $merge;
216
+		}
217
+
218
+		return $merges;
219
+	}
220
+
221
+	/**
222
+	 * Looks for capitalized abbreviations & includes them with the following fragment.
223
+	 *
224
+	 * For example:
225
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
226
+	 *        ... becomes ...
227
+	 *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
228
+	 *  [ "Mr. Comey was not available for comment." ]
229
+	 *
230
+	 * @param string[] $fragments
231
+	 *
232
+	 * @return string[]
233
+	 */
234
+	private function abbreviationMerge($fragments)
235
+	{
236
+		$return_fragment = [];
237
+
238
+		$previous_fragment = '';
239
+		$previous_is_abbreviation = false;
240
+		$i = 0;
241
+		foreach ($fragments as $fragment) {
242
+			$is_abbreviation = self::isAbbreviation($fragment);
243
+
244
+			// merge previous fragment with this
245
+			if ($previous_is_abbreviation) {
246
+				$fragment = $previous_fragment . $fragment;
247
+			}
248
+			$return_fragment[$i] = $fragment;
249
+
250
+			$previous_is_abbreviation = $is_abbreviation;
251
+			$previous_fragment = $fragment;
252
+
253
+			// only increment if this isn't an abbreviation
254
+			if (!$is_abbreviation) {
255
+				$i++;
256
+			}
257
+		}
258
+
259
+		return $return_fragment;
260
+	}
261
+
262
+	/**
263
+	 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
264
+	 *
265
+	 * @param $fragment
266
+	 *
267
+	 * @return bool
268
+	 */
269
+	private static function isAbbreviation($fragment)
270
+	{
271
+		$words = mb_split('\s+', Multibyte::trim($fragment));
272
+
273
+		$word_count = count($words);
274
+
275
+		$last_word = Multibyte::trim($words[$word_count - 1]);
276
+		$last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
277
+		$last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
278
+
279
+		return $last_is_capital > 0
280
+			&& $last_is_abbreviation > 0
281
+			&& mb_strlen($last_word) <= 3;
282
+	}
283
+
284
+	/**
285
+	 * Merges any part starting with a closing parenthesis ')' to the previous
286
+	 * part.
287
+	 *
288
+	 * @param string[] $parts
289
+	 *
290
+	 * @return string[]
291
+	 */
292
+	private function parenthesesMerge($parts)
293
+	{
294
+		$subSentences = [];
295
+
296
+		foreach ($parts as $part) {
297
+			if ($part[0] === ')' && !empty($subSentences)) {
298
+				$subSentences[count($subSentences) - 1] .= $part;
299
+			} else {
300
+				$subSentences[] = $part;
301
+			}
302
+		}
303
+
304
+		return $subSentences;
305
+	}
306
+
307
+	/**
308
+	 * Looks for closing quotes to include them with the previous statement.
309
+	 * "That was very interesting," he said.
310
+	 * "That was very interesting."
311
+	 *
312
+	 * @param string[] $statements
313
+	 *
314
+	 * @return string[]
315
+	 */
316
+	private function closeQuotesMerge($statements)
317
+	{
318
+		$i = 0;
319
+		$previous_statement = '';
320
+		$return = [];
321
+		foreach ($statements as $statement) {
322
+			if (self::isEndQuote($statement)) {
323
+				$statement = $previous_statement . $statement;
324
+			} else {
325
+				$i++;
326
+			}
327
+
328
+			$return[$i] = $statement;
329
+			$previous_statement = $statement;
330
+		}
331
+
332
+		return $return;
333
+	}
334
+
335
+	/**
336
+	 * Check if the entire string is a quotation mark or quote, then space, then lowercase.
337
+	 *
338
+	 * @param $statement
339
+	 *
340
+	 * @return bool
341
+	 */
342
+	private static function isEndQuote($statement)
343
+	{
344
+		$trimmed = Multibyte::trim($statement);
345
+		$first = mb_substr($statement, 0, 1);
346
+
347
+		return in_array($trimmed, ['"', '\''])
348
+			|| (
349
+				in_array($first, ['"', '\''])
350
+				&& mb_substr($statement, 1, 1) === ' '
351
+				&& ctype_lower(mb_substr($statement, 2, 1)) === true
352
+			);
353
+	}
354
+
355
+	/**
356
+	 * Merges items into larger sentences.
357
+	 * Multibyte.php safe
358
+	 *
359
+	 * @param string[] $shorts
360
+	 *
361
+	 * @return string[]
362
+	 */
363
+	private function sentenceMerge($shorts)
364
+	{
365
+		$non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
366
+
367
+		$sentences = [];
368
+
369
+		$sentence = '';
370
+		$has_words = false;
371
+		$previous_word_ending = null;
372
+		foreach ($shorts as $short) {
373
+			$word_count = count(mb_split('\s+', Multibyte::trim($short)));
374
+			$after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
375
+
376
+			if ($after_non_abbreviating_terminal
377
+				|| ($has_words && $word_count > 1)) {
378
+
379
+				$sentences[] = $sentence;
380
+
381
+				$sentence = '';
382
+				$has_words = false;
383
+			}
384
+
385
+			$has_words = $has_words
386
+				|| $word_count > 1;
387
+
388
+			$sentence .= $short;
389
+			$previous_word_ending = mb_substr($short, -1);
390
+		}
391
+
392
+		if (!empty($sentence)) {
393
+			$sentences[] = $sentence;
394
+		}
395
+
396
+		return $sentences;
397
+	}
398
+
399
+	/**
400
+	 * Return the sentences detected in the provided text.
401
+	 * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
402
+	 *
403
+	 * @param string  $text
404
+	 * @param integer $flags
405
+	 *
406
+	 * @return string[]
407
+	 */
408
+	public function split($text, $flags = 0, $pipeline = [])
409
+	{
410
+		if (empty($pipeline)) {
411
+			static $pipeline = [
412
+				'replaceFloatNumbers',
413
+				'punctuationSplit',
414
+				'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
415
+				'punctuationMerge',
416
+				'abbreviationMerge',
417
+				'closeQuotesMerge',
418
+				'sentenceMerge',
419
+				'restoreReplacements',
420
+			];
421
+		}
422
+
423
+		// clean funny quotes
424
+		$text = Multibyte::cleanUnicode($text);
425
+
426
+		// Split
427
+		$sentences = [];
428
+		foreach (self::linebreakSplit($text) as $input) {
429
+			if (Multibyte::trim($input) !== '') {
430
+				foreach ($pipeline as $method) {
431
+					$input = $this->$method($input);
432
+				}
433
+				$sentences = array_merge($sentences, $input);
434
+			}
435
+		}
436
+
437
+		// Post process
438
+		if ($flags & self::SPLIT_TRIM) {
439
+			return self::trimSentences($sentences);
440
+		}
441
+
442
+		return $sentences;
443
+	}
444
+
445
+	/**
446
+	 * Multibyte.php trim each string in an array.
447
+	 *
448
+	 * @param string[] $sentences
449
+	 *
450
+	 * @return string[]
451
+	 */
452
+	private static function trimSentences($sentences)
453
+	{
454
+		return array_map(function ($sentence) {
455
+			return Multibyte::trim($sentence);
456
+		}, $sentences);
457
+	}
458
+
459
+	/**
460
+	 * Return the number of sentences detected in the provided text.
461
+	 *
462
+	 * @param string $text
463
+	 *
464
+	 * @return integer
465
+	 */
466
+	public function count($text)
467
+	{
468
+		return count($this->split($text));
469
+	}
470 470
 
471 471
 }
Please login to merge, or discard this patch.