Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( a16ac0...33e1e8 )
by Der Mundschenk
15s
created

Text_Parser::get_spaces()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 3
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 0
1
<?php
2
/**
3
 *  This file is part of PHP-Typography.
4
 *
5
 *  Copyright 2014-2017 Peter Putzer.
6
 *  Copyright 2012-2013 Marie Hogebrandt.
7
 *  Copyright 2009-2011 KINGdesk, LLC.
8
 *
9
 *  This program is free software; you can redistribute it and/or
10
 *  modify it under the terms of the GNU General Public License
11
 *  as published by the Free Software Foundation; either version 2
12
 *  of the License, or (at your option) any later version.
13
 *
14
 *  This program is distributed in the hope that it will be useful,
15
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 *  GNU General Public License for more details.
18
 *
19
 *  You should have received a copy of the GNU General Public License
20
 *  along with this program; if not, write to the Free Software
21
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
22
 *
23
 *  ***
24
 *
25
 *  @package mundschenk-at/php-typography
26
 *  @license http://www.gnu.org/licenses/gpl-2.0.html
27
 */
28
29
namespace PHP_Typography;
30
31
use PHP_Typography\Text_Parser\Token;
32
33
/**
34
 * A class to parse plain text (such as the data of DOMText).
35
 *
36
 * Parse_Text assumes no HTML markup in the text (except for special html characters like &gt;).
37
 * If multibyte characters are passed, they must be encoded as UTF-8.
38
 */
39
class Text_Parser {
40
41
	const NO_ALL_LETTERS      = 0b000000000001;
42
	const ALLOW_ALL_LETTERS   = 0b000000000010;
43
	const REQUIRE_ALL_LETTERS = 0b000000000100;
44
	const NO_ALL_CAPS         = 0b000000001000;
45
	const ALLOW_ALL_CAPS      = 0b000000010000;
46
	const REQUIRE_ALL_CAPS    = 0b000000100000;
47
	const NO_COMPOUNDS        = 0b000001000000;
48
	const ALLOW_COMPOUNDS     = 0b000010000000;
49
	const REQUIRE_COMPOUNDS   = 0b000100000000;
50
51
	/**
52
	 * Find spacing FIRST (as it is the primary delimiter)
53
	 *
54
	 * Find the HTML character representation for the following characters:
55
	 *      tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
56
	 *      ogham space mark | en quad space | em quad space | en-space | three-per-em space
57
	 *      four-per-em space | six-per-em space | figure space | punctuation space | em-space
58
	 *      thin space | hair space | narrow no-break space
59
	 *      medium mathematical space | ideographic space
60
	 * Some characters are used inside words, we will not count these as a space for the purpose
61
	 * of finding word boundaries:
62
	 *      zero-width-space ("&#8203;", "&#x200b;")
63
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
64
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
65
	 */
66
	const _HTML_SPACING = '
67
			(?:
68
				(?:										# alpha matches
69
					&
70
					(?: nbsp|ensp|emsp|thinsp )
71
					;
72
				)
73
				|
74
				(?:										# decimal matches
75
					&\#
76
					(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 )
77
					;
78
				)
79
				|
80
				(?:										# hexidecimal matches
81
					&\#x
82
					(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 )
83
					;
84
				)
85
				|
86
				(?:										# actual characters
87
					\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}|
88
					\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000}
89
				)
90
			)
91
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
92
	const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8.
93
94
	/**
95
	 * Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores)
96
	 *
97
	 * @see http://www.unicode.org/charts/PDF/U2000.pdf
98
	 *
99
	 * Find punctuation and symbols
100
	 *  dec matches =   33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903
101
	 *  hex matches =   0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2|
102
	 *                  03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f
103
	 *
104
	 * Some characters are used inside words, we will not count these as a space for the purpose
105
	 * of finding word boundaries:
106
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
107
	 *      underscore ("&#95;", "&#x005f;")
108
	 */
109
	const _HTML_PUNCTUATION = '
110
			(?:
111
				(?:										# alpha matches
112
					&
113
					(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams)
114
					;
115
				)
116
				|
117
				(?:										# decimal matches
118
					&\#
119
					(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] )
120
					;
121
				)
122
				|
123
				(?:										# hexidecimal matches
124
					&\#x
125
					(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] )
126
					;
127
				)
128
			)
129
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
130
	const _PUNCTUATION = '
131
	(?:
132
		(?:
133
			[^\w\s\&\/\@]  # assume characters that are not word spaces or whitespace are punctuation
134
						   # exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations
135
						   # exclude slash \/as to not include the last slash in a URL
136
						   # exclude @ as to keep twitter names together
137
			|
138
			' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation
139
		)+
140
	)
141
	';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
142
143
144
	/**
145
	 * Letter connectors allowed in words
146
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
147
	 *      underscore ("&#95;", "&#x005f;")
148
	 *      zero-width-space ("&#8203;", "&#x200b;")
149
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
150
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
151
	 */
152
	const _HTML_LETTER_CONNECTORS = '
153
		(?:
154
			(?:												# alpha matches
155
				&
156
				(?: shy|zwj|zwnj )
157
				;
158
			)
159
			|
160
			(?:												# decimal matches
161
				&\#
162
				(?: 45|95|173|820[3-589]|8210 )
163
				;
164
			)
165
			|
166
			(?:												# hexidecimal matches
167
				&\#x
168
				(?: 002d|005f|00ad|200[b-d]|201[0-2] )
169
				;
170
			)
171
			|
172
			(?:												# actual characters
173
				\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012}
174
			)
175
		)
176
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
177
178
	/**
179
	 * Word character html entities
180
	 *   characters  0-9__ A-Z__ a-z___ other_special_chrs_____
181
	 *   decimal     48-57 65-90 97-122 192-214,216-246,248-255, 256-383
182
	 *   hex         31-39 41-5a 61-7a  c0-d6   d8-f6   f8-ff    0100-017f
183
	 */
184
	const _HTML_LETTERS = '
185
		(?:
186
			(?:												# alpha matches
187
				&
188
				(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml)
189
				;
190
			)
191
			|
192
			(?:												# decimal matches
193
				&\#
194
				(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] )
195
				;
196
			)
197
			|
198
			(?:												# hexidecimal matches
199
				(?:
200
					&\#x00
201
					(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] )
202
					;
203
				)
204
				|
205
				(?:
206
					&\#x01[0-7][0-9a-f];
207
				)
208
			)
209
			|
210
			(?:												# actual characters
211
				[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
212
				\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
213
				\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
214
				\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
215
				\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
216
				\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
217
				\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
218
				\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
219
				\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
220
				\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
221
				\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
222
				\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
223
				\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
224
				\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
225
				\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
226
				\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
227
				\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
228
				\x{017c}|\x{017d}|\x{017e}|\x{017f}
229
			)
230
		)
231
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
232
233
	const _WORD = '
234
	(?:
235
		(?<![\w\&])	 # negative lookbehind to ensure
236
					 #	1) we are proceeded by a non-word-character, and
237
					 #	2) we are not inside an HTML character def
238
		(?:
239
			[\w\-\_\/]
240
			|
241
			' . self::_HTML_LETTERS . '
242
			|
243
			' . self::_HTML_LETTER_CONNECTORS . '
244
		)+
245
	)
246
	'; // required modifiers: x (multiline pattern) u (utf8).
247
248
	// Find any text
249
	const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
250
251
	// Regular expressions.
252
	const _RE_ANY_TEXT               = '/(' . self::_ANY_TEXT . ')/Sxiu';
253
	const _RE_SPACE                  = '/\A' . self::_SPACE . '\Z/Sxiu';
254
	const _RE_PUNCTUATION            = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu';
255
	const _RE_WORD                   = '/\A' . self::_WORD . '\Z/Sxu';
256
	const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu';
257
	const _RE_MAX_STRING_LENGTH      = '/\w{500}/Ss';
258
259
	/**
260
	 * The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper').
261
	 *
262
	 * @var callable|null
263
	 */
264
	private $current_strtoupper = null;
265
266
	/**
267
	 * The tokenized text.
268
	 *
269
	 * @var array $text {
270
	 *      @type Text_Parser\Token $index
271
	 * }
272
	 */
273
	private $text = [];
274
275
	/**
276
	 * An array of various regex components (not complete patterns).
277
	 *
278
	 * @var array $components
279
	 */
280
	private $components = [];
281
282
	/**
283
	 * An array of regex patterns.
284
	 *
285
	 * @var array $regex
286
	 */
287
	private $regex = [];
288
289
	/**
290
	 * Creates a new parser object.
291
	 */
292
	public function __construct() {
293
	}
294
295
	/**
296
	 * Tokenizes a string and stores the tokens in $this->text.
297
	 *
298
	 * @param string $raw_text A text fragment without any HTML markup.
299
	 *
300
	 * @return bool Returns `true` on successful completion, `false` otherwise.
301
	 */
302
	public function load( $raw_text ) {
303
		if ( ! is_string( $raw_text ) ) {
304
			return false; // we have an error, abort.
305
		}
306
307
		// Abort if a simple string exceeds 500 characters (security concern).
308
		if ( preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) {
309
			return false;
310
		}
311
312
		// Detect encoding.
313
		$str_functions = Strings::functions( $raw_text );
314
		if ( empty( $str_functions ) ) {
315
			return false; // unknown encoding.
316
		}
317
		$this->current_strtoupper = $str_functions['strtoupper'];
318
319
		// Tokenize the raw text parts.
320
		$this->text = self::tokenize( preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) );
321
322
		// The token array should never be empty.
323
		return ! empty( $this->text );
324
	}
325
326
	/**
327
	 * Turns the array of strings into an array of tokens.
328
	 *
329
	 * @param string[] $parts An array of non-empty strings.
330
	 *
331
	 * @return Token[] An array of numerically indexed tokens.
332
	 */
333
	protected static function tokenize( array $parts ) {
334
		$tokens = [];
335
		$index  = 0;
336
337
		foreach ( $parts as $part ) {
338
			if ( preg_match( self::_RE_SPACE, $part ) ) {
339
				$tokens[ $index ] = new Token( $part, Token::SPACE );
340
			} elseif ( preg_match( self::_RE_PUNCTUATION, $part ) ) {
341
				$tokens[ $index ] = new Token( $part, Token::PUNCTUATION );
342
			} elseif ( preg_match( self::_RE_WORD, $part ) ) {
343
				// Make sure that things like email addresses and URLs are not broken up
344
				// into words and punctuation not preceeded by an 'other'.
345
				self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index );
346
			} else {
347
				// Make sure that things like email addresses and URLs are not broken up into words
348
				// and punctuation not preceeded by an 'other' or 'word'.
349
				self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index );
350
			}
351
352
			$index++;
353
		}
354
355
		return $tokens;
356
	}
357
358
	/**
359
	 * Parse ambigious tokens (that may need to be combined with the predecessors).
360
	 *
361
	 * @param int     $expected_type Either Token::WORD or Token::OTHER.
362
	 * @param string  $part          The string fragment to parse.
363
	 * @param Token[] $tokens        The token array. Passed by reference.
364
	 * @param int     $index         The current index. Passed by reference.
365
	 */
366
	protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) {
367
368
		// Make sure that things like email addresses and URLs are not broken up incorrectly.
369
		if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) {
370
			$index--;
371
			$old_part = $tokens[ $index ]->value;
372
			$tokens[ $index ] = new Token( $old_part . $part, Token::OTHER );
373
374
		// Not preceeded by a non-space + punctuation.
375
		} elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) {
376
			$old_part   = $tokens[ $index - 1 ]->value;
377
			$older_part = $tokens[ $index - 2 ]->value;
378
			$tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER );
379
			unset( $tokens[ $index - 1 ] );
380
			$index = $index - 2;
381
382
		// All good.
383
		} else {
384
			$tokens[ $index ] = new Token( $part, $expected_type );
385
		}
386
	}
387
388
	/**
389
	 * Checks if the predecessor of the current token is of a certain type.
390
	 *
391
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
392
	 * @param  array $tokens An array of tokens.
393
	 * @param  int   $index  The current token index.
394
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
395
	 *
396
	 * @return bool
397
	 */
398
	protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
399
		return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type;
400
	}
401
402
	/**
403
	 * Checks if the predecessor of the current token is not of a certain type.
404
	 *
405
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
406
	 * @param  array $tokens An array of tokens.
407
	 * @param  int   $index  The current token index.
408
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
409
	 *
410
	 * @return bool
411
	 */
412
	protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
413
		return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type;
414
	}
415
416
417
	/**
418
	 * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted).
419
	 *
420
	 * Warning: Tokens previously acquired through 'get' methods may not match new tokenization.
421
	 *
422
	 * @return bool Returns true on successful completion.
423
	 */
424
	public function reload() {
425
		return $this->load( $this->unload() );
426
	}
427
428
	/**
429
	 * Returns the complete text as a string and clears the parser.
430
	 *
431
	 * @return string
432
	 */
433
	public function unload() {
434
		$reassembled_text = '';
435
436
		foreach ( $this->text as $token ) {
437
			$reassembled_text .= $token->value;
438
		}
439
440
		$this->clear();
441
442
		return $reassembled_text;
443
	}
444
445
	/**
446
	 * Clears the currently set text from the parser.
447
	 */
448
	public function clear() {
449
		$this->text = [];
450
		$this->current_strtoupper = null;
451
	}
452
453
	/**
454
	 * Updates the 'value' field for all matching tokens.
455
	 *
456
	 * @param Token[] $tokens An array of tokens.
457
	 */
458
	public function update( $tokens ) {
459
		foreach ( $tokens as $index => $token ) {
460
			$this->text[ $index ] = $this->text[ $index ]->with_value( $token->value );
461
		}
462
	}
463
464
	/**
465
	 * Retrieves all tokens of the currently set text.
466
	 *
467
	 * @return Token[] An array of numerically indexed tokens.
468
	 */
469
	public function get_all() {
470
		return $this->text;
471
	}
472
473
	/**
474
	 * Retrieves all tokens of the type "space".
475
	 *
476
	 * @return Token[] An array of numerically indexed tokens.
477
	 */
478
	public function get_spaces() {
479
		return $this->get_type( Token::SPACE );
480
	}
481
482
	/**
483
	 * Retrieves all tokens of the type "punctuation".
484
	 *
485
	 * @return Token[] An array of numerically indexed tokens.
486
	 */
487
	public function get_punctuation() {
488
		return $this->get_type( Token::PUNCTUATION );
489
	}
490
491
	/**
492
	 * Retrieves all tokens of the type "word".
493
	 *
494
	 * @param int $abc   Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS.
495
	 * @param int $caps  Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS.
496
	 * @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS.
497
	 *
498
	 * @return Token[] An array of numerically indexed tokens.
499
	 */
500
	public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) {
501
		// Return early if no text has been loaded.
502
		if ( ! isset( $this->text ) || ! is_callable( $this->current_strtoupper ) ) {
503
			return []; // abort.
504
		}
505
506
		// Result set.
507
		$tokens = [];
508
509
		foreach ( $this->get_type( Token::WORD ) as $index => $token ) {
510
511
			if ( $this->conforms_to_letters_policy( $token, $abc ) &&
512
				 $this->conforms_to_caps_policy( $token, $caps ) &&
513
				 $this->conforms_to_compounds_policy( $token, $comps ) ) {
514
515
				$tokens[ $index ] = $token;
516
			}
517
		}
518
519
		return $tokens;
520
	}
521
522
	/**
523
	 * Check if the value of the token conforms to the given policy for letters.
524
	 *
525
	 * @param  Token $token  Required.
526
	 * @param  int   $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS.
527
	 *
528
	 * @return bool
529
	 */
530
	protected function conforms_to_letters_policy( Token $token, $policy ) {
531
		return $this->check_policy( $token, $policy, self::ALLOW_ALL_LETTERS, self::REQUIRE_ALL_LETTERS, self::NO_ALL_LETTERS, function( $value ) {
532
			return preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value );
533
		} );
534
	}
535
536
	/**
537
	 * Check if the value of the token conforms to the given policy for all-caps words.
538
	 *
539
	 * @param  Token $token  Required.
540
	 * @param  int   $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS.
541
	 *
542
	 * @return bool
543
	 */
544
	protected function conforms_to_caps_policy( Token $token, $policy ) {
545
		return $this->check_policy( $token, $policy, self::ALLOW_ALL_CAPS, self::REQUIRE_ALL_CAPS, self::NO_ALL_CAPS, function( $value ) {
546
			return call_user_func( $this->current_strtoupper, $value );
547
		} );
548
	}
549
550
	/**
551
	 * Check if the value of the token conforms to the given policy for compound words.
552
	 *
553
	 * @param  Token $token  Required.
554
	 * @param  int   $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS.
555
	 *
556
	 * @return bool
557
	 */
558
	protected function conforms_to_compounds_policy( Token $token, $policy ) {
559
		return $this->check_policy( $token, $policy, self::ALLOW_COMPOUNDS, self::NO_COMPOUNDS, self::REQUIRE_COMPOUNDS, function( $value ) {
560
			return preg_replace( '/-/S', '', $value );
561
		} );
562
	}
563
564
	/**
565
	 * Check if the value of the token conforms to the given policy.
566
	 *
567
	 * @param  Token    $token             Required.
568
	 * @param  int      $policy            The policy to check.
569
	 * @param  int      $permissive_policy ALLOW_* policy constant.
570
	 * @param  int      $equal_policy      Policy constant to check when the transformed value is equal to the original token.
571
	 * @param  int      $non_equal_policy  Policy constant to check when the transformed value is different from the original token.
572
	 * @param  callable $transform_token   Function to transform the token value.
573
	 *
574
	 * @return bool
575
	 */
576
	protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) {
577
578
		// Short circuit.
579
		if ( $permissive_policy === $policy ) {
580
			return true;
581
		}
582
583
		$transformed = $transform_token( $token->value );
584
585
		return
586
			( $equal_policy === $policy && $transformed === $token->value ) ||
587
			( $non_equal_policy === $policy && $transformed !== $token->value );
588
	}
589
590
	/**
591
	 * Retrieves all tokens of the type "other".
592
	 *
593
	 * @return Token[] An array of numerically indexed tokens.
594
	 */
595
	public function get_other() {
596
		return $this->get_type( Token::OTHER );
597
	}
598
599
	/**
600
	 * Retrieves all tokens of the given type.
601
	 *
602
	 * @param int $type The type to get.
603
	 *
604
	 * @return Token[] An array of numerically indexed tokens.
605
	 */
606
	public function get_type( $type ) {
607
		$tokens = [];
608
609
		foreach ( $this->text as $index => $token ) {
610
			if ( $token->type === $type ) {
611
				$tokens[ $index ] = $token;
612
			}
613
		}
614
615
		return $tokens;
616
	}
617
}
618