Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — master (#44)
by Der Mundschenk
02:15
created

Text_Parser::check_policy()   B

Complexity

Conditions 5
Paths 6

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 13
rs 8.8571
c 0
b 0
f 0
cc 5
eloc 7
nc 6
nop 6
1
<?php
2
/**
3
 *  This file is part of PHP-Typography.
4
 *
5
 *  Copyright 2014-2017 Peter Putzer.
6
 *  Copyright 2012-2013 Marie Hogebrandt.
7
 *  Copyright 2009-2011 KINGdesk, LLC.
8
 *
9
 *  This program is free software; you can redistribute it and/or modify
10
 *  it under the terms of the GNU General Public License as published by
11
 *  the Free Software Foundation; either version 2 of the License, or
12
 *  (at your option) any later version.
13
 *
14
 *  This program is distributed in the hope that it will be useful,
15
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 *  GNU General Public License for more details.
18
 *
19
 *  You should have received a copy of the GNU General Public License along
20
 *  with this program; if not, write to the Free Software Foundation, Inc.,
21
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
 *
23
 *  ***
24
 *
25
 *  @package mundschenk-at/php-typography
26
 *  @license http://www.gnu.org/licenses/gpl-2.0.html
27
 */
28
29
namespace PHP_Typography;
30
31
use PHP_Typography\Text_Parser\Token;
32
33
/**
34
 * A class to parse plain text (such as the data of DOMText).
35
 *
36
 * Parse_Text assumes no HTML markup in the text (except for special html characters like &gt;).
37
 * If multibyte characters are passed, they must be encoded as UTF-8.
38
 */
39
class Text_Parser {
40
41
	const NO_ALL_LETTERS      = 0b000000000001;
42
	const ALLOW_ALL_LETTERS   = 0b000000000010;
43
	const REQUIRE_ALL_LETTERS = 0b000000000100;
44
	const NO_ALL_CAPS         = 0b000000001000;
45
	const ALLOW_ALL_CAPS      = 0b000000010000;
46
	const REQUIRE_ALL_CAPS    = 0b000000100000;
47
	const NO_COMPOUNDS        = 0b000001000000;
48
	const ALLOW_COMPOUNDS     = 0b000010000000;
49
	const REQUIRE_COMPOUNDS   = 0b000100000000;
50
51
	/**
52
	 * Find spacing FIRST (as it is the primary delimiter)
53
	 *
54
	 * Find the HTML character representation for the following characters:
55
	 *      tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
56
	 *      ogham space mark | en quad space | em quad space | en-space | three-per-em space
57
	 *      four-per-em space | six-per-em space | figure space | punctuation space | em-space
58
	 *      thin space | hair space | narrow no-break space
59
	 *      medium mathematical space | ideographic space
60
	 * Some characters are used inside words, we will not count these as a space for the purpose
61
	 * of finding word boundaries:
62
	 *      zero-width-space ("&#8203;", "&#x200b;")
63
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
64
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
65
	 */
66
	const _HTML_SPACING = '
67
			(?:
68
				(?:										# alpha matches
69
					&
70
					(?: nbsp|ensp|emsp|thinsp )
71
					;
72
				)
73
				|
74
				(?:										# decimal matches
75
					&\#
76
					(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 )
77
					;
78
				)
79
				|
80
				(?:										# hexidecimal matches
81
					&\#x
82
					(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 )
83
					;
84
				)
85
				|
86
				(?:										# actual characters
87
					\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}|
88
					\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000}
89
				)
90
			)
91
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
92
93
	const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8.
94
95
	/**
96
	 * Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores)
97
	 *
98
	 * @see http://www.unicode.org/charts/PDF/U2000.pdf
99
	 *
100
	 * Find punctuation and symbols
101
	 *  dec matches =   33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903
102
	 *  hex matches =   0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2|
103
	 *                  03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f
104
	 *
105
	 * Some characters are used inside words, we will not count these as a space for the purpose
106
	 * of finding word boundaries:
107
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
108
	 *      underscore ("&#95;", "&#x005f;")
109
	 */
110
	const _HTML_PUNCTUATION = '
111
			(?:
112
				(?:										# alpha matches
113
					&
114
					(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams)
115
					;
116
				)
117
				|
118
				(?:										# decimal matches
119
					&\#
120
					(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] )
121
					;
122
				)
123
				|
124
				(?:										# hexidecimal matches
125
					&\#x
126
					(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] )
127
					;
128
				)
129
			)
130
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
131
132
	const _PUNCTUATION = '
133
	(?:
134
		(?:
135
			[^\w\s\&\/\@]  # assume characters that are not word spaces or whitespace are punctuation
136
						   # exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations
137
						   # exclude slash \/as to not include the last slash in a URL
138
						   # exclude @ as to keep twitter names together
139
			|
140
			' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation
141
		)+
142
	)
143
	';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
144
145
	/**
146
	 * Letter connectors allowed in words
147
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
148
	 *      underscore ("&#95;", "&#x005f;")
149
	 *      zero-width-space ("&#8203;", "&#x200b;")
150
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
151
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
152
	 */
153
	const _HTML_LETTER_CONNECTORS = '
154
		(?:
155
			(?:												# alpha matches
156
				&
157
				(?: shy|zwj|zwnj )
158
				;
159
			)
160
			|
161
			(?:												# decimal matches
162
				&\#
163
				(?: 45|95|173|820[3-589]|8210 )
164
				;
165
			)
166
			|
167
			(?:												# hexidecimal matches
168
				&\#x
169
				(?: 002d|005f|00ad|200[b-d]|201[0-2] )
170
				;
171
			)
172
			|
173
			(?:												# actual characters
174
				\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012}
175
			)
176
		)
177
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
178
179
	/**
180
	 * Word character html entities
181
	 *   characters  0-9__ A-Z__ a-z___ other_special_chrs_____
182
	 *   decimal     48-57 65-90 97-122 192-214,216-246,248-255, 256-383
183
	 *   hex         31-39 41-5a 61-7a  c0-d6   d8-f6   f8-ff    0100-017f
184
	 */
185
	const _HTML_LETTERS = '
186
		(?:
187
			(?:												# alpha matches
188
				&
189
				(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml)
190
				;
191
			)
192
			|
193
			(?:												# decimal matches
194
				&\#
195
				(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] )
196
				;
197
			)
198
			|
199
			(?:												# hexidecimal matches
200
				(?:
201
					&\#x00
202
					(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] )
203
					;
204
				)
205
				|
206
				(?:
207
					&\#x01[0-7][0-9a-f];
208
				)
209
			)
210
			|
211
			(?:												# actual characters
212
				[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
213
				\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
214
				\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
215
				\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
216
				\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
217
				\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
218
				\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
219
				\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
220
				\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
221
				\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
222
				\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
223
				\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
224
				\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
225
				\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
226
				\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
227
				\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
228
				\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
229
				\x{017c}|\x{017d}|\x{017e}|\x{017f}
230
			)
231
		)
232
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
233
234
	const _WORD = '
235
	(?:
236
		(?<![\w\&])	 # negative lookbehind to ensure
237
					 #	1) we are proceeded by a non-word-character, and
238
					 #	2) we are not inside an HTML character def
239
		(?:
240
			[\w\-\_\/]
241
			|
242
			' . self::_HTML_LETTERS . '
243
			|
244
			' . self::_HTML_LETTER_CONNECTORS . '
245
		)+
246
	)
247
	'; // required modifiers: x (multiline pattern) u (utf8).
248
249
	// Find any text.
250
	const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
251
252
	// Regular expressions.
253
	const _RE_ANY_TEXT               = '/(' . self::_ANY_TEXT . ')/Sxiu';
254
	const _RE_SPACE                  = '/\A' . self::_SPACE . '\Z/Sxiu';
255
	const _RE_PUNCTUATION            = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu';
256
	const _RE_WORD                   = '/\A' . self::_WORD . '\Z/Sxu';
257
	const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu';
258
	const _RE_MAX_STRING_LENGTH      = '/\w{500}/Ss';
259
260
	/**
261
	 * The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper').
262
	 *
263
	 * @var callable|null
264
	 */
265
	private $current_strtoupper = null;
266
267
	/**
268
	 * The tokenized text.
269
	 *
270
	 * @var array $text {
271
	 *      @type Text_Parser\Token $index
272
	 * }
273
	 */
274
	private $text = [];
275
276
	/**
277
	 * An array of various regex components (not complete patterns).
278
	 *
279
	 * @var array $components
280
	 */
281
	private $components = [];
282
283
	/**
284
	 * An array of regex patterns.
285
	 *
286
	 * @var array $regex
287
	 */
288
	private $regex = [];
289
290
	/**
291
	 * Creates a new parser object.
292
	 */
293
	public function __construct() {
294
	}
295
296
	/**
297
	 * Tokenizes a string and stores the tokens in $this->text.
298
	 *
299
	 * @param string $raw_text A text fragment without any HTML markup.
300
	 *
301
	 * @return bool Returns `true` on successful completion, `false` otherwise.
302
	 */
303
	public function load( $raw_text ) {
304
		if ( ! is_string( $raw_text ) ) {
305
			return false; // we have an error, abort.
306
		}
307
308
		// Abort if a simple string exceeds 500 characters (security concern).
309
		if ( preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) {
310
			return false;
311
		}
312
313
		// Detect encoding.
314
		$str_functions = Strings::functions( $raw_text );
315
		if ( empty( $str_functions ) ) {
316
			return false; // unknown encoding.
317
		}
318
		$this->current_strtoupper = $str_functions['strtoupper'];
319
320
		// Tokenize the raw text parts.
321
		$this->text = self::tokenize( preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) );
322
323
		// The token array should never be empty.
324
		return ! empty( $this->text );
325
	}
326
327
	/**
328
	 * Turns the array of strings into an array of tokens.
329
	 *
330
	 * @param string[] $parts An array of non-empty strings.
331
	 *
332
	 * @return Token[] An array of numerically indexed tokens.
333
	 */
334
	protected static function tokenize( array $parts ) {
335
		$tokens = [];
336
		$index  = 0;
337
338
		foreach ( $parts as $part ) {
339
			if ( preg_match( self::_RE_SPACE, $part ) ) {
340
				$tokens[ $index ] = new Token( $part, Token::SPACE );
341
			} elseif ( preg_match( self::_RE_PUNCTUATION, $part ) ) {
342
				$tokens[ $index ] = new Token( $part, Token::PUNCTUATION );
343
			} elseif ( preg_match( self::_RE_WORD, $part ) ) {
344
				// Make sure that things like email addresses and URLs are not broken up
345
				// into words and punctuation not preceeded by an 'other'.
346
				self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index );
347
			} else {
348
				// Make sure that things like email addresses and URLs are not broken up into words
349
				// and punctuation not preceeded by an 'other' or 'word'.
350
				self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index );
351
			}
352
353
			$index++;
354
		}
355
356
		return $tokens;
357
	}
358
359
	/**
360
	 * Parse ambigious tokens (that may need to be combined with the predecessors).
361
	 *
362
	 * @param int     $expected_type Either Token::WORD or Token::OTHER.
363
	 * @param string  $part          The string fragment to parse.
364
	 * @param Token[] $tokens        The token array. Passed by reference.
365
	 * @param int     $index         The current index. Passed by reference.
366
	 */
367
	protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) {
368
369
		// Make sure that things like email addresses and URLs are not broken up incorrectly.
370
		if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) {
371
			$index--;
372
			$old_part         = $tokens[ $index ]->value;
373
			$tokens[ $index ] = new Token( $old_part . $part, Token::OTHER );
374
375
		} // Not preceeded by a non-space + punctuation.
376
		elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) {
377
			$old_part             = $tokens[ $index - 1 ]->value;
378
			$older_part           = $tokens[ $index - 2 ]->value;
379
			$tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER );
380
			unset( $tokens[ $index - 1 ] );
381
			$index = $index - 2;
382
383
		} // All good.
384
		else {
385
			$tokens[ $index ] = new Token( $part, $expected_type );
386
		}
387
	}
388
389
	/**
390
	 * Checks if the predecessor of the current token is of a certain type.
391
	 *
392
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
393
	 * @param  array $tokens An array of tokens.
394
	 * @param  int   $index  The current token index.
395
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
396
	 *
397
	 * @return bool
398
	 */
399
	protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
400
		return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type;
401
	}
402
403
	/**
404
	 * Checks if the predecessor of the current token is not of a certain type.
405
	 *
406
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
407
	 * @param  array $tokens An array of tokens.
408
	 * @param  int   $index  The current token index.
409
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
410
	 *
411
	 * @return bool
412
	 */
413
	protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
414
		return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type;
415
	}
416
417
418
	/**
419
	 * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted).
420
	 *
421
	 * Warning: Tokens previously acquired through 'get' methods may not match new tokenization.
422
	 *
423
	 * @return bool Returns true on successful completion.
424
	 */
425
	public function reload() {
426
		return $this->load( $this->unload() );
427
	}
428
429
	/**
430
	 * Returns the complete text as a string and clears the parser.
431
	 *
432
	 * @return string
433
	 */
434
	public function unload() {
435
		$reassembled_text = '';
436
437
		foreach ( $this->text as $token ) {
438
			$reassembled_text .= $token->value;
439
		}
440
441
		$this->clear();
442
443
		return $reassembled_text;
444
	}
445
446
	/**
447
	 * Clears the currently set text from the parser.
448
	 */
449
	public function clear() {
450
		$this->text               = [];
451
		$this->current_strtoupper = null;
452
	}
453
454
	/**
455
	 * Updates the 'value' field for all matching tokens.
456
	 *
457
	 * @param Token[] $tokens An array of tokens.
458
	 */
459
	public function update( $tokens ) {
460
		foreach ( $tokens as $index => $token ) {
461
			$this->text[ $index ] = $this->text[ $index ]->with_value( $token->value );
462
		}
463
	}
464
465
	/**
466
	 * Retrieves all tokens of the currently set text.
467
	 *
468
	 * @return Token[] An array of numerically indexed tokens.
469
	 */
470
	public function get_all() {
471
		return $this->text;
472
	}
473
474
	/**
475
	 * Retrieves all tokens of the type "space".
476
	 *
477
	 * @return Token[] An array of numerically indexed tokens.
478
	 */
479
	public function get_spaces() {
480
		return $this->get_type( Token::SPACE );
481
	}
482
483
	/**
484
	 * Retrieves all tokens of the type "punctuation".
485
	 *
486
	 * @return Token[] An array of numerically indexed tokens.
487
	 */
488
	public function get_punctuation() {
489
		return $this->get_type( Token::PUNCTUATION );
490
	}
491
492
	/**
493
	 * Retrieves all tokens of the type "word".
494
	 *
495
	 * @param int $abc   Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS.
496
	 * @param int $caps  Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS.
497
	 * @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS.
498
	 *
499
	 * @return Token[] An array of numerically indexed tokens.
500
	 */
501
	public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) {
502
		// Return early if no text has been loaded.
503
		if ( ! isset( $this->text ) || ! is_callable( $this->current_strtoupper ) ) {
504
			return []; // abort.
505
		}
506
507
		// Result set.
508
		$tokens = [];
509
510
		foreach ( $this->get_type( Token::WORD ) as $index => $token ) {
511
512
			if (
513
				$this->conforms_to_letters_policy( $token, $abc ) &&
514
				$this->conforms_to_caps_policy( $token, $caps ) &&
515
				$this->conforms_to_compounds_policy( $token, $comps )
516
			) {
517
				$tokens[ $index ] = $token;
518
			}
519
		}
520
521
		return $tokens;
522
	}
523
524
	/**
525
	 * Check if the value of the token conforms to the given policy for letters.
526
	 *
527
	 * @param  Token $token  Required.
528
	 * @param  int   $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS.
529
	 *
530
	 * @return bool
531
	 */
532
	protected function conforms_to_letters_policy( Token $token, $policy ) {
533
		return $this->check_policy( $token, $policy, self::ALLOW_ALL_LETTERS, self::REQUIRE_ALL_LETTERS, self::NO_ALL_LETTERS, function( $value ) {
534
			return preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value );
535
		} );
536
	}
537
538
	/**
539
	 * Check if the value of the token conforms to the given policy for all-caps words.
540
	 *
541
	 * @param  Token $token  Required.
542
	 * @param  int   $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS.
543
	 *
544
	 * @return bool
545
	 */
546
	protected function conforms_to_caps_policy( Token $token, $policy ) {
547
		return $this->check_policy( $token, $policy, self::ALLOW_ALL_CAPS, self::REQUIRE_ALL_CAPS, self::NO_ALL_CAPS, function( $value ) {
548
			return call_user_func( $this->current_strtoupper, $value );
549
		} );
550
	}
551
552
	/**
553
	 * Check if the value of the token conforms to the given policy for compound words.
554
	 *
555
	 * @param  Token $token  Required.
556
	 * @param  int   $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS.
557
	 *
558
	 * @return bool
559
	 */
560
	protected function conforms_to_compounds_policy( Token $token, $policy ) {
561
		return $this->check_policy( $token, $policy, self::ALLOW_COMPOUNDS, self::NO_COMPOUNDS, self::REQUIRE_COMPOUNDS, function( $value ) {
562
			return preg_replace( '/-/S', '', $value );
563
		} );
564
	}
565
566
	/**
567
	 * Check if the value of the token conforms to the given policy.
568
	 *
569
	 * @param  Token    $token             Required.
570
	 * @param  int      $policy            The policy to check.
571
	 * @param  int      $permissive_policy ALLOW_* policy constant.
572
	 * @param  int      $equal_policy      Policy constant to check when the transformed value is equal to the original token.
573
	 * @param  int      $non_equal_policy  Policy constant to check when the transformed value is different from the original token.
574
	 * @param  callable $transform_token   Function to transform the token value.
575
	 *
576
	 * @return bool
577
	 */
578
	protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) {
579
580
		// Short circuit.
581
		if ( $permissive_policy === $policy ) {
582
			return true;
583
		}
584
585
		$transformed = $transform_token( $token->value );
586
587
		return
588
			( $equal_policy === $policy && $transformed === $token->value ) ||
589
			( $non_equal_policy === $policy && $transformed !== $token->value );
590
	}
591
592
	/**
593
	 * Retrieves all tokens of the type "other".
594
	 *
595
	 * @return Token[] An array of numerically indexed tokens.
596
	 */
597
	public function get_other() {
598
		return $this->get_type( Token::OTHER );
599
	}
600
601
	/**
602
	 * Retrieves all tokens of the given type.
603
	 *
604
	 * @param int $type The type to get.
605
	 *
606
	 * @return Token[] An array of numerically indexed tokens.
607
	 */
608
	public function get_type( $type ) {
609
		$tokens = [];
610
611
		foreach ( $this->text as $index => $token ) {
612
			if ( $token->type === $type ) {
613
				$tokens[ $index ] = $token;
614
			}
615
		}
616
617
		return $tokens;
618
	}
619
}
620