Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (13)

src/class-text-parser.php (1 issue)

1
<?php
2
/**
3
 *  This file is part of PHP-Typography.
4
 *
5
 *  Copyright 2014-2019 Peter Putzer.
6
 *  Copyright 2012-2013 Marie Hogebrandt.
7
 *  Copyright 2009-2011 KINGdesk, LLC.
8
 *
9
 *  This program is free software; you can redistribute it and/or modify
10
 *  it under the terms of the GNU General Public License as published by
11
 *  the Free Software Foundation; either version 2 of the License, or
12
 *  (at your option) any later version.
13
 *
14
 *  This program is distributed in the hope that it will be useful,
15
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 *  GNU General Public License for more details.
18
 *
19
 *  You should have received a copy of the GNU General Public License along
20
 *  with this program; if not, write to the Free Software Foundation, Inc.,
21
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
 *
23
 *  ***
24
 *
25
 *  @package mundschenk-at/php-typography
26
 *  @license http://www.gnu.org/licenses/gpl-2.0.html
27
 */
28
29
namespace PHP_Typography;
30
31
use PHP_Typography\Text_Parser\Token;
32
33
/**
34
 * A class to parse plain text (such as the data of DOMText).
35
 *
36
 * Parse_Text assumes no HTML markup in the text (except for special html characters like &gt;).
37
 * If multibyte characters are passed, they must be encoded as UTF-8.
38
 */
39
class Text_Parser {
40
41
	const NO_ALL_LETTERS      = 0b000000000001;
42
	const ALLOW_ALL_LETTERS   = 0b000000000010;
43
	const REQUIRE_ALL_LETTERS = 0b000000000100;
44
	const NO_ALL_CAPS         = 0b000000001000;
45
	const ALLOW_ALL_CAPS      = 0b000000010000;
46
	const REQUIRE_ALL_CAPS    = 0b000000100000;
47
	const NO_COMPOUNDS        = 0b000001000000;
48
	const ALLOW_COMPOUNDS     = 0b000010000000;
49
	const REQUIRE_COMPOUNDS   = 0b000100000000;
50
51
	/**
52
	 * Find spacing FIRST (as it is the primary delimiter)
53
	 *
54
	 * Find the HTML character representation for the following characters:
55
	 *      tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
56
	 *      ogham space mark | en quad space | em quad space | en-space | three-per-em space
57
	 *      four-per-em space | six-per-em space | figure space | punctuation space | em-space
58
	 *      thin space | hair space | narrow no-break space
59
	 *      medium mathematical space | ideographic space
60
	 * Some characters are used inside words, we will not count these as a space for the purpose
61
	 * of finding word boundaries:
62
	 *      zero-width-space ("&#8203;", "&#x200b;")
63
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
64
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
65
	 */
66
	const _HTML_SPACING = '
67
			(?:
68
				(?:										# alpha matches
69
					&
70
					(?: nbsp|ensp|emsp|thinsp )
71
					;
72
				)
73
				|
74
				(?:										# decimal matches
75
					&\#
76
					(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 )
77
					;
78
				)
79
				|
80
				(?:										# hexidecimal matches
81
					&\#x
82
					(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 )
83
					;
84
				)
85
				|
86
				(?:										# actual characters
87
					\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}|
88
					\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000}
89
				)
90
			)
91
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
92
93
	const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8.
94
95
	/**
96
	 * Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores)
97
	 *
98
	 * @see http://www.unicode.org/charts/PDF/U2000.pdf
99
	 *
100
	 * Find punctuation and symbols
101
	 *  dec matches =   33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903
102
	 *  hex matches =   0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2|
103
	 *                  03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f
104
	 *
105
	 * Some characters are used inside words, we will not count these as a space for the purpose
106
	 * of finding word boundaries:
107
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
108
	 *      underscore ("&#95;", "&#x005f;")
109
	 */
110
	const _HTML_PUNCTUATION = '
111
			(?:
112
				(?:										# alpha matches
113
					&
114
					(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams)
115
					;
116
				)
117
				|
118
				(?:										# decimal matches
119
					&\#
120
					(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] )
121
					;
122
				)
123
				|
124
				(?:										# hexidecimal matches
125
					&\#x
126
					(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] )
127
					;
128
				)
129
			)
130
		'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
131
132
	const _PUNCTUATION = '
133
	(?:
134
		(?:
135
			[^\w\s\&\/\@]  # assume characters that are not word spaces or whitespace are punctuation
136
						   # exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations
137
						   # exclude slash \/as to not include the last slash in a URL
138
						   # exclude @ as to keep twitter names together
139
			|
140
			' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation
141
		)+
142
	)
143
	';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
144
145
	/**
146
	 * Letter connectors allowed in words
147
	 *      hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
148
	 *      underscore ("&#95;", "&#x005f;")
149
	 *      zero-width-space ("&#8203;", "&#x200b;")
150
	 *      zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
151
	 *      zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
152
	 */
153
	const _HTML_LETTER_CONNECTORS = '
154
		(?:
155
			(?:												# alpha matches
156
				&
157
				(?: shy|zwj|zwnj )
158
				;
159
			)
160
			|
161
			(?:												# decimal matches
162
				&\#
163
				(?: 45|95|173|820[3-589]|8210 )
164
				;
165
			)
166
			|
167
			(?:												# hexidecimal matches
168
				&\#x
169
				(?: 002d|005f|00ad|200[b-d]|201[0-2] )
170
				;
171
			)
172
			|
173
			(?:												# actual characters
174
				\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012}
175
			)
176
		)
177
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
178
179
	/**
180
	 * Word character html entities
181
	 *   characters  0-9__ A-Z__ a-z___ other_special_chrs_____
182
	 *   decimal     48-57 65-90 97-122 192-214,216-246,248-255, 256-383
183
	 *   hex         31-39 41-5a 61-7a  c0-d6   d8-f6   f8-ff    0100-017f
184
	 */
185
	const _HTML_LETTERS = '
186
		(?:
187
			(?:												# alpha matches
188
				&
189
				(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml)
190
				;
191
			)
192
			|
193
			(?:												# decimal matches
194
				&\#
195
				(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] )
196
				;
197
			)
198
			|
199
			(?:												# hexidecimal matches
200
				(?:
201
					&\#x00
202
					(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] )
203
					;
204
				)
205
				|
206
				(?:
207
					&\#x01[0-7][0-9a-f];
208
				)
209
			)
210
			|
211
			(?:												# actual characters
212
				[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
213
				\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
214
				\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
215
				\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
216
				\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
217
				\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
218
				\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
219
				\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
220
				\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
221
				\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
222
				\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
223
				\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
224
				\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
225
				\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
226
				\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
227
				\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
228
				\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
229
				\x{017c}|\x{017d}|\x{017e}|\x{017f}
230
			)
231
		)
232
	'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
233
234
	const _WORD = '
235
	(?:
236
		(?<![\w\&])	 # negative lookbehind to ensure
237
					 #	1) we are proceeded by a non-word-character, and
238
					 #	2) we are not inside an HTML character def
239
		(?:
240
			[\w\-\_\/]
241
			|
242
			' . self::_HTML_LETTERS . '
243
			|
244
			' . self::_HTML_LETTER_CONNECTORS . '
245
		)+
246
	)
247
	'; // required modifiers: x (multiline pattern) u (utf8).
248
249
	// Find any text.
250
	const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8).
251
252
	// Regular expressions.
253
	const _RE_ANY_TEXT               = '/(' . self::_ANY_TEXT . ')/Sxiu';
254
	const _RE_SPACE                  = '/\A' . self::_SPACE . '\Z/Sxiu';
255
	const _RE_PUNCTUATION            = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu';
256
	const _RE_WORD                   = '/\A' . self::_WORD . '\Z/Sxu';
257
	const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu';
258
	const _RE_MAX_STRING_LENGTH      = '/\w{500}/Ss';
259
260
	/**
261
	 * The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper').
262
	 *
263
	 * @var callable
264
	 */
265
	private $current_strtoupper = 'strtoupper';
266
267
	/**
268
	 * The tokenized text.
269
	 *
270
	 * @var array $text {
271
	 *      @type Text_Parser\Token $index
272
	 * }
273
	 */
274
	private $text = [];
275
276
	/**
277
	 * Creates a new parser object.
278
	 */
279 1
	public function __construct() {
280 1
	}
281
282
	/**
283
	 * Tokenizes a string and stores the tokens in $this->text.
284
	 *
285
	 * @param string $raw_text A text fragment without any HTML markup.
286
	 *
287
	 * @return bool Returns `true` on successful completion, `false` otherwise.
288
	 */
289 6
	public function load( $raw_text ) {
290 6
		if ( ! \is_string( $raw_text ) ) {
0 ignored issues
show
The condition is_string($raw_text) is always true.
Loading history...
291 1
			return false; // we have an error, abort.
292
		}
293
294
		// Abort if a simple string exceeds 500 characters (security concern).
295 5
		if ( \preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) {
296 1
			return false;
297
		}
298
299
		// Detect encoding.
300 5
		$str_functions = Strings::functions( $raw_text );
301 5
		if ( empty( $str_functions ) ) {
302 1
			return false; // unknown encoding.
303
		}
304 4
		$this->current_strtoupper = $str_functions['strtoupper'];
305
306
		// Tokenize the raw text parts.
307 4
		$this->text = self::tokenize( /** RE correct. @scrutinizer ignore-type */ \preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) );
308
309
		// The token array should never be empty.
310 4
		return ! empty( $this->text );
311
	}
312
313
	/**
314
	 * Turns the array of strings into an array of tokens.
315
	 *
316
	 * @param string[] $parts An array of non-empty strings.
317
	 *
318
	 * @return Token[] An array of numerically indexed tokens.
319
	 */
320 4
	protected static function tokenize( array $parts ) {
321 4
		$tokens = [];
322 4
		$index  = 0;
323
324 4
		foreach ( $parts as $part ) {
325 4
			if ( \preg_match( self::_RE_SPACE, $part ) ) {
326 4
				$tokens[ $index ] = new Token( $part, Token::SPACE );
327 4
			} elseif ( \preg_match( self::_RE_PUNCTUATION, $part ) ) {
328 4
				$tokens[ $index ] = new Token( $part, Token::PUNCTUATION );
329 4
			} elseif ( \preg_match( self::_RE_WORD, $part ) ) {
330
				// Make sure that things like email addresses and URLs are not broken up
331
				// into words and punctuation not preceeded by an 'other'.
332 4
				self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index );
333
			} else {
334
				// Make sure that things like email addresses and URLs are not broken up into words
335
				// and punctuation not preceeded by an 'other' or 'word'.
336 2
				self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index );
337
			}
338
339 4
			$index++;
340
		}
341
342 4
		return $tokens;
343
	}
344
345
	/**
346
	 * Parse ambigious tokens (that may need to be combined with the predecessors).
347
	 *
348
	 * @param int     $expected_type Either Token::WORD or Token::OTHER.
349
	 * @param string  $part          The string fragment to parse.
350
	 * @param Token[] $tokens        The token array. Passed by reference.
351
	 * @param int     $index         The current index. Passed by reference.
352
	 */
353 4
	protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) {
354
355
		// Make sure that things like email addresses and URLs are not broken up incorrectly.
356 4
		if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) {
357 2
			$index--;
358 2
			$old_part         = $tokens[ $index ]->value;
359 2
			$tokens[ $index ] = new Token( $old_part . $part, Token::OTHER );
360
361 4
		} elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) {
362
			// Not preceeded by a non-space + punctuation.
363 3
			$old_part             = $tokens[ $index - 1 ]->value;
364 3
			$older_part           = $tokens[ $index - 2 ]->value;
365 3
			$tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER );
366 3
			unset( $tokens[ $index - 1 ] );
367 3
			$index = $index - 2;
368
369
		} else {
370
			// All good.
371 4
			$tokens[ $index ] = new Token( $part, $expected_type );
372
		}
373 4
	}
374
375
	/**
376
	 * Checks if the predecessor of the current token is of a certain type.
377
	 *
378
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
379
	 * @param  array $tokens An array of tokens.
380
	 * @param  int   $index  The current token index.
381
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
382
	 *
383
	 * @return bool
384
	 */
385 4
	protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
386 4
		return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type;
387
	}
388
389
	/**
390
	 * Checks if the predecessor of the current token is not of a certain type.
391
	 *
392
	 * @param  int   $type   A valid token type (e.g. Token::WORD).
393
	 * @param  array $tokens An array of tokens.
394
	 * @param  int   $index  The current token index.
395
	 * @param  int   $steps  Optional. The number steps to go back for the check. Default 1.
396
	 *
397
	 * @return bool
398
	 */
399 4
	protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) {
400 4
		return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type;
401
	}
402
403
404
	/**
405
	 * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted).
406
	 *
407
	 * Warning: Tokens previously acquired through 'get' methods may not match new tokenization.
408
	 *
409
	 * @return bool Returns true on successful completion.
410
	 */
411 1
	public function reload() {
412 1
		return $this->load( $this->unload() );
413
	}
414
415
	/**
416
	 * Returns the complete text as a string and clears the parser.
417
	 *
418
	 * @return string
419
	 */
420 1
	public function unload() {
421 1
		$reassembled_text = '';
422
423 1
		foreach ( $this->text as $token ) {
424 1
			$reassembled_text .= $token->value;
425
		}
426
427 1
		$this->clear();
428
429 1
		return $reassembled_text;
430
	}
431
432
	/**
433
	 * Clears the currently set text from the parser.
434
	 */
435 1
	public function clear() {
436 1
		$this->text = [];
437 1
	}
438
439
	/**
440
	 * Updates the 'value' field for all matching tokens.
441
	 *
442
	 * @param Token[] $tokens An array of tokens.
443
	 */
444 1
	public function update( $tokens ) {
445 1
		foreach ( $tokens as $index => $token ) {
446 1
			$this->text[ $index ] = $this->text[ $index ]->with_value( $token->value );
447
		}
448 1
	}
449
450
	/**
451
	 * Retrieves all tokens of the currently set text.
452
	 *
453
	 * @return Token[] An array of numerically indexed tokens.
454
	 */
455 1
	public function get_all() {
456 1
		return $this->text;
457
	}
458
459
	/**
460
	 * Retrieves all tokens of the type "space".
461
	 *
462
	 * @return Token[] An array of numerically indexed tokens.
463
	 */
464 1
	public function get_spaces() {
465 1
		return $this->get_type( Token::SPACE );
466
	}
467
468
	/**
469
	 * Retrieves all tokens of the type "punctuation".
470
	 *
471
	 * @return Token[] An array of numerically indexed tokens.
472
	 */
473 1
	public function get_punctuation() {
474 1
		return $this->get_type( Token::PUNCTUATION );
475
	}
476
477
	/**
478
	 * Retrieves all tokens of the type "word".
479
	 *
480
	 * @param int $abc   Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS.
481
	 * @param int $caps  Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS.
482
	 * @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS.
483
	 *
484
	 * @return Token[] An array of numerically indexed tokens.
485
	 */
486 2
	public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) {
487
		// Return early if no text has been loaded.
488 2
		if ( empty( $this->text ) ) {
489 1
			return []; // abort.
490
		}
491
492
		// Result set.
493 1
		$tokens = [];
494
495 1
		foreach ( $this->get_type( Token::WORD ) as $index => $token ) {
496
497
			if (
498 1
				$this->conforms_to_letters_policy( $token, $abc ) &&
499 1
				$this->conforms_to_caps_policy( $token, $caps ) &&
500 1
				$this->conforms_to_compounds_policy( $token, $comps )
501
			) {
502 1
				$tokens[ $index ] = $token;
503
			}
504
		}
505
506 1
		return $tokens;
507
	}
508
509
	/**
510
	 * Check if the value of the token conforms to the given policy for letters.
511
	 *
512
	 * @param  Token $token  Required.
513
	 * @param  int   $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS.
514
	 *
515
	 * @return bool
516
	 */
517 13
	protected function conforms_to_letters_policy( Token $token, $policy ) {
518 13
		return $this->check_policy(
519 13
			$token,
520
			$policy,
521 13
			self::ALLOW_ALL_LETTERS,
522 13
			self::REQUIRE_ALL_LETTERS,
523 13
			self::NO_ALL_LETTERS,
524
			function( $value ) {
525 9
				return \preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value );
526 13
			}
527
		);
528
	}
529
530
	/**
531
	 * Check if the value of the token conforms to the given policy for all-caps words.
532
	 *
533
	 * @param  Token $token  Required.
534
	 * @param  int   $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS.
535
	 *
536
	 * @return bool
537
	 */
538 13
	protected function conforms_to_caps_policy( Token $token, $policy ) {
539 13
		return $this->check_policy(
540 13
			$token,
541
			$policy,
542 13
			self::ALLOW_ALL_CAPS,
543 13
			self::REQUIRE_ALL_CAPS,
544 13
			self::NO_ALL_CAPS,
545 13
			$this->current_strtoupper
546
		);
547
	}
548
549
	/**
550
	 * Check if the value of the token conforms to the given policy for compound words.
551
	 *
552
	 * @param  Token $token  Required.
553
	 * @param  int   $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS.
554
	 *
555
	 * @return bool
556
	 */
557 13
	protected function conforms_to_compounds_policy( Token $token, $policy ) {
558 13
		return $this->check_policy(
559 13
			$token,
560
			$policy,
561 13
			self::ALLOW_COMPOUNDS,
562 13
			self::NO_COMPOUNDS,
563 13
			self::REQUIRE_COMPOUNDS,
564
			function( $value ) {
565 9
				return \preg_replace( '/-/S', '', $value );
566 13
			}
567
		);
568
	}
569
570
	/**
571
	 * Check if the value of the token conforms to the given policy.
572
	 *
573
	 * @param  Token    $token             Required.
574
	 * @param  int      $policy            The policy to check.
575
	 * @param  int      $permissive_policy ALLOW_* policy constant.
576
	 * @param  int      $equal_policy      Policy constant to check when the transformed value is equal to the original token.
577
	 * @param  int      $non_equal_policy  Policy constant to check when the transformed value is different from the original token.
578
	 * @param  callable $transform_token   Function to transform the token value.
579
	 *
580
	 * @return bool
581
	 */
582 39
	protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) {
583
584
		// Short circuit.
585 39
		if ( $permissive_policy === $policy ) {
586 12
			return true;
587
		}
588
589 27
		$transformed = $transform_token( $token->value );
590
591 27
		return ( $equal_policy === $policy && $transformed === $token->value )
592 27
			|| ( $non_equal_policy === $policy && $transformed !== $token->value );
593
	}
594
595
	/**
596
	 * Retrieves all tokens of the type "other".
597
	 *
598
	 * @return Token[] An array of numerically indexed tokens.
599
	 */
600 1
	public function get_other() {
601 1
		return $this->get_type( Token::OTHER );
602
	}
603
604
	/**
605
	 * Retrieves all tokens of the given type.
606
	 *
607
	 * @param int $type The type to get.
608
	 *
609
	 * @return Token[] An array of numerically indexed tokens.
610
	 */
611 1
	public function get_type( $type ) {
612 1
		$tokens = [];
613
614 1
		foreach ( $this->text as $index => $token ) {
615 1
			if ( $token->type === $type ) {
616 1
				$tokens[ $index ] = $token;
617
			}
618
		}
619
620 1
		return $tokens;
621
	}
622
}
623