We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.
1 | <?php |
||
2 | /** |
||
3 | * This file is part of PHP-Typography. |
||
4 | * |
||
5 | * Copyright 2014-2019 Peter Putzer. |
||
6 | * Copyright 2012-2013 Marie Hogebrandt. |
||
7 | * Copyright 2009-2011 KINGdesk, LLC. |
||
8 | * |
||
9 | * This program is free software; you can redistribute it and/or modify |
||
10 | * it under the terms of the GNU General Public License as published by |
||
11 | * the Free Software Foundation; either version 2 of the License, or |
||
12 | * (at your option) any later version. |
||
13 | * |
||
14 | * This program is distributed in the hope that it will be useful, |
||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
17 | * GNU General Public License for more details. |
||
18 | * |
||
19 | * You should have received a copy of the GNU General Public License along |
||
20 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
21 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
22 | * |
||
23 | * *** |
||
24 | * |
||
25 | * @package mundschenk-at/php-typography |
||
26 | * @license http://www.gnu.org/licenses/gpl-2.0.html |
||
27 | */ |
||
28 | |||
29 | namespace PHP_Typography; |
||
30 | |||
31 | use PHP_Typography\Text_Parser\Token; |
||
32 | |||
33 | /** |
||
34 | * A class to parse plain text (such as the data of DOMText). |
||
35 | * |
||
36 | * Parse_Text assumes no HTML markup in the text (except for special html characters like >). |
||
37 | * If multibyte characters are passed, they must be encoded as UTF-8. |
||
38 | */ |
||
39 | class Text_Parser { |
||
40 | |||
41 | const NO_ALL_LETTERS = 0b000000000001; |
||
42 | const ALLOW_ALL_LETTERS = 0b000000000010; |
||
43 | const REQUIRE_ALL_LETTERS = 0b000000000100; |
||
44 | const NO_ALL_CAPS = 0b000000001000; |
||
45 | const ALLOW_ALL_CAPS = 0b000000010000; |
||
46 | const REQUIRE_ALL_CAPS = 0b000000100000; |
||
47 | const NO_COMPOUNDS = 0b000001000000; |
||
48 | const ALLOW_COMPOUNDS = 0b000010000000; |
||
49 | const REQUIRE_COMPOUNDS = 0b000100000000; |
||
50 | |||
51 | /** |
||
52 | * Find spacing FIRST (as it is the primary delimiter) |
||
53 | * |
||
54 | * Find the HTML character representation for the following characters: |
||
55 | * tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace |
||
56 | * ogham space mark | en quad space | em quad space | en-space | three-per-em space |
||
57 | * four-per-em space | six-per-em space | figure space | punctuation space | em-space |
||
58 | * thin space | hair space | narrow no-break space |
||
59 | * medium mathematical space | ideographic space |
||
60 | * Some characters are used inside words, we will not count these as a space for the purpose |
||
61 | * of finding word boundaries: |
||
62 | * zero-width-space ("​", "​") |
||
63 | * zero-width-joiner ("‌", "‌", "‍") |
||
64 | * zero-width-non-joiner ("‍", "‍", "‌") |
||
65 | */ |
||
66 | const _HTML_SPACING = ' |
||
67 | (?: |
||
68 | (?: # alpha matches |
||
69 | & |
||
70 | (?: nbsp|ensp|emsp|thinsp ) |
||
71 | ; |
||
72 | ) |
||
73 | | |
||
74 | (?: # decimal matches |
||
75 | &\# |
||
76 | (?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 ) |
||
77 | ; |
||
78 | ) |
||
79 | | |
||
80 | (?: # hexidecimal matches |
||
81 | &\#x |
||
82 | (?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 ) |
||
83 | ; |
||
84 | ) |
||
85 | | |
||
86 | (?: # actual characters |
||
87 | \x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}| |
||
88 | \x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000} |
||
89 | ) |
||
90 | ) |
||
91 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
92 | |||
93 | const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8. |
||
94 | |||
95 | /** |
||
96 | * Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores) |
||
97 | * |
||
98 | * @see http://www.unicode.org/charts/PDF/U2000.pdf |
||
99 | * |
||
100 | * Find punctuation and symbols |
||
101 | * dec matches = 33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903 |
||
102 | * hex matches = 0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2| |
||
103 | * 03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f |
||
104 | * |
||
105 | * Some characters are used inside words, we will not count these as a space for the purpose |
||
106 | * of finding word boundaries: |
||
107 | * hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
||
108 | * underscore ("_", "_") |
||
109 | */ |
||
110 | const _HTML_PUNCTUATION = ' |
||
111 | (?: |
||
112 | (?: # alpha matches |
||
113 | & |
||
114 | (?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams) |
||
115 | ; |
||
116 | ) |
||
117 | | |
||
118 | (?: # decimal matches |
||
119 | &\# |
||
120 | (?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] ) |
||
121 | ; |
||
122 | ) |
||
123 | | |
||
124 | (?: # hexidecimal matches |
||
125 | &\#x |
||
126 | (?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] ) |
||
127 | ; |
||
128 | ) |
||
129 | ) |
||
130 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
131 | |||
132 | const _PUNCTUATION = ' |
||
133 | (?: |
||
134 | (?: |
||
135 | [^\w\s\&\/\@] # assume characters that are not word spaces or whitespace are punctuation |
||
136 | # exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations |
||
137 | # exclude slash \/as to not include the last slash in a URL |
||
138 | # exclude @ as to keep twitter names together |
||
139 | | |
||
140 | ' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation |
||
141 | )+ |
||
142 | ) |
||
143 | ';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
144 | |||
145 | /** |
||
146 | * Letter connectors allowed in words |
||
147 | * hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
||
148 | * underscore ("_", "_") |
||
149 | * zero-width-space ("​", "​") |
||
150 | * zero-width-joiner ("‌", "‌", "‍") |
||
151 | * zero-width-non-joiner ("‍", "‍", "‌") |
||
152 | */ |
||
153 | const _HTML_LETTER_CONNECTORS = ' |
||
154 | (?: |
||
155 | (?: # alpha matches |
||
156 | & |
||
157 | (?: shy|zwj|zwnj ) |
||
158 | ; |
||
159 | ) |
||
160 | | |
||
161 | (?: # decimal matches |
||
162 | &\# |
||
163 | (?: 45|95|173|820[3-589]|8210 ) |
||
164 | ; |
||
165 | ) |
||
166 | | |
||
167 | (?: # hexidecimal matches |
||
168 | &\#x |
||
169 | (?: 002d|005f|00ad|200[b-d]|201[0-2] ) |
||
170 | ; |
||
171 | ) |
||
172 | | |
||
173 | (?: # actual characters |
||
174 | \x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012} |
||
175 | ) |
||
176 | ) |
||
177 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
178 | |||
179 | /** |
||
180 | * Word character html entities |
||
181 | * characters 0-9__ A-Z__ a-z___ other_special_chrs_____ |
||
182 | * decimal 48-57 65-90 97-122 192-214,216-246,248-255, 256-383 |
||
183 | * hex 31-39 41-5a 61-7a c0-d6 d8-f6 f8-ff 0100-017f |
||
184 | */ |
||
185 | const _HTML_LETTERS = ' |
||
186 | (?: |
||
187 | (?: # alpha matches |
||
188 | & |
||
189 | (?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml) |
||
190 | ; |
||
191 | ) |
||
192 | | |
||
193 | (?: # decimal matches |
||
194 | &\# |
||
195 | (?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] ) |
||
196 | ; |
||
197 | ) |
||
198 | | |
||
199 | (?: # hexidecimal matches |
||
200 | (?: |
||
201 | &\#x00 |
||
202 | (?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] ) |
||
203 | ; |
||
204 | ) |
||
205 | | |
||
206 | (?: |
||
207 | &\#x01[0-7][0-9a-f]; |
||
208 | ) |
||
209 | ) |
||
210 | | |
||
211 | (?: # actual characters |
||
212 | [0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| |
||
213 | \x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| |
||
214 | \x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| |
||
215 | \x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| |
||
216 | \x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| |
||
217 | \x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| |
||
218 | \x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| |
||
219 | \x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| |
||
220 | \x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| |
||
221 | \x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| |
||
222 | \x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| |
||
223 | \x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| |
||
224 | \x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| |
||
225 | \x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| |
||
226 | \x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| |
||
227 | \x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| |
||
228 | \x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| |
||
229 | \x{017c}|\x{017d}|\x{017e}|\x{017f} |
||
230 | ) |
||
231 | ) |
||
232 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
233 | |||
234 | const _WORD = ' |
||
235 | (?: |
||
236 | (?<![\w\&]) # negative lookbehind to ensure |
||
237 | # 1) we are proceeded by a non-word-character, and |
||
238 | # 2) we are not inside an HTML character def |
||
239 | (?: |
||
240 | [\w\-\_\/] |
||
241 | | |
||
242 | ' . self::_HTML_LETTERS . ' |
||
243 | | |
||
244 | ' . self::_HTML_LETTER_CONNECTORS . ' |
||
245 | )+ |
||
246 | ) |
||
247 | '; // required modifiers: x (multiline pattern) u (utf8). |
||
248 | |||
249 | // Find any text. |
||
250 | const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
251 | |||
252 | // Regular expressions. |
||
253 | const _RE_ANY_TEXT = '/(' . self::_ANY_TEXT . ')/Sxiu'; |
||
254 | const _RE_SPACE = '/\A' . self::_SPACE . '\Z/Sxiu'; |
||
255 | const _RE_PUNCTUATION = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu'; |
||
256 | const _RE_WORD = '/\A' . self::_WORD . '\Z/Sxu'; |
||
257 | const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu'; |
||
258 | const _RE_MAX_STRING_LENGTH = '/\w{500}/Ss'; |
||
259 | |||
260 | /** |
||
261 | * The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper'). |
||
262 | * |
||
263 | * @var callable |
||
264 | */ |
||
265 | private $current_strtoupper = 'strtoupper'; |
||
266 | |||
267 | /** |
||
268 | * The tokenized text. |
||
269 | * |
||
270 | * @var array $text { |
||
271 | * @type Text_Parser\Token $index |
||
272 | * } |
||
273 | */ |
||
274 | private $text = []; |
||
275 | |||
276 | /** |
||
277 | * Creates a new parser object. |
||
278 | */ |
||
279 | 1 | public function __construct() { |
|
280 | 1 | } |
|
281 | |||
282 | /** |
||
283 | * Tokenizes a string and stores the tokens in $this->text. |
||
284 | * |
||
285 | * @param string $raw_text A text fragment without any HTML markup. |
||
286 | * |
||
287 | * @return bool Returns `true` on successful completion, `false` otherwise. |
||
288 | */ |
||
289 | 6 | public function load( $raw_text ) { |
|
290 | 6 | if ( ! \is_string( $raw_text ) ) { |
|
0 ignored issues
–
show
introduced
by
![]() |
|||
291 | 1 | return false; // we have an error, abort. |
|
292 | } |
||
293 | |||
294 | // Abort if a simple string exceeds 500 characters (security concern). |
||
295 | 5 | if ( \preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) { |
|
296 | 1 | return false; |
|
297 | } |
||
298 | |||
299 | // Detect encoding. |
||
300 | 5 | $str_functions = Strings::functions( $raw_text ); |
|
301 | 5 | if ( empty( $str_functions ) ) { |
|
302 | 1 | return false; // unknown encoding. |
|
303 | } |
||
304 | 4 | $this->current_strtoupper = $str_functions['strtoupper']; |
|
305 | |||
306 | // Tokenize the raw text parts. |
||
307 | 4 | $this->text = self::tokenize( /** RE correct. @scrutinizer ignore-type */ \preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ); |
|
308 | |||
309 | // The token array should never be empty. |
||
310 | 4 | return ! empty( $this->text ); |
|
311 | } |
||
312 | |||
313 | /** |
||
314 | * Turns the array of strings into an array of tokens. |
||
315 | * |
||
316 | * @param string[] $parts An array of non-empty strings. |
||
317 | * |
||
318 | * @return Token[] An array of numerically indexed tokens. |
||
319 | */ |
||
320 | 4 | protected static function tokenize( array $parts ) { |
|
321 | 4 | $tokens = []; |
|
322 | 4 | $index = 0; |
|
323 | |||
324 | 4 | foreach ( $parts as $part ) { |
|
325 | 4 | if ( \preg_match( self::_RE_SPACE, $part ) ) { |
|
326 | 4 | $tokens[ $index ] = new Token( $part, Token::SPACE ); |
|
327 | 4 | } elseif ( \preg_match( self::_RE_PUNCTUATION, $part ) ) { |
|
328 | 4 | $tokens[ $index ] = new Token( $part, Token::PUNCTUATION ); |
|
329 | 4 | } elseif ( \preg_match( self::_RE_WORD, $part ) ) { |
|
330 | // Make sure that things like email addresses and URLs are not broken up |
||
331 | // into words and punctuation not preceeded by an 'other'. |
||
332 | 4 | self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index ); |
|
333 | } else { |
||
334 | // Make sure that things like email addresses and URLs are not broken up into words |
||
335 | // and punctuation not preceeded by an 'other' or 'word'. |
||
336 | 2 | self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index ); |
|
337 | } |
||
338 | |||
339 | 4 | $index++; |
|
340 | } |
||
341 | |||
342 | 4 | return $tokens; |
|
343 | } |
||
344 | |||
345 | /** |
||
346 | * Parse ambigious tokens (that may need to be combined with the predecessors). |
||
347 | * |
||
348 | * @param int $expected_type Either Token::WORD or Token::OTHER. |
||
349 | * @param string $part The string fragment to parse. |
||
350 | * @param Token[] $tokens The token array. Passed by reference. |
||
351 | * @param int $index The current index. Passed by reference. |
||
352 | */ |
||
353 | 4 | protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) { |
|
354 | |||
355 | // Make sure that things like email addresses and URLs are not broken up incorrectly. |
||
356 | 4 | if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) { |
|
357 | 2 | $index--; |
|
358 | 2 | $old_part = $tokens[ $index ]->value; |
|
359 | 2 | $tokens[ $index ] = new Token( $old_part . $part, Token::OTHER ); |
|
360 | |||
361 | 4 | } elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) { |
|
362 | // Not preceeded by a non-space + punctuation. |
||
363 | 3 | $old_part = $tokens[ $index - 1 ]->value; |
|
364 | 3 | $older_part = $tokens[ $index - 2 ]->value; |
|
365 | 3 | $tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER ); |
|
366 | 3 | unset( $tokens[ $index - 1 ] ); |
|
367 | 3 | $index = $index - 2; |
|
368 | |||
369 | } else { |
||
370 | // All good. |
||
371 | 4 | $tokens[ $index ] = new Token( $part, $expected_type ); |
|
372 | } |
||
373 | 4 | } |
|
374 | |||
375 | /** |
||
376 | * Checks if the predecessor of the current token is of a certain type. |
||
377 | * |
||
378 | * @param int $type A valid token type (e.g. Token::WORD). |
||
379 | * @param array $tokens An array of tokens. |
||
380 | * @param int $index The current token index. |
||
381 | * @param int $steps Optional. The number steps to go back for the check. Default 1. |
||
382 | * |
||
383 | * @return bool |
||
384 | */ |
||
385 | 4 | protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
386 | 4 | return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type; |
|
387 | } |
||
388 | |||
389 | /** |
||
390 | * Checks if the predecessor of the current token is not of a certain type. |
||
391 | * |
||
392 | * @param int $type A valid token type (e.g. Token::WORD). |
||
393 | * @param array $tokens An array of tokens. |
||
394 | * @param int $index The current token index. |
||
395 | * @param int $steps Optional. The number steps to go back for the check. Default 1. |
||
396 | * |
||
397 | * @return bool |
||
398 | */ |
||
399 | 4 | protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
400 | 4 | return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type; |
|
401 | } |
||
402 | |||
403 | |||
404 | /** |
||
405 | * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted). |
||
406 | * |
||
407 | * Warning: Tokens previously acquired through 'get' methods may not match new tokenization. |
||
408 | * |
||
409 | * @return bool Returns true on successful completion. |
||
410 | */ |
||
411 | 1 | public function reload() { |
|
412 | 1 | return $this->load( $this->unload() ); |
|
413 | } |
||
414 | |||
415 | /** |
||
416 | * Returns the complete text as a string and clears the parser. |
||
417 | * |
||
418 | * @return string |
||
419 | */ |
||
420 | 1 | public function unload() { |
|
421 | 1 | $reassembled_text = ''; |
|
422 | |||
423 | 1 | foreach ( $this->text as $token ) { |
|
424 | 1 | $reassembled_text .= $token->value; |
|
425 | } |
||
426 | |||
427 | 1 | $this->clear(); |
|
428 | |||
429 | 1 | return $reassembled_text; |
|
430 | } |
||
431 | |||
432 | /** |
||
433 | * Clears the currently set text from the parser. |
||
434 | */ |
||
435 | 1 | public function clear() { |
|
436 | 1 | $this->text = []; |
|
437 | 1 | } |
|
438 | |||
439 | /** |
||
440 | * Updates the 'value' field for all matching tokens. |
||
441 | * |
||
442 | * @param Token[] $tokens An array of tokens. |
||
443 | */ |
||
444 | 1 | public function update( $tokens ) { |
|
445 | 1 | foreach ( $tokens as $index => $token ) { |
|
446 | 1 | $this->text[ $index ] = $this->text[ $index ]->with_value( $token->value ); |
|
447 | } |
||
448 | 1 | } |
|
449 | |||
450 | /** |
||
451 | * Retrieves all tokens of the currently set text. |
||
452 | * |
||
453 | * @return Token[] An array of numerically indexed tokens. |
||
454 | */ |
||
455 | 1 | public function get_all() { |
|
456 | 1 | return $this->text; |
|
457 | } |
||
458 | |||
459 | /** |
||
460 | * Retrieves all tokens of the type "space". |
||
461 | * |
||
462 | * @return Token[] An array of numerically indexed tokens. |
||
463 | */ |
||
464 | 1 | public function get_spaces() { |
|
465 | 1 | return $this->get_type( Token::SPACE ); |
|
466 | } |
||
467 | |||
468 | /** |
||
469 | * Retrieves all tokens of the type "punctuation". |
||
470 | * |
||
471 | * @return Token[] An array of numerically indexed tokens. |
||
472 | */ |
||
473 | 1 | public function get_punctuation() { |
|
474 | 1 | return $this->get_type( Token::PUNCTUATION ); |
|
475 | } |
||
476 | |||
477 | /** |
||
478 | * Retrieves all tokens of the type "word". |
||
479 | * |
||
480 | * @param int $abc Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS. |
||
481 | * @param int $caps Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS. |
||
482 | * @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS. |
||
483 | * |
||
484 | * @return Token[] An array of numerically indexed tokens. |
||
485 | */ |
||
486 | 2 | public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) { |
|
487 | // Return early if no text has been loaded. |
||
488 | 2 | if ( empty( $this->text ) ) { |
|
489 | 1 | return []; // abort. |
|
490 | } |
||
491 | |||
492 | // Result set. |
||
493 | 1 | $tokens = []; |
|
494 | |||
495 | 1 | foreach ( $this->get_type( Token::WORD ) as $index => $token ) { |
|
496 | |||
497 | if ( |
||
498 | 1 | $this->conforms_to_letters_policy( $token, $abc ) && |
|
499 | 1 | $this->conforms_to_caps_policy( $token, $caps ) && |
|
500 | 1 | $this->conforms_to_compounds_policy( $token, $comps ) |
|
501 | ) { |
||
502 | 1 | $tokens[ $index ] = $token; |
|
503 | } |
||
504 | } |
||
505 | |||
506 | 1 | return $tokens; |
|
507 | } |
||
508 | |||
509 | /** |
||
510 | * Check if the value of the token conforms to the given policy for letters. |
||
511 | * |
||
512 | * @param Token $token Required. |
||
513 | * @param int $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS. |
||
514 | * |
||
515 | * @return bool |
||
516 | */ |
||
517 | 13 | protected function conforms_to_letters_policy( Token $token, $policy ) { |
|
518 | 13 | return $this->check_policy( |
|
519 | 13 | $token, |
|
520 | $policy, |
||
521 | 13 | self::ALLOW_ALL_LETTERS, |
|
522 | 13 | self::REQUIRE_ALL_LETTERS, |
|
523 | 13 | self::NO_ALL_LETTERS, |
|
524 | function( $value ) { |
||
525 | 9 | return \preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value ); |
|
526 | 13 | } |
|
527 | ); |
||
528 | } |
||
529 | |||
530 | /** |
||
531 | * Check if the value of the token conforms to the given policy for all-caps words. |
||
532 | * |
||
533 | * @param Token $token Required. |
||
534 | * @param int $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS. |
||
535 | * |
||
536 | * @return bool |
||
537 | */ |
||
538 | 13 | protected function conforms_to_caps_policy( Token $token, $policy ) { |
|
539 | 13 | return $this->check_policy( |
|
540 | 13 | $token, |
|
541 | $policy, |
||
542 | 13 | self::ALLOW_ALL_CAPS, |
|
543 | 13 | self::REQUIRE_ALL_CAPS, |
|
544 | 13 | self::NO_ALL_CAPS, |
|
545 | 13 | $this->current_strtoupper |
|
546 | ); |
||
547 | } |
||
548 | |||
549 | /** |
||
550 | * Check if the value of the token conforms to the given policy for compound words. |
||
551 | * |
||
552 | * @param Token $token Required. |
||
553 | * @param int $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS. |
||
554 | * |
||
555 | * @return bool |
||
556 | */ |
||
557 | 13 | protected function conforms_to_compounds_policy( Token $token, $policy ) { |
|
558 | 13 | return $this->check_policy( |
|
559 | 13 | $token, |
|
560 | $policy, |
||
561 | 13 | self::ALLOW_COMPOUNDS, |
|
562 | 13 | self::NO_COMPOUNDS, |
|
563 | 13 | self::REQUIRE_COMPOUNDS, |
|
564 | function( $value ) { |
||
565 | 9 | return \preg_replace( '/-/S', '', $value ); |
|
566 | 13 | } |
|
567 | ); |
||
568 | } |
||
569 | |||
570 | /** |
||
571 | * Check if the value of the token conforms to the given policy. |
||
572 | * |
||
573 | * @param Token $token Required. |
||
574 | * @param int $policy The policy to check. |
||
575 | * @param int $permissive_policy ALLOW_* policy constant. |
||
576 | * @param int $equal_policy Policy constant to check when the transformed value is equal to the original token. |
||
577 | * @param int $non_equal_policy Policy constant to check when the transformed value is different from the original token. |
||
578 | * @param callable $transform_token Function to transform the token value. |
||
579 | * |
||
580 | * @return bool |
||
581 | */ |
||
582 | 39 | protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) { |
|
583 | |||
584 | // Short circuit. |
||
585 | 39 | if ( $permissive_policy === $policy ) { |
|
586 | 12 | return true; |
|
587 | } |
||
588 | |||
589 | 27 | $transformed = $transform_token( $token->value ); |
|
590 | |||
591 | 27 | return ( $equal_policy === $policy && $transformed === $token->value ) |
|
592 | 27 | || ( $non_equal_policy === $policy && $transformed !== $token->value ); |
|
593 | } |
||
594 | |||
595 | /** |
||
596 | * Retrieves all tokens of the type "other". |
||
597 | * |
||
598 | * @return Token[] An array of numerically indexed tokens. |
||
599 | */ |
||
600 | 1 | public function get_other() { |
|
601 | 1 | return $this->get_type( Token::OTHER ); |
|
602 | } |
||
603 | |||
604 | /** |
||
605 | * Retrieves all tokens of the given type. |
||
606 | * |
||
607 | * @param int $type The type to get. |
||
608 | * |
||
609 | * @return Token[] An array of numerically indexed tokens. |
||
610 | */ |
||
611 | 1 | public function get_type( $type ) { |
|
612 | 1 | $tokens = []; |
|
613 | |||
614 | 1 | foreach ( $this->text as $index => $token ) { |
|
615 | 1 | if ( $token->type === $type ) { |
|
616 | 1 | $tokens[ $index ] = $token; |
|
617 | } |
||
618 | } |
||
619 | |||
620 | 1 | return $tokens; |
|
621 | } |
||
622 | } |
||
623 |