1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* This file is part of PHP-Typography. |
4
|
|
|
* |
5
|
|
|
* Copyright 2014-2019 Peter Putzer. |
6
|
|
|
* Copyright 2012-2013 Marie Hogebrandt. |
7
|
|
|
* Copyright 2009-2011 KINGdesk, LLC. |
8
|
|
|
* |
9
|
|
|
* This program is free software; you can redistribute it and/or modify |
10
|
|
|
* it under the terms of the GNU General Public License as published by |
11
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
12
|
|
|
* (at your option) any later version. |
13
|
|
|
* |
14
|
|
|
* This program is distributed in the hope that it will be useful, |
15
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
16
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17
|
|
|
* GNU General Public License for more details. |
18
|
|
|
* |
19
|
|
|
* You should have received a copy of the GNU General Public License along |
20
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc., |
21
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
22
|
|
|
* |
23
|
|
|
* *** |
24
|
|
|
* |
25
|
|
|
* @package mundschenk-at/php-typography |
26
|
|
|
* @license http://www.gnu.org/licenses/gpl-2.0.html |
27
|
|
|
*/ |
28
|
|
|
|
29
|
|
|
namespace PHP_Typography; |
30
|
|
|
|
31
|
|
|
use PHP_Typography\Text_Parser\Token; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* A class to parse plain text (such as the data of DOMText). |
35
|
|
|
* |
36
|
|
|
* Parse_Text assumes no HTML markup in the text (except for special html characters like >). |
37
|
|
|
* If multibyte characters are passed, they must be encoded as UTF-8. |
38
|
|
|
*/ |
39
|
|
|
class Text_Parser { |
40
|
|
|
|
41
|
|
|
const NO_ALL_LETTERS = 0b000000000001; |
42
|
|
|
const ALLOW_ALL_LETTERS = 0b000000000010; |
43
|
|
|
const REQUIRE_ALL_LETTERS = 0b000000000100; |
44
|
|
|
const NO_ALL_CAPS = 0b000000001000; |
45
|
|
|
const ALLOW_ALL_CAPS = 0b000000010000; |
46
|
|
|
const REQUIRE_ALL_CAPS = 0b000000100000; |
47
|
|
|
const NO_COMPOUNDS = 0b000001000000; |
48
|
|
|
const ALLOW_COMPOUNDS = 0b000010000000; |
49
|
|
|
const REQUIRE_COMPOUNDS = 0b000100000000; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* Find spacing FIRST (as it is the primary delimiter) |
53
|
|
|
* |
54
|
|
|
* Find the HTML character representation for the following characters: |
55
|
|
|
* tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace |
56
|
|
|
* ogham space mark | en quad space | em quad space | en-space | three-per-em space |
57
|
|
|
* four-per-em space | six-per-em space | figure space | punctuation space | em-space |
58
|
|
|
* thin space | hair space | narrow no-break space |
59
|
|
|
* medium mathematical space | ideographic space |
60
|
|
|
* Some characters are used inside words, we will not count these as a space for the purpose |
61
|
|
|
* of finding word boundaries: |
62
|
|
|
* zero-width-space ("​", "​") |
63
|
|
|
* zero-width-joiner ("‌", "‌", "‍") |
64
|
|
|
* zero-width-non-joiner ("‍", "‍", "‌") |
65
|
|
|
*/ |
66
|
|
|
const _HTML_SPACING = ' |
67
|
|
|
(?: |
68
|
|
|
(?: # alpha matches |
69
|
|
|
& |
70
|
|
|
(?: nbsp|ensp|emsp|thinsp ) |
71
|
|
|
; |
72
|
|
|
) |
73
|
|
|
| |
74
|
|
|
(?: # decimal matches |
75
|
|
|
&\# |
76
|
|
|
(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 ) |
77
|
|
|
; |
78
|
|
|
) |
79
|
|
|
| |
80
|
|
|
(?: # hexidecimal matches |
81
|
|
|
&\#x |
82
|
|
|
(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 ) |
83
|
|
|
; |
84
|
|
|
) |
85
|
|
|
| |
86
|
|
|
(?: # actual characters |
87
|
|
|
\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}| |
88
|
|
|
\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000} |
89
|
|
|
) |
90
|
|
|
) |
91
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
92
|
|
|
|
93
|
|
|
const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8. |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores) |
97
|
|
|
* |
98
|
|
|
* @see http://www.unicode.org/charts/PDF/U2000.pdf |
99
|
|
|
* |
100
|
|
|
* Find punctuation and symbols |
101
|
|
|
* dec matches = 33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903 |
102
|
|
|
* hex matches = 0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2| |
103
|
|
|
* 03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f |
104
|
|
|
* |
105
|
|
|
* Some characters are used inside words, we will not count these as a space for the purpose |
106
|
|
|
* of finding word boundaries: |
107
|
|
|
* hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
108
|
|
|
* underscore ("_", "_") |
109
|
|
|
*/ |
110
|
|
|
const _HTML_PUNCTUATION = ' |
111
|
|
|
(?: |
112
|
|
|
(?: # alpha matches |
113
|
|
|
& |
114
|
|
|
(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams) |
115
|
|
|
; |
116
|
|
|
) |
117
|
|
|
| |
118
|
|
|
(?: # decimal matches |
119
|
|
|
&\# |
120
|
|
|
(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] ) |
121
|
|
|
; |
122
|
|
|
) |
123
|
|
|
| |
124
|
|
|
(?: # hexidecimal matches |
125
|
|
|
&\#x |
126
|
|
|
(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] ) |
127
|
|
|
; |
128
|
|
|
) |
129
|
|
|
) |
130
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
131
|
|
|
|
132
|
|
|
const _PUNCTUATION = ' |
133
|
|
|
(?: |
134
|
|
|
(?: |
135
|
|
|
[^\w\s\&\/\@] # assume characters that are not word spaces or whitespace are punctuation |
136
|
|
|
# exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations |
137
|
|
|
# exclude slash \/as to not include the last slash in a URL |
138
|
|
|
# exclude @ as to keep twitter names together |
139
|
|
|
| |
140
|
|
|
' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation |
141
|
|
|
)+ |
142
|
|
|
) |
143
|
|
|
';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* Letter connectors allowed in words |
147
|
|
|
* hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
148
|
|
|
* underscore ("_", "_") |
149
|
|
|
* zero-width-space ("​", "​") |
150
|
|
|
* zero-width-joiner ("‌", "‌", "‍") |
151
|
|
|
* zero-width-non-joiner ("‍", "‍", "‌") |
152
|
|
|
*/ |
153
|
|
|
const _HTML_LETTER_CONNECTORS = ' |
154
|
|
|
(?: |
155
|
|
|
(?: # alpha matches |
156
|
|
|
& |
157
|
|
|
(?: shy|zwj|zwnj ) |
158
|
|
|
; |
159
|
|
|
) |
160
|
|
|
| |
161
|
|
|
(?: # decimal matches |
162
|
|
|
&\# |
163
|
|
|
(?: 45|95|173|820[3-589]|8210 ) |
164
|
|
|
; |
165
|
|
|
) |
166
|
|
|
| |
167
|
|
|
(?: # hexidecimal matches |
168
|
|
|
&\#x |
169
|
|
|
(?: 002d|005f|00ad|200[b-d]|201[0-2] ) |
170
|
|
|
; |
171
|
|
|
) |
172
|
|
|
| |
173
|
|
|
(?: # actual characters |
174
|
|
|
\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012} |
175
|
|
|
) |
176
|
|
|
) |
177
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
178
|
|
|
|
179
|
|
|
/** |
180
|
|
|
* Word character html entities |
181
|
|
|
* characters 0-9__ A-Z__ a-z___ other_special_chrs_____ |
182
|
|
|
* decimal 48-57 65-90 97-122 192-214,216-246,248-255, 256-383 |
183
|
|
|
* hex 31-39 41-5a 61-7a c0-d6 d8-f6 f8-ff 0100-017f |
184
|
|
|
*/ |
185
|
|
|
const _HTML_LETTERS = ' |
186
|
|
|
(?: |
187
|
|
|
(?: # alpha matches |
188
|
|
|
& |
189
|
|
|
(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml) |
190
|
|
|
; |
191
|
|
|
) |
192
|
|
|
| |
193
|
|
|
(?: # decimal matches |
194
|
|
|
&\# |
195
|
|
|
(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] ) |
196
|
|
|
; |
197
|
|
|
) |
198
|
|
|
| |
199
|
|
|
(?: # hexidecimal matches |
200
|
|
|
(?: |
201
|
|
|
&\#x00 |
202
|
|
|
(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] ) |
203
|
|
|
; |
204
|
|
|
) |
205
|
|
|
| |
206
|
|
|
(?: |
207
|
|
|
&\#x01[0-7][0-9a-f]; |
208
|
|
|
) |
209
|
|
|
) |
210
|
|
|
| |
211
|
|
|
(?: # actual characters |
212
|
|
|
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| |
213
|
|
|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| |
214
|
|
|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| |
215
|
|
|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| |
216
|
|
|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| |
217
|
|
|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| |
218
|
|
|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| |
219
|
|
|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| |
220
|
|
|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| |
221
|
|
|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| |
222
|
|
|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| |
223
|
|
|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| |
224
|
|
|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| |
225
|
|
|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| |
226
|
|
|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| |
227
|
|
|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| |
228
|
|
|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| |
229
|
|
|
\x{017c}|\x{017d}|\x{017e}|\x{017f} |
230
|
|
|
) |
231
|
|
|
) |
232
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
233
|
|
|
|
234
|
|
|
const _WORD = ' |
235
|
|
|
(?: |
236
|
|
|
(?<![\w\&]) # negative lookbehind to ensure |
237
|
|
|
# 1) we are proceeded by a non-word-character, and |
238
|
|
|
# 2) we are not inside an HTML character def |
239
|
|
|
(?: |
240
|
|
|
[\w\-\_\/] |
241
|
|
|
| |
242
|
|
|
' . self::_HTML_LETTERS . ' |
243
|
|
|
| |
244
|
|
|
' . self::_HTML_LETTER_CONNECTORS . ' |
245
|
|
|
)+ |
246
|
|
|
) |
247
|
|
|
'; // required modifiers: x (multiline pattern) u (utf8). |
248
|
|
|
|
249
|
|
|
// Find any text. |
250
|
|
|
const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
251
|
|
|
|
252
|
|
|
// Regular expressions. |
253
|
|
|
const _RE_ANY_TEXT = '/(' . self::_ANY_TEXT . ')/Sxiu'; |
254
|
|
|
const _RE_SPACE = '/\A' . self::_SPACE . '\Z/Sxiu'; |
255
|
|
|
const _RE_PUNCTUATION = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu'; |
256
|
|
|
const _RE_WORD = '/\A' . self::_WORD . '\Z/Sxu'; |
257
|
|
|
const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu'; |
258
|
|
|
const _RE_MAX_STRING_LENGTH = '/\w{500}/Ss'; |
259
|
|
|
|
260
|
|
|
/** |
261
|
|
|
* The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper'). |
262
|
|
|
* |
263
|
|
|
* @var callable |
264
|
|
|
*/ |
265
|
|
|
private $current_strtoupper = 'strtoupper'; |
266
|
|
|
|
267
|
|
|
/** |
268
|
|
|
* The tokenized text. |
269
|
|
|
* |
270
|
|
|
* @var array $text { |
271
|
|
|
* @type Text_Parser\Token $index |
272
|
|
|
* } |
273
|
|
|
*/ |
274
|
|
|
private $text = []; |
275
|
|
|
|
276
|
|
|
/** |
277
|
|
|
* Creates a new parser object. |
278
|
|
|
*/ |
279
|
1 |
|
public function __construct() { |
280
|
1 |
|
} |
281
|
|
|
|
282
|
|
|
/** |
283
|
|
|
* Tokenizes a string and stores the tokens in $this->text. |
284
|
|
|
* |
285
|
|
|
* @param string $raw_text A text fragment without any HTML markup. |
286
|
|
|
* |
287
|
|
|
* @return bool Returns `true` on successful completion, `false` otherwise. |
288
|
|
|
*/ |
289
|
6 |
|
public function load( $raw_text ) { |
290
|
6 |
|
if ( ! \is_string( $raw_text ) ) { |
|
|
|
|
291
|
1 |
|
return false; // we have an error, abort. |
292
|
|
|
} |
293
|
|
|
|
294
|
|
|
// Abort if a simple string exceeds 500 characters (security concern). |
295
|
5 |
|
if ( \preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) { |
296
|
1 |
|
return false; |
297
|
|
|
} |
298
|
|
|
|
299
|
|
|
// Detect encoding. |
300
|
5 |
|
$str_functions = Strings::functions( $raw_text ); |
301
|
5 |
|
if ( empty( $str_functions ) ) { |
302
|
1 |
|
return false; // unknown encoding. |
303
|
|
|
} |
304
|
4 |
|
$this->current_strtoupper = $str_functions['strtoupper']; |
305
|
|
|
|
306
|
|
|
// Tokenize the raw text parts. |
307
|
4 |
|
$this->text = self::tokenize( /** RE correct. @scrutinizer ignore-type */ \preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ); |
308
|
|
|
|
309
|
|
|
// The token array should never be empty. |
310
|
4 |
|
return ! empty( $this->text ); |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
/** |
314
|
|
|
* Turns the array of strings into an array of tokens. |
315
|
|
|
* |
316
|
|
|
* @param string[] $parts An array of non-empty strings. |
317
|
|
|
* |
318
|
|
|
* @return Token[] An array of numerically indexed tokens. |
319
|
|
|
*/ |
320
|
4 |
|
protected static function tokenize( array $parts ) { |
321
|
4 |
|
$tokens = []; |
322
|
4 |
|
$index = 0; |
323
|
|
|
|
324
|
4 |
|
foreach ( $parts as $part ) { |
325
|
4 |
|
if ( \preg_match( self::_RE_SPACE, $part ) ) { |
326
|
4 |
|
$tokens[ $index ] = new Token( $part, Token::SPACE ); |
327
|
4 |
|
} elseif ( \preg_match( self::_RE_PUNCTUATION, $part ) ) { |
328
|
4 |
|
$tokens[ $index ] = new Token( $part, Token::PUNCTUATION ); |
329
|
4 |
|
} elseif ( \preg_match( self::_RE_WORD, $part ) ) { |
330
|
|
|
// Make sure that things like email addresses and URLs are not broken up |
331
|
|
|
// into words and punctuation not preceeded by an 'other'. |
332
|
4 |
|
self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index ); |
333
|
|
|
} else { |
334
|
|
|
// Make sure that things like email addresses and URLs are not broken up into words |
335
|
|
|
// and punctuation not preceeded by an 'other' or 'word'. |
336
|
2 |
|
self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index ); |
337
|
|
|
} |
338
|
|
|
|
339
|
4 |
|
$index++; |
340
|
|
|
} |
341
|
|
|
|
342
|
4 |
|
return $tokens; |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
/** |
346
|
|
|
* Parse ambigious tokens (that may need to be combined with the predecessors). |
347
|
|
|
* |
348
|
|
|
* @param int $expected_type Either Token::WORD or Token::OTHER. |
349
|
|
|
* @param string $part The string fragment to parse. |
350
|
|
|
* @param Token[] $tokens The token array. Passed by reference. |
351
|
|
|
* @param int $index The current index. Passed by reference. |
352
|
|
|
*/ |
353
|
4 |
|
protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) { |
354
|
|
|
|
355
|
|
|
// Make sure that things like email addresses and URLs are not broken up incorrectly. |
356
|
4 |
|
if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) { |
357
|
2 |
|
$index--; |
358
|
2 |
|
$old_part = $tokens[ $index ]->value; |
359
|
2 |
|
$tokens[ $index ] = new Token( $old_part . $part, Token::OTHER ); |
360
|
|
|
|
361
|
4 |
|
} elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) { |
362
|
|
|
// Not preceeded by a non-space + punctuation. |
363
|
3 |
|
$old_part = $tokens[ $index - 1 ]->value; |
364
|
3 |
|
$older_part = $tokens[ $index - 2 ]->value; |
365
|
3 |
|
$tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER ); |
366
|
3 |
|
unset( $tokens[ $index - 1 ] ); |
367
|
3 |
|
$index = $index - 2; |
368
|
|
|
|
369
|
|
|
} else { |
370
|
|
|
// All good. |
371
|
4 |
|
$tokens[ $index ] = new Token( $part, $expected_type ); |
372
|
|
|
} |
373
|
4 |
|
} |
374
|
|
|
|
375
|
|
|
/** |
376
|
|
|
* Checks if the predecessor of the current token is of a certain type. |
377
|
|
|
* |
378
|
|
|
* @param int $type A valid token type (e.g. Token::WORD). |
379
|
|
|
* @param array $tokens An array of tokens. |
380
|
|
|
* @param int $index The current token index. |
381
|
|
|
* @param int $steps Optional. The number steps to go back for the check. Default 1. |
382
|
|
|
* |
383
|
|
|
* @return bool |
384
|
|
|
*/ |
385
|
4 |
|
protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
386
|
4 |
|
return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type; |
387
|
|
|
} |
388
|
|
|
|
389
|
|
|
/** |
390
|
|
|
* Checks if the predecessor of the current token is not of a certain type. |
391
|
|
|
* |
392
|
|
|
* @param int $type A valid token type (e.g. Token::WORD). |
393
|
|
|
* @param array $tokens An array of tokens. |
394
|
|
|
* @param int $index The current token index. |
395
|
|
|
* @param int $steps Optional. The number steps to go back for the check. Default 1. |
396
|
|
|
* |
397
|
|
|
* @return bool |
398
|
|
|
*/ |
399
|
4 |
|
protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
400
|
4 |
|
return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type; |
401
|
|
|
} |
402
|
|
|
|
403
|
|
|
|
404
|
|
|
/** |
405
|
|
|
* Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted). |
406
|
|
|
* |
407
|
|
|
* Warning: Tokens previously acquired through 'get' methods may not match new tokenization. |
408
|
|
|
* |
409
|
|
|
* @return bool Returns true on successful completion. |
410
|
|
|
*/ |
411
|
1 |
|
public function reload() { |
412
|
1 |
|
return $this->load( $this->unload() ); |
413
|
|
|
} |
414
|
|
|
|
415
|
|
|
/** |
416
|
|
|
* Returns the complete text as a string and clears the parser. |
417
|
|
|
* |
418
|
|
|
* @return string |
419
|
|
|
*/ |
420
|
1 |
|
public function unload() { |
421
|
1 |
|
$reassembled_text = ''; |
422
|
|
|
|
423
|
1 |
|
foreach ( $this->text as $token ) { |
424
|
1 |
|
$reassembled_text .= $token->value; |
425
|
|
|
} |
426
|
|
|
|
427
|
1 |
|
$this->clear(); |
428
|
|
|
|
429
|
1 |
|
return $reassembled_text; |
430
|
|
|
} |
431
|
|
|
|
432
|
|
|
/** |
433
|
|
|
* Clears the currently set text from the parser. |
434
|
|
|
*/ |
435
|
1 |
|
public function clear() { |
436
|
1 |
|
$this->text = []; |
437
|
1 |
|
} |
438
|
|
|
|
439
|
|
|
/** |
440
|
|
|
* Updates the 'value' field for all matching tokens. |
441
|
|
|
* |
442
|
|
|
* @param Token[] $tokens An array of tokens. |
443
|
|
|
*/ |
444
|
1 |
|
public function update( $tokens ) { |
445
|
1 |
|
foreach ( $tokens as $index => $token ) { |
446
|
1 |
|
$this->text[ $index ] = $this->text[ $index ]->with_value( $token->value ); |
447
|
|
|
} |
448
|
1 |
|
} |
449
|
|
|
|
450
|
|
|
/** |
451
|
|
|
* Retrieves all tokens of the currently set text. |
452
|
|
|
* |
453
|
|
|
* @return Token[] An array of numerically indexed tokens. |
454
|
|
|
*/ |
455
|
1 |
|
public function get_all() { |
456
|
1 |
|
return $this->text; |
457
|
|
|
} |
458
|
|
|
|
459
|
|
|
/** |
460
|
|
|
* Retrieves all tokens of the type "space". |
461
|
|
|
* |
462
|
|
|
* @return Token[] An array of numerically indexed tokens. |
463
|
|
|
*/ |
464
|
1 |
|
public function get_spaces() { |
465
|
1 |
|
return $this->get_type( Token::SPACE ); |
466
|
|
|
} |
467
|
|
|
|
468
|
|
|
/** |
469
|
|
|
* Retrieves all tokens of the type "punctuation". |
470
|
|
|
* |
471
|
|
|
* @return Token[] An array of numerically indexed tokens. |
472
|
|
|
*/ |
473
|
1 |
|
public function get_punctuation() { |
474
|
1 |
|
return $this->get_type( Token::PUNCTUATION ); |
475
|
|
|
} |
476
|
|
|
|
477
|
|
|
/** |
478
|
|
|
* Retrieves all tokens of the type "word". |
479
|
|
|
* |
480
|
|
|
* @param int $abc Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS. |
481
|
|
|
* @param int $caps Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS. |
482
|
|
|
* @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS. |
483
|
|
|
* |
484
|
|
|
* @return Token[] An array of numerically indexed tokens. |
485
|
|
|
*/ |
486
|
2 |
|
public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) { |
487
|
|
|
// Return early if no text has been loaded. |
488
|
2 |
|
if ( empty( $this->text ) ) { |
489
|
1 |
|
return []; // abort. |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
// Result set. |
493
|
1 |
|
$tokens = []; |
494
|
|
|
|
495
|
1 |
|
foreach ( $this->get_type( Token::WORD ) as $index => $token ) { |
496
|
|
|
|
497
|
|
|
if ( |
498
|
1 |
|
$this->conforms_to_letters_policy( $token, $abc ) && |
499
|
1 |
|
$this->conforms_to_caps_policy( $token, $caps ) && |
500
|
1 |
|
$this->conforms_to_compounds_policy( $token, $comps ) |
501
|
|
|
) { |
502
|
1 |
|
$tokens[ $index ] = $token; |
503
|
|
|
} |
504
|
|
|
} |
505
|
|
|
|
506
|
1 |
|
return $tokens; |
507
|
|
|
} |
508
|
|
|
|
509
|
|
|
/** |
510
|
|
|
* Check if the value of the token conforms to the given policy for letters. |
511
|
|
|
* |
512
|
|
|
* @param Token $token Required. |
513
|
|
|
* @param int $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS. |
514
|
|
|
* |
515
|
|
|
* @return bool |
516
|
|
|
*/ |
517
|
13 |
|
protected function conforms_to_letters_policy( Token $token, $policy ) { |
518
|
13 |
|
return $this->check_policy( |
519
|
13 |
|
$token, |
520
|
|
|
$policy, |
521
|
13 |
|
self::ALLOW_ALL_LETTERS, |
522
|
13 |
|
self::REQUIRE_ALL_LETTERS, |
523
|
13 |
|
self::NO_ALL_LETTERS, |
524
|
|
|
function( $value ) { |
525
|
9 |
|
return \preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value ); |
526
|
13 |
|
} |
527
|
|
|
); |
528
|
|
|
} |
529
|
|
|
|
530
|
|
|
/** |
531
|
|
|
* Check if the value of the token conforms to the given policy for all-caps words. |
532
|
|
|
* |
533
|
|
|
* @param Token $token Required. |
534
|
|
|
* @param int $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS. |
535
|
|
|
* |
536
|
|
|
* @return bool |
537
|
|
|
*/ |
538
|
13 |
|
protected function conforms_to_caps_policy( Token $token, $policy ) { |
539
|
13 |
|
return $this->check_policy( |
540
|
13 |
|
$token, |
541
|
|
|
$policy, |
542
|
13 |
|
self::ALLOW_ALL_CAPS, |
543
|
13 |
|
self::REQUIRE_ALL_CAPS, |
544
|
13 |
|
self::NO_ALL_CAPS, |
545
|
13 |
|
$this->current_strtoupper |
546
|
|
|
); |
547
|
|
|
} |
548
|
|
|
|
549
|
|
|
/** |
550
|
|
|
* Check if the value of the token conforms to the given policy for compound words. |
551
|
|
|
* |
552
|
|
|
* @param Token $token Required. |
553
|
|
|
* @param int $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS. |
554
|
|
|
* |
555
|
|
|
* @return bool |
556
|
|
|
*/ |
557
|
13 |
|
protected function conforms_to_compounds_policy( Token $token, $policy ) { |
558
|
13 |
|
return $this->check_policy( |
559
|
13 |
|
$token, |
560
|
|
|
$policy, |
561
|
13 |
|
self::ALLOW_COMPOUNDS, |
562
|
13 |
|
self::NO_COMPOUNDS, |
563
|
13 |
|
self::REQUIRE_COMPOUNDS, |
564
|
|
|
function( $value ) { |
565
|
9 |
|
return \preg_replace( '/-/S', '', $value ); |
566
|
13 |
|
} |
567
|
|
|
); |
568
|
|
|
} |
569
|
|
|
|
570
|
|
|
/** |
571
|
|
|
* Check if the value of the token conforms to the given policy. |
572
|
|
|
* |
573
|
|
|
* @param Token $token Required. |
574
|
|
|
* @param int $policy The policy to check. |
575
|
|
|
* @param int $permissive_policy ALLOW_* policy constant. |
576
|
|
|
* @param int $equal_policy Policy constant to check when the transformed value is equal to the original token. |
577
|
|
|
* @param int $non_equal_policy Policy constant to check when the transformed value is different from the original token. |
578
|
|
|
* @param callable $transform_token Function to transform the token value. |
579
|
|
|
* |
580
|
|
|
* @return bool |
581
|
|
|
*/ |
582
|
39 |
|
protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) { |
583
|
|
|
|
584
|
|
|
// Short circuit. |
585
|
39 |
|
if ( $permissive_policy === $policy ) { |
586
|
12 |
|
return true; |
587
|
|
|
} |
588
|
|
|
|
589
|
27 |
|
$transformed = $transform_token( $token->value ); |
590
|
|
|
|
591
|
27 |
|
return ( $equal_policy === $policy && $transformed === $token->value ) |
592
|
27 |
|
|| ( $non_equal_policy === $policy && $transformed !== $token->value ); |
593
|
|
|
} |
594
|
|
|
|
595
|
|
|
/** |
596
|
|
|
* Retrieves all tokens of the type "other". |
597
|
|
|
* |
598
|
|
|
* @return Token[] An array of numerically indexed tokens. |
599
|
|
|
*/ |
600
|
1 |
|
public function get_other() { |
601
|
1 |
|
return $this->get_type( Token::OTHER ); |
602
|
|
|
} |
603
|
|
|
|
604
|
|
|
/** |
605
|
|
|
* Retrieves all tokens of the given type. |
606
|
|
|
* |
607
|
|
|
* @param int $type The type to get. |
608
|
|
|
* |
609
|
|
|
* @return Token[] An array of numerically indexed tokens. |
610
|
|
|
*/ |
611
|
1 |
|
public function get_type( $type ) { |
612
|
1 |
|
$tokens = []; |
613
|
|
|
|
614
|
1 |
|
foreach ( $this->text as $index => $token ) { |
615
|
1 |
|
if ( $token->type === $type ) { |
616
|
1 |
|
$tokens[ $index ] = $token; |
617
|
|
|
} |
618
|
|
|
} |
619
|
|
|
|
620
|
1 |
|
return $tokens; |
621
|
|
|
} |
622
|
|
|
} |
623
|
|
|
|