|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* This file is part of PHP-Typography. |
|
4
|
|
|
* |
|
5
|
|
|
* Copyright 2014-2019 Peter Putzer. |
|
6
|
|
|
* Copyright 2012-2013 Marie Hogebrandt. |
|
7
|
|
|
* Copyright 2009-2011 KINGdesk, LLC. |
|
8
|
|
|
* |
|
9
|
|
|
* This program is free software; you can redistribute it and/or modify |
|
10
|
|
|
* it under the terms of the GNU General Public License as published by |
|
11
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
|
12
|
|
|
* (at your option) any later version. |
|
13
|
|
|
* |
|
14
|
|
|
* This program is distributed in the hope that it will be useful, |
|
15
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
16
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
17
|
|
|
* GNU General Public License for more details. |
|
18
|
|
|
* |
|
19
|
|
|
* You should have received a copy of the GNU General Public License along |
|
20
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc., |
|
21
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|
22
|
|
|
* |
|
23
|
|
|
* *** |
|
24
|
|
|
* |
|
25
|
|
|
* @package mundschenk-at/php-typography |
|
26
|
|
|
* @license http://www.gnu.org/licenses/gpl-2.0.html |
|
27
|
|
|
*/ |
|
28
|
|
|
|
|
29
|
|
|
namespace PHP_Typography; |
|
30
|
|
|
|
|
31
|
|
|
use PHP_Typography\Text_Parser\Token; |
|
32
|
|
|
|
|
33
|
|
|
/** |
|
34
|
|
|
* A class to parse plain text (such as the data of DOMText). |
|
35
|
|
|
* |
|
36
|
|
|
* Parse_Text assumes no HTML markup in the text (except for special html characters like >). |
|
37
|
|
|
* If multibyte characters are passed, they must be encoded as UTF-8. |
|
38
|
|
|
*/ |
|
39
|
|
|
class Text_Parser { |
|
40
|
|
|
|
|
41
|
|
|
const NO_ALL_LETTERS = 0b000000000001; |
|
42
|
|
|
const ALLOW_ALL_LETTERS = 0b000000000010; |
|
43
|
|
|
const REQUIRE_ALL_LETTERS = 0b000000000100; |
|
44
|
|
|
const NO_ALL_CAPS = 0b000000001000; |
|
45
|
|
|
const ALLOW_ALL_CAPS = 0b000000010000; |
|
46
|
|
|
const REQUIRE_ALL_CAPS = 0b000000100000; |
|
47
|
|
|
const NO_COMPOUNDS = 0b000001000000; |
|
48
|
|
|
const ALLOW_COMPOUNDS = 0b000010000000; |
|
49
|
|
|
const REQUIRE_COMPOUNDS = 0b000100000000; |
|
50
|
|
|
|
|
51
|
|
|
/** |
|
52
|
|
|
* Find spacing FIRST (as it is the primary delimiter) |
|
53
|
|
|
* |
|
54
|
|
|
* Find the HTML character representation for the following characters: |
|
55
|
|
|
* tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace |
|
56
|
|
|
* ogham space mark | en quad space | em quad space | en-space | three-per-em space |
|
57
|
|
|
* four-per-em space | six-per-em space | figure space | punctuation space | em-space |
|
58
|
|
|
* thin space | hair space | narrow no-break space |
|
59
|
|
|
* medium mathematical space | ideographic space |
|
60
|
|
|
* Some characters are used inside words, we will not count these as a space for the purpose |
|
61
|
|
|
* of finding word boundaries: |
|
62
|
|
|
* zero-width-space ("​", "​") |
|
63
|
|
|
* zero-width-joiner ("‌", "‌", "‍") |
|
64
|
|
|
* zero-width-non-joiner ("‍", "‍", "‌") |
|
65
|
|
|
*/ |
|
66
|
|
|
const _HTML_SPACING = ' |
|
67
|
|
|
(?: |
|
68
|
|
|
(?: # alpha matches |
|
69
|
|
|
& |
|
70
|
|
|
(?: nbsp|ensp|emsp|thinsp ) |
|
71
|
|
|
; |
|
72
|
|
|
) |
|
73
|
|
|
| |
|
74
|
|
|
(?: # decimal matches |
|
75
|
|
|
&\# |
|
76
|
|
|
(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 ) |
|
77
|
|
|
; |
|
78
|
|
|
) |
|
79
|
|
|
| |
|
80
|
|
|
(?: # hexidecimal matches |
|
81
|
|
|
&\#x |
|
82
|
|
|
(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 ) |
|
83
|
|
|
; |
|
84
|
|
|
) |
|
85
|
|
|
| |
|
86
|
|
|
(?: # actual characters |
|
87
|
|
|
\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}| |
|
88
|
|
|
\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000} |
|
89
|
|
|
) |
|
90
|
|
|
) |
|
91
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
92
|
|
|
|
|
93
|
|
|
const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8. |
|
94
|
|
|
|
|
95
|
|
|
/** |
|
96
|
|
|
* Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores) |
|
97
|
|
|
* |
|
98
|
|
|
* @see http://www.unicode.org/charts/PDF/U2000.pdf |
|
99
|
|
|
* |
|
100
|
|
|
* Find punctuation and symbols |
|
101
|
|
|
* dec matches = 33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903 |
|
102
|
|
|
* hex matches = 0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2| |
|
103
|
|
|
* 03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f |
|
104
|
|
|
* |
|
105
|
|
|
* Some characters are used inside words, we will not count these as a space for the purpose |
|
106
|
|
|
* of finding word boundaries: |
|
107
|
|
|
* hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
|
108
|
|
|
* underscore ("_", "_") |
|
109
|
|
|
*/ |
|
110
|
|
|
const _HTML_PUNCTUATION = ' |
|
111
|
|
|
(?: |
|
112
|
|
|
(?: # alpha matches |
|
113
|
|
|
& |
|
114
|
|
|
(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams) |
|
115
|
|
|
; |
|
116
|
|
|
) |
|
117
|
|
|
| |
|
118
|
|
|
(?: # decimal matches |
|
119
|
|
|
&\# |
|
120
|
|
|
(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] ) |
|
121
|
|
|
; |
|
122
|
|
|
) |
|
123
|
|
|
| |
|
124
|
|
|
(?: # hexidecimal matches |
|
125
|
|
|
&\#x |
|
126
|
|
|
(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] ) |
|
127
|
|
|
; |
|
128
|
|
|
) |
|
129
|
|
|
) |
|
130
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
131
|
|
|
|
|
132
|
|
|
const _PUNCTUATION = ' |
|
133
|
|
|
(?: |
|
134
|
|
|
(?: |
|
135
|
|
|
[^\w\s\&\/\@] # assume characters that are not word spaces or whitespace are punctuation |
|
136
|
|
|
# exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations |
|
137
|
|
|
# exclude slash \/as to not include the last slash in a URL |
|
138
|
|
|
# exclude @ as to keep twitter names together |
|
139
|
|
|
| |
|
140
|
|
|
' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation |
|
141
|
|
|
)+ |
|
142
|
|
|
) |
|
143
|
|
|
';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
144
|
|
|
|
|
145
|
|
|
/** |
|
146
|
|
|
* Letter connectors allowed in words |
|
147
|
|
|
* hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
|
148
|
|
|
* underscore ("_", "_") |
|
149
|
|
|
* zero-width-space ("​", "​") |
|
150
|
|
|
* zero-width-joiner ("‌", "‌", "‍") |
|
151
|
|
|
* zero-width-non-joiner ("‍", "‍", "‌") |
|
152
|
|
|
*/ |
|
153
|
|
|
const _HTML_LETTER_CONNECTORS = ' |
|
154
|
|
|
(?: |
|
155
|
|
|
(?: # alpha matches |
|
156
|
|
|
& |
|
157
|
|
|
(?: shy|zwj|zwnj ) |
|
158
|
|
|
; |
|
159
|
|
|
) |
|
160
|
|
|
| |
|
161
|
|
|
(?: # decimal matches |
|
162
|
|
|
&\# |
|
163
|
|
|
(?: 45|95|173|820[3-589]|8210 ) |
|
164
|
|
|
; |
|
165
|
|
|
) |
|
166
|
|
|
| |
|
167
|
|
|
(?: # hexidecimal matches |
|
168
|
|
|
&\#x |
|
169
|
|
|
(?: 002d|005f|00ad|200[b-d]|201[0-2] ) |
|
170
|
|
|
; |
|
171
|
|
|
) |
|
172
|
|
|
| |
|
173
|
|
|
(?: # actual characters |
|
174
|
|
|
\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012} |
|
175
|
|
|
) |
|
176
|
|
|
) |
|
177
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
178
|
|
|
|
|
179
|
|
|
/** |
|
180
|
|
|
* Word character html entities |
|
181
|
|
|
* characters 0-9__ A-Z__ a-z___ other_special_chrs_____ |
|
182
|
|
|
* decimal 48-57 65-90 97-122 192-214,216-246,248-255, 256-383 |
|
183
|
|
|
* hex 31-39 41-5a 61-7a c0-d6 d8-f6 f8-ff 0100-017f |
|
184
|
|
|
*/ |
|
185
|
|
|
const _HTML_LETTERS = ' |
|
186
|
|
|
(?: |
|
187
|
|
|
(?: # alpha matches |
|
188
|
|
|
& |
|
189
|
|
|
(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml) |
|
190
|
|
|
; |
|
191
|
|
|
) |
|
192
|
|
|
| |
|
193
|
|
|
(?: # decimal matches |
|
194
|
|
|
&\# |
|
195
|
|
|
(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] ) |
|
196
|
|
|
; |
|
197
|
|
|
) |
|
198
|
|
|
| |
|
199
|
|
|
(?: # hexidecimal matches |
|
200
|
|
|
(?: |
|
201
|
|
|
&\#x00 |
|
202
|
|
|
(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] ) |
|
203
|
|
|
; |
|
204
|
|
|
) |
|
205
|
|
|
| |
|
206
|
|
|
(?: |
|
207
|
|
|
&\#x01[0-7][0-9a-f]; |
|
208
|
|
|
) |
|
209
|
|
|
) |
|
210
|
|
|
| |
|
211
|
|
|
(?: # actual characters |
|
212
|
|
|
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| |
|
213
|
|
|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| |
|
214
|
|
|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| |
|
215
|
|
|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| |
|
216
|
|
|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| |
|
217
|
|
|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| |
|
218
|
|
|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| |
|
219
|
|
|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| |
|
220
|
|
|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| |
|
221
|
|
|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| |
|
222
|
|
|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| |
|
223
|
|
|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| |
|
224
|
|
|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| |
|
225
|
|
|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| |
|
226
|
|
|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| |
|
227
|
|
|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| |
|
228
|
|
|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| |
|
229
|
|
|
\x{017c}|\x{017d}|\x{017e}|\x{017f} |
|
230
|
|
|
) |
|
231
|
|
|
) |
|
232
|
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
233
|
|
|
|
|
234
|
|
|
const _WORD = ' |
|
235
|
|
|
(?: |
|
236
|
|
|
(?<![\w\&]) # negative lookbehind to ensure |
|
237
|
|
|
# 1) we are proceeded by a non-word-character, and |
|
238
|
|
|
# 2) we are not inside an HTML character def |
|
239
|
|
|
(?: |
|
240
|
|
|
[\w\-\_\/] |
|
241
|
|
|
| |
|
242
|
|
|
' . self::_HTML_LETTERS . ' |
|
243
|
|
|
| |
|
244
|
|
|
' . self::_HTML_LETTER_CONNECTORS . ' |
|
245
|
|
|
)+ |
|
246
|
|
|
) |
|
247
|
|
|
'; // required modifiers: x (multiline pattern) u (utf8). |
|
248
|
|
|
|
|
249
|
|
|
// Find any text. |
|
250
|
|
|
const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
|
251
|
|
|
|
|
252
|
|
|
// Regular expressions. |
|
253
|
|
|
const _RE_ANY_TEXT = '/(' . self::_ANY_TEXT . ')/Sxiu'; |
|
254
|
|
|
const _RE_SPACE = '/\A' . self::_SPACE . '\Z/Sxiu'; |
|
255
|
|
|
const _RE_PUNCTUATION = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu'; |
|
256
|
|
|
const _RE_WORD = '/\A' . self::_WORD . '\Z/Sxu'; |
|
257
|
|
|
const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu'; |
|
258
|
|
|
const _RE_MAX_STRING_LENGTH = '/\w{500}/Ss'; |
|
259
|
|
|
|
|
260
|
|
|
/** |
|
261
|
|
|
* The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper'). |
|
262
|
|
|
* |
|
263
|
|
|
* @var callable |
|
264
|
|
|
*/ |
|
265
|
|
|
private $current_strtoupper = 'strtoupper'; |
|
266
|
|
|
|
|
267
|
|
|
/** |
|
268
|
|
|
* The tokenized text. |
|
269
|
|
|
* |
|
270
|
|
|
* @var array $text { |
|
271
|
|
|
* @type Text_Parser\Token $index |
|
272
|
|
|
* } |
|
273
|
|
|
*/ |
|
274
|
|
|
private $text = []; |
|
275
|
|
|
|
|
276
|
|
|
/** |
|
277
|
|
|
* Creates a new parser object. |
|
278
|
|
|
*/ |
|
279
|
1 |
|
public function __construct() { |
|
280
|
1 |
|
} |
|
281
|
|
|
|
|
282
|
|
|
/** |
|
283
|
|
|
* Tokenizes a string and stores the tokens in $this->text. |
|
284
|
|
|
* |
|
285
|
|
|
* @param string $raw_text A text fragment without any HTML markup. |
|
286
|
|
|
* |
|
287
|
|
|
* @return bool Returns `true` on successful completion, `false` otherwise. |
|
288
|
|
|
*/ |
|
289
|
6 |
|
public function load( $raw_text ) { |
|
290
|
6 |
|
if ( ! \is_string( $raw_text ) ) { |
|
|
|
|
|
|
291
|
1 |
|
return false; // we have an error, abort. |
|
292
|
|
|
} |
|
293
|
|
|
|
|
294
|
|
|
// Abort if a simple string exceeds 500 characters (security concern). |
|
295
|
5 |
|
if ( \preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) { |
|
296
|
1 |
|
return false; |
|
297
|
|
|
} |
|
298
|
|
|
|
|
299
|
|
|
// Detect encoding. |
|
300
|
5 |
|
$str_functions = Strings::functions( $raw_text ); |
|
301
|
5 |
|
if ( empty( $str_functions ) ) { |
|
302
|
1 |
|
return false; // unknown encoding. |
|
303
|
|
|
} |
|
304
|
4 |
|
$this->current_strtoupper = $str_functions['strtoupper']; |
|
305
|
|
|
|
|
306
|
|
|
// Tokenize the raw text parts. |
|
307
|
4 |
|
$this->text = self::tokenize( /** RE correct. @scrutinizer ignore-type */ \preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ); |
|
308
|
|
|
|
|
309
|
|
|
// The token array should never be empty. |
|
310
|
4 |
|
return ! empty( $this->text ); |
|
311
|
|
|
} |
|
312
|
|
|
|
|
313
|
|
|
/** |
|
314
|
|
|
* Turns the array of strings into an array of tokens. |
|
315
|
|
|
* |
|
316
|
|
|
* @param string[] $parts An array of non-empty strings. |
|
317
|
|
|
* |
|
318
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
319
|
|
|
*/ |
|
320
|
4 |
|
protected static function tokenize( array $parts ) { |
|
321
|
4 |
|
$tokens = []; |
|
322
|
4 |
|
$index = 0; |
|
323
|
|
|
|
|
324
|
4 |
|
foreach ( $parts as $part ) { |
|
325
|
4 |
|
if ( \preg_match( self::_RE_SPACE, $part ) ) { |
|
326
|
4 |
|
$tokens[ $index ] = new Token( $part, Token::SPACE ); |
|
327
|
4 |
|
} elseif ( \preg_match( self::_RE_PUNCTUATION, $part ) ) { |
|
328
|
4 |
|
$tokens[ $index ] = new Token( $part, Token::PUNCTUATION ); |
|
329
|
4 |
|
} elseif ( \preg_match( self::_RE_WORD, $part ) ) { |
|
330
|
|
|
// Make sure that things like email addresses and URLs are not broken up |
|
331
|
|
|
// into words and punctuation not preceeded by an 'other'. |
|
332
|
4 |
|
self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index ); |
|
333
|
|
|
} else { |
|
334
|
|
|
// Make sure that things like email addresses and URLs are not broken up into words |
|
335
|
|
|
// and punctuation not preceeded by an 'other' or 'word'. |
|
336
|
2 |
|
self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index ); |
|
337
|
|
|
} |
|
338
|
|
|
|
|
339
|
4 |
|
$index++; |
|
340
|
|
|
} |
|
341
|
|
|
|
|
342
|
4 |
|
return $tokens; |
|
343
|
|
|
} |
|
344
|
|
|
|
|
345
|
|
|
/** |
|
346
|
|
|
* Parse ambigious tokens (that may need to be combined with the predecessors). |
|
347
|
|
|
* |
|
348
|
|
|
* @param int $expected_type Either Token::WORD or Token::OTHER. |
|
349
|
|
|
* @param string $part The string fragment to parse. |
|
350
|
|
|
* @param Token[] $tokens The token array. Passed by reference. |
|
351
|
|
|
* @param int $index The current index. Passed by reference. |
|
352
|
|
|
*/ |
|
353
|
4 |
|
protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) { |
|
354
|
|
|
|
|
355
|
|
|
// Make sure that things like email addresses and URLs are not broken up incorrectly. |
|
356
|
4 |
|
if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) { |
|
357
|
2 |
|
$index--; |
|
358
|
2 |
|
$old_part = $tokens[ $index ]->value; |
|
359
|
2 |
|
$tokens[ $index ] = new Token( $old_part . $part, Token::OTHER ); |
|
360
|
|
|
|
|
361
|
4 |
|
} elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) { |
|
362
|
|
|
// Not preceeded by a non-space + punctuation. |
|
363
|
3 |
|
$old_part = $tokens[ $index - 1 ]->value; |
|
364
|
3 |
|
$older_part = $tokens[ $index - 2 ]->value; |
|
365
|
3 |
|
$tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER ); |
|
366
|
3 |
|
unset( $tokens[ $index - 1 ] ); |
|
367
|
3 |
|
$index = $index - 2; |
|
368
|
|
|
|
|
369
|
|
|
} else { |
|
370
|
|
|
// All good. |
|
371
|
4 |
|
$tokens[ $index ] = new Token( $part, $expected_type ); |
|
372
|
|
|
} |
|
373
|
4 |
|
} |
|
374
|
|
|
|
|
375
|
|
|
/** |
|
376
|
|
|
* Checks if the predecessor of the current token is of a certain type. |
|
377
|
|
|
* |
|
378
|
|
|
* @param int $type A valid token type (e.g. Token::WORD). |
|
379
|
|
|
* @param array $tokens An array of tokens. |
|
380
|
|
|
* @param int $index The current token index. |
|
381
|
|
|
* @param int $steps Optional. The number steps to go back for the check. Default 1. |
|
382
|
|
|
* |
|
383
|
|
|
* @return bool |
|
384
|
|
|
*/ |
|
385
|
4 |
|
protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
386
|
4 |
|
return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type; |
|
387
|
|
|
} |
|
388
|
|
|
|
|
389
|
|
|
/** |
|
390
|
|
|
* Checks if the predecessor of the current token is not of a certain type. |
|
391
|
|
|
* |
|
392
|
|
|
* @param int $type A valid token type (e.g. Token::WORD). |
|
393
|
|
|
* @param array $tokens An array of tokens. |
|
394
|
|
|
* @param int $index The current token index. |
|
395
|
|
|
* @param int $steps Optional. The number steps to go back for the check. Default 1. |
|
396
|
|
|
* |
|
397
|
|
|
* @return bool |
|
398
|
|
|
*/ |
|
399
|
4 |
|
protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
400
|
4 |
|
return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type; |
|
401
|
|
|
} |
|
402
|
|
|
|
|
403
|
|
|
|
|
404
|
|
|
/** |
|
405
|
|
|
* Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted). |
|
406
|
|
|
* |
|
407
|
|
|
* Warning: Tokens previously acquired through 'get' methods may not match new tokenization. |
|
408
|
|
|
* |
|
409
|
|
|
* @return bool Returns true on successful completion. |
|
410
|
|
|
*/ |
|
411
|
1 |
|
public function reload() { |
|
412
|
1 |
|
return $this->load( $this->unload() ); |
|
413
|
|
|
} |
|
414
|
|
|
|
|
415
|
|
|
/** |
|
416
|
|
|
* Returns the complete text as a string and clears the parser. |
|
417
|
|
|
* |
|
418
|
|
|
* @return string |
|
419
|
|
|
*/ |
|
420
|
1 |
|
public function unload() { |
|
421
|
1 |
|
$reassembled_text = ''; |
|
422
|
|
|
|
|
423
|
1 |
|
foreach ( $this->text as $token ) { |
|
424
|
1 |
|
$reassembled_text .= $token->value; |
|
425
|
|
|
} |
|
426
|
|
|
|
|
427
|
1 |
|
$this->clear(); |
|
428
|
|
|
|
|
429
|
1 |
|
return $reassembled_text; |
|
430
|
|
|
} |
|
431
|
|
|
|
|
432
|
|
|
/** |
|
433
|
|
|
* Clears the currently set text from the parser. |
|
434
|
|
|
*/ |
|
435
|
1 |
|
public function clear() { |
|
436
|
1 |
|
$this->text = []; |
|
437
|
1 |
|
} |
|
438
|
|
|
|
|
439
|
|
|
/** |
|
440
|
|
|
* Updates the 'value' field for all matching tokens. |
|
441
|
|
|
* |
|
442
|
|
|
* @param Token[] $tokens An array of tokens. |
|
443
|
|
|
*/ |
|
444
|
1 |
|
public function update( $tokens ) { |
|
445
|
1 |
|
foreach ( $tokens as $index => $token ) { |
|
446
|
1 |
|
$this->text[ $index ] = $this->text[ $index ]->with_value( $token->value ); |
|
447
|
|
|
} |
|
448
|
1 |
|
} |
|
449
|
|
|
|
|
450
|
|
|
/** |
|
451
|
|
|
* Retrieves all tokens of the currently set text. |
|
452
|
|
|
* |
|
453
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
454
|
|
|
*/ |
|
455
|
1 |
|
public function get_all() { |
|
456
|
1 |
|
return $this->text; |
|
457
|
|
|
} |
|
458
|
|
|
|
|
459
|
|
|
/** |
|
460
|
|
|
* Retrieves all tokens of the type "space". |
|
461
|
|
|
* |
|
462
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
463
|
|
|
*/ |
|
464
|
1 |
|
public function get_spaces() { |
|
465
|
1 |
|
return $this->get_type( Token::SPACE ); |
|
466
|
|
|
} |
|
467
|
|
|
|
|
468
|
|
|
/** |
|
469
|
|
|
* Retrieves all tokens of the type "punctuation". |
|
470
|
|
|
* |
|
471
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
472
|
|
|
*/ |
|
473
|
1 |
|
public function get_punctuation() { |
|
474
|
1 |
|
return $this->get_type( Token::PUNCTUATION ); |
|
475
|
|
|
} |
|
476
|
|
|
|
|
477
|
|
|
/** |
|
478
|
|
|
* Retrieves all tokens of the type "word". |
|
479
|
|
|
* |
|
480
|
|
|
* @param int $abc Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS. |
|
481
|
|
|
* @param int $caps Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS. |
|
482
|
|
|
* @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS. |
|
483
|
|
|
* |
|
484
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
485
|
|
|
*/ |
|
486
|
2 |
|
public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) { |
|
487
|
|
|
// Return early if no text has been loaded. |
|
488
|
2 |
|
if ( empty( $this->text ) ) { |
|
489
|
1 |
|
return []; // abort. |
|
490
|
|
|
} |
|
491
|
|
|
|
|
492
|
|
|
// Result set. |
|
493
|
1 |
|
$tokens = []; |
|
494
|
|
|
|
|
495
|
1 |
|
foreach ( $this->get_type( Token::WORD ) as $index => $token ) { |
|
496
|
|
|
|
|
497
|
|
|
if ( |
|
498
|
1 |
|
$this->conforms_to_letters_policy( $token, $abc ) && |
|
499
|
1 |
|
$this->conforms_to_caps_policy( $token, $caps ) && |
|
500
|
1 |
|
$this->conforms_to_compounds_policy( $token, $comps ) |
|
501
|
|
|
) { |
|
502
|
1 |
|
$tokens[ $index ] = $token; |
|
503
|
|
|
} |
|
504
|
|
|
} |
|
505
|
|
|
|
|
506
|
1 |
|
return $tokens; |
|
507
|
|
|
} |
|
508
|
|
|
|
|
509
|
|
|
/** |
|
510
|
|
|
* Check if the value of the token conforms to the given policy for letters. |
|
511
|
|
|
* |
|
512
|
|
|
* @param Token $token Required. |
|
513
|
|
|
* @param int $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS. |
|
514
|
|
|
* |
|
515
|
|
|
* @return bool |
|
516
|
|
|
*/ |
|
517
|
13 |
|
protected function conforms_to_letters_policy( Token $token, $policy ) { |
|
518
|
13 |
|
return $this->check_policy( |
|
519
|
13 |
|
$token, |
|
520
|
|
|
$policy, |
|
521
|
13 |
|
self::ALLOW_ALL_LETTERS, |
|
522
|
13 |
|
self::REQUIRE_ALL_LETTERS, |
|
523
|
13 |
|
self::NO_ALL_LETTERS, |
|
524
|
|
|
function( $value ) { |
|
525
|
9 |
|
return \preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value ); |
|
526
|
13 |
|
} |
|
527
|
|
|
); |
|
528
|
|
|
} |
|
529
|
|
|
|
|
530
|
|
|
/** |
|
531
|
|
|
* Check if the value of the token conforms to the given policy for all-caps words. |
|
532
|
|
|
* |
|
533
|
|
|
* @param Token $token Required. |
|
534
|
|
|
* @param int $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS. |
|
535
|
|
|
* |
|
536
|
|
|
* @return bool |
|
537
|
|
|
*/ |
|
538
|
13 |
|
protected function conforms_to_caps_policy( Token $token, $policy ) { |
|
539
|
13 |
|
return $this->check_policy( |
|
540
|
13 |
|
$token, |
|
541
|
|
|
$policy, |
|
542
|
13 |
|
self::ALLOW_ALL_CAPS, |
|
543
|
13 |
|
self::REQUIRE_ALL_CAPS, |
|
544
|
13 |
|
self::NO_ALL_CAPS, |
|
545
|
13 |
|
$this->current_strtoupper |
|
546
|
|
|
); |
|
547
|
|
|
} |
|
548
|
|
|
|
|
549
|
|
|
/** |
|
550
|
|
|
* Check if the value of the token conforms to the given policy for compound words. |
|
551
|
|
|
* |
|
552
|
|
|
* @param Token $token Required. |
|
553
|
|
|
* @param int $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS. |
|
554
|
|
|
* |
|
555
|
|
|
* @return bool |
|
556
|
|
|
*/ |
|
557
|
13 |
|
protected function conforms_to_compounds_policy( Token $token, $policy ) { |
|
558
|
13 |
|
return $this->check_policy( |
|
559
|
13 |
|
$token, |
|
560
|
|
|
$policy, |
|
561
|
13 |
|
self::ALLOW_COMPOUNDS, |
|
562
|
13 |
|
self::NO_COMPOUNDS, |
|
563
|
13 |
|
self::REQUIRE_COMPOUNDS, |
|
564
|
|
|
function( $value ) { |
|
565
|
9 |
|
return \preg_replace( '/-/S', '', $value ); |
|
566
|
13 |
|
} |
|
567
|
|
|
); |
|
568
|
|
|
} |
|
569
|
|
|
|
|
570
|
|
|
/** |
|
571
|
|
|
* Check if the value of the token conforms to the given policy. |
|
572
|
|
|
* |
|
573
|
|
|
* @param Token $token Required. |
|
574
|
|
|
* @param int $policy The policy to check. |
|
575
|
|
|
* @param int $permissive_policy ALLOW_* policy constant. |
|
576
|
|
|
* @param int $equal_policy Policy constant to check when the transformed value is equal to the original token. |
|
577
|
|
|
* @param int $non_equal_policy Policy constant to check when the transformed value is different from the original token. |
|
578
|
|
|
* @param callable $transform_token Function to transform the token value. |
|
579
|
|
|
* |
|
580
|
|
|
* @return bool |
|
581
|
|
|
*/ |
|
582
|
39 |
|
protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) { |
|
583
|
|
|
|
|
584
|
|
|
// Short circuit. |
|
585
|
39 |
|
if ( $permissive_policy === $policy ) { |
|
586
|
12 |
|
return true; |
|
587
|
|
|
} |
|
588
|
|
|
|
|
589
|
27 |
|
$transformed = $transform_token( $token->value ); |
|
590
|
|
|
|
|
591
|
27 |
|
return ( $equal_policy === $policy && $transformed === $token->value ) |
|
592
|
27 |
|
|| ( $non_equal_policy === $policy && $transformed !== $token->value ); |
|
593
|
|
|
} |
|
594
|
|
|
|
|
595
|
|
|
/** |
|
596
|
|
|
* Retrieves all tokens of the type "other". |
|
597
|
|
|
* |
|
598
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
599
|
|
|
*/ |
|
600
|
1 |
|
public function get_other() { |
|
601
|
1 |
|
return $this->get_type( Token::OTHER ); |
|
602
|
|
|
} |
|
603
|
|
|
|
|
604
|
|
|
/** |
|
605
|
|
|
* Retrieves all tokens of the given type. |
|
606
|
|
|
* |
|
607
|
|
|
* @param int $type The type to get. |
|
608
|
|
|
* |
|
609
|
|
|
* @return Token[] An array of numerically indexed tokens. |
|
610
|
|
|
*/ |
|
611
|
1 |
|
public function get_type( $type ) { |
|
612
|
1 |
|
$tokens = []; |
|
613
|
|
|
|
|
614
|
1 |
|
foreach ( $this->text as $index => $token ) { |
|
615
|
1 |
|
if ( $token->type === $type ) { |
|
616
|
1 |
|
$tokens[ $index ] = $token; |
|
617
|
|
|
} |
|
618
|
|
|
} |
|
619
|
|
|
|
|
620
|
1 |
|
return $tokens; |
|
621
|
|
|
} |
|
622
|
|
|
} |
|
623
|
|
|
|