We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.
| Total Complexity | 47 |
| Total Lines | 582 |
| Duplicated Lines | 0 % |
| Coverage | 100% |
| Changes | 4 | ||
| Bugs | 0 | Features | 0 |
Complex classes like Text_Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Text_Parser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 39 | class Text_Parser { |
||
| 40 | |||
| 41 | const NO_ALL_LETTERS = 0b000000000001; |
||
| 42 | const ALLOW_ALL_LETTERS = 0b000000000010; |
||
| 43 | const REQUIRE_ALL_LETTERS = 0b000000000100; |
||
| 44 | const NO_ALL_CAPS = 0b000000001000; |
||
| 45 | const ALLOW_ALL_CAPS = 0b000000010000; |
||
| 46 | const REQUIRE_ALL_CAPS = 0b000000100000; |
||
| 47 | const NO_COMPOUNDS = 0b000001000000; |
||
| 48 | const ALLOW_COMPOUNDS = 0b000010000000; |
||
| 49 | const REQUIRE_COMPOUNDS = 0b000100000000; |
||
| 50 | |||
| 51 | /** |
||
| 52 | * Find spacing FIRST (as it is the primary delimiter) |
||
| 53 | * |
||
| 54 | * Find the HTML character representation for the following characters: |
||
| 55 | * tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace |
||
| 56 | * ogham space mark | en quad space | em quad space | en-space | three-per-em space |
||
| 57 | * four-per-em space | six-per-em space | figure space | punctuation space | em-space |
||
| 58 | * thin space | hair space | narrow no-break space |
||
| 59 | * medium mathematical space | ideographic space |
||
| 60 | * Some characters are used inside words, we will not count these as a space for the purpose |
||
| 61 | * of finding word boundaries: |
||
| 62 | * zero-width-space ("​", "​") |
||
| 63 | * zero-width-joiner ("‌", "‌", "‍") |
||
| 64 | * zero-width-non-joiner ("‍", "‍", "‌") |
||
| 65 | */ |
||
| 66 | const _HTML_SPACING = ' |
||
| 67 | (?: |
||
| 68 | (?: # alpha matches |
||
| 69 | & |
||
| 70 | (?: nbsp|ensp|emsp|thinsp ) |
||
| 71 | ; |
||
| 72 | ) |
||
| 73 | | |
||
| 74 | (?: # decimal matches |
||
| 75 | &\# |
||
| 76 | (?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 ) |
||
| 77 | ; |
||
| 78 | ) |
||
| 79 | | |
||
| 80 | (?: # hexidecimal matches |
||
| 81 | &\#x |
||
| 82 | (?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 ) |
||
| 83 | ; |
||
| 84 | ) |
||
| 85 | | |
||
| 86 | (?: # actual characters |
||
| 87 | \x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}| |
||
| 88 | \x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000} |
||
| 89 | ) |
||
| 90 | ) |
||
| 91 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 92 | |||
| 93 | const _SPACE = '(?:\s|' . self::_HTML_SPACING . ')+'; // required modifiers: x (multiline pattern) i (case insensitive) $utf8. |
||
| 94 | |||
| 95 | /** |
||
| 96 | * Find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores) |
||
| 97 | * |
||
| 98 | * @see http://www.unicode.org/charts/PDF/U2000.pdf |
||
| 99 | * |
||
| 100 | * Find punctuation and symbols |
||
| 101 | * dec matches = 33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903 |
||
| 102 | * hex matches = 0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2| |
||
| 103 | * 03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f |
||
| 104 | * |
||
| 105 | * Some characters are used inside words, we will not count these as a space for the purpose |
||
| 106 | * of finding word boundaries: |
||
| 107 | * hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
||
| 108 | * underscore ("_", "_") |
||
| 109 | */ |
||
| 110 | const _HTML_PUNCTUATION = ' |
||
| 111 | (?: |
||
| 112 | (?: # alpha matches |
||
| 113 | & |
||
| 114 | (?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams) |
||
| 115 | ; |
||
| 116 | ) |
||
| 117 | | |
||
| 118 | (?: # decimal matches |
||
| 119 | &\# |
||
| 120 | (?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] ) |
||
| 121 | ; |
||
| 122 | ) |
||
| 123 | | |
||
| 124 | (?: # hexidecimal matches |
||
| 125 | &\#x |
||
| 126 | (?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] ) |
||
| 127 | ; |
||
| 128 | ) |
||
| 129 | ) |
||
| 130 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 131 | |||
| 132 | const _PUNCTUATION = ' |
||
| 133 | (?: |
||
| 134 | (?: |
||
| 135 | [^\w\s\&\/\@] # assume characters that are not word spaces or whitespace are punctuation |
||
| 136 | # exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations |
||
| 137 | # exclude slash \/as to not include the last slash in a URL |
||
| 138 | # exclude @ as to keep twitter names together |
||
| 139 | | |
||
| 140 | ' . self::_HTML_PUNCTUATION . ' # catch any HTML reps of punctuation |
||
| 141 | )+ |
||
| 142 | ) |
||
| 143 | ';// required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 144 | |||
| 145 | /** |
||
| 146 | * Letter connectors allowed in words |
||
| 147 | * hyphens ("-", "­", "‐", "‑", "‒", "-", "­", "‐", "‑", "‒", "­") |
||
| 148 | * underscore ("_", "_") |
||
| 149 | * zero-width-space ("​", "​") |
||
| 150 | * zero-width-joiner ("‌", "‌", "‍") |
||
| 151 | * zero-width-non-joiner ("‍", "‍", "‌") |
||
| 152 | */ |
||
| 153 | const _HTML_LETTER_CONNECTORS = ' |
||
| 154 | (?: |
||
| 155 | (?: # alpha matches |
||
| 156 | & |
||
| 157 | (?: shy|zwj|zwnj ) |
||
| 158 | ; |
||
| 159 | ) |
||
| 160 | | |
||
| 161 | (?: # decimal matches |
||
| 162 | &\# |
||
| 163 | (?: 45|95|173|820[3-589]|8210 ) |
||
| 164 | ; |
||
| 165 | ) |
||
| 166 | | |
||
| 167 | (?: # hexidecimal matches |
||
| 168 | &\#x |
||
| 169 | (?: 002d|005f|00ad|200[b-d]|201[0-2] ) |
||
| 170 | ; |
||
| 171 | ) |
||
| 172 | | |
||
| 173 | (?: # actual characters |
||
| 174 | \x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012} |
||
| 175 | ) |
||
| 176 | ) |
||
| 177 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 178 | |||
| 179 | /** |
||
| 180 | * Word character html entities |
||
| 181 | * characters 0-9__ A-Z__ a-z___ other_special_chrs_____ |
||
| 182 | * decimal 48-57 65-90 97-122 192-214,216-246,248-255, 256-383 |
||
| 183 | * hex 31-39 41-5a 61-7a c0-d6 d8-f6 f8-ff 0100-017f |
||
| 184 | */ |
||
| 185 | const _HTML_LETTERS = ' |
||
| 186 | (?: |
||
| 187 | (?: # alpha matches |
||
| 188 | & |
||
| 189 | (?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml) |
||
| 190 | ; |
||
| 191 | ) |
||
| 192 | | |
||
| 193 | (?: # decimal matches |
||
| 194 | &\# |
||
| 195 | (?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] ) |
||
| 196 | ; |
||
| 197 | ) |
||
| 198 | | |
||
| 199 | (?: # hexidecimal matches |
||
| 200 | (?: |
||
| 201 | &\#x00 |
||
| 202 | (?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] ) |
||
| 203 | ; |
||
| 204 | ) |
||
| 205 | | |
||
| 206 | (?: |
||
| 207 | &\#x01[0-7][0-9a-f]; |
||
| 208 | ) |
||
| 209 | ) |
||
| 210 | | |
||
| 211 | (?: # actual characters |
||
| 212 | [0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| |
||
| 213 | \x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| |
||
| 214 | \x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| |
||
| 215 | \x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| |
||
| 216 | \x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| |
||
| 217 | \x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| |
||
| 218 | \x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| |
||
| 219 | \x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| |
||
| 220 | \x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| |
||
| 221 | \x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| |
||
| 222 | \x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| |
||
| 223 | \x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| |
||
| 224 | \x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| |
||
| 225 | \x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| |
||
| 226 | \x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| |
||
| 227 | \x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| |
||
| 228 | \x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| |
||
| 229 | \x{017c}|\x{017d}|\x{017e}|\x{017f} |
||
| 230 | ) |
||
| 231 | ) |
||
| 232 | '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 233 | |||
| 234 | const _WORD = ' |
||
| 235 | (?: |
||
| 236 | (?<![\w\&]) # negative lookbehind to ensure |
||
| 237 | # 1) we are proceeded by a non-word-character, and |
||
| 238 | # 2) we are not inside an HTML character def |
||
| 239 | (?: |
||
| 240 | [\w\-\_\/] |
||
| 241 | | |
||
| 242 | ' . self::_HTML_LETTERS . ' |
||
| 243 | | |
||
| 244 | ' . self::_HTML_LETTER_CONNECTORS . ' |
||
| 245 | )+ |
||
| 246 | ) |
||
| 247 | '; // required modifiers: x (multiline pattern) u (utf8). |
||
| 248 | |||
| 249 | // Find any text. |
||
| 250 | const _ANY_TEXT = self::_SPACE . '|' . self::_PUNCTUATION . '|' . self::_WORD; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8). |
||
| 251 | |||
| 252 | // Regular expressions. |
||
| 253 | const _RE_ANY_TEXT = '/(' . self::_ANY_TEXT . ')/Sxiu'; |
||
| 254 | const _RE_SPACE = '/\A' . self::_SPACE . '\Z/Sxiu'; |
||
| 255 | const _RE_PUNCTUATION = '/\A' . self::_PUNCTUATION . '\Z/Ssxiu'; |
||
| 256 | const _RE_WORD = '/\A' . self::_WORD . '\Z/Sxu'; |
||
| 257 | const _RE_HTML_LETTER_CONNECTORS = '/' . self::_HTML_LETTER_CONNECTORS . '|[0-9\-_&#;\/]/Sxu'; |
||
| 258 | const _RE_MAX_STRING_LENGTH = '/\w{500}/Ss'; |
||
| 259 | |||
| 260 | /** |
||
| 261 | * The current strtoupper function to use (either 'strtoupper' or 'mb_strtoupper'). |
||
| 262 | * |
||
| 263 | * @var callable |
||
| 264 | */ |
||
| 265 | private $current_strtoupper = 'strtoupper'; |
||
| 266 | |||
| 267 | /** |
||
| 268 | * The tokenized text. |
||
| 269 | * |
||
| 270 | * @var array $text { |
||
| 271 | * @type Text_Parser\Token $index |
||
| 272 | * } |
||
| 273 | */ |
||
| 274 | private $text = []; |
||
| 275 | |||
| 276 | /** |
||
| 277 | * Creates a new parser object. |
||
| 278 | */ |
||
| 279 | 1 | public function __construct() { |
|
| 280 | 1 | } |
|
| 281 | |||
| 282 | /** |
||
| 283 | * Tokenizes a string and stores the tokens in $this->text. |
||
| 284 | * |
||
| 285 | * @param string $raw_text A text fragment without any HTML markup. |
||
| 286 | * |
||
| 287 | * @return bool Returns `true` on successful completion, `false` otherwise. |
||
| 288 | */ |
||
| 289 | 6 | public function load( $raw_text ) { |
|
| 290 | 6 | if ( ! \is_string( $raw_text ) ) { |
|
|
|
|||
| 291 | 1 | return false; // we have an error, abort. |
|
| 292 | } |
||
| 293 | |||
| 294 | // Abort if a simple string exceeds 500 characters (security concern). |
||
| 295 | 5 | if ( \preg_match( self::_RE_MAX_STRING_LENGTH, $raw_text ) ) { |
|
| 296 | 1 | return false; |
|
| 297 | } |
||
| 298 | |||
| 299 | // Detect encoding. |
||
| 300 | 5 | $str_functions = Strings::functions( $raw_text ); |
|
| 301 | 5 | if ( empty( $str_functions ) ) { |
|
| 302 | 1 | return false; // unknown encoding. |
|
| 303 | } |
||
| 304 | 4 | $this->current_strtoupper = $str_functions['strtoupper']; |
|
| 305 | |||
| 306 | // Tokenize the raw text parts. |
||
| 307 | 4 | $this->text = self::tokenize( /** RE correct. @scrutinizer ignore-type */ \preg_split( self::_RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ); |
|
| 308 | |||
| 309 | // The token array should never be empty. |
||
| 310 | 4 | return ! empty( $this->text ); |
|
| 311 | } |
||
| 312 | |||
| 313 | /** |
||
| 314 | * Turns the array of strings into an array of tokens. |
||
| 315 | * |
||
| 316 | * @param string[] $parts An array of non-empty strings. |
||
| 317 | * |
||
| 318 | * @return Token[] An array of numerically indexed tokens. |
||
| 319 | */ |
||
| 320 | 4 | protected static function tokenize( array $parts ) { |
|
| 321 | 4 | $tokens = []; |
|
| 322 | 4 | $index = 0; |
|
| 323 | |||
| 324 | 4 | foreach ( $parts as $part ) { |
|
| 325 | 4 | if ( \preg_match( self::_RE_SPACE, $part ) ) { |
|
| 326 | 4 | $tokens[ $index ] = new Token( $part, Token::SPACE ); |
|
| 327 | 4 | } elseif ( \preg_match( self::_RE_PUNCTUATION, $part ) ) { |
|
| 328 | 4 | $tokens[ $index ] = new Token( $part, Token::PUNCTUATION ); |
|
| 329 | 4 | } elseif ( \preg_match( self::_RE_WORD, $part ) ) { |
|
| 330 | // Make sure that things like email addresses and URLs are not broken up |
||
| 331 | // into words and punctuation not preceeded by an 'other'. |
||
| 332 | 4 | self::parse_ambiguous_token( Token::WORD, $part, $tokens, $index ); |
|
| 333 | } else { |
||
| 334 | // Make sure that things like email addresses and URLs are not broken up into words |
||
| 335 | // and punctuation not preceeded by an 'other' or 'word'. |
||
| 336 | 2 | self::parse_ambiguous_token( Token::OTHER, $part, $tokens, $index ); |
|
| 337 | } |
||
| 338 | |||
| 339 | 4 | $index++; |
|
| 340 | } |
||
| 341 | |||
| 342 | 4 | return $tokens; |
|
| 343 | } |
||
| 344 | |||
| 345 | /** |
||
| 346 | * Parse ambigious tokens (that may need to be combined with the predecessors). |
||
| 347 | * |
||
| 348 | * @param int $expected_type Either Token::WORD or Token::OTHER. |
||
| 349 | * @param string $part The string fragment to parse. |
||
| 350 | * @param Token[] $tokens The token array. Passed by reference. |
||
| 351 | * @param int $index The current index. Passed by reference. |
||
| 352 | */ |
||
| 353 | 4 | protected static function parse_ambiguous_token( $expected_type, $part, array &$tokens, &$index ) { |
|
| 354 | |||
| 355 | // Make sure that things like email addresses and URLs are not broken up incorrectly. |
||
| 356 | 4 | if ( self::is_preceeded_by( Token::OTHER, $tokens, $index ) || ( Token::OTHER === $expected_type && self::is_preceeded_by( Token::WORD, $tokens, $index ) ) ) { |
|
| 357 | 2 | $index--; |
|
| 358 | 2 | $old_part = $tokens[ $index ]->value; |
|
| 359 | 2 | $tokens[ $index ] = new Token( $old_part . $part, Token::OTHER ); |
|
| 360 | |||
| 361 | 4 | } elseif ( self::is_preceeded_by( Token::PUNCTUATION, $tokens, $index ) && self::is_not_preceeded_by( Token::SPACE, $tokens, $index, 2 ) ) { |
|
| 362 | // Not preceeded by a non-space + punctuation. |
||
| 363 | 3 | $old_part = $tokens[ $index - 1 ]->value; |
|
| 364 | 3 | $older_part = $tokens[ $index - 2 ]->value; |
|
| 365 | 3 | $tokens[ $index - 2 ] = new Token( $older_part . $old_part . $part, Token::OTHER ); |
|
| 366 | 3 | unset( $tokens[ $index - 1 ] ); |
|
| 367 | 3 | $index = $index - 2; |
|
| 368 | |||
| 369 | } else { |
||
| 370 | // All good. |
||
| 371 | 4 | $tokens[ $index ] = new Token( $part, $expected_type ); |
|
| 372 | } |
||
| 373 | 4 | } |
|
| 374 | |||
| 375 | /** |
||
| 376 | * Checks if the predecessor of the current token is of a certain type. |
||
| 377 | * |
||
| 378 | * @param int $type A valid token type (e.g. Token::WORD). |
||
| 379 | * @param array $tokens An array of tokens. |
||
| 380 | * @param int $index The current token index. |
||
| 381 | * @param int $steps Optional. The number steps to go back for the check. Default 1. |
||
| 382 | * |
||
| 383 | * @return bool |
||
| 384 | */ |
||
| 385 | 4 | protected static function is_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
| 386 | 4 | return $index - $steps >= 0 && $type === $tokens[ $index - $steps ]->type; |
|
| 387 | } |
||
| 388 | |||
| 389 | /** |
||
| 390 | * Checks if the predecessor of the current token is not of a certain type. |
||
| 391 | * |
||
| 392 | * @param int $type A valid token type (e.g. Token::WORD). |
||
| 393 | * @param array $tokens An array of tokens. |
||
| 394 | * @param int $index The current token index. |
||
| 395 | * @param int $steps Optional. The number steps to go back for the check. Default 1. |
||
| 396 | * |
||
| 397 | * @return bool |
||
| 398 | */ |
||
| 399 | 4 | protected static function is_not_preceeded_by( $type, array $tokens, $index, $steps = 1 ) { |
|
| 401 | } |
||
| 402 | |||
| 403 | |||
| 404 | /** |
||
| 405 | * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted). |
||
| 406 | * |
||
| 407 | * Warning: Tokens previously acquired through 'get' methods may not match new tokenization. |
||
| 408 | * |
||
| 409 | * @return bool Returns true on successful completion. |
||
| 410 | */ |
||
| 411 | 1 | public function reload() { |
|
| 412 | 1 | return $this->load( $this->unload() ); |
|
| 413 | } |
||
| 414 | |||
| 415 | /** |
||
| 416 | * Returns the complete text as a string and clears the parser. |
||
| 417 | * |
||
| 418 | * @return string |
||
| 419 | */ |
||
| 420 | 1 | public function unload() { |
|
| 421 | 1 | $reassembled_text = ''; |
|
| 422 | |||
| 423 | 1 | foreach ( $this->text as $token ) { |
|
| 424 | 1 | $reassembled_text .= $token->value; |
|
| 425 | } |
||
| 426 | |||
| 427 | 1 | $this->clear(); |
|
| 428 | |||
| 429 | 1 | return $reassembled_text; |
|
| 430 | } |
||
| 431 | |||
| 432 | /** |
||
| 433 | * Clears the currently set text from the parser. |
||
| 434 | */ |
||
| 435 | 1 | public function clear() { |
|
| 436 | 1 | $this->text = []; |
|
| 437 | 1 | } |
|
| 438 | |||
| 439 | /** |
||
| 440 | * Updates the 'value' field for all matching tokens. |
||
| 441 | * |
||
| 442 | * @param Token[] $tokens An array of tokens. |
||
| 443 | */ |
||
| 444 | 1 | public function update( $tokens ) { |
|
| 445 | 1 | foreach ( $tokens as $index => $token ) { |
|
| 446 | 1 | $this->text[ $index ] = $this->text[ $index ]->with_value( $token->value ); |
|
| 447 | } |
||
| 448 | 1 | } |
|
| 449 | |||
| 450 | /** |
||
| 451 | * Retrieves all tokens of the currently set text. |
||
| 452 | * |
||
| 453 | * @return Token[] An array of numerically indexed tokens. |
||
| 454 | */ |
||
| 455 | 1 | public function get_all() { |
|
| 456 | 1 | return $this->text; |
|
| 457 | } |
||
| 458 | |||
| 459 | /** |
||
| 460 | * Retrieves all tokens of the type "space". |
||
| 461 | * |
||
| 462 | * @return Token[] An array of numerically indexed tokens. |
||
| 463 | */ |
||
| 464 | 1 | public function get_spaces() { |
|
| 466 | } |
||
| 467 | |||
| 468 | /** |
||
| 469 | * Retrieves all tokens of the type "punctuation". |
||
| 470 | * |
||
| 471 | * @return Token[] An array of numerically indexed tokens. |
||
| 472 | */ |
||
| 473 | 1 | public function get_punctuation() { |
|
| 475 | } |
||
| 476 | |||
| 477 | /** |
||
| 478 | * Retrieves all tokens of the type "word". |
||
| 479 | * |
||
| 480 | * @param int $abc Optional. Handling of all-letter words. Allowed values NO_ALL_LETTERS, ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS. Default ALLOW_ALL_LETTERS. |
||
| 481 | * @param int $caps Optional. Handling of capitalized words (setting does not affect non-letter characters). Allowed values NO_ALL_CAPS, ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS. Default ALLOW_ALL_CAPS. |
||
| 482 | * @param int $comps Optional. Handling of compound words (setting does not affect all-letter words). Allowed values NO_COMPOUNDS, ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS. Default ALLOW_COMPOUNDS. |
||
| 483 | * |
||
| 484 | * @return Token[] An array of numerically indexed tokens. |
||
| 485 | */ |
||
| 486 | 2 | public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) { |
|
| 487 | // Return early if no text has been loaded. |
||
| 488 | 2 | if ( empty( $this->text ) ) { |
|
| 489 | 1 | return []; // abort. |
|
| 490 | } |
||
| 491 | |||
| 492 | // Result set. |
||
| 493 | 1 | $tokens = []; |
|
| 494 | |||
| 495 | 1 | foreach ( $this->get_type( Token::WORD ) as $index => $token ) { |
|
| 496 | |||
| 497 | if ( |
||
| 498 | 1 | $this->conforms_to_letters_policy( $token, $abc ) && |
|
| 499 | 1 | $this->conforms_to_caps_policy( $token, $caps ) && |
|
| 500 | 1 | $this->conforms_to_compounds_policy( $token, $comps ) |
|
| 501 | ) { |
||
| 502 | 1 | $tokens[ $index ] = $token; |
|
| 503 | } |
||
| 504 | } |
||
| 505 | |||
| 506 | 1 | return $tokens; |
|
| 507 | } |
||
| 508 | |||
| 509 | /** |
||
| 510 | * Check if the value of the token conforms to the given policy for letters. |
||
| 511 | * |
||
| 512 | * @param Token $token Required. |
||
| 513 | * @param int $policy Either ALLOW_ALL_LETTERS, REQUIRE_ALL_LETTERS or NO_ALL_LETTERS. |
||
| 514 | * |
||
| 515 | * @return bool |
||
| 516 | */ |
||
| 517 | 13 | protected function conforms_to_letters_policy( Token $token, $policy ) { |
|
| 518 | 13 | return $this->check_policy( |
|
| 519 | 13 | $token, |
|
| 520 | $policy, |
||
| 521 | 13 | self::ALLOW_ALL_LETTERS, |
|
| 522 | 13 | self::REQUIRE_ALL_LETTERS, |
|
| 523 | 13 | self::NO_ALL_LETTERS, |
|
| 524 | function( $value ) { |
||
| 525 | 9 | return \preg_replace( self::_RE_HTML_LETTER_CONNECTORS, '', $value ); |
|
| 526 | 13 | } |
|
| 527 | ); |
||
| 528 | } |
||
| 529 | |||
| 530 | /** |
||
| 531 | * Check if the value of the token conforms to the given policy for all-caps words. |
||
| 532 | * |
||
| 533 | * @param Token $token Required. |
||
| 534 | * @param int $policy Either ALLOW_ALL_CAPS, REQUIRE_ALL_CAPS or NO_ALL_CAPS. |
||
| 535 | * |
||
| 536 | * @return bool |
||
| 537 | */ |
||
| 538 | 13 | protected function conforms_to_caps_policy( Token $token, $policy ) { |
|
| 539 | 13 | return $this->check_policy( |
|
| 540 | 13 | $token, |
|
| 541 | $policy, |
||
| 542 | 13 | self::ALLOW_ALL_CAPS, |
|
| 543 | 13 | self::REQUIRE_ALL_CAPS, |
|
| 544 | 13 | self::NO_ALL_CAPS, |
|
| 545 | 13 | $this->current_strtoupper |
|
| 546 | ); |
||
| 547 | } |
||
| 548 | |||
| 549 | /** |
||
| 550 | * Check if the value of the token conforms to the given policy for compound words. |
||
| 551 | * |
||
| 552 | * @param Token $token Required. |
||
| 553 | * @param int $policy Either ALLOW_COMPOUNDS, REQUIRE_COMPOUNDS or NO_COMPOUNDS. |
||
| 554 | * |
||
| 555 | * @return bool |
||
| 556 | */ |
||
| 557 | 13 | protected function conforms_to_compounds_policy( Token $token, $policy ) { |
|
| 558 | 13 | return $this->check_policy( |
|
| 559 | 13 | $token, |
|
| 560 | $policy, |
||
| 561 | 13 | self::ALLOW_COMPOUNDS, |
|
| 562 | 13 | self::NO_COMPOUNDS, |
|
| 563 | 13 | self::REQUIRE_COMPOUNDS, |
|
| 564 | function( $value ) { |
||
| 565 | 9 | return \preg_replace( '/-/S', '', $value ); |
|
| 566 | 13 | } |
|
| 567 | ); |
||
| 568 | } |
||
| 569 | |||
| 570 | /** |
||
| 571 | * Check if the value of the token conforms to the given policy. |
||
| 572 | * |
||
| 573 | * @param Token $token Required. |
||
| 574 | * @param int $policy The policy to check. |
||
| 575 | * @param int $permissive_policy ALLOW_* policy constant. |
||
| 576 | * @param int $equal_policy Policy constant to check when the transformed value is equal to the original token. |
||
| 577 | * @param int $non_equal_policy Policy constant to check when the transformed value is different from the original token. |
||
| 578 | * @param callable $transform_token Function to transform the token value. |
||
| 579 | * |
||
| 580 | * @return bool |
||
| 581 | */ |
||
| 582 | 39 | protected function check_policy( Token $token, $policy, $permissive_policy, $equal_policy, $non_equal_policy, callable $transform_token ) { |
|
| 583 | |||
| 584 | // Short circuit. |
||
| 585 | 39 | if ( $permissive_policy === $policy ) { |
|
| 586 | 12 | return true; |
|
| 587 | } |
||
| 588 | |||
| 589 | 27 | $transformed = $transform_token( $token->value ); |
|
| 590 | |||
| 591 | 27 | return ( $equal_policy === $policy && $transformed === $token->value ) |
|
| 592 | 27 | || ( $non_equal_policy === $policy && $transformed !== $token->value ); |
|
| 593 | } |
||
| 594 | |||
| 595 | /** |
||
| 596 | * Retrieves all tokens of the type "other". |
||
| 597 | * |
||
| 598 | * @return Token[] An array of numerically indexed tokens. |
||
| 599 | */ |
||
| 600 | 1 | public function get_other() { |
|
| 602 | } |
||
| 603 | |||
| 604 | /** |
||
| 605 | * Retrieves all tokens of the given type. |
||
| 606 | * |
||
| 607 | * @param int $type The type to get. |
||
| 608 | * |
||
| 609 | * @return Token[] An array of numerically indexed tokens. |
||
| 610 | */ |
||
| 611 | 1 | public function get_type( $type ) { |
|
| 612 | 1 | $tokens = []; |
|
| 623 |