1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/* |
4
|
|
|
* This file is part of the Symfony package. |
5
|
|
|
* |
6
|
|
|
* (c) Fabien Potencier <[email protected]> |
7
|
|
|
* |
8
|
|
|
* For the full copyright and license information, please view the LICENSE |
9
|
|
|
* file that was distributed with this source code. |
10
|
|
|
*/ |
11
|
|
|
|
12
|
|
|
namespace Symfony\Component\String; |
13
|
|
|
|
14
|
|
|
use Symfony\Component\String\Exception\ExceptionInterface; |
15
|
|
|
use Symfony\Component\String\Exception\InvalidArgumentException; |
16
|
|
|
use Symfony\Component\String\Exception\RuntimeException; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Represents a string of abstract Unicode characters. |
20
|
|
|
* |
21
|
|
|
* Unicode defines 3 types of "characters" (bytes, code points and grapheme clusters). |
22
|
|
|
* This class is the abstract type to use as a type-hint when the logic you want to |
23
|
|
|
* implement is Unicode-aware but doesn't care about code points vs grapheme clusters. |
24
|
|
|
* |
25
|
|
|
* @author Nicolas Grekas <[email protected]> |
26
|
|
|
* |
27
|
|
|
* @throws ExceptionInterface |
28
|
|
|
*/ |
29
|
|
|
abstract class AbstractUnicodeString extends AbstractString |
30
|
|
|
{ |
31
|
|
|
public const NFC = \Normalizer::NFC; |
32
|
|
|
public const NFD = \Normalizer::NFD; |
33
|
|
|
public const NFKC = \Normalizer::NFKC; |
34
|
|
|
public const NFKD = \Normalizer::NFKD; |
35
|
|
|
|
36
|
|
|
// all ASCII letters sorted by typical frequency of occurrence |
37
|
|
|
private const ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; |
38
|
|
|
|
39
|
|
|
// the subset of folded case mappings that is not in lower case mappings |
40
|
|
|
private const FOLD_FROM = ['İ', 'µ', 'ſ', "\xCD\x85", 'ς', 'ϐ', 'ϑ', 'ϕ', 'ϖ', 'ϰ', 'ϱ', 'ϵ', 'ẛ', "\xE1\xBE\xBE", 'ß', 'ʼn', 'ǰ', 'ΐ', 'ΰ', 'և', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'ẚ', 'ẞ', 'ὐ', 'ὒ', 'ὔ', 'ὖ', 'ᾀ', 'ᾁ', 'ᾂ', 'ᾃ', 'ᾄ', 'ᾅ', 'ᾆ', 'ᾇ', 'ᾈ', 'ᾉ', 'ᾊ', 'ᾋ', 'ᾌ', 'ᾍ', 'ᾎ', 'ᾏ', 'ᾐ', 'ᾑ', 'ᾒ', 'ᾓ', 'ᾔ', 'ᾕ', 'ᾖ', 'ᾗ', 'ᾘ', 'ᾙ', 'ᾚ', 'ᾛ', 'ᾜ', 'ᾝ', 'ᾞ', 'ᾟ', 'ᾠ', 'ᾡ', 'ᾢ', 'ᾣ', 'ᾤ', 'ᾥ', 'ᾦ', 'ᾧ', 'ᾨ', 'ᾩ', 'ᾪ', 'ᾫ', 'ᾬ', 'ᾭ', 'ᾮ', 'ᾯ', 'ᾲ', 'ᾳ', 'ᾴ', 'ᾶ', 'ᾷ', 'ᾼ', 'ῂ', 'ῃ', 'ῄ', 'ῆ', 'ῇ', 'ῌ', 'ῒ', 'ῖ', 'ῗ', 'ῢ', 'ῤ', 'ῦ', 'ῧ', 'ῲ', 'ῳ', 'ῴ', 'ῶ', 'ῷ', 'ῼ', 'ff', 'fi', 'fl', 'ffi', 'ffl', 'ſt', 'st', 'ﬓ', 'ﬔ', 'ﬕ', 'ﬖ', 'ﬗ']; |
41
|
|
|
private const FOLD_TO = ['i̇', 'μ', 's', 'ι', 'σ', 'β', 'θ', 'φ', 'π', 'κ', 'ρ', 'ε', 'ṡ', 'ι', 'ss', 'ʼn', 'ǰ', 'ΐ', 'ΰ', 'եւ', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'aʾ', 'ss', 'ὐ', 'ὒ', 'ὔ', 'ὖ', 'ἀι', 'ἁι', 'ἂι', 'ἃι', 'ἄι', 'ἅι', 'ἆι', 'ἇι', 'ἀι', 'ἁι', 'ἂι', 'ἃι', 'ἄι', 'ἅι', 'ἆι', 'ἇι', 'ἠι', 'ἡι', 'ἢι', 'ἣι', 'ἤι', 'ἥι', 'ἦι', 'ἧι', 'ἠι', 'ἡι', 'ἢι', 'ἣι', 'ἤι', 'ἥι', 'ἦι', 'ἧι', 'ὠι', 'ὡι', 'ὢι', 'ὣι', 'ὤι', 'ὥι', 'ὦι', 'ὧι', 'ὠι', 'ὡι', 'ὢι', 'ὣι', 'ὤι', 'ὥι', 'ὦι', 'ὧι', 'ὰι', 'αι', 'άι', 'ᾶ', 'ᾶι', 'αι', 'ὴι', 'ηι', 'ήι', 'ῆ', 'ῆι', 'ηι', 'ῒ', 'ῖ', 'ῗ', 'ῢ', 'ῤ', 'ῦ', 'ῧ', 'ὼι', 'ωι', 'ώι', 'ῶ', 'ῶι', 'ωι', 'ff', 'fi', 'fl', 'ffi', 'ffl', 'st', 'st', 'մն', 'մե', 'մի', 'վն', 'մխ']; |
42
|
|
|
|
43
|
|
|
// the subset of https://github.com/unicode-org/cldr/blob/master/common/transforms/Latin-ASCII.xml that is not in NFKD |
44
|
|
|
private const TRANSLIT_FROM = ['Æ', 'Ð', 'Ø', 'Þ', 'ß', 'æ', 'ð', 'ø', 'þ', 'Đ', 'đ', 'Ħ', 'ħ', 'ı', 'ĸ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'ʼn', 'Ŋ', 'ŋ', 'Œ', 'œ', 'Ŧ', 'ŧ', 'ƀ', 'Ɓ', 'Ƃ', 'ƃ', 'Ƈ', 'ƈ', 'Ɖ', 'Ɗ', 'Ƌ', 'ƌ', 'Ɛ', 'Ƒ', 'ƒ', 'Ɠ', 'ƕ', 'Ɩ', 'Ɨ', 'Ƙ', 'ƙ', 'ƚ', 'Ɲ', 'ƞ', 'Ƣ', 'ƣ', 'Ƥ', 'ƥ', 'ƫ', 'Ƭ', 'ƭ', 'Ʈ', 'Ʋ', 'Ƴ', 'ƴ', 'Ƶ', 'ƶ', 'DŽ', 'Dž', 'dž', 'Ǥ', 'ǥ', 'ȡ', 'Ȥ', 'ȥ', 'ȴ', 'ȵ', 'ȶ', 'ȷ', 'ȸ', 'ȹ', 'Ⱥ', 'Ȼ', 'ȼ', 'Ƚ', 'Ⱦ', 'ȿ', 'ɀ', 'Ƀ', 'Ʉ', 'Ɇ', 'ɇ', 'Ɉ', 'ɉ', 'Ɍ', 'ɍ', 'Ɏ', 'ɏ', 'ɓ', 'ɕ', 'ɖ', 'ɗ', 'ɛ', 'ɟ', 'ɠ', 'ɡ', 'ɢ', 'ɦ', 'ɧ', 'ɨ', 'ɪ', 'ɫ', 'ɬ', 'ɭ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɶ', 'ɼ', 'ɽ', 'ɾ', 'ʀ', 'ʂ', 'ʈ', 'ʉ', 'ʋ', 'ʏ', 'ʐ', 'ʑ', 'ʙ', 'ʛ', 'ʜ', 'ʝ', 'ʟ', 'ʠ', 'ʣ', 'ʥ', 'ʦ', 'ʪ', 'ʫ', 'ᴀ', 'ᴁ', 'ᴃ', 'ᴄ', 'ᴅ', 'ᴆ', 'ᴇ', 'ᴊ', 'ᴋ', 'ᴌ', 'ᴍ', 'ᴏ', 'ᴘ', 'ᴛ', 'ᴜ', 'ᴠ', 'ᴡ', 'ᴢ', 'ᵫ', 'ᵬ', 'ᵭ', 'ᵮ', 'ᵯ', 'ᵰ', 'ᵱ', 'ᵲ', 'ᵳ', 'ᵴ', 'ᵵ', 'ᵶ', 'ᵺ', 'ᵻ', 'ᵽ', 'ᵾ', 'ᶀ', 'ᶁ', 'ᶂ', 'ᶃ', 'ᶄ', 'ᶅ', 'ᶆ', 'ᶇ', 'ᶈ', 'ᶉ', 'ᶊ', 'ᶌ', 'ᶍ', 'ᶎ', 'ᶏ', 'ᶑ', 'ᶒ', 'ᶓ', 'ᶖ', 'ᶙ', 'ẚ', 'ẜ', 'ẝ', 'ẞ', 'Ỻ', 'ỻ', 'Ỽ', 'ỽ', 'Ỿ', 'ỿ', '©', '®', '₠', '₢', '₣', '₤', '₧', '₺', '₹', 'ℌ', '℞', '㎧', '㎮', '㏆', '㏗', '㏞', '㏟', '¼', '½', '¾', '⅓', '⅔', '⅕', '⅖', '⅗', '⅘', '⅙', '⅚', '⅛', '⅜', '⅝', '⅞', '⅟', '〇', '‘', '’', '‚', '‛', '“', '”', '„', '‟', '′', '″', '〝', '〞', '«', '»', '‹', '›', '‐', '‑', '‒', '–', '—', '―', '︱', '︲', '﹘', '‖', '⁄', '⁅', '⁆', '⁎', '、', '。', '〈', '〉', '《', '》', '〔', '〕', '〘', '〙', '〚', '〛', '︑', '︒', '︹', '︺', '︽', '︾', '︿', '﹀', '﹑', '﹝', '﹞', '⦅', '⦆', '。', '、', '×', '÷', '−', '∕', '∖', '∣', '∥', '≪', '≫', '⦅', '⦆']; |
45
|
|
|
private const TRANSLIT_TO = ['AE', 'D', 'O', 'TH', 'ss', 'ae', 'd', 'o', 'th', 'D', 'd', 'H', 'h', 'i', 'q', 'L', 'l', 'L', 'l', '\'n', 'N', 'n', 'OE', 'oe', 'T', 't', 'b', 'B', 'B', 'b', 'C', 'c', 'D', 'D', 'D', 'd', 'E', 'F', 'f', 'G', 'hv', 'I', 'I', 'K', 'k', 'l', 'N', 'n', 'OI', 'oi', 'P', 'p', 't', 'T', 't', 'T', 'V', 'Y', 'y', 'Z', 'z', 'DZ', 'Dz', 'dz', 'G', 'g', 'd', 'Z', 'z', 'l', 'n', 't', 'j', 'db', 'qp', 'A', 'C', 'c', 'L', 'T', 's', 'z', 'B', 'U', 'E', 'e', 'J', 'j', 'R', 'r', 'Y', 'y', 'b', 'c', 'd', 'd', 'e', 'j', 'g', 'g', 'G', 'h', 'h', 'i', 'I', 'l', 'l', 'l', 'm', 'n', 'n', 'N', 'OE', 'r', 'r', 'r', 'R', 's', 't', 'u', 'v', 'Y', 'z', 'z', 'B', 'G', 'H', 'j', 'L', 'q', 'dz', 'dz', 'ts', 'ls', 'lz', 'A', 'AE', 'B', 'C', 'D', 'D', 'E', 'J', 'K', 'L', 'M', 'O', 'P', 'T', 'U', 'V', 'W', 'Z', 'ue', 'b', 'd', 'f', 'm', 'n', 'p', 'r', 'r', 's', 't', 'z', 'th', 'I', 'p', 'U', 'b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 'r', 's', 'v', 'x', 'z', 'a', 'd', 'e', 'e', 'i', 'u', 'a', 's', 's', 'SS', 'LL', 'll', 'V', 'v', 'Y', 'y', '(C)', '(R)', 'CE', 'Cr', 'Fr.', 'L.', 'Pts', 'TL', 'Rs', 'x', 'Rx', 'm/s', 'rad/s', 'C/kg', 'pH', 'V/m', 'A/m', ' 1/4', ' 1/2', ' 3/4', ' 1/3', ' 2/3', ' 1/5', ' 2/5', ' 3/5', ' 4/5', ' 1/6', ' 5/6', ' 1/8', ' 3/8', ' 5/8', ' 7/8', ' 1/', '0', '\'', '\'', ',', '\'', '"', '"', ',,', '"', '\'', '"', '"', '"', '<<', '>>', '<', '>', '-', '-', '-', '-', '-', '-', '-', '-', '-', '||', '/', '[', ']', '*', ',', '.', '<', '>', '<<', '>>', '[', ']', '[', ']', '[', ']', ',', '.', '[', ']', '<<', '>>', '<', '>', ',', '[', ']', '((', '))', '.', ',', '*', '/', '-', '/', '\\', '|', '||', '<<', '>>', '((', '))']; |
46
|
|
|
|
47
|
|
|
private static array $transliterators = []; |
48
|
|
|
private static array $tableZero; |
49
|
|
|
private static array $tableWide; |
50
|
|
|
|
51
|
|
|
public static function fromCodePoints(int ...$codes): static |
52
|
|
|
{ |
53
|
|
|
$string = ''; |
54
|
|
|
|
55
|
|
|
foreach ($codes as $code) { |
56
|
|
|
if (0x80 > $code %= 0x200000) { |
57
|
|
|
$string .= \chr($code); |
58
|
|
|
} elseif (0x800 > $code) { |
59
|
|
|
$string .= \chr(0xC0 | $code >> 6).\chr(0x80 | $code & 0x3F); |
60
|
|
|
} elseif (0x10000 > $code) { |
61
|
|
|
$string .= \chr(0xE0 | $code >> 12).\chr(0x80 | $code >> 6 & 0x3F).\chr(0x80 | $code & 0x3F); |
62
|
|
|
} else { |
63
|
|
|
$string .= \chr(0xF0 | $code >> 18).\chr(0x80 | $code >> 12 & 0x3F).\chr(0x80 | $code >> 6 & 0x3F).\chr(0x80 | $code & 0x3F); |
64
|
|
|
} |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
return new static($string); |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Generic UTF-8 to ASCII transliteration. |
72
|
|
|
* |
73
|
|
|
* Install the intl extension for best results. |
74
|
|
|
* |
75
|
|
|
* @param string[]|\Transliterator[]|\Closure[] $rules See "*-Latin" rules from Transliterator::listIDs() |
76
|
|
|
*/ |
77
|
|
|
public function ascii(array $rules = []): self |
78
|
|
|
{ |
79
|
|
|
$str = clone $this; |
80
|
|
|
$s = $str->string; |
81
|
|
|
$str->string = ''; |
82
|
|
|
|
83
|
|
|
array_unshift($rules, 'nfd'); |
84
|
|
|
$rules[] = 'latin-ascii'; |
85
|
|
|
|
86
|
|
|
if (\function_exists('transliterator_transliterate')) { |
87
|
|
|
$rules[] = 'any-latin/bgn'; |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
$rules[] = 'nfkd'; |
91
|
|
|
$rules[] = '[:nonspacing mark:] remove'; |
92
|
|
|
|
93
|
|
|
while (\strlen($s) - 1 > $i = strspn($s, self::ASCII)) { |
94
|
|
|
if (0 < --$i) { |
95
|
|
|
$str->string .= substr($s, 0, $i); |
96
|
|
|
$s = substr($s, $i); |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
if (!$rule = array_shift($rules)) { |
100
|
|
|
$rules = []; // An empty rule interrupts the next ones |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
if ($rule instanceof \Transliterator) { |
104
|
|
|
$s = $rule->transliterate($s); |
105
|
|
|
} elseif ($rule instanceof \Closure) { |
106
|
|
|
$s = $rule($s); |
107
|
|
|
} elseif ($rule) { |
108
|
|
|
if ('nfd' === $rule = strtolower($rule)) { |
109
|
|
|
normalizer_is_normalized($s, self::NFD) ?: $s = normalizer_normalize($s, self::NFD); |
110
|
|
|
} elseif ('nfkd' === $rule) { |
111
|
|
|
normalizer_is_normalized($s, self::NFKD) ?: $s = normalizer_normalize($s, self::NFKD); |
112
|
|
|
} elseif ('[:nonspacing mark:] remove' === $rule) { |
113
|
|
|
$s = preg_replace('/\p{Mn}++/u', '', $s); |
114
|
|
|
} elseif ('latin-ascii' === $rule) { |
115
|
|
|
$s = str_replace(self::TRANSLIT_FROM, self::TRANSLIT_TO, $s); |
116
|
|
|
} elseif ('de-ascii' === $rule) { |
117
|
|
|
$s = preg_replace("/([AUO])\u{0308}(?=\p{Ll})/u", '$1e', $s); |
118
|
|
|
$s = str_replace(["a\u{0308}", "o\u{0308}", "u\u{0308}", "A\u{0308}", "O\u{0308}", "U\u{0308}"], ['ae', 'oe', 'ue', 'AE', 'OE', 'UE'], $s); |
119
|
|
|
} elseif (\function_exists('transliterator_transliterate')) { |
120
|
|
|
if (null === $transliterator = self::$transliterators[$rule] ??= \Transliterator::create($rule)) { |
121
|
|
|
if ('any-latin/bgn' === $rule) { |
122
|
|
|
$rule = 'any-latin'; |
123
|
|
|
$transliterator = self::$transliterators[$rule] ??= \Transliterator::create($rule); |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
if (null === $transliterator) { |
127
|
|
|
throw new InvalidArgumentException(sprintf('Unknown transliteration rule "%s".', $rule)); |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
self::$transliterators['any-latin/bgn'] = $transliterator; |
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
$s = $transliterator->transliterate($s); |
134
|
|
|
} |
135
|
|
|
} elseif (!\function_exists('iconv')) { |
136
|
|
|
$s = preg_replace('/[^\x00-\x7F]/u', '?', $s); |
137
|
|
|
} else { |
138
|
|
|
$s = @preg_replace_callback('/[^\x00-\x7F]/u', static function ($c) { |
139
|
|
|
$c = (string) iconv('UTF-8', 'ASCII//TRANSLIT', $c[0]); |
140
|
|
|
|
141
|
|
|
if ('' === $c && '' === iconv('UTF-8', 'ASCII//TRANSLIT', '²')) { |
142
|
|
|
throw new \LogicException(sprintf('"%s" requires a translit-able iconv implementation, try installing "gnu-libiconv" if you\'re using Alpine Linux.', static::class)); |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
return 1 < \strlen($c) ? ltrim($c, '\'`"^~') : ('' !== $c ? $c : '?'); |
146
|
|
|
}, $s); |
147
|
|
|
} |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
$str->string .= $s; |
151
|
|
|
|
152
|
|
|
return $str; |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
public function camel(): static |
156
|
|
|
{ |
157
|
|
|
$str = clone $this; |
158
|
|
|
$str->string = str_replace(' ', '', preg_replace_callback('/\b.(?!\p{Lu})/u', static function ($m) { |
159
|
|
|
static $i = 0; |
160
|
|
|
|
161
|
|
|
return 1 === ++$i ? ('İ' === $m[0] ? 'i̇' : mb_strtolower($m[0], 'UTF-8')) : mb_convert_case($m[0], \MB_CASE_TITLE, 'UTF-8'); |
162
|
|
|
}, preg_replace('/[^\pL0-9]++/u', ' ', $this->string))); |
163
|
|
|
|
164
|
|
|
return $str; |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* @return int[] |
169
|
|
|
*/ |
170
|
|
|
public function codePointsAt(int $offset): array |
171
|
|
|
{ |
172
|
|
|
$str = $this->slice($offset, 1); |
173
|
|
|
|
174
|
|
|
if ('' === $str->string) { |
175
|
|
|
return []; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
$codePoints = []; |
179
|
|
|
|
180
|
|
|
foreach (preg_split('//u', $str->string, -1, \PREG_SPLIT_NO_EMPTY) as $c) { |
181
|
|
|
$codePoints[] = mb_ord($c, 'UTF-8'); |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
return $codePoints; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
public function folded(bool $compat = true): static |
188
|
|
|
{ |
189
|
|
|
$str = clone $this; |
190
|
|
|
|
191
|
|
|
if (!$compat || !\defined('Normalizer::NFKC_CF')) { |
192
|
|
|
$str->string = normalizer_normalize($str->string, $compat ? \Normalizer::NFKC : \Normalizer::NFC); |
193
|
|
|
$str->string = mb_strtolower(str_replace(self::FOLD_FROM, self::FOLD_TO, $str->string), 'UTF-8'); |
194
|
|
|
} else { |
195
|
|
|
$str->string = normalizer_normalize($str->string, \Normalizer::NFKC_CF); |
|
|
|
|
196
|
|
|
} |
197
|
|
|
|
198
|
|
|
return $str; |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
public function join(array $strings, ?string $lastGlue = null): static |
202
|
|
|
{ |
203
|
|
|
$str = clone $this; |
204
|
|
|
|
205
|
|
|
$tail = null !== $lastGlue && 1 < \count($strings) ? $lastGlue.array_pop($strings) : ''; |
206
|
|
|
$str->string = implode($this->string, $strings).$tail; |
207
|
|
|
|
208
|
|
|
if (!preg_match('//u', $str->string)) { |
209
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 string.'); |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
return $str; |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
public function lower(): static |
216
|
|
|
{ |
217
|
|
|
$str = clone $this; |
218
|
|
|
$str->string = mb_strtolower(str_replace('İ', 'i̇', $str->string), 'UTF-8'); |
219
|
|
|
|
220
|
|
|
return $str; |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
/** |
224
|
|
|
* @param string $locale In the format language_region (e.g. tr_TR) |
225
|
|
|
*/ |
226
|
|
|
public function localeLower(string $locale): static |
227
|
|
|
{ |
228
|
|
|
if (null !== $transliterator = $this->getLocaleTransliterator($locale, 'Lower')) { |
229
|
|
|
$str = clone $this; |
230
|
|
|
$str->string = $transliterator->transliterate($str->string); |
231
|
|
|
|
232
|
|
|
return $str; |
233
|
|
|
} |
234
|
|
|
|
235
|
|
|
return $this->lower(); |
236
|
|
|
} |
237
|
|
|
|
238
|
|
|
public function match(string $regexp, int $flags = 0, int $offset = 0): array |
239
|
|
|
{ |
240
|
|
|
$match = ((\PREG_PATTERN_ORDER | \PREG_SET_ORDER) & $flags) ? 'preg_match_all' : 'preg_match'; |
241
|
|
|
|
242
|
|
|
if ($this->ignoreCase) { |
243
|
|
|
$regexp .= 'i'; |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
set_error_handler(static fn ($t, $m) => throw new InvalidArgumentException($m)); |
247
|
|
|
|
248
|
|
|
try { |
249
|
|
|
if (false === $match($regexp.'u', $this->string, $matches, $flags | \PREG_UNMATCHED_AS_NULL, $offset)) { |
|
|
|
|
250
|
|
|
throw new RuntimeException('Matching failed with error: '.preg_last_error_msg()); |
251
|
|
|
} |
252
|
|
|
} finally { |
253
|
|
|
restore_error_handler(); |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
return $matches; |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
public function normalize(int $form = self::NFC): static |
260
|
|
|
{ |
261
|
|
|
if (!\in_array($form, [self::NFC, self::NFD, self::NFKC, self::NFKD])) { |
262
|
|
|
throw new InvalidArgumentException('Unsupported normalization form.'); |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
$str = clone $this; |
266
|
|
|
normalizer_is_normalized($str->string, $form) ?: $str->string = normalizer_normalize($str->string, $form); |
267
|
|
|
|
268
|
|
|
return $str; |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
public function padBoth(int $length, string $padStr = ' '): static |
272
|
|
|
{ |
273
|
|
|
if ('' === $padStr || !preg_match('//u', $padStr)) { |
274
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 string.'); |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
$pad = clone $this; |
278
|
|
|
$pad->string = $padStr; |
279
|
|
|
|
280
|
|
|
return $this->pad($length, $pad, \STR_PAD_BOTH); |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
public function padEnd(int $length, string $padStr = ' '): static |
284
|
|
|
{ |
285
|
|
|
if ('' === $padStr || !preg_match('//u', $padStr)) { |
286
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 string.'); |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
$pad = clone $this; |
290
|
|
|
$pad->string = $padStr; |
291
|
|
|
|
292
|
|
|
return $this->pad($length, $pad, \STR_PAD_RIGHT); |
293
|
|
|
} |
294
|
|
|
|
295
|
|
|
public function padStart(int $length, string $padStr = ' '): static |
296
|
|
|
{ |
297
|
|
|
if ('' === $padStr || !preg_match('//u', $padStr)) { |
298
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 string.'); |
299
|
|
|
} |
300
|
|
|
|
301
|
|
|
$pad = clone $this; |
302
|
|
|
$pad->string = $padStr; |
303
|
|
|
|
304
|
|
|
return $this->pad($length, $pad, \STR_PAD_LEFT); |
305
|
|
|
} |
306
|
|
|
|
307
|
|
|
public function replaceMatches(string $fromRegexp, string|callable $to): static |
308
|
|
|
{ |
309
|
|
|
if ($this->ignoreCase) { |
310
|
|
|
$fromRegexp .= 'i'; |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
if (\is_array($to) || $to instanceof \Closure) { |
314
|
|
|
$replace = 'preg_replace_callback'; |
315
|
|
|
$to = static function (array $m) use ($to): string { |
316
|
|
|
$to = $to($m); |
317
|
|
|
|
318
|
|
|
if ('' !== $to && (!\is_string($to) || !preg_match('//u', $to))) { |
319
|
|
|
throw new InvalidArgumentException('Replace callback must return a valid UTF-8 string.'); |
320
|
|
|
} |
321
|
|
|
|
322
|
|
|
return $to; |
323
|
|
|
}; |
324
|
|
|
} elseif ('' !== $to && !preg_match('//u', $to)) { |
|
|
|
|
325
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 string.'); |
326
|
|
|
} else { |
327
|
|
|
$replace = 'preg_replace'; |
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
set_error_handler(static fn ($t, $m) => throw new InvalidArgumentException($m)); |
331
|
|
|
|
332
|
|
|
try { |
333
|
|
|
if (null === $string = $replace($fromRegexp.'u', $to, $this->string)) { |
334
|
|
|
$lastError = preg_last_error(); |
335
|
|
|
|
336
|
|
|
foreach (get_defined_constants(true)['pcre'] as $k => $v) { |
337
|
|
|
if ($lastError === $v && str_ends_with($k, '_ERROR')) { |
338
|
|
|
throw new RuntimeException('Matching failed with '.$k.'.'); |
339
|
|
|
} |
340
|
|
|
} |
341
|
|
|
|
342
|
|
|
throw new RuntimeException('Matching failed with unknown error code.'); |
343
|
|
|
} |
344
|
|
|
} finally { |
345
|
|
|
restore_error_handler(); |
346
|
|
|
} |
347
|
|
|
|
348
|
|
|
$str = clone $this; |
349
|
|
|
$str->string = $string; |
350
|
|
|
|
351
|
|
|
return $str; |
352
|
|
|
} |
353
|
|
|
|
354
|
|
|
public function reverse(): static |
355
|
|
|
{ |
356
|
|
|
$str = clone $this; |
357
|
|
|
$str->string = implode('', array_reverse(preg_split('/(\X)/u', $str->string, -1, \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY))); |
358
|
|
|
|
359
|
|
|
return $str; |
360
|
|
|
} |
361
|
|
|
|
362
|
|
|
public function snake(): static |
363
|
|
|
{ |
364
|
|
|
$str = $this->camel(); |
365
|
|
|
$str->string = mb_strtolower(preg_replace(['/(\p{Lu}+)(\p{Lu}\p{Ll})/u', '/([\p{Ll}0-9])(\p{Lu})/u'], '\1_\2', $str->string), 'UTF-8'); |
366
|
|
|
|
367
|
|
|
return $str; |
368
|
|
|
} |
369
|
|
|
|
370
|
|
|
public function title(bool $allWords = false): static |
371
|
|
|
{ |
372
|
|
|
$str = clone $this; |
373
|
|
|
|
374
|
|
|
$limit = $allWords ? -1 : 1; |
375
|
|
|
|
376
|
|
|
$str->string = preg_replace_callback('/\b./u', static fn (array $m): string => mb_convert_case($m[0], \MB_CASE_TITLE, 'UTF-8'), $str->string, $limit); |
377
|
|
|
|
378
|
|
|
return $str; |
379
|
|
|
} |
380
|
|
|
|
381
|
|
|
/** |
382
|
|
|
* @param string $locale In the format language_region (e.g. tr_TR) |
383
|
|
|
*/ |
384
|
|
|
public function localeTitle(string $locale): static |
385
|
|
|
{ |
386
|
|
|
if (null !== $transliterator = $this->getLocaleTransliterator($locale, 'Title')) { |
387
|
|
|
$str = clone $this; |
388
|
|
|
$str->string = $transliterator->transliterate($str->string); |
389
|
|
|
|
390
|
|
|
return $str; |
391
|
|
|
} |
392
|
|
|
|
393
|
|
|
return $this->title(); |
394
|
|
|
} |
395
|
|
|
|
396
|
|
|
public function trim(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static |
397
|
|
|
{ |
398
|
|
|
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) { |
399
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 chars.'); |
400
|
|
|
} |
401
|
|
|
$chars = preg_quote($chars); |
402
|
|
|
|
403
|
|
|
$str = clone $this; |
404
|
|
|
$str->string = preg_replace("{^[$chars]++|[$chars]++$}uD", '', $str->string); |
405
|
|
|
|
406
|
|
|
return $str; |
407
|
|
|
} |
408
|
|
|
|
409
|
|
|
public function trimEnd(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static |
410
|
|
|
{ |
411
|
|
|
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) { |
412
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 chars.'); |
413
|
|
|
} |
414
|
|
|
$chars = preg_quote($chars); |
415
|
|
|
|
416
|
|
|
$str = clone $this; |
417
|
|
|
$str->string = preg_replace("{[$chars]++$}uD", '', $str->string); |
418
|
|
|
|
419
|
|
|
return $str; |
420
|
|
|
} |
421
|
|
|
|
422
|
|
|
public function trimPrefix($prefix): static |
423
|
|
|
{ |
424
|
|
|
if (!$this->ignoreCase) { |
|
|
|
|
425
|
|
|
return parent::trimPrefix($prefix); |
|
|
|
|
426
|
|
|
} |
427
|
|
|
|
428
|
|
|
$str = clone $this; |
429
|
|
|
|
430
|
|
|
if ($prefix instanceof \Traversable) { |
431
|
|
|
$prefix = iterator_to_array($prefix, false); |
432
|
|
|
} elseif ($prefix instanceof parent) { |
433
|
|
|
$prefix = $prefix->string; |
434
|
|
|
} |
435
|
|
|
|
436
|
|
|
$prefix = implode('|', array_map('preg_quote', (array) $prefix)); |
437
|
|
|
$str->string = preg_replace("{^(?:$prefix)}iuD", '', $this->string); |
438
|
|
|
|
439
|
|
|
return $str; |
440
|
|
|
} |
441
|
|
|
|
442
|
|
|
public function trimStart(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static |
443
|
|
|
{ |
444
|
|
|
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) { |
445
|
|
|
throw new InvalidArgumentException('Invalid UTF-8 chars.'); |
446
|
|
|
} |
447
|
|
|
$chars = preg_quote($chars); |
448
|
|
|
|
449
|
|
|
$str = clone $this; |
450
|
|
|
$str->string = preg_replace("{^[$chars]++}uD", '', $str->string); |
451
|
|
|
|
452
|
|
|
return $str; |
453
|
|
|
} |
454
|
|
|
|
455
|
|
|
public function trimSuffix($suffix): static |
456
|
|
|
{ |
457
|
|
|
if (!$this->ignoreCase) { |
|
|
|
|
458
|
|
|
return parent::trimSuffix($suffix); |
|
|
|
|
459
|
|
|
} |
460
|
|
|
|
461
|
|
|
$str = clone $this; |
462
|
|
|
|
463
|
|
|
if ($suffix instanceof \Traversable) { |
464
|
|
|
$suffix = iterator_to_array($suffix, false); |
465
|
|
|
} elseif ($suffix instanceof parent) { |
466
|
|
|
$suffix = $suffix->string; |
467
|
|
|
} |
468
|
|
|
|
469
|
|
|
$suffix = implode('|', array_map('preg_quote', (array) $suffix)); |
470
|
|
|
$str->string = preg_replace("{(?:$suffix)$}iuD", '', $this->string); |
471
|
|
|
|
472
|
|
|
return $str; |
473
|
|
|
} |
474
|
|
|
|
475
|
|
|
public function upper(): static |
476
|
|
|
{ |
477
|
|
|
$str = clone $this; |
478
|
|
|
$str->string = mb_strtoupper($str->string, 'UTF-8'); |
479
|
|
|
|
480
|
|
|
return $str; |
481
|
|
|
} |
482
|
|
|
|
483
|
|
|
/** |
484
|
|
|
* @param string $locale In the format language_region (e.g. tr_TR) |
485
|
|
|
*/ |
486
|
|
|
public function localeUpper(string $locale): static |
487
|
|
|
{ |
488
|
|
|
if (null !== $transliterator = $this->getLocaleTransliterator($locale, 'Upper')) { |
489
|
|
|
$str = clone $this; |
490
|
|
|
$str->string = $transliterator->transliterate($str->string); |
491
|
|
|
|
492
|
|
|
return $str; |
493
|
|
|
} |
494
|
|
|
|
495
|
|
|
return $this->upper(); |
496
|
|
|
} |
497
|
|
|
|
498
|
|
|
public function width(bool $ignoreAnsiDecoration = true): int |
499
|
|
|
{ |
500
|
|
|
$width = 0; |
501
|
|
|
$s = str_replace(["\x00", "\x05", "\x07"], '', $this->string); |
502
|
|
|
|
503
|
|
|
if (str_contains($s, "\r")) { |
504
|
|
|
$s = str_replace(["\r\n", "\r"], "\n", $s); |
505
|
|
|
} |
506
|
|
|
|
507
|
|
|
if (!$ignoreAnsiDecoration) { |
508
|
|
|
$s = preg_replace('/[\p{Cc}\x7F]++/u', '', $s); |
509
|
|
|
} |
510
|
|
|
|
511
|
|
|
foreach (explode("\n", $s) as $s) { |
512
|
|
|
if ($ignoreAnsiDecoration) { |
513
|
|
|
$s = preg_replace('/(?:\x1B(?: |
514
|
|
|
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [\x40-\x7E] |
515
|
|
|
| [P\]X^_] .*? \x1B\\\\ |
516
|
|
|
| [\x41-\x7E] |
517
|
|
|
)|[\p{Cc}\x7F]++)/xu', '', $s); |
518
|
|
|
} |
519
|
|
|
|
520
|
|
|
$lineWidth = $this->wcswidth($s); |
521
|
|
|
|
522
|
|
|
if ($lineWidth > $width) { |
523
|
|
|
$width = $lineWidth; |
524
|
|
|
} |
525
|
|
|
} |
526
|
|
|
|
527
|
|
|
return $width; |
528
|
|
|
} |
529
|
|
|
|
530
|
|
|
private function pad(int $len, self $pad, int $type): static |
531
|
|
|
{ |
532
|
|
|
$sLen = $this->length(); |
533
|
|
|
|
534
|
|
|
if ($len <= $sLen) { |
535
|
|
|
return clone $this; |
536
|
|
|
} |
537
|
|
|
|
538
|
|
|
$padLen = $pad->length(); |
539
|
|
|
$freeLen = $len - $sLen; |
540
|
|
|
$len = $freeLen % $padLen; |
541
|
|
|
|
542
|
|
|
switch ($type) { |
543
|
|
|
case \STR_PAD_RIGHT: |
544
|
|
|
return $this->append(str_repeat($pad->string, intdiv($freeLen, $padLen)).($len ? $pad->slice(0, $len) : '')); |
|
|
|
|
545
|
|
|
|
546
|
|
|
case \STR_PAD_LEFT: |
547
|
|
|
return $this->prepend(str_repeat($pad->string, intdiv($freeLen, $padLen)).($len ? $pad->slice(0, $len) : '')); |
|
|
|
|
548
|
|
|
|
549
|
|
|
case \STR_PAD_BOTH: |
550
|
|
|
$freeLen /= 2; |
551
|
|
|
|
552
|
|
|
$rightLen = ceil($freeLen); |
553
|
|
|
$len = $rightLen % $padLen; |
554
|
|
|
$str = $this->append(str_repeat($pad->string, intdiv($rightLen, $padLen)).($len ? $pad->slice(0, $len) : '')); |
|
|
|
|
555
|
|
|
|
556
|
|
|
$leftLen = floor($freeLen); |
557
|
|
|
$len = $leftLen % $padLen; |
558
|
|
|
|
559
|
|
|
return $str->prepend(str_repeat($pad->string, intdiv($leftLen, $padLen)).($len ? $pad->slice(0, $len) : '')); |
|
|
|
|
560
|
|
|
|
561
|
|
|
default: |
562
|
|
|
throw new InvalidArgumentException('Invalid padding type.'); |
563
|
|
|
} |
564
|
|
|
} |
565
|
|
|
|
566
|
|
|
/** |
567
|
|
|
* Based on https://github.com/jquast/wcwidth, a Python implementation of https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c. |
568
|
|
|
*/ |
569
|
|
|
private function wcswidth(string $string): int |
570
|
|
|
{ |
571
|
|
|
$width = 0; |
572
|
|
|
|
573
|
|
|
foreach (preg_split('//u', $string, -1, \PREG_SPLIT_NO_EMPTY) as $c) { |
574
|
|
|
$codePoint = mb_ord($c, 'UTF-8'); |
575
|
|
|
|
576
|
|
|
if (0 === $codePoint // NULL |
577
|
|
|
|| 0x034F === $codePoint // COMBINING GRAPHEME JOINER |
578
|
|
|
|| (0x200B <= $codePoint && 0x200F >= $codePoint) // ZERO WIDTH SPACE to RIGHT-TO-LEFT MARK |
579
|
|
|
|| 0x2028 === $codePoint // LINE SEPARATOR |
580
|
|
|
|| 0x2029 === $codePoint // PARAGRAPH SEPARATOR |
581
|
|
|
|| (0x202A <= $codePoint && 0x202E >= $codePoint) // LEFT-TO-RIGHT EMBEDDING to RIGHT-TO-LEFT OVERRIDE |
582
|
|
|
|| (0x2060 <= $codePoint && 0x2063 >= $codePoint) // WORD JOINER to INVISIBLE SEPARATOR |
583
|
|
|
) { |
584
|
|
|
continue; |
585
|
|
|
} |
586
|
|
|
|
587
|
|
|
// Non printable characters |
588
|
|
|
if (32 > $codePoint // C0 control characters |
589
|
|
|
|| (0x07F <= $codePoint && 0x0A0 > $codePoint) // C1 control characters and DEL |
590
|
|
|
) { |
591
|
|
|
return -1; |
592
|
|
|
} |
593
|
|
|
|
594
|
|
|
self::$tableZero ??= require __DIR__.'/Resources/data/wcswidth_table_zero.php'; |
595
|
|
|
|
596
|
|
|
if ($codePoint >= self::$tableZero[0][0] && $codePoint <= self::$tableZero[$ubound = \count(self::$tableZero) - 1][1]) { |
597
|
|
|
$lbound = 0; |
598
|
|
|
while ($ubound >= $lbound) { |
599
|
|
|
$mid = floor(($lbound + $ubound) / 2); |
600
|
|
|
|
601
|
|
|
if ($codePoint > self::$tableZero[$mid][1]) { |
602
|
|
|
$lbound = $mid + 1; |
603
|
|
|
} elseif ($codePoint < self::$tableZero[$mid][0]) { |
604
|
|
|
$ubound = $mid - 1; |
605
|
|
|
} else { |
606
|
|
|
continue 2; |
607
|
|
|
} |
608
|
|
|
} |
609
|
|
|
} |
610
|
|
|
|
611
|
|
|
self::$tableWide ??= require __DIR__.'/Resources/data/wcswidth_table_wide.php'; |
612
|
|
|
|
613
|
|
|
if ($codePoint >= self::$tableWide[0][0] && $codePoint <= self::$tableWide[$ubound = \count(self::$tableWide) - 1][1]) { |
614
|
|
|
$lbound = 0; |
615
|
|
|
while ($ubound >= $lbound) { |
616
|
|
|
$mid = floor(($lbound + $ubound) / 2); |
617
|
|
|
|
618
|
|
|
if ($codePoint > self::$tableWide[$mid][1]) { |
619
|
|
|
$lbound = $mid + 1; |
620
|
|
|
} elseif ($codePoint < self::$tableWide[$mid][0]) { |
621
|
|
|
$ubound = $mid - 1; |
622
|
|
|
} else { |
623
|
|
|
$width += 2; |
624
|
|
|
|
625
|
|
|
continue 2; |
626
|
|
|
} |
627
|
|
|
} |
628
|
|
|
} |
629
|
|
|
|
630
|
|
|
++$width; |
631
|
|
|
} |
632
|
|
|
|
633
|
|
|
return $width; |
634
|
|
|
} |
635
|
|
|
|
636
|
|
|
private function getLocaleTransliterator(string $locale, string $id): ?\Transliterator |
637
|
|
|
{ |
638
|
|
|
$rule = $locale.'-'.$id; |
639
|
|
|
if (\array_key_exists($rule, self::$transliterators)) { |
640
|
|
|
return self::$transliterators[$rule]; |
641
|
|
|
} |
642
|
|
|
|
643
|
|
|
if (null !== $transliterator = self::$transliterators[$rule] = \Transliterator::create($rule)) { |
644
|
|
|
return $transliterator; |
645
|
|
|
} |
646
|
|
|
|
647
|
|
|
// Try to find a parent locale (nl_BE -> nl) |
648
|
|
|
if (false === $i = strpos($locale, '_')) { |
649
|
|
|
return null; |
650
|
|
|
} |
651
|
|
|
|
652
|
|
|
$parentRule = substr_replace($locale, '-'.$id, $i); |
653
|
|
|
|
654
|
|
|
// Parent locale was already cached, return and store as current locale |
655
|
|
|
if (\array_key_exists($parentRule, self::$transliterators)) { |
|
|
|
|
656
|
|
|
return self::$transliterators[$rule] = self::$transliterators[$parentRule]; |
657
|
|
|
} |
658
|
|
|
|
659
|
|
|
// Create transliterator based on parent locale and cache the result on both initial and parent locale values |
660
|
|
|
$transliterator = \Transliterator::create($parentRule); |
|
|
|
|
661
|
|
|
|
662
|
|
|
return self::$transliterators[$rule] = self::$transliterators[$parentRule] = $transliterator; |
663
|
|
|
} |
664
|
|
|
} |
665
|
|
|
|