1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* File containing the {@see AppUtils\ConvertHelper_ControlCharacters} class. |
4
|
|
|
* |
5
|
|
|
* @package Application Utils |
6
|
|
|
* @subpackage ConvertHelper |
7
|
|
|
* @see AppUtils\ConvertHelper_ControlCharacters |
8
|
|
|
*/ |
9
|
|
|
|
10
|
|
|
declare(strict_types=1); |
11
|
|
|
|
12
|
|
|
namespace AppUtils; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* Control characters management class. |
16
|
|
|
* |
17
|
|
|
* @package Application Utils |
18
|
|
|
* @subpackage ConvertHelper |
19
|
|
|
* @author Sebastian Mordziol <[email protected]> |
20
|
|
|
*/ |
21
|
|
|
class ConvertHelper_ControlCharacters |
22
|
|
|
{ |
23
|
|
|
public const ERROR_MALFORMATTED_STRING = 53801; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* @var string[] |
27
|
|
|
*/ |
28
|
|
|
protected static $controlChars = array( |
29
|
|
|
'0000-0008', // control chars |
30
|
|
|
'000E-000F', // control chars |
31
|
|
|
'0010-001F', // control chars |
32
|
|
|
'2000-200F', // non-breaking space and co |
33
|
|
|
); |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* @var string|NULL |
37
|
|
|
*/ |
38
|
|
|
protected static $controlCharsRegex; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* @var string[] |
42
|
|
|
*/ |
43
|
|
|
protected static $hexAlphabet = array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'); |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* @var string[]|NULL |
47
|
|
|
*/ |
48
|
|
|
protected static $charsAsHex; |
49
|
|
|
|
50
|
|
|
public function __construct() |
51
|
|
|
{ |
52
|
|
|
// create the regex from the unicode characters list |
53
|
|
|
if(!isset(self::$controlCharsRegex)) |
54
|
|
|
{ |
55
|
|
|
$chars = $this->getCharsAsHex(); |
56
|
|
|
|
57
|
|
|
// we use the notation \x{0000} to specify the unicode character key |
58
|
|
|
// in the regular expression. |
59
|
|
|
$stack = array(); |
60
|
|
|
|
61
|
|
|
foreach($chars as $char) |
62
|
|
|
{ |
63
|
|
|
$stack[] = '\x{'.$char.'}'; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
self::$controlCharsRegex = '/['.implode('', $stack).']/u'; |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Retrieves the HEX character codes for all control |
72
|
|
|
* characters that the {@link stripControlCharacters()} |
73
|
|
|
* method will remove. |
74
|
|
|
* |
75
|
|
|
* @return string[] |
76
|
|
|
*/ |
77
|
|
|
public function getCharsAsHex() : array |
78
|
|
|
{ |
79
|
|
|
if (isset(self::$charsAsHex)) |
80
|
|
|
{ |
81
|
|
|
return self::$charsAsHex; |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
$stack = array(); |
85
|
|
|
|
86
|
|
|
foreach(self::$controlChars as $char) |
87
|
|
|
{ |
88
|
|
|
$tokens = explode('-', $char); |
89
|
|
|
$start = $tokens[0]; |
90
|
|
|
$end = $tokens[1]; |
91
|
|
|
$prefix = substr($start, 0, 3); |
92
|
|
|
|
93
|
|
|
$range = array(); |
94
|
|
|
|
95
|
|
|
foreach(self::$hexAlphabet as $number) |
96
|
|
|
{ |
97
|
|
|
$range[] = $prefix.$number; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
$use = false; |
101
|
|
|
|
102
|
|
|
foreach($range as $number) |
103
|
|
|
{ |
104
|
|
|
if($number == $start) { |
105
|
|
|
$use = true; |
106
|
|
|
} |
107
|
|
|
|
108
|
|
|
if($use) { |
109
|
|
|
$stack[] = $number; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
if($number == $end) { |
113
|
|
|
break; |
114
|
|
|
} |
115
|
|
|
} |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
self::$charsAsHex = $stack; |
119
|
|
|
|
120
|
|
|
return $stack; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Retrieves an array of all control characters that |
125
|
|
|
* the {@link stripControlCharacters()} method will |
126
|
|
|
* remove, as the actual UTF-8 characters. |
127
|
|
|
* |
128
|
|
|
* @return string[] |
129
|
|
|
*/ |
130
|
|
|
public function getCharsAsUTF8() : array |
131
|
|
|
{ |
132
|
|
|
$chars = $this->getCharsAsHex(); |
133
|
|
|
|
134
|
|
|
$result = array(); |
135
|
|
|
foreach($chars as $char) { |
136
|
|
|
$result[] = hex2bin($char); |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
return $result; |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
/** |
143
|
|
|
* Retrieves all control characters as JSON encoded |
144
|
|
|
* characters, e.g. "\u200b". |
145
|
|
|
* |
146
|
|
|
* @return string[] |
147
|
|
|
*/ |
148
|
|
|
public function getCharsAsJSON() : array |
149
|
|
|
{ |
150
|
|
|
$chars = $this->getCharsAsHex(); |
151
|
|
|
|
152
|
|
|
$result = array(); |
153
|
|
|
foreach($chars as $char) { |
154
|
|
|
$result[] = '\u'.strtolower($char); |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
return $result; |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
/** |
161
|
|
|
* Removes all control characters from the specified string |
162
|
|
|
* that can cause problems in some cases, like creating |
163
|
|
|
* valid XML documents. This includes invisible non-breaking |
164
|
|
|
* spaces. |
165
|
|
|
* |
166
|
|
|
* @param string $string |
167
|
|
|
* @return string |
168
|
|
|
* @see https://stackoverflow.com/a/8171868/2298192 |
169
|
|
|
* @see https://unicode-table.com/en |
170
|
|
|
*/ |
171
|
|
|
public function stripControlCharacters(string $string) : string |
172
|
|
|
{ |
173
|
|
|
if(empty($string)) |
174
|
|
|
{ |
175
|
|
|
return $string; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
$result = preg_replace(self::$controlCharsRegex, '', $string); |
179
|
|
|
|
180
|
|
|
// can happen if the text contains invalid UTF8 |
181
|
|
|
if($result === null) |
182
|
|
|
{ |
183
|
|
|
$string = ConvertHelper::string2utf8($string); |
184
|
|
|
|
185
|
|
|
$result = preg_replace(self::$controlCharsRegex, '', $string); |
186
|
|
|
|
187
|
|
|
if($result === null) |
188
|
|
|
{ |
189
|
|
|
throw new ConvertHelper_Exception( |
190
|
|
|
'Cannot strip control characters: malformatted string encountered.', |
191
|
|
|
'preg_replace returned null, which happens when a string contains broken unicode characters. Tried to fix the string, but this did not help.', |
192
|
|
|
self::ERROR_MALFORMATTED_STRING |
193
|
|
|
); |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
return (string)$result; |
198
|
|
|
} |
199
|
|
|
} |
200
|
|
|
|