ConvertHelper_ControlCharacters::getCharsAsHex()   B
last analyzed

Complexity

Conditions 8
Paths 20

Size

Total Lines 44
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
eloc 21
c 1
b 1
f 0
dl 0
loc 44
rs 8.4444
cc 8
nc 20
nop 0
1
<?php
2
/**
3
 * File containing the {@see AppUtils\ConvertHelper_ControlCharacters} class.
4
 *
5
 * @package Application Utils
6
 * @subpackage ConvertHelper
7
 * @see AppUtils\ConvertHelper_ControlCharacters
8
 */
9
10
declare(strict_types=1);
11
12
namespace AppUtils;
13
14
/**
15
 * Control characters management class.
16
 *
17
 * @package Application Utils
18
 * @subpackage ConvertHelper
19
 * @author Sebastian Mordziol <[email protected]>
20
 */
21
class ConvertHelper_ControlCharacters
22
{
23
    public const ERROR_MALFORMATTED_STRING = 53801;
24
    
25
   /**
26
    * @var string[]
27
    */
28
    protected static $controlChars =  array(
29
        '0000-0008', // control chars
30
        '000E-000F', // control chars
31
        '0010-001F', // control chars
32
        '2000-200F', // non-breaking space and co
33
    );
34
    
35
   /**
36
    * @var string|NULL
37
    */
38
    protected static $controlCharsRegex;
39
40
   /**
41
    * @var string[]
42
    */
43
    protected static $hexAlphabet = array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F');
44
    
45
   /**
46
    * @var string[]|NULL
47
    */
48
    protected static $charsAsHex;
49
    
50
    public function __construct()
51
    {
52
        // create the regex from the unicode characters list
53
        if(!isset(self::$controlCharsRegex))
54
        {
55
            $chars = $this->getCharsAsHex();
56
            
57
            // we use the notation \x{0000} to specify the unicode character key
58
            // in the regular expression.
59
            $stack = array();
60
            
61
            foreach($chars as $char) 
62
            {
63
                $stack[] = '\x{'.$char.'}';
64
            }
65
            
66
            self::$controlCharsRegex = '/['.implode('', $stack).']/u';
67
        }
68
    }
69
    
70
   /**
71
    * Retrieves the HEX character codes for all control
72
    * characters that the {@link stripControlCharacters()}
73
    * method will remove.
74
    *
75
    * @return string[]
76
    */
77
    public function getCharsAsHex() : array
78
    {
79
        if (isset(self::$charsAsHex))
80
        {
81
            return self::$charsAsHex;
82
        }
83
        
84
        $stack = array();
85
        
86
        foreach(self::$controlChars as $char)
87
        {
88
            $tokens = explode('-', $char);
89
            $start = $tokens[0];
90
            $end = $tokens[1];
91
            $prefix = substr($start, 0, 3);
92
            
93
            $range = array();
94
            
95
            foreach(self::$hexAlphabet as $number) 
96
            {
97
                $range[] = $prefix.$number;
98
            }
99
            
100
            $use = false;
101
            
102
            foreach($range as $number) 
103
            {
104
                if($number == $start) {
105
                    $use = true;
106
                }
107
                
108
                if($use) {
109
                    $stack[] = $number;
110
                }
111
                
112
                if($number == $end) {
113
                    break;
114
                }
115
            }
116
        }
117
        
118
        self::$charsAsHex = $stack;
119
        
120
        return $stack;
121
    }
122
    
123
   /**
124
    * Retrieves an array of all control characters that
125
    * the {@link stripControlCharacters()} method will
126
    * remove, as the actual UTF-8 characters.
127
    *
128
    * @return string[]
129
    */
130
    public function getCharsAsUTF8() : array
131
    {
132
        $chars = $this->getCharsAsHex();
133
        
134
        $result = array();
135
        foreach($chars as $char) {
136
            $result[] = hex2bin($char);
137
        }
138
        
139
        return $result;
140
    }
141
    
142
   /**
143
    * Retrieves all control characters as JSON encoded
144
    * characters, e.g. "\u200b".
145
    *
146
    * @return string[]
147
    */
148
    public function getCharsAsJSON() : array
149
    {
150
        $chars = $this->getCharsAsHex();
151
        
152
        $result = array();
153
        foreach($chars as $char) {
154
            $result[] = '\u'.strtolower($char);
155
        }
156
        
157
        return $result;
158
    }
159
    
160
   /**
161
    * Removes all control characters from the specified string
162
    * that can cause problems in some cases, like creating
163
    * valid XML documents. This includes invisible non-breaking
164
    * spaces.
165
    *
166
    * @param string $string
167
    * @return string
168
    * @see https://stackoverflow.com/a/8171868/2298192
169
    * @see https://unicode-table.com/en
170
    */
171
    public function stripControlCharacters(string $string) : string
172
    {
173
        if(empty($string)) 
174
        {
175
            return $string;
176
        }
177
        
178
        $result = preg_replace(self::$controlCharsRegex, '', $string);
179
        
180
        // can happen if the text contains invalid UTF8
181
        if($result === null)
182
        {
183
            $string = ConvertHelper::string2utf8($string);
184
            
185
            $result = preg_replace(self::$controlCharsRegex, '', $string);
186
            
187
            if($result === null)
188
            {
189
                throw new ConvertHelper_Exception(
190
                    'Cannot strip control characters: malformatted string encountered.',
191
                    'preg_replace returned null, which happens when a string contains broken unicode characters. Tried to fix the string, but this did not help.',
192
                    self::ERROR_MALFORMATTED_STRING
193
                );
194
            }
195
        }
196
        
197
        return (string)$result;
198
    }
199
}
200