Issues (84)

src/URLInfo/URIParser.php (4 issues)

1
<?php
2
/**
3
 * @package Application Utils
4
 * @subpackage URLInfo
5
 * @see \AppUtils\URLInfo\URIParser
6
 */
7
8
declare(strict_types=1);
9
10
namespace AppUtils\URLInfo;
11
12
use AppUtils\ClassHelper;
13
use AppUtils\ConvertHelper;
14
use AppUtils\URLInfo\Parser\BaseURLTypeDetector;
15
use AppUtils\URLInfo\Parser\BaseURLValidator;
16
use AppUtils\URLInfo\Parser\ParsedInfoFilter;
17
use AppUtils\URLInfo\Parser\URLTypeDetector\DetectEmail;
18
use AppUtils\URLInfo\Parser\URLTypeDetector\DetectFragmentLink;
19
use AppUtils\URLInfo\Parser\URLTypeDetector\DetectIPAddress;
20
use AppUtils\URLInfo\Parser\URLTypeDetector\DetectPhoneLink;
21
use AppUtils\URLInfo\Parser\URLTypeDetector\DetectStandardURL;
22
use AppUtils\URLInfo\Parser\URLValidator\ValidateHostIsPresent;
23
use AppUtils\URLInfo\Parser\URLValidator\ValidateIsTypeKnown;
24
use AppUtils\URLInfo\Parser\URLValidator\ValidateSchemeIsKnown;
25
use AppUtils\URLInfo\Parser\URLValidator\ValidateSchemeIsSet;
26
27
/**
28
 * Handles the URL parsing, as replacement for PHP's 
29
 * native parse_url function. It overcomes a number of
30
 * limitations of the function, using pre- and post-processing
31
 * of the URL and its component parts.
32
 *
33
 * @package Application Utils
34
 * @subpackage URLInfo
35
 * @author Sebastian Mordziol <[email protected]>
36
 */
37
class URIParser
38
{
39
    use URLInfoTrait;
40
41
    protected string $url;
42
    protected bool $isValid = false;
43
    protected bool $encodeUTF = false;
44
45
   /**
46
    * @var array{code:int,message:string}|NULL
47
    */
48
    protected ?array $error = null;
49
    
50
   /**
51
    * Stores a list of all unicode characters in the URL
52
    * that have been filtered out before parsing it with
53
    * parse_url.
54
    * 
55
    * @var array<string,string>
56
    */
57
    protected array $unicodeChars = array();
58
59
    /**
60
     * @var class-string[]
0 ignored issues
show
Documentation Bug introduced by
The doc comment class-string[] at position 0 could not be parsed: Unknown type name 'class-string' at position 0 in class-string[].
Loading history...
61
     */
62
    private static array $detectorClasses = array(
63
        DetectEmail::class,
64
        DetectFragmentLink::class,
65
        DetectPhoneLink::class,
66
        DetectIPAddress::class,
67
        DetectStandardURL::class
68
    );
69
70
    /**
71
     * @var class-string[]
0 ignored issues
show
Documentation Bug introduced by
The doc comment class-string[] at position 0 could not be parsed: Unknown type name 'class-string' at position 0 in class-string[].
Loading history...
72
     */
73
    private static array $validatorClasses = array(
74
        ValidateIsTypeKnown::class,
75
        ValidateSchemeIsSet::class,
76
        ValidateSchemeIsKnown::class,
77
        ValidateHostIsPresent::class
78
    );
79
80
    /**
81
    * 
82
    * @param string $url The target URL.
83
    * @param bool $encodeUTF Whether to URL encode any plain text unicode characters.
84
    */
85
    public function __construct(string $url, bool $encodeUTF)
86
    {
87
        $this->url = $url;
88
        $this->encodeUTF = $encodeUTF;
89
90
        $this->parse();
91
        $this->detectType();
92
        $this->validate();
93
    }
94
95
   /**
96
    * Retrieves the array as parsed by PHP's parse_url,
97
    * filtered and adjusted as necessary.
98
    * 
99
    * @return array<string,mixed>
100
    */
101
    public function getInfo() : array
102
    {
103
        return $this->info;
104
    }
105
106
    protected function parse() : void
107
    {
108
        $this->filterUnicodeChars();
109
110
        $result = parse_url($this->url);
111
        $this->info = array();
112
113
        if(!is_array($result))
0 ignored issues
show
The condition is_array($result) is always true.
Loading history...
114
        {
115
            $this->fixBrokenURL();
116
            $result = parse_url($this->url);
117
        }
118
119
        if(is_array($result))
0 ignored issues
show
The condition is_array($result) is always true.
Loading history...
120
        {
121
            $this->info = $result;
122
        }
123
124
        $this->filterParsed();
125
126
        // if the URL contains any URL characters, and we
127
        // do not want them URL encoded, restore them.
128
        if(!$this->encodeUTF && !empty($this->unicodeChars))
129
        {
130
            $this->info = $this->restoreUnicodeChars($this->info);
131
        }
132
    }
133
134
    /**
135
     * Tries to fix broken URLs by checking for common user mistakes.
136
     * @return void
137
     */
138
    private function fixBrokenURL() : void
139
    {
140
        if(strpos($this->url, ':') === false) {
141
            return;
142
        }
143
144
        // Using explode to exclude breaking a URL that contains :/// somewhere
145
        // else, as unlikely as it may be.
146
        $parts = explode(':', $this->url);
147
148
        while(strpos($parts[1], '///') === 0)
149
        {
150
            $parts[1] = str_replace('///', '//', $parts[1]);
151
        }
152
153
        $this->url = implode(':', $parts);
154
    }
155
156
   /**
157
    * Finds any non-url encoded unicode characters in 
158
    * the URL, and encodes them before the URL is 
159
    * passed to parse_url.
160
    */
161
    protected function filterUnicodeChars() : void
162
    {
163
        $chars = ConvertHelper::string2array($this->url);
164
        
165
        $keep = array();
166
        
167
        foreach($chars as $char)
168
        {
169
            if(preg_match('/\p{L}/uix', $char))
170
            {
171
                $encoded = rawurlencode($char);
172
                
173
                if($encoded !== $char)
174
                {
175
                    $this->unicodeChars[$encoded] = $char;
176
                    $char = $encoded;
177
                }
178
            }
179
            
180
            $keep[] = $char;
181
        }
182
        
183
        $this->url = implode('', $keep);
184
    }
185
186
    protected function detectType() : bool
187
    {
188
        foreach(self::$detectorClasses as $className)
189
        {
190
            $detector = ClassHelper::requireObjectInstanceOf(
191
                BaseURLTypeDetector::class,
192
                new $className($this)
193
            );
194
195
            $detected = $detector->detect();
196
197
            // Use the adjusted data
198
            $this->info = $detector->getInfo();
199
200
            if($detected) {
201
                $this->isValid = true;
202
                return true;
203
            }
204
        }
205
206
        return false;
207
    }
208
209
    protected function validate() : void
210
    {
211
        foreach(self::$validatorClasses as $validatorClass)
212
        {
213
            $validator = ClassHelper::requireObjectInstanceOf(
214
                BaseURLValidator::class,
215
                new $validatorClass($this)
216
            );
217
218
            $result = $validator->validate();
219
220
            $this->info = $validator->getInfo();
221
222
            if($result !== true) {
223
                $this->isValid = false;
224
                return;
225
            }
226
        }
227
        
228
        $this->isValid = true;
229
    }
230
231
   /**
232
    * Goes through all information in the parse_url result
233
    * array, and attempts to fix any user errors in formatting
234
    * that can be recovered from, mostly regarding stray spaces.
235
    */
236
    protected function filterParsed() : void
237
    {
238
        $this->info = (new ParsedInfoFilter($this->url, $this->info))->filter();
239
    }
240
    
241
   /**
242
    * Recursively goes through the array, and converts all previously
243
    * URL encoded characters with their unicode character counterparts.
244
    * 
245
    * @param array<string,mixed> $subject
246
    * @return array<string,mixed>
247
    */
248
    protected function restoreUnicodeChars(array $subject) : array
249
    {
250
        $result = array();
251
        
252
        foreach($subject as $key => $val)
253
        {
254
            if(is_array($val))
255
            {
256
                $val = $this->restoreUnicodeChars($val);
257
            }
258
            else
259
            {
260
                $val = $this->restoreUnicodeChar($val);
261
            }
262
            
263
            $key = $this->restoreUnicodeChar($key);
264
            
265
            $result[$key] = $val;
266
        }
267
        
268
        return $result;
269
    }
270
    
271
   /**
272
    * Replaces all URL encoded unicode characters
273
    * in the string with the unicode character.
274
    * 
275
    * @param string $string
276
    * @return string
277
    */
278
    protected function restoreUnicodeChar(string $string) : string
279
    {
280
        if(strpos($string, '%') !== false)
281
        {
282
            return str_replace(array_keys($this->unicodeChars), array_values($this->unicodeChars), $string);
283
        }
284
        
285
        return $string;
286
    }
287
288
    public function setError(int $code, string $message) : void
289
    {
290
        $this->isValid = false;
291
        
292
        $this->error = array(
293
            'code' => $code,
294
            'message' => $message
295
        );
296
    }
297
   
298
   /**
299
    * Checks whether the URL that was parsed is valid.
300
    * @return bool
301
    */
302
    public function isValid() : bool
303
    {
304
        return $this->isValid;
305
    }
306
307
   /**
308
    * If the validation failed, retrieves the validation
309
    * error message.
310
    * 
311
    * @return string
312
    */
313
    public function getErrorMessage() : string
314
    {
315
        return $this->error['message'] ?? '';
316
    }
317
    
318
   /**
319
    * If the validation failed, retrieves the validation
320
    * error code.
321
    * 
322
    * @return int
323
    */
324
    public function getErrorCode() : int
325
    {
326
        return $this->error['code'] ?? -1;
327
    }
328
}
329