layershifter /
TLDExtract
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
| 1 | <?php |
||
| 2 | /** |
||
| 3 | * TLDExtract: Library for extraction of domain parts e.g. TLD. Domain parser that uses Public Suffix List. |
||
| 4 | * |
||
| 5 | * @link https://github.com/layershifter/TLDExtract |
||
| 6 | * |
||
| 7 | * @copyright Copyright (c) 2016, Alexander Fedyashov |
||
| 8 | * @license https://raw.githubusercontent.com/layershifter/TLDExtract/master/LICENSE Apache 2.0 License |
||
| 9 | */ |
||
| 10 | |||
| 11 | namespace LayerShifter\TLDExtract; |
||
| 12 | |||
| 13 | use LayerShifter\TLDDatabase\Store; |
||
| 14 | use LayerShifter\TLDExtract\Exceptions\RuntimeException; |
||
| 15 | use LayerShifter\TLDSupport\Helpers\Arr; |
||
| 16 | use LayerShifter\TLDSupport\Helpers\IP; |
||
| 17 | use LayerShifter\TLDSupport\Helpers\Str; |
||
| 18 | |||
| 19 | /** |
||
| 20 | * Extract class accurately extracts subdomain, domain and TLD components from URLs. |
||
| 21 | * |
||
| 22 | * @see Result for more information on the returned data structure. |
||
| 23 | */ |
||
| 24 | class Extract |
||
| 25 | { |
||
| 26 | |||
| 27 | /** |
||
| 28 | * @const int If this option provided, extract will consider ICCAN suffixes. |
||
| 29 | */ |
||
| 30 | const MODE_ALLOW_ICCAN = 2; |
||
| 31 | /** |
||
| 32 | * @const int If this option provided, extract will consider private suffixes. |
||
| 33 | */ |
||
| 34 | const MODE_ALLOW_PRIVATE = 4; |
||
| 35 | /** |
||
| 36 | * @const int If this option provided, extract will consider custom domains. |
||
| 37 | */ |
||
| 38 | const MODE_ALLOW_NOT_EXISTING_SUFFIXES = 8; |
||
| 39 | /** |
||
| 40 | * @const string RFC 3986 compliant scheme regex pattern. |
||
| 41 | * |
||
| 42 | * @see https://tools.ietf.org/html/rfc3986#section-3.1 |
||
| 43 | */ |
||
| 44 | const SCHEMA_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*:)?//#'; |
||
| 45 | |||
| 46 | /** |
||
| 47 | * @var int Value of extraction options. |
||
| 48 | */ |
||
| 49 | private $extractionMode; |
||
| 50 | /** |
||
| 51 | * @var string Name of class that will store results of parsing. |
||
| 52 | */ |
||
| 53 | private $resultClassName; |
||
| 54 | /** |
||
| 55 | * @var Store Object of TLDDatabase\Store class. |
||
| 56 | */ |
||
| 57 | private $suffixStore; |
||
| 58 | |||
| 59 | /** |
||
| 60 | * Factory constructor. |
||
| 61 | * |
||
| 62 | * @param null|string $databaseFile Optional, name of file with Public Suffix List database |
||
| 63 | * @param null|string $resultClassName Optional, name of class that will store results of parsing |
||
| 64 | * @param null|int $extractionMode Optional, option that will control extraction process |
||
| 65 | * |
||
| 66 | * @throws RuntimeException |
||
| 67 | */ |
||
| 68 | public function __construct($databaseFile = null, $resultClassName = null, $extractionMode = null) |
||
| 69 | { |
||
| 70 | $this->suffixStore = new Store($databaseFile); |
||
| 71 | $this->resultClassName = Result::class; |
||
| 72 | |||
| 73 | // Checks for resultClassName argument. |
||
| 74 | |||
| 75 | if (null !== $resultClassName) { |
||
| 76 | if (!class_exists($resultClassName)) { |
||
| 77 | throw new RuntimeException(sprintf('Class "%s" is not defined', $resultClassName)); |
||
| 78 | } |
||
| 79 | |||
| 80 | if (!in_array(ResultInterface::class, class_implements($resultClassName), true)) { |
||
| 81 | throw new RuntimeException(sprintf('Class "%s" not implements ResultInterface', $resultClassName)); |
||
| 82 | } |
||
| 83 | |||
| 84 | $this->resultClassName = $resultClassName; |
||
| 85 | } |
||
| 86 | |||
| 87 | $this->setExtractionMode($extractionMode); |
||
| 88 | } |
||
| 89 | |||
| 90 | /** |
||
| 91 | * Sets extraction mode, option that will control extraction process. |
||
| 92 | * |
||
| 93 | * @param int $extractionMode One of MODE_* constants |
||
| 94 | * |
||
| 95 | * @throws RuntimeException |
||
| 96 | */ |
||
| 97 | public function setExtractionMode($extractionMode = null) |
||
| 98 | { |
||
| 99 | if (null === $extractionMode) { |
||
| 100 | $this->extractionMode = static::MODE_ALLOW_ICCAN |
||
| 101 | | static::MODE_ALLOW_PRIVATE |
||
| 102 | | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES; |
||
| 103 | |||
| 104 | return; |
||
| 105 | } |
||
| 106 | |||
| 107 | if (!is_int($extractionMode)) { |
||
| 108 | throw new RuntimeException('Invalid argument type, extractionMode must be integer'); |
||
| 109 | } |
||
| 110 | |||
| 111 | if (!in_array($extractionMode, [ |
||
| 112 | static::MODE_ALLOW_ICCAN, |
||
| 113 | static::MODE_ALLOW_PRIVATE, |
||
| 114 | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES, |
||
| 115 | static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE, |
||
| 116 | static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES, |
||
| 117 | static::MODE_ALLOW_ICCAN | static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES, |
||
| 118 | static::MODE_ALLOW_PRIVATE | static::MODE_ALLOW_NOT_EXISTING_SUFFIXES |
||
| 119 | ], true) |
||
| 120 | ) { |
||
| 121 | throw new RuntimeException( |
||
| 122 | 'Invalid argument type, extractionMode must be one of defined constants of their combination' |
||
| 123 | ); |
||
| 124 | } |
||
| 125 | |||
| 126 | $this->extractionMode = $extractionMode; |
||
| 127 | } |
||
| 128 | |||
| 129 | /** |
||
| 130 | * Extract the subdomain, host and gTLD/ccTLD components from a URL. |
||
| 131 | * |
||
| 132 | * @param string $url URL that will be extracted |
||
| 133 | * |
||
| 134 | * @return ResultInterface |
||
| 135 | */ |
||
| 136 | public function parse($url) |
||
| 137 | { |
||
| 138 | $hostname = $this->extractHostname($url); |
||
| 139 | |||
| 140 | // If received hostname is valid IP address, result will be formed from it. |
||
| 141 | |||
| 142 | if (IP::isValid($hostname)) { |
||
| 143 | return new $this->resultClassName(null, $hostname, null); |
||
| 144 | } |
||
| 145 | |||
| 146 | list($subDomain, $host, $suffix) = $this->extractParts($hostname); |
||
| 147 | |||
| 148 | return new $this->resultClassName($subDomain, $host, $suffix); |
||
| 149 | } |
||
| 150 | |||
| 151 | /** |
||
| 152 | * Method that extracts the hostname or IP address from a URL. |
||
| 153 | * |
||
| 154 | * @param string $url URL for extraction |
||
| 155 | * |
||
| 156 | * @return null|string Hostname or IP address |
||
| 157 | */ |
||
| 158 | private function extractHostname($url) |
||
| 159 | { |
||
| 160 | $url = trim(Str::lower($url)); |
||
| 161 | |||
| 162 | // Removes scheme and path i.e. "https://github.com/layershifter" to "github.com/layershifter". |
||
| 163 | |||
| 164 | $url = preg_replace(static::SCHEMA_PATTERN, '', $url); |
||
| 165 | |||
| 166 | // Removes path and query part of URL i.e. "github.com/layershifter" to "github.com". |
||
| 167 | |||
| 168 | $url = $this->fixQueryPart($url); |
||
| 169 | $hostname = Arr::first(explode('/', $url, 2)); |
||
| 170 | |||
| 171 | // Removes username from URL i.e. [email protected] to github.com. |
||
| 172 | |||
| 173 | $hostname = Arr::last(explode('@', $hostname)); |
||
| 174 | |||
| 175 | // Remove ports from hosts, also check for IPv6 literals like "[3ffe:2a00:100:7031::1]". |
||
| 176 | // |
||
| 177 | // @see http://www.ietf.org/rfc/rfc2732.txt |
||
| 178 | |||
| 179 | $lastBracketPosition = Str::strrpos($hostname, ']'); |
||
| 180 | |||
| 181 | if ($lastBracketPosition !== false && Str::startsWith($hostname, '[')) { |
||
| 182 | return Str::substr($hostname, 1, $lastBracketPosition - 1); |
||
| 183 | } |
||
| 184 | |||
| 185 | // This is either a normal hostname or an IPv4 address, just remove the port. |
||
| 186 | |||
| 187 | $hostname = Arr::first(explode(':', $hostname)); |
||
| 188 | |||
| 189 | // If string is empty, null will be returned. |
||
| 190 | |||
| 191 | return '' === $hostname ? null : $hostname; |
||
| 192 | } |
||
| 193 | |||
| 194 | /** |
||
| 195 | * Extracts subdomain, host and suffix from input string. Based on algorithm described in |
||
| 196 | * https://publicsuffix.org/list/. |
||
| 197 | * |
||
| 198 | * @param string $hostname Hostname for extraction |
||
| 199 | * |
||
| 200 | * @return array|string[] An array that contains subdomain, host and suffix. |
||
| 201 | */ |
||
| 202 | public function extractParts($hostname) |
||
| 203 | { |
||
| 204 | $suffix = $this->extractSuffix($hostname); |
||
| 205 | |||
| 206 | if ($suffix === $hostname) { |
||
| 207 | return [null, $hostname, null]; |
||
| 208 | } |
||
| 209 | |||
| 210 | if (null !== $suffix) { |
||
| 211 | $hostname = Str::substr($hostname, 0, -Str::length($suffix) - 1); |
||
| 212 | } |
||
| 213 | |||
| 214 | $lastDot = Str::strrpos($hostname, '.'); |
||
| 215 | |||
| 216 | if (false === $lastDot) { |
||
| 217 | return [null, $hostname, $suffix]; |
||
| 218 | } |
||
| 219 | |||
| 220 | $subDomain = Str::substr($hostname, 0, $lastDot); |
||
|
0 ignored issues
–
show
|
|||
| 221 | $host = Str::substr($hostname, $lastDot + 1); |
||
| 222 | |||
| 223 | return [ |
||
| 224 | $subDomain, |
||
| 225 | $host, |
||
| 226 | $suffix |
||
| 227 | ]; |
||
| 228 | } |
||
| 229 | |||
| 230 | /** |
||
| 231 | * Extracts suffix from hostname using Public Suffix List database. |
||
| 232 | * |
||
| 233 | * @param string $hostname Hostname for extraction |
||
| 234 | * |
||
| 235 | * @return null|string |
||
| 236 | */ |
||
| 237 | private function extractSuffix($hostname) |
||
| 238 | { |
||
| 239 | // If hostname has leading dot, it's invalid. |
||
| 240 | // If hostname is a single label domain makes, it's invalid. |
||
| 241 | |||
| 242 | if (Str::startsWith($hostname, '.') || Str::strpos($hostname, '.') === false) { |
||
| 243 | return null; |
||
| 244 | } |
||
| 245 | |||
| 246 | // If domain is in punycode, it will be converted to IDN. |
||
| 247 | |||
| 248 | $isPunycoded = Str::strpos($hostname, 'xn--') !== false; |
||
| 249 | |||
| 250 | if ($isPunycoded) { |
||
| 251 | $hostname = idn_to_utf8($hostname); |
||
| 252 | } |
||
| 253 | |||
| 254 | $suffix = $this->parseSuffix($hostname); |
||
| 255 | |||
| 256 | if (null === $suffix) { |
||
| 257 | if (!($this->extractionMode & static::MODE_ALLOW_NOT_EXISTING_SUFFIXES)) { |
||
| 258 | return null; |
||
| 259 | } |
||
| 260 | |||
| 261 | $suffix = Str::substr($hostname, Str::strrpos($hostname, '.') + 1); |
||
| 262 | } |
||
| 263 | |||
| 264 | // If domain is punycoded, suffix will be converted to punycode. |
||
| 265 | |||
| 266 | return $isPunycoded ? idn_to_ascii($suffix) : $suffix; |
||
| 267 | } |
||
| 268 | |||
| 269 | /** |
||
| 270 | * Extracts suffix from hostname using Public Suffix List database. |
||
| 271 | * |
||
| 272 | * @param string $hostname Hostname for extraction |
||
| 273 | * |
||
| 274 | * @return null|string |
||
| 275 | */ |
||
| 276 | private function parseSuffix($hostname) |
||
| 277 | { |
||
| 278 | $hostnameParts = explode('.', $hostname); |
||
| 279 | $realSuffix = null; |
||
| 280 | |||
| 281 | for ($i = 0, $count = count($hostnameParts); $i < $count; $i++) { |
||
| 282 | $possibleSuffix = implode('.', array_slice($hostnameParts, $i)); |
||
| 283 | $exceptionSuffix = '!' . $possibleSuffix; |
||
| 284 | |||
| 285 | if ($this->suffixExists($exceptionSuffix)) { |
||
| 286 | $realSuffix = implode('.', array_slice($hostnameParts, $i + 1)); |
||
| 287 | |||
| 288 | break; |
||
| 289 | } |
||
| 290 | |||
| 291 | if ($this->suffixExists($possibleSuffix)) { |
||
| 292 | $realSuffix = $possibleSuffix; |
||
| 293 | |||
| 294 | break; |
||
| 295 | } |
||
| 296 | |||
| 297 | $wildcardTld = '*.' . implode('.', array_slice($hostnameParts, $i + 1)); |
||
| 298 | |||
| 299 | if ($this->suffixExists($wildcardTld)) { |
||
| 300 | $realSuffix = $possibleSuffix; |
||
| 301 | |||
| 302 | break; |
||
| 303 | } |
||
| 304 | } |
||
| 305 | |||
| 306 | return $realSuffix; |
||
| 307 | } |
||
| 308 | |||
| 309 | /** |
||
| 310 | * Method that checks existence of entry in Public Suffix List database, including provided options. |
||
| 311 | * |
||
| 312 | * @param string $entry Entry for check in Public Suffix List database |
||
| 313 | * |
||
| 314 | * @return bool |
||
| 315 | */ |
||
| 316 | private function suffixExists($entry) |
||
| 317 | { |
||
| 318 | if (!$this->suffixStore->isExists($entry)) { |
||
| 319 | return false; |
||
| 320 | } |
||
| 321 | |||
| 322 | $type = $this->suffixStore->getType($entry); |
||
| 323 | |||
| 324 | if ($this->extractionMode & static::MODE_ALLOW_ICCAN && $type === Store::TYPE_ICCAN) { |
||
| 325 | return true; |
||
| 326 | } |
||
| 327 | |||
| 328 | return $this->extractionMode & static::MODE_ALLOW_PRIVATE && $type === Store::TYPE_PRIVATE; |
||
| 329 | } |
||
| 330 | |||
| 331 | /** |
||
| 332 | * Fixes URL from "github.com?layershifter" to "github.com/?layershifter". |
||
| 333 | * |
||
| 334 | * @see https://github.com/layershifter/TLDExtract/issues/5 |
||
| 335 | * |
||
| 336 | * @param string $url |
||
| 337 | * |
||
| 338 | * @return string |
||
| 339 | */ |
||
| 340 | private function fixQueryPart($url) |
||
| 341 | { |
||
| 342 | $position = Str::strpos($url, '?'); |
||
| 343 | |||
| 344 | if ($position === false) { |
||
| 345 | return $url; |
||
| 346 | } |
||
| 347 | |||
| 348 | return Str::substr($url, 0, $position) . '/' . Str::substr($url, $position); |
||
|
0 ignored issues
–
show
It seems like
$position defined by \LayerShifter\TLDSupport...\Str::strpos($url, '?') on line 342 can also be of type boolean; however, LayerShifter\TLDSupport\Helpers\Str::substr() does only seem to accept integer|null, maybe add an additional type check?
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check: /**
* @return array|string
*/
function returnsDifferentValues($x) {
if ($x) {
return 'foo';
}
return array();
}
$x = returnsDifferentValues($y);
if (is_array($x)) {
// $x is an array.
}
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue. Loading history...
It seems like
$position defined by \LayerShifter\TLDSupport...\Str::strpos($url, '?') on line 342 can also be of type boolean; however, LayerShifter\TLDSupport\Helpers\Str::substr() does only seem to accept integer, maybe add an additional type check?
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check: /**
* @return array|string
*/
function returnsDifferentValues($x) {
if ($x) {
return 'foo';
}
return array();
}
$x = returnsDifferentValues($y);
if (is_array($x)) {
// $x is an array.
}
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue. Loading history...
|
|||
| 349 | } |
||
| 350 | } |
||
| 351 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.