Utf8   A
last analyzed

Complexity

Total Complexity 12

Size/Duplication

Total Lines 88
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 2
Bugs 0 Features 0
Metric Value
wmc 12
eloc 24
c 2
b 0
f 0
dl 0
loc 88
ccs 27
cts 27
cp 1
rs 10

5 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A charsToCodepointsWithSurrogates() 0 18 3
A cp() 0 17 4
A charsToCodepoints() 0 3 1
A split() 0 8 3
1
<?php declare(strict_types=1);
2
3
/**
4
* @package   s9e\RegexpBuilder
5
* @copyright Copyright (c) 2016-2022 The s9e authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\RegexpBuilder\Input;
9
10
use InvalidArgumentException;
11
12
class Utf8 extends BaseImplementation
13
{
14
	/**
15
	* @var bool Whether to use surrogates to represent higher codepoints
16
	*/
17
	protected $useSurrogates;
18
19
	/**
20
	* {@inheritdoc}
21
	*/
22 9
	public function __construct(array $options = [])
23
	{
24 9
		$this->useSurrogates = !empty($options['useSurrogates']);
25 9
	}
26
27
	/**
28
	* {@inheritdoc}
29
	*/
30 9
	public function split(string $string): array
31
	{
32 9
		if (preg_match_all('(.)us', $string, $matches) === false)
33
		{
34 1
			throw new InvalidArgumentException('Invalid UTF-8 string');
35
		}
36
37 8
		return ($this->useSurrogates) ? $this->charsToCodepointsWithSurrogates($matches[0]) : $this->charsToCodepoints($matches[0]);
38
	}
39
40
	/**
41
	* Convert a list of UTF-8 characters into a list of Unicode codepoint
42
	*
43
	* @param  string[]  $chars
44
	* @return integer[]
45
	*/
46 6
	protected function charsToCodepoints(array $chars): array
47
	{
48 6
		return array_map([$this, 'cp'], $chars);
49
	}
50
51
	/**
52
	* Convert a list of UTF-8 characters into a list of Unicode codepoint with surrogates
53
	*
54
	* @param  string[]  $chars
55
	* @return integer[]
56
	*/
57 2
	protected function charsToCodepointsWithSurrogates(array $chars): array
58
	{
59 2
		$codepoints = [];
60 2
		foreach ($chars as $char)
61
		{
62 2
			$cp = $this->cp($char);
63 2
			if ($cp < 0x10000)
64
			{
65 2
				$codepoints[] = $cp;
66
			}
67
			else
68
			{
69 1
				$codepoints[] = 0xD7C0 + ($cp >> 10);
70 1
				$codepoints[] = 0xDC00 + ($cp & 0x3FF);
71
			}
72
		}
73
74 2
		return $codepoints;
75
	}
76
77
	/**
78
	* Compute and return the Unicode codepoint for given UTF-8 char
79
	*
80
	* @param  string  $char UTF-8 char
81
	* @return integer
82
	*/
83 7
	protected function cp(string $char): int
84
	{
85 7
		$cp = ord($char[0]);
86 7
		if ($cp >= 0xF0)
87
		{
88 3
			$cp = ($cp << 18) + (ord($char[1]) << 12) + (ord($char[2]) << 6) + ord($char[3]) - 0x3C82080;
89
		}
90 7
		elseif ($cp >= 0xE0)
91
		{
92 1
			$cp = ($cp << 12) + (ord($char[1]) << 6) + ord($char[2]) - 0xE2080;
93
		}
94 6
		elseif ($cp >= 0xC0)
95
		{
96 2
			$cp = ($cp << 6) + ord($char[1]) - 0x3080;
97
		}
98
99 7
		return $cp;
100
	}
101
}