| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | * @package   s9e\RegexpBuilder | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | * @copyright Copyright (c) 2016 The s9e Authors | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | * @license   http://www.opensource.org/licenses/mit-license.php The MIT License | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | namespace s9e\RegexpBuilder\Input; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | use InvalidArgumentException; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | class Utf8 extends BaseImplementation | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | 	* @var bool Whether to use surrogates to represent higher codepoints | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | 	*/ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | 	protected $useSurrogates; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | 	* {@inheritdoc} | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 21 |  |  | 	*/ | 
            
                                                                        
                            
            
                                    
            
            
                | 22 | 9 |  | 	public function __construct(array $options = []) | 
            
                                                                        
                            
            
                                    
            
            
                | 23 |  |  | 	{ | 
            
                                                                        
                            
            
                                    
            
            
                | 24 | 9 |  | 		$this->useSurrogates = !empty($options['useSurrogates']); | 
            
                                                                        
                            
            
                                    
            
            
                | 25 | 9 |  | 	} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | 	* {@inheritdoc} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  | 	*/ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 | 9 |  | 	public function split($string) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  | 	{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 | 9 |  | 		if (preg_match_all('(.)us', $string, $matches) === false) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  | 		{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 | 1 |  | 			throw new InvalidArgumentException('Invalid UTF-8 string'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  | 		} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 | 8 |  | 		return ($this->useSurrogates) ? $this->charsToCodepointsWithSurrogates($matches[0]) : $this->charsToCodepoints($matches[0]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  | 	} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  | 	* Convert a list of UTF-8 characters into a list of Unicode codepoint | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  | 	* | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  | 	* @param  string[]  $chars | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  | 	* @return integer[] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  | 	*/ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 | 6 |  | 	protected function charsToCodepoints(array $chars) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  | 	{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 | 6 |  | 		return array_map([$this, 'cp'], $chars); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  | 	} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  | 	* Convert a list of UTF-8 characters into a list of Unicode codepoint with surrogates | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  | 	* | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  | 	* @param  string[]  $chars | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  | 	* @return integer[] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  | 	*/ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 | 2 |  | 	protected function charsToCodepointsWithSurrogates(array $chars) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  | 	{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 | 2 |  | 		$codepoints = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 | 2 |  | 		foreach ($chars as $char) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  | 		{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 | 2 |  | 			$cp = $this->cp($char); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 | 2 |  | 			if ($cp < 0x10000) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  | 			{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 | 2 |  | 				$codepoints[] = $cp; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  | 			} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  | 			else | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  | 			{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 | 1 |  | 				$codepoints[] = 0xD7C0 + ($cp >> 10); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 | 2 |  | 				$codepoints[] = 0xDC00 + ($cp & 0x3FF); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  | 			} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  | 		} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 | 2 |  | 		return $codepoints; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  | 	} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  | 	/** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  | 	* Compute and return the Unicode codepoint for given UTF-8 char | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  | 	* | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  | 	* @param  string  $char UTF-8 char | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  | 	* @return integer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  | 	*/ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 | 7 |  | 	protected function cp($char) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  | 	{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 | 7 |  | 		$cp = ord($char[0]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 | 7 |  | 		if ($cp >= 0xF0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  | 		{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 | 3 |  | 			$cp = ($cp << 18) + (ord($char[1]) << 12) + (ord($char[2]) << 6) + ord($char[3]) - 0x3C82080; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  | 		} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 | 7 |  | 		elseif ($cp >= 0xE0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  | 		{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 | 1 |  | 			$cp = ($cp << 12) + (ord($char[1]) << 6) + ord($char[2]) - 0xE2080; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  | 		} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 | 6 |  | 		elseif ($cp >= 0xC0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  | 		{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 | 2 |  | 			$cp = ($cp << 6) + ord($char[1]) - 0x3080; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  | 		} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 | 7 |  | 		return $cp; | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 100 |  |  | 	} | 
            
                                                        
            
                                    
            
            
                | 101 |  |  | } |