| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  * @file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  * Transliteration processing functions. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  * Transliterates UTF-8 encoded text to US-ASCII. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  * Based on Mediawiki's UtfNormal::quickIsNFCVerify(). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  * Based on Drupal 7 transliteration module. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  * @param $string | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  *   UTF-8 encoded text input. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  * @param $unknown | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  *   Replacement string for characters that do not have a suitable ASCII | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  *   equivalent. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  * @param $source_langcode | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  *   Optional ISO 639 language code that denotes the language of the input and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  *   is used to apply language-specific variations. If the source language is | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  *   not known at the time of transliteration, it is recommended to set this | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  *   argument to the site default language to produce consistent results. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |  *   Otherwise the current display language will be used. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  * @return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  *   Transliterated text. | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 27 |  |  |  */ | 
            
                                                                        
                            
            
                                    
            
            
                | 28 |  |  | function _transliteration_process($string, $unknown = '?', $source_langcode = NULL) { | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 29 |  |  |   // ASCII is always valid NFC! If we're only ever given plain ASCII, we can | 
            
                                                                        
                            
            
                                    
            
            
                | 30 |  |  |   // avoid the overhead of initializing the decomposition tables by skipping | 
            
                                                                        
                            
            
                                    
            
            
                | 31 |  |  |   // out early. | 
            
                                                                        
                            
            
                                    
            
            
                | 32 | 8 |  |   if (!preg_match('/[\x80-\xff]/', $string)) { | 
            
                                                                        
                            
            
                                    
            
            
                | 33 | 7 |  |     return $string; | 
            
                                                                        
                            
            
                                    
            
            
                | 34 |  |  |   } | 
            
                                                                        
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 36 | 8 |  |   static $tail_bytes; | 
            
                                                                        
                            
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 38 | 8 |  |   if (!isset($tail_bytes)) { | 
            
                                                                        
                            
            
                                    
            
            
                | 39 |  |  |     // Each UTF-8 head byte is followed by a certain number of tail bytes. | 
            
                                                                        
                            
            
                                    
            
            
                | 40 | 1 |  |     $tail_bytes = array(); | 
            
                                                                        
                            
            
                                    
            
            
                | 41 | 1 |  |     for ($n = 0; $n < 256; $n++) { | 
            
                                                                        
                            
            
                                    
            
            
                | 42 | 1 |  |       if ($n < 0xc0) { | 
            
                                                                        
                            
            
                                    
            
            
                | 43 | 1 |  |         $remaining = 0; | 
            
                                                                        
                            
            
                                    
            
            
                | 44 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 45 | 1 |  |       elseif ($n < 0xe0) { | 
            
                                                                        
                            
            
                                    
            
            
                | 46 | 1 |  |         $remaining = 1; | 
            
                                                                        
                            
            
                                    
            
            
                | 47 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 48 | 1 |  |       elseif ($n < 0xf0) { | 
            
                                                                        
                            
            
                                    
            
            
                | 49 | 1 |  |         $remaining = 2; | 
            
                                                                        
                            
            
                                    
            
            
                | 50 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 51 | 1 |  |       elseif ($n < 0xf8) { | 
            
                                                                        
                            
            
                                    
            
            
                | 52 | 1 |  |         $remaining = 3; | 
            
                                                                        
                            
            
                                    
            
            
                | 53 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 54 | 1 |  |       elseif ($n < 0xfc) { | 
            
                                                                        
                            
            
                                    
            
            
                | 55 | 1 |  |         $remaining = 4; | 
            
                                                                        
                            
            
                                    
            
            
                | 56 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 57 | 1 |  |       elseif ($n < 0xfe) { | 
            
                                                                        
                            
            
                                    
            
            
                | 58 | 1 |  |         $remaining = 5; | 
            
                                                                        
                            
            
                                    
            
            
                | 59 | 1 |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |       else { | 
            
                                                                        
                            
            
                                    
            
            
                | 61 | 1 |  |         $remaining = 0; | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 63 | 1 |  |       $tail_bytes[chr($n)] = $remaining; | 
            
                                                                        
                            
            
                                    
            
            
                | 64 | 1 |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 65 | 1 |  |   } | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |   // Chop the text into pure-ASCII and non-ASCII areas; large ASCII parts can | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |   // be handled much more quickly. Don't chop up Unicode areas for punctuation, | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |   // though, that wastes energy. | 
            
                                                                        
                            
            
                                    
            
            
                | 70 | 8 |  |   preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches); | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 72 | 8 |  |   $result = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 73 | 8 |  |   foreach ($matches[0] as $str) { | 
            
                                                                        
                            
            
                                    
            
            
                | 74 | 8 |  |     if ($str[0] < "\x80") { | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |       // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |       // skip over it. | 
            
                                                                        
                            
            
                                    
            
            
                | 77 | 7 |  |       $result .= $str; | 
            
                                                                        
                            
            
                                    
            
            
                | 78 | 7 |  |       continue; | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |     // We'll have to examine the chunk byte by byte to ensure that it consists | 
            
                                                                        
                            
            
                                    
            
            
                | 82 |  |  |     // of valid UTF-8 sequences, and to see if any of them might not be | 
            
                                                                        
                            
            
                                    
            
            
                | 83 |  |  |     // normalized. | 
            
                                                                        
                            
            
                                    
            
            
                | 84 |  |  |     // | 
            
                                                                        
                            
            
                                    
            
            
                | 85 |  |  |     // Since PHP is not the fastest language on earth, some of this code is a | 
            
                                                                        
                            
            
                                    
            
            
                | 86 |  |  |     // little ugly with inner loop optimizations. | 
            
                                                                        
                            
            
                                    
            
            
                | 87 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 88 | 8 |  |     $head = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 89 | 8 |  |     $chunk = strlen($str); | 
            
                                                                        
                            
            
                                    
            
            
                | 90 |  |  |     // Counting down is faster. I'm *so* sorry. | 
            
                                                                        
                            
            
                                    
            
            
                | 91 | 8 |  |     $len = $chunk + 1; | 
            
                                                                        
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 93 | 8 |  |     for ($i = -1; --$len; ) { | 
            
                                                                        
                            
            
                                    
            
            
                | 94 | 8 |  |       $c = $str[++$i]; | 
            
                                                                        
                            
            
                                    
            
            
                | 95 | 8 |  |       if ($remaining = $tail_bytes[$c]) { | 
            
                                                                        
                            
            
                                    
            
            
                | 96 |  |  |         // UTF-8 head byte! | 
            
                                                                        
                            
            
                                    
            
            
                | 97 | 8 |  |         $sequence = $head = $c; | 
            
                                                                        
                            
            
                                    
            
            
                | 98 |  |  |         do { | 
            
                                                                        
                            
            
                                    
            
            
                | 99 |  |  |           // Look for the defined number of tail bytes... | 
            
                                                                        
                            
            
                                    
            
            
                | 100 | 8 |  |           if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") { | 
            
                                                                        
                            
            
                                    
            
            
                | 101 |  |  |             // Legal tail bytes are nice. | 
            
                                                                        
                            
            
                                    
            
            
                | 102 | 8 |  |             $sequence .= $c; | 
            
                                                                        
                            
            
                                    
            
            
                | 103 | 8 |  |           } | 
            
                                                                        
                            
            
                                    
            
            
                | 104 |  |  |           else { | 
            
                                                                        
                            
            
                                    
            
            
                | 105 |  |  |             if ($len == 0) { | 
            
                                                                        
                            
            
                                    
            
            
                | 106 |  |  |               // Premature end of string! Drop a replacement character into | 
            
                                                                        
                            
            
                                    
            
            
                | 107 |  |  |               // output to represent the invalid UTF-8 sequence. | 
            
                                                                        
                            
            
                                    
            
            
                | 108 |  |  |               $result .= $unknown; | 
            
                                                                        
                            
            
                                    
            
            
                | 109 |  |  |               break 2; | 
            
                                                                        
                            
            
                                    
            
            
                | 110 |  |  |             } | 
            
                                                                        
                            
            
                                    
            
            
                | 111 |  |  |             else { | 
            
                                                                        
                            
            
                                    
            
            
                | 112 |  |  |               // Illegal tail byte; abandon the sequence. | 
            
                                                                        
                            
            
                                    
            
            
                | 113 |  |  |               $result .= $unknown; | 
            
                                                                        
                            
            
                                    
            
            
                | 114 |  |  |               // Back up and reprocess this byte; it may itself be a legal | 
            
                                                                        
                            
            
                                    
            
            
                | 115 |  |  |               // ASCII or UTF-8 sequence head. | 
            
                                                                        
                            
            
                                    
            
            
                | 116 |  |  |               --$i; | 
            
                                                                        
                            
            
                                    
            
            
                | 117 |  |  |               ++$len; | 
            
                                                                        
                            
            
                                    
            
            
                | 118 |  |  |               continue 2; | 
            
                                                                        
                            
            
                                    
            
            
                | 119 |  |  |             } | 
            
                                                                        
                            
            
                                    
            
            
                | 120 |  |  |           } | 
            
                                                                        
                            
            
                                    
            
            
                | 121 | 8 |  |         } while (--$remaining); | 
            
                                                                        
                            
            
                                    
            
            
                | 122 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 123 | 8 |  |         $n = ord($head); | 
            
                                                                        
                            
            
                                    
            
            
                | 124 | 8 |  |         if ($n <= 0xdf) { | 
            
                                                                        
                            
            
                                    
            
            
                | 125 | 8 |  |           $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); | 
            
                                                                        
                            
            
                                    
            
            
                | 126 | 8 |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 127 |  |  |         elseif ($n <= 0xef) { | 
            
                                                                        
                            
            
                                    
            
            
                | 128 |  |  |           $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); | 
            
                                                                        
                            
            
                                    
            
            
                | 129 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 130 |  |  |         elseif ($n <= 0xf7) { | 
            
                                                                        
                            
            
                                    
            
            
                | 131 |  |  |           $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); | 
            
                                                                        
                            
            
                                    
            
            
                | 132 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 133 |  |  |         elseif ($n <= 0xfb) { | 
            
                                                                        
                            
            
                                    
            
            
                | 134 |  |  |           $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); | 
            
                                                                        
                            
            
                                    
            
            
                | 135 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 136 |  |  |         elseif ($n <= 0xfd) { | 
            
                                                                        
                            
            
                                    
            
            
                | 137 |  |  |           $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); | 
            
                                                                        
                            
            
                                    
            
            
                | 138 |  |  |         } else { | 
            
                                                                        
                            
            
                                    
            
            
                | 139 |  |  |           $ord = $n; | 
            
                                                                        
                            
            
                                    
            
            
                | 140 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 141 | 8 |  |         $result .= _transliteration_replace($ord, $unknown, $source_langcode); | 
            
                                                                        
                            
            
                                    
            
            
                | 142 | 8 |  |         $head = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 143 | 8 |  |       } elseif ($c < "\x80") { | 
            
                                                                        
                            
            
                                    
            
            
                | 144 |  |  |         // ASCII byte. | 
            
                                                                        
                            
            
                                    
            
            
                | 145 |  |  |         $result .= $c; | 
            
                                                                        
                            
            
                                    
            
            
                | 146 |  |  |         $head = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 147 |  |  |       } elseif ($c < "\xc0") { | 
            
                                                                        
                            
            
                                    
            
            
                | 148 |  |  |         // Illegal tail bytes. | 
            
                                                                        
                            
            
                                    
            
            
                | 149 |  |  |         if ($head == '') { | 
            
                                                                        
                            
            
                                    
            
            
                | 150 |  |  |           $result .= $unknown; | 
            
                                                                        
                            
            
                                    
            
            
                | 151 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 152 |  |  |       } else { | 
            
                                                                        
                            
            
                                    
            
            
                | 153 |  |  |         // Miscellaneous freaks. | 
            
                                                                        
                            
            
                                    
            
            
                | 154 |  |  |         $result .= $unknown; | 
            
                                                                        
                            
            
                                    
            
            
                | 155 |  |  |         $head = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 156 |  |  |       } | 
            
                                                                        
                            
            
                                    
            
            
                | 157 | 8 |  |     } | 
            
                                                                        
                            
            
                                    
            
            
                | 158 | 8 |  |   } | 
            
                                                                        
                            
            
                                    
            
            
                | 159 | 8 |  |   return $result; | 
            
                                                                        
                            
            
                                    
            
            
                | 160 |  |  | } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |  * Replaces a Unicode character using the transliteration database. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |  * @param $ord | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |  *   An ordinal Unicode character code. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |  * @param $unknown | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |  *   Replacement string for characters that do not have a suitable ASCII | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |  *   equivalent. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |  * @param $langcode | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |  *   Optional ISO 639 language code that denotes the language of the input and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |  *   is used to apply language-specific variations.  Defaults to the current | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |  *   display language. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |  * @return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |  *   ASCII replacement character. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  | function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) { | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 | 8 |  |   static $map = array(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |   //GL: set language later | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |   /* | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |   if (!isset($langcode)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |     global $language; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |     $langcode = $language->language; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |   } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |   */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 | 8 |  |   $bank = $ord >> 8; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 | 8 |  |   if (!isset($map[$bank][$langcode])) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 | 1 |  |     $file = './resources/transliteration-data/' . sprintf('x%02x', $bank) . '.php';   | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 | 1 |  |     if (file_exists($file)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 | 1 |  |       $base = array(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 | 1 |  |       $variant = array(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 | 1 |  |       include $file; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 | 1 |  |       if ($langcode != 'en' && isset($variant[$langcode])) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |         // Merge in language specific mappings. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |         $map[$bank][$langcode] = $variant[$langcode] + $base; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |       } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  |       else { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 | 1 |  |         $map[$bank][$langcode] = $base; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |       } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 | 1 |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |     else { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |       $map[$bank][$langcode] = array(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 | 1 |  |   } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 | 8 |  |   $ord = $ord & 255; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 | 8 |  |   return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown; | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 212 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 213 |  |  |  | 
            
                        
Learn more about camelCase.