 eddy8    /
                    LightCMS
                      eddy8    /
                    LightCMS
                
                            | 1 | <?php | ||
| 2 | |||
| 3 | namespace App\Foundation; | ||
| 4 | |||
| 5 | /** | ||
| 6 | * 代码基于 https://github.com/whiteCcinn/tire-php 修改而来 | ||
| 7 | * | ||
| 8 | * 修复了首个敏感字连接在一起无法查出敏感词的bug | ||
| 9 | * 修复了还原字符串时未能正确处理ascii字符的bug | ||
| 10 | */ | ||
| 11 | class Tire | ||
| 12 | { | ||
| 13 | public $tree = []; | ||
| 14 | public $indexCode = []; | ||
| 15 | private $statistics = false; | ||
| 16 | |||
| 17 | public function add(string $word) | ||
| 18 |     { | ||
| 19 | $tree = &$this->tree; | ||
| 20 | |||
| 21 |         foreach ($this->split($word) as $node) { | ||
| 22 | $node = $this->utf8TransformAscii($node); | ||
| 23 | $tree = &$this->insertNode($tree, $node); | ||
| 24 | } | ||
| 25 | $tree['end'] = true; | ||
| 26 | |||
| 27 | return $this; | ||
| 28 | } | ||
| 29 | |||
| 30 | private function &insertNode(&$tree, $node) | ||
| 31 |     { | ||
| 32 |         if (isset($tree[$node])) { | ||
| 33 | return $tree[$node]; | ||
| 34 | } | ||
| 35 | |||
| 36 | $tree[$node] = []; | ||
| 37 | return $tree[$node]; | ||
| 38 | } | ||
| 39 | |||
| 40 | public function seek(string $text, $statistics = false, $first = true) | ||
| 41 |     { | ||
| 42 | $match = []; | ||
| 43 | $this->statistics = $statistics; | ||
| 44 | $tree = &$this->tree; | ||
| 45 | |||
| 46 |         foreach ($this->split($text) as $k => $word) { | ||
| 47 |             if (!$first && !$k) { | ||
| 48 | continue; | ||
| 49 | } | ||
| 50 | $code = $this->utf8TransformAscii($word); | ||
| 51 | $tree = &$this->beginFind($tree, $code, $sensitive); | ||
| 52 |             if (isset($tree['end'])) { | ||
| 53 | // 匹配到了词 | ||
| 54 | !$this->exist($sensitive, $statistics) && $match[] = $this->asciiTransformUtf8($sensitive); | ||
| 55 | } | ||
| 56 | } | ||
| 57 | |||
| 58 |         foreach ($match as $words) { | ||
| 59 | $match = array_merge($match, $this->seek($words, $statistics, false)); | ||
| 60 | } | ||
| 61 | return $match; | ||
| 62 | } | ||
| 63 | |||
| 64 | private function &beginFind(&$tree, $node, &$prefix = '') | ||
| 65 |     { | ||
| 66 |         if (isset($tree[$node])) { | ||
| 67 |             $prefix = $prefix . "\u{$node}"; | ||
| 68 | return $tree[$node]; | ||
| 69 | } | ||
| 70 | |||
| 71 | // fixed | ||
| 72 |         if (isset($this->tree[$node])) { | ||
| 73 |             $prefix = "\u{$node}"; | ||
| 74 | return $this->tree[$node]; | ||
| 75 | } | ||
| 76 | |||
| 77 | $prefix = ''; | ||
| 78 | return $this->tree; | ||
| 79 | } | ||
| 80 | |||
| 81 | public function statistics() | ||
| 82 |     { | ||
| 83 |         if (!$this->statistics) { | ||
| 84 | return false; | ||
| 85 | } | ||
| 86 | $that = $this; | ||
| 87 | $indexCode = array(); | ||
| 88 |         array_walk($this->indexCode, function ($statistics, &$sensitive) use ($that, &$indexCode) { | ||
| 89 | $sensitive = $that->asciiTransformUtf8($sensitive); | ||
| 90 | $indexCode[ $sensitive ] = $statistics; | ||
| 91 | }); | ||
| 92 | return $indexCode; | ||
| 93 | } | ||
| 94 | |||
| 95 | private function exist($sensitive, $statistics = false) | ||
| 96 |     { | ||
| 97 |         if (isset($this->indexCode[ $sensitive ])) { | ||
| 98 | $statistics && $this->indexCode[ $sensitive ]++; | ||
| 99 | return true; | ||
| 100 |         } else { | ||
| 101 | $this->indexCode[ $sensitive ] = 1; | ||
| 102 | return false; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | /** | ||
| 107 | * 单字符转换编码 | ||
| 108 | * | ||
| 109 | * @param $utf8_str | ||
| 110 | * @return string | ||
| 111 | */ | ||
| 112 | public function utf8TransformAscii($utf8_str) | ||
| 113 |     { | ||
| 114 |         if (ord($utf8_str) <= 127) { | ||
| 115 | return ord($utf8_str); | ||
| 116 | } | ||
| 117 | |||
| 118 | $ascii = (ord(@$utf8_str[0]) & 0xF) << 12; | ||
| 119 | $ascii |= (ord(@$utf8_str[1]) & 0x3F) << 6; | ||
| 120 | $ascii |= (ord(@$utf8_str[2]) & 0x3F); | ||
| 121 | |||
| 122 | return $ascii; | ||
| 123 | } | ||
| 124 | |||
| 125 | /** | ||
| 126 | * 编码转单字符 | ||
| 127 | * | ||
| 128 | * @param $ascii | ||
| 129 | * @return string | ||
| 130 | */ | ||
| 131 | public function asciiTransformUtf8($ascii) | ||
| 132 |     { | ||
| 133 |         if (strpos($ascii, '\u') !== false) { | ||
| 134 |             $asciis = explode('\u', $ascii); | ||
| 135 | array_shift($asciis); | ||
| 136 |         } else { | ||
| 137 | $asciis = array($ascii); | ||
| 138 | } | ||
| 139 | |||
| 140 | $utf8_str = ''; | ||
| 141 |         foreach ($asciis as $ascii) { | ||
| 0 ignored issues–
                            show     
    
    
        introduced 
                            by  
  Loading history... | |||
| 142 | $ascii = (int) $ascii; | ||
| 143 | // fixed | ||
| 144 |             if ($ascii <= 127) { | ||
| 145 | $utf8_str .= chr($ascii); | ||
| 146 | continue; | ||
| 147 | } | ||
| 148 | $ord_1 = 0xe0 | ($ascii >> 12); | ||
| 149 | $ord_2 = 0x80 | (($ascii >> 6) & 0x3f); | ||
| 150 | $ord_3 = 0x80 | ($ascii & 0x3f); | ||
| 151 | $utf8_str .= chr($ord_1) . chr($ord_2) . chr($ord_3); | ||
| 152 | } | ||
| 153 | |||
| 154 | return $utf8_str; | ||
| 155 | } | ||
| 156 | |||
| 157 | /** | ||
| 158 | * utf8拆字 | ||
| 159 | * | ||
| 160 | * @param string $str | ||
| 161 | * @return \Generator | ||
| 162 | */ | ||
| 163 | private function split(string $str) | ||
| 164 |     { | ||
| 165 | $len = strlen($str); | ||
| 166 |         for ($i = 0; $i < $len; $i++) { | ||
| 167 | $c = $str[$i]; | ||
| 168 | $n = ord($c); | ||
| 169 |             if (($n >> 7) == 0) { | ||
| 170 | //0xxx xxxx, asci, single | ||
| 171 | yield $c; | ||
| 172 |             } elseif (($n >> 4) == 15) { //1111 xxxx, first in four char | ||
| 173 |                 if ($i < $len - 3) { | ||
| 174 | yield $c . $str[ $i + 1 ] . $str[ $i + 2 ] . $str[ $i + 3 ]; | ||
| 175 | $i += 3; | ||
| 176 | } | ||
| 177 |             } elseif (($n >> 5) == 7) { | ||
| 178 | //111x xxxx, first in three char | ||
| 179 |                 if ($i < $len - 2) { | ||
| 180 | yield $c . $str[ $i + 1 ] . $str[ $i + 2 ]; | ||
| 181 | $i += 2; | ||
| 182 | } | ||
| 183 |             } elseif (($n >> 6) == 3) { | ||
| 184 | //11xx xxxx, first in two char | ||
| 185 |                 if ($i < $len - 1) { | ||
| 186 | yield $c . $str[ $i + 1 ]; | ||
| 187 | $i++; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | } | ||
| 191 | } | ||
| 192 | } | ||
| 193 | 
