Issues (40)

app/Foundation/Tire.php (1 issue)

Severity
1
<?php
2
3
namespace App\Foundation;
4
5
/**
6
 * 代码基于 https://github.com/whiteCcinn/tire-php 修改而来
7
 *
8
 * 修复了首个敏感字连接在一起无法查出敏感词的bug
9
 * 修复了还原字符串时未能正确处理ascii字符的bug
10
 */
11
class Tire
12
{
13
    public $tree = [];
14
    public $indexCode = [];
15
    private $statistics = false;
16
17
    public function add(string $word)
18
    {
19
        $tree = &$this->tree;
20
21
        foreach ($this->split($word) as $node) {
22
            $node = $this->utf8TransformAscii($node);
23
            $tree = &$this->insertNode($tree, $node);
24
        }
25
        $tree['end'] = true;
26
27
        return $this;
28
    }
29
30
    private function &insertNode(&$tree, $node)
31
    {
32
        if (isset($tree[$node])) {
33
            return $tree[$node];
34
        }
35
36
        $tree[$node] = [];
37
        return $tree[$node];
38
    }
39
40
    public function seek(string $text, $statistics = false, $first = true)
41
    {
42
        $match = [];
43
        $this->statistics = $statistics;
44
        $tree = &$this->tree;
45
46
        foreach ($this->split($text) as $k => $word) {
47
            if (!$first && !$k) {
48
                continue;
49
            }
50
            $code = $this->utf8TransformAscii($word);
51
            $tree = &$this->beginFind($tree, $code, $sensitive);
52
            if (isset($tree['end'])) {
53
                // 匹配到了词
54
                !$this->exist($sensitive, $statistics) && $match[] = $this->asciiTransformUtf8($sensitive);
55
            }
56
        }
57
58
        foreach ($match as $words) {
59
            $match = array_merge($match, $this->seek($words, $statistics, false));
60
        }
61
        return $match;
62
    }
63
64
    private function &beginFind(&$tree, $node, &$prefix = '')
65
    {
66
        if (isset($tree[$node])) {
67
            $prefix = $prefix . "\u{$node}";
68
            return $tree[$node];
69
        }
70
71
        // fixed
72
        if (isset($this->tree[$node])) {
73
            $prefix = "\u{$node}";
74
            return $this->tree[$node];
75
        }
76
77
        $prefix = '';
78
        return $this->tree;
79
    }
80
81
    public function statistics()
82
    {
83
        if (!$this->statistics) {
84
            return false;
85
        }
86
        $that      = $this;
87
        $indexCode = array();
88
        array_walk($this->indexCode, function ($statistics, &$sensitive) use ($that, &$indexCode) {
89
            $sensitive               = $that->asciiTransformUtf8($sensitive);
90
            $indexCode[ $sensitive ] = $statistics;
91
        });
92
        return $indexCode;
93
    }
94
95
    private function exist($sensitive, $statistics = false)
96
    {
97
        if (isset($this->indexCode[ $sensitive ])) {
98
            $statistics && $this->indexCode[ $sensitive ]++;
99
            return true;
100
        } else {
101
            $this->indexCode[ $sensitive ] = 1;
102
            return false;
103
        }
104
    }
105
106
    /**
107
     * 单字符转换编码
108
     *
109
     * @param $utf8_str
110
     * @return string
111
     */
112
    public function utf8TransformAscii($utf8_str)
113
    {
114
        if (ord($utf8_str) <= 127) {
115
            return ord($utf8_str);
116
        }
117
118
        $ascii = (ord(@$utf8_str[0]) & 0xF) << 12;
119
        $ascii |= (ord(@$utf8_str[1]) & 0x3F) << 6;
120
        $ascii |= (ord(@$utf8_str[2]) & 0x3F);
121
122
        return $ascii;
123
    }
124
125
    /**
126
     * 编码转单字符
127
     *
128
     * @param $ascii
129
     * @return string
130
     */
131
    public function asciiTransformUtf8($ascii)
132
    {
133
        if (strpos($ascii, '\u') !== false) {
134
            $asciis = explode('\u', $ascii);
135
            array_shift($asciis);
136
        } else {
137
            $asciis = array($ascii);
138
        }
139
140
        $utf8_str = '';
141
        foreach ($asciis as $ascii) {
0 ignored issues
show
$ascii is overwriting one of the parameters of this function.
Loading history...
142
            $ascii = (int) $ascii;
143
            // fixed
144
            if ($ascii <= 127) {
145
                $utf8_str .= chr($ascii);
146
                continue;
147
            }
148
            $ord_1 = 0xe0 | ($ascii >> 12);
149
            $ord_2 = 0x80 | (($ascii >> 6) & 0x3f);
150
            $ord_3 = 0x80 | ($ascii & 0x3f);
151
            $utf8_str .= chr($ord_1) . chr($ord_2) . chr($ord_3);
152
        }
153
154
        return $utf8_str;
155
    }
156
157
    /**
158
     * utf8拆字
159
     *
160
     * @param string $str
161
     * @return \Generator
162
     */
163
    private function split(string $str)
164
    {
165
        $len = strlen($str);
166
        for ($i = 0; $i < $len; $i++) {
167
            $c = $str[$i];
168
            $n = ord($c);
169
            if (($n >> 7) == 0) {
170
                //0xxx xxxx, asci, single
171
                yield $c;
172
            } elseif (($n >> 4) == 15) { //1111 xxxx, first in four char
173
                if ($i < $len - 3) {
174
                    yield $c . $str[ $i + 1 ] . $str[ $i + 2 ] . $str[ $i + 3 ];
175
                    $i += 3;
176
                }
177
            } elseif (($n >> 5) == 7) {
178
                //111x xxxx, first in three char
179
                if ($i < $len - 2) {
180
                    yield $c . $str[ $i + 1 ] . $str[ $i + 2 ];
181
                    $i += 2;
182
                }
183
            } elseif (($n >> 6) == 3) {
184
                //11xx xxxx, first in two char
185
                if ($i < $len - 1) {
186
                    yield $c . $str[ $i + 1 ];
187
                    $i++;
188
                }
189
            }
190
        }
191
    }
192
}
193