EncodingDetector   A
last analyzed

Complexity

Total Complexity 23

Size/Duplication

Total Lines 231
Duplicated Lines 0 %

Test Coverage

Coverage 96.72%

Importance

Changes 7
Bugs 3 Features 0
Metric Value
eloc 73
c 7
b 3
f 0
dl 0
loc 231
ccs 59
cts 61
cp 0.9672
rs 10
wmc 23

11 Methods

Rating   Name   Duplication   Size   Complexity  
A disableEncoding() 0 4 2
A isUtf() 0 3 1
A iconvXtoEncoding() 0 21 4
A getRange() 0 11 3
A addEncoding() 0 5 3
A getRanges() 0 5 1
A enableEncoding() 0 6 3
A getEncodingList() 0 3 1
A getEncoding() 0 16 3
A getRangeModel() 0 3 1
A __construct() 0 8 1
1
<?php
2
3
/**
4
 * User: onnov
5
 * Date: 27.08.2019
6
 * Time: 21:59
7
 */
8
9
declare(strict_types=1);
10
11
namespace Onnov\DetectEncoding;
12
13
use Exception;
14
use RuntimeException;
15
16
/**
17
 * Class EncodingDetector
18
 *
19
 * @package Onnov\DetectEncoding
20
 */
21
class EncodingDetector
22
{
23
    public const LOWER_FACTOR = 3;
24
25
    public const UTF_8 = 'utf-8';
26
    public const WINDOWS_1251 = 'windows-1251';
27
    public const KOI8_R = 'koi8-r';
28
    public const IBM866 = 'ibm866';
29
    public const ISO_8859_5 = 'iso-8859-5';
30
    public const MAC_CYRILLIC = 'MacCyrillic';
31
32
    /** @var array<string, array<string, string>> */
33
    protected $rangeModel
34
        = [
35
            self::WINDOWS_1251 => [
36
                'upper' => '168, 192-212, 214-223',
37
                'lower' => '184, 224-255',
38
            ],
39
            self::KOI8_R       => [
40
                'upper' => '179, 224-231, 233-255',
41
                'lower' => '163, 192-223',
42
            ],
43
            self::ISO_8859_5   => [
44
                'upper' => '161, 176-196, 198-207',
45
                'lower' => '208-239, 241',
46
            ],
47
            self::IBM866       => [
48
                'upper' => '128-148, 150-159, 240',
49
                'lower' => '160-175, 224-239, 241',
50
            ],
51
            self::MAC_CYRILLIC => [
52
                'upper' => '128-148, 150-159, 221',
53
                'lower' => '222-254',
54
            ],
55
        ];
56
57
    /** @var mixed[] */
58
    protected $ranges;
59
60
    /**
61
     * EncodingDetector constructor.
62
     */
63 25
    public function __construct()
64
    {
65
        // default setting
66 25
        $this->enableEncoding(
67
            [
68 25
                self::WINDOWS_1251,
69 25
                self::KOI8_R,
70 25
                self::ISO_8859_5,
71
            ]
72
        );
73 25
    }
74
75
    /**
76
     * Method to enable encoding definition
77
     * Example:
78
     * $detector->enableEncoding([
79
     *      $detector::IBM866,
80
     *      $detector::MAC_CYRILLIC,
81
     * ]);
82
     *
83
     * @param mixed[] $encodingList
84
     */
85 25
    public function enableEncoding(array $encodingList): void
86
    {
87 25
        foreach ($encodingList as $encoding) {
88 25
            if (isset($this->rangeModel[$encoding])) {
89 25
                $this->ranges[$encoding]
90 25
                    = $this->getRanges($this->rangeModel[$encoding]);
91
            }
92
        }
93 25
    }
94
95
    /**
96
     * Method to disable encoding definition
97
     * Example:
98
     * $detector->disableEncoding([
99
     *      $detector::ISO_8859_5,
100
     * ]);
101
     *
102
     * @param string[] $encodingList
103
     */
104 8
    public function disableEncoding(array $encodingList): void
105
    {
106 8
        foreach ($encodingList as $encoding) {
107 8
            unset($this->ranges[$encoding]);
108
        }
109 8
    }
110
111
    /**
112
     * Method for adding custom encoding
113
     * Example:
114
     * $detector->addEncoding([
115
     *      'encodingName' => [
116
     *          'upper' => '1-50,200-250,253', // uppercase character number range
117
     *          'lower' => '55-100,120-180,199', // lowercase character number range
118
     *      ],
119
     * ]);
120
     *
121
     * @param mixed[] $ranges
122
     */
123 1
    public function addEncoding(array $ranges): void
124
    {
125 1
        foreach ($ranges as $encoding => $config) {
126 1
            if (isset($config['upper'], $config['lower'])) {
127 1
                $this->ranges[$encoding] = $this->getRanges($config);
128
            }
129
        }
130 1
    }
131
132
    /**
133
     * Method for converting text of an unknown encoding into a given encoding, by default in utf-8
134
     * optional parameters:
135
     * $extra = '//TRANSLIT' (default setting) , other options: '' or '//IGNORE'
136
     * $encoding = 'utf-8' (default setting) , other options: any encoding that is available iconv
137
     *
138
     * @param string $text
139
     * @param string $extra
140
     * @param string $encoding
141
     *
142
     * @return string
143
     * @throws RuntimeException
144
     */
145 4
    public function iconvXtoEncoding(
146
        string &$text,
147
        string $extra = '//TRANSLIT',
148
        string $encoding = EncodingDetector::UTF_8
149
    ): string {
150 4
        $res = $text;
151 4
        $xec = $this->getEncoding($text);
152 4
        if ($xec !== $encoding) {
153 4
            $msg = 'iconv returned false';
154
            try {
155 4
                $res = iconv($xec, $encoding . $extra, $text);
156 2
                if ($res === false) {
157 2
                    throw new RuntimeException($msg);
158
                }
159 2
            } catch (Exception $error) {
160 2
                $msg = $error->getMessage();
161 2
                throw new RuntimeException($msg);
162
            }
163
        }
164
165 2
        return $res;
166
    }
167
168
    /**
169
     * Definition of text encoding
170
     *
171
     * @param string $text
172
     *
173
     * @return string
174
     */
175 6
    public function getEncoding(string &$text): string
176
    {
177 6
        $result = $this::UTF_8;
178 6
        if ($this->isUtf($text) === false) {
179 4
            $res = [];
180 4
            $chars = count_chars($text, 1);
181 4
            foreach ($this->ranges as $encoding => $config) {
182 4
                $upc = array_intersect_key($chars, $config['upper']);
183 4
                $loc = array_intersect_key($chars, $config['lower']);
184 4
                $res[$encoding] = (array_sum($upc) + array_sum($loc) * self::LOWER_FACTOR);
185
            }
186 4
            asort($res);
187 4
            $result = (string)array_key_last($res);
188
        }
189
190 6
        return $result;
191
    }
192
193
    /**
194
     * UTF Encoding Definition Method
195
     *
196
     * @param string $text
197
     *
198
     * @return bool
199
     */
200 6
    private function isUtf(string &$text): bool
201
    {
202 6
        return (bool)preg_match('/./u', $text);
203
    }
204
205
    /**
206
     * @param mixed[] $config
207
     *
208
     * @return mixed[]
209
     */
210 25
    private function getRanges(array $config): array
211
    {
212
        return [
213 25
            'upper' => $this->getRange($config['upper']),
214 25
            'lower' => $this->getRange($config['lower']),
215
        ];
216
    }
217
218
    /**
219
     * Method to convert a range from a string to an array
220
     *
221
     * @param string $str
222
     *
223
     * @return int[]
224
     */
225 25
    private function getRange(string &$str): array
226
    {
227 25
        $ranges = [];
228 25
        foreach (explode(',', $str) as $item) {
229 25
            $arr = explode('-', $item);
230 25
            if (count($arr) > 1) {
231 25
                $ranges[] = implode(',', range($arr[0], $arr[1]));
232
            }
233
        }
234
235 25
        return array_flip(explode(',', implode(',', $ranges)));
236
    }
237
238
    /**
239
     * @return array<string, array<string, string>>
240
     */
241 18
    public function getEncodingList(): array
242
    {
243 18
        return $this->ranges;
244
    }
245
246
    /**
247
     * @return string[][]
248
     */
249
    public function getRangeModel(): array
250
    {
251
        return $this->rangeModel;
252
    }
253
}
254