Passed
Push — dev ( 824cd4...d410ef )
by Greg
12:51
created

AbstractUTF16Encoding::toUtf8()   B

Complexity

Conditions 7
Paths 6

Size

Total Lines 29
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 7
eloc 17
c 1
b 0
f 1
nc 6
nop 1
dl 0
loc 29
rs 8.8333
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2021 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Encodings;
21
22
use function chr;
23
use function intdiv;
24
use function ord;
25
use function str_split;
26
use function strlen;
27
28
/**
29
 * Convert between an encoding and UTF-16.
30
 */
31
abstract class AbstractUTF16Encoding implements EncodingInterface
32
{
33
    // Concrete classes should implement this.
34
    public const REPLACEMENT_CHARACTER = '';
35
36
    /**
37
     * Convert a string from UTF-8 to another encoding.
38
     *
39
     * @param string $text
40
     *
41
     * @return string
42
     */
43
    public function fromUtf8(string $text): string
44
    {
45
        $out = '';
46
        $len = strlen($text);
47
48
        for ($n = 0; $n < $len; ++$n) {
49
            $code_point = ord($text[$n]);
50
51
            if ($code_point <= 0x7F) {
52
                $out .= $this->codePointToCharacter($code_point);
53
            } elseif ($code_point <= 0xBF) {
54
                // Invalid
55
                $out .= static::REPLACEMENT_CHARACTER;
56
            } elseif ($code_point <= 0xDF) {
57
                $byte2 = ord($text[++$n]);
58
59
                if (($byte2 & 0xC0) !== 0x80) {
60
                    // Invalid
61
                    $out .= static::REPLACEMENT_CHARACTER;
62
                } else {
63
                    $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F);
64
                }
65
            } elseif ($code_point <= 0xEF) {
66
                $byte2 = ord($text[++$n]);
67
                $byte3 = ord($text[++$n]);
68
69
                if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) {
70
                    // Invalid
71
                    $out .= static::REPLACEMENT_CHARACTER;
72
                } else {
73
                    $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F);
74
                }
75
            } else {
76
                // Invalid
77
                $out .= static::REPLACEMENT_CHARACTER;
78
            }
79
        }
80
81
        return $out;
82
    }
83
84
    /**
85
     * Convert a string from another encoding to UTF-8.
86
     *
87
     * @param string $text
88
     *
89
     * @return string
90
     */
91
    public function toUtf8(string $text): string
92
    {
93
        $utf8 = '';
94
95
        foreach (str_split($text, 2) as $character) {
96
            $code_point = $this->characterToCodePoint($character);
97
98
            if ($code_point <= 0x7F) {
99
                // 7 bits => 1 byte
100
                $utf8 .= chr($code_point);
101
            } elseif ($code_point <= 0xFF) {
102
                // U+80 - U+FF are invalid
103
                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
104
            } elseif ($code_point <= 0x7FF) {
105
                // 11 bits (5,6) => 2 bytes
106
                $utf8 .= chr(0xC0 | ($code_point >> 6));
107
                $utf8 .= chr(0x80 | $code_point & 0x3F);
108
            } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) {
109
                // 16 bits (4,6,6) => 3 bytes
110
                $utf8 .= chr(0xE0 | ($code_point >> 12));
111
                $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F);
112
                $utf8 .= chr(0x80 | $code_point & 0x3F);
113
            } else {
114
                // U+D800 - U+DFFF are invalid
115
                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
116
            }
117
        }
118
119
        return $utf8;
120
    }
121
122
    /**
123
     * When reading multi-byte encodings using a stream, we must avoid incomplete characters.
124
     *
125
     * @param string $text
126
     *
127
     * @return int
128
     */
129
    public function convertibleBytes(string $text): int
130
    {
131
        return 2 * intdiv(strlen($text), 2);
132
    }
133
134
    /**
135
     * Convert two bytes to a code-point, taking care of byte-order.
136
     *
137
     * @param string $character
138
     *
139
     * @return int
140
     */
141
    abstract protected function characterToCodePoint(string $character): int;
142
143
    /**
144
     * Convert a code-point to two bytes, taking care of byte-order.
145
     *
146
     * @param int $code_point
147
     *
148
     * @return string
149
     */
150
    abstract protected function codePointToCharacter(int $code_point): string;
151
}
152