Passed
Push — dev ( 824cd4...d410ef )
by Greg
12:51
created

EncodingFactory::make()   C

Complexity

Conditions 14
Paths 14

Size

Total Lines 44
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 14
eloc 29
c 1
b 0
f 1
nc 14
nop 1
dl 0
loc 44
rs 6.2666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2021 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees\Factories;
21
22
use DomainException;
23
use Fisharebest\Webtrees\Contracts\EncodingFactoryInterface;
24
use Fisharebest\Webtrees\Encodings\ANSEL;
25
use Fisharebest\Webtrees\Encodings\ASCII;
26
use Fisharebest\Webtrees\Encodings\CP437;
27
use Fisharebest\Webtrees\Encodings\CP850;
28
use Fisharebest\Webtrees\Encodings\EncodingInterface;
29
use Fisharebest\Webtrees\Encodings\ISO88591;
30
use Fisharebest\Webtrees\Encodings\ISO88592;
31
use Fisharebest\Webtrees\Encodings\MacRoman;
32
use Fisharebest\Webtrees\Encodings\UTF16BE;
33
use Fisharebest\Webtrees\Encodings\UTF16LE;
34
use Fisharebest\Webtrees\Encodings\UTF8;
35
use Fisharebest\Webtrees\Encodings\Windows1250;
36
use Fisharebest\Webtrees\Encodings\Windows1251;
37
use Fisharebest\Webtrees\Encodings\Windows1252;
38
use Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException;
39
40
use function explode;
41
use function ltrim;
42
use function preg_match;
43
use function str_contains;
44
use function str_starts_with;
45
use function strstr;
46
47
/**
48
 * Create an encoding object.
49
 */
50
class EncodingFactory implements EncodingFactoryInterface
51
{
52
    /**
53
     * Detect an encoding from a GEDCOM header record.
54
     *
55
     * @param string $header
56
     *
57
     * @return EncodingInterface|null
58
     * @throws InvalidGedcomEncodingException
59
     */
60
    public function detect(string $header): ?EncodingInterface
61
    {
62
        $utf_bom = [
63
            '/^' . UTF8::BYTE_ORDER_MARK . '/'    => UTF8::NAME,
64
            '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME,
65
            '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME,
66
        ];
67
68
        foreach ($utf_bom as $regex => $encoding) {
69
            if (preg_match($regex, $header) === 1) {
70
                return $this->make($encoding);
71
            }
72
        }
73
74
        $utf16 = [
75
            "\x000" => UTF16BE::NAME,
76
            "0\x00" => UTF16LE::NAME,
77
        ];
78
79
        foreach ($utf16 as $start => $encoding) {
80
            if (str_starts_with($header, $start)) {
81
                return $this->make($encoding);
82
            }
83
        }
84
85
        // Standardize whitespace to simplify matching.
86
        $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]);
87
88
        while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, '  ')) {
89
            $header = strtr($header, ["\n " => "\n", " \n" => "\n", '  ' => ' ']);
90
        }
91
92
        // We need a complete header record
93
        $header = strstr($header, "\n0", true);
94
95
        if ($header === false) {
96
            return null;
97
        }
98
99
        // Some of these come from Tamura Jones, the rest from webtrees users.
100
        $character_sets = [
101
            'ASCII'             => ASCII::NAME,
102
            'ANSEL'             => ANSEL::NAME,
103
            'UTF-8'             => UTF8::NAME,
104
            'UNICODE'           => UTF8::NAME, // If the null byte test failed, this can't be UTF16
105
            'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM
106
            'ASCII/MACINTOSH'   => MacRoman::NAME, // MacFamilyTree < 8.3.5
107
            'MACINTOSH'         => MacRoman::NAME, // MacFamilyTree >= 8.3.5
108
            'CP437'             => CP437::NAME,
109
            'IBMPC'             => CP437::NAME,
110
            'IBM'               => CP437::NAME, // Reunion
111
            'IBM-PC'            => CP437::NAME, // CumberlandFamilyTree
112
            'OEM'               => CP437::NAME, // Généatique
113
            'CP850'             => CP850::NAME,
114
            'MSDOS'             => CP850::NAME,
115
            'IBM-DOS'           => CP850::NAME, // Reunion, EasyTree
116
            'MS-DOS'            => CP850::NAME, // AbrEdit FTM for Windows
117
            'ANSI'              => CP850::NAME,
118
            'WINDOWS'           => CP850::NAME, // Parentele
119
            'IBM WINDOWS'       => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages
120
            'IBM_WINDOWS'       => CP850::NAME, // EasyTree
121
            'CP1250'            => Windows1250::NAME,
122
            'windows-1250'      => Windows1250::NAME, // GenoPro, Rodokmen Pro
123
            'CP1251'            => Windows1251::NAME,
124
            'WINDOWS-1251'      => Windows1251::NAME, // Rodovid
125
            'CP1252'            => Windows1252::NAME, // Lifelines
126
            'ISO-8859-1'        => ISO88591::NAME, // Cumberland Family Tree, Lifelines
127
            'ISO8859-1'         => ISO88591::NAME, // Scion Genealogist
128
            'ISO8859'           => ISO88591::NAME, // Genealogica Grafica
129
            'LATIN-1'           => ISO88591::NAME,
130
            'LATIN1'            => ISO88591::NAME, // GenealogyJ
131
            'ISO-8859-2'        => ISO88592::NAME,
132
            'ISO8859-2'         => ISO88592::NAME,
133
            'LATIN-2'           => ISO88592::NAME,
134
            'LATIN2'            => ISO88592::NAME,
135
        ];
136
137
        foreach ($character_sets as $pattern => $encoding) {
138
            if (str_contains($pattern, '/')) {
139
                [$char, $vers] = explode('/', $pattern);
140
                $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers;
141
            } else {
142
                $regex = "\n1 CHAR(?:ACTER)? " . $pattern;
143
            }
144
145
            if (preg_match("/" . $regex . "/i", $header) === 1) {
146
                return $this->make($encoding);
147
            }
148
        }
149
150
        if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) {
151
            $charset = $match[1];
152
        } else {
153
            $charset = '???';
154
        }
155
156
        throw new InvalidGedcomEncodingException($charset);
157
    }
158
159
    /**
160
     * Create a named encoding.
161
     *
162
     * @param string $name
163
     *
164
     * @return EncodingInterface
165
     * @thorws DomainException
166
     */
167
    public function make(string $name): EncodingInterface
168
    {
169
        switch ($name) {
170
            case UTF8::NAME:
171
                return new UTF8();
172
173
            case UTF16BE::NAME:
174
                return new UTF16BE();
175
176
            case UTF16LE::NAME:
177
                return new UTF16LE();
178
179
            case ANSEL::NAME:
180
                return new ANSEL();
181
182
            case ASCII::NAME:
183
                return new ASCII();
184
185
            case CP437::NAME:
186
                return new CP437();
187
188
            case CP850::NAME:
189
                return new CP850();
190
191
            case Windows1250::NAME:
192
                return new Windows1250();
193
194
            case Windows1251::NAME:
195
                return new Windows1251();
196
197
            case Windows1252::NAME:
198
                return new Windows1252();
199
200
            case MacRoman::NAME:
201
                return new MacRoman();
202
203
            case ISO88591::NAME:
204
                return new ISO88591();
205
206
            case ISO88592::NAME:
207
                return new ISO88592();
208
209
            default:
210
                throw new DomainException('Invalid encoding: ' . $name);
211
        }
212
    }
213
214
    /**
215
     * A list of supported encodings and their names.
216
     *
217
     * @return array<string,string>
218
     */
219
    public function list(): array
220
    {
221
        return [
222
            UTF8::NAME        => 'UTF-8',
223
            UTF16BE::NAME     => 'UTF-16BE',
224
            UTF16LE::NAME     => 'UTF-16LE',
225
            ANSEL::NAME       => 'ANSEL',
226
            ASCII::NAME       => 'ASCII',
227
            ISO88591::NAME    => 'ISO-8859-1',
228
            ISO88592::NAME    => 'ISO-8859-2',
229
            Windows1250::NAME => 'Windows 1250',
230
            Windows1251::NAME => 'Windows 1251',
231
            Windows1252::NAME => 'Windows 1252',
232
            CP437::NAME       => 'CP437',
233
            CP850::NAME       => 'CP850',
234
            MacRoman::NAME    => 'MacOS Roman',
235
        ];
236
    }
237
}
238