PDFDocEncoding   A
last analyzed

Complexity

Total Complexity 2

Size/Duplication

Total Lines 148
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 140
c 1
b 0
f 0
dl 0
loc 148
ccs 142
cts 142
cp 1
rs 10
wmc 2

2 Methods

Rating   Name   Duplication   Size   Complexity  
A convertPDFDoc2UTF8() 0 3 1
B getCodePage() 0 140 1
1
<?php
2
3
/**
4
 * @file    This file is part of the PdfParser library.
5
 *
6
 * @author  Brian Huisman <[email protected]>
7
 *
8
 * @date    2023-06-28
9
 *
10
 * @license LGPLv3
11
 *
12
 * @url     <https://github.com/smalot/pdfparser>
13
 *
14
 *  PdfParser is a pdf library written in PHP, extraction oriented.
15
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
16
 *
17
 *  This program is free software: you can redistribute it and/or modify
18
 *  it under the terms of the GNU Lesser General Public License as published by
19
 *  the Free Software Foundation, either version 3 of the License, or
20
 *  (at your option) any later version.
21
 *
22
 *  This program is distributed in the hope that it will be useful,
23
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
24
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25
 *  GNU Lesser General Public License for more details.
26
 *
27
 *  You should have received a copy of the GNU Lesser General Public License
28
 *  along with this program.
29
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
30
 */
31
32
// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf
33
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf
34
35
namespace Smalot\PdfParser\Encoding;
36
37
/**
38
 * Class PDFDocEncoding
39
 */
40
class PDFDocEncoding
41
{
42 11
    public static function getCodePage(): array
43
    {
44 11
        return [
45 11
            "\x18" => "\u{02d8}", // breve
46 11
            "\x19" => "\u{02c7}", // caron
47 11
            "\x1a" => "\u{02c6}", // circumflex
48 11
            "\x1b" => "\u{02d9}", // dotaccent
49 11
            "\x1c" => "\u{02dd}", // hungarumlaut
50 11
            "\x1d" => "\u{02db}", // ogonek
51 11
            "\x1e" => "\u{02de}", // ring
52 11
            "\x1f" => "\u{02dc}", // tilde
53 11
            "\x7f" => '',
54 11
            "\x80" => "\u{2022}", // bullet
55 11
            "\x81" => "\u{2020}", // dagger
56 11
            "\x82" => "\u{2021}", // daggerdbl
57 11
            "\x83" => "\u{2026}", // ellipsis
58 11
            "\x84" => "\u{2014}", // emdash
59 11
            "\x85" => "\u{2013}", // endash
60 11
            "\x86" => "\u{0192}", // florin
61 11
            "\x87" => "\u{2044}", // fraction
62 11
            "\x88" => "\u{2039}", // guilsinglleft
63 11
            "\x89" => "\u{203a}", // guilsinglright
64 11
            "\x8a" => "\u{2212}", // minus
65 11
            "\x8b" => "\u{2030}", // perthousand
66 11
            "\x8c" => "\u{201e}", // quotedblbase
67 11
            "\x8d" => "\u{201c}", // quotedblleft
68 11
            "\x8e" => "\u{201d}", // quotedblright
69 11
            "\x8f" => "\u{2018}", // quoteleft
70 11
            "\x90" => "\u{2019}", // quoteright
71 11
            "\x91" => "\u{201a}", // quotesinglbase
72 11
            "\x92" => "\u{2122}", // trademark
73 11
            "\x93" => "\u{fb01}", // fi
74 11
            "\x94" => "\u{fb02}", // fl
75 11
            "\x95" => "\u{0141}", // Lslash
76 11
            "\x96" => "\u{0152}", // OE
77 11
            "\x97" => "\u{0160}", // Scaron
78 11
            "\x98" => "\u{0178}", // Ydieresis
79 11
            "\x99" => "\u{017d}", // Zcaron
80 11
            "\x9a" => "\u{0131}", // dotlessi
81 11
            "\x9b" => "\u{0142}", // lslash
82 11
            "\x9c" => "\u{0153}", // oe
83 11
            "\x9d" => "\u{0161}", // scaron
84 11
            "\x9e" => "\u{017e}", // zcaron
85 11
            "\x9f" => '',
86 11
            "\xa0" => "\u{20ac}", // Euro
87 11
            "\xa1" => "\u{00a1}", // exclamdown
88 11
            "\xa2" => "\u{00a2}", // cent
89 11
            "\xa3" => "\u{00a3}", // sterling
90 11
            "\xa4" => "\u{00a4}", // currency
91 11
            "\xa5" => "\u{00a5}", // yen
92 11
            "\xa6" => "\u{00a6}", // brokenbar
93 11
            "\xa7" => "\u{00a7}", // section
94 11
            "\xa8" => "\u{00a8}", // dieresis
95 11
            "\xa9" => "\u{00a9}", // copyright
96 11
            "\xaa" => "\u{00aa}", // ordfeminine
97 11
            "\xab" => "\u{00ab}", // guillemotleft
98 11
            "\xac" => "\u{00ac}", // logicalnot
99 11
            "\xad" => '',
100 11
            "\xae" => "\u{00ae}", // registered
101 11
            "\xaf" => "\u{00af}", // macron
102 11
            "\xb0" => "\u{00b0}", // degree
103 11
            "\xb1" => "\u{00b1}", // plusminus
104 11
            "\xb2" => "\u{00b2}", // twosuperior
105 11
            "\xb3" => "\u{00b3}", // threesuperior
106 11
            "\xb4" => "\u{00b4}", // acute
107 11
            "\xb5" => "\u{00b5}", // mu
108 11
            "\xb6" => "\u{00b6}", // paragraph
109 11
            "\xb7" => "\u{00b7}", // periodcentered
110 11
            "\xb8" => "\u{00b8}", // cedilla
111 11
            "\xb9" => "\u{00b9}", // onesuperior
112 11
            "\xba" => "\u{00ba}", // ordmasculine
113 11
            "\xbb" => "\u{00bb}", // guillemotright
114 11
            "\xbc" => "\u{00bc}", // onequarter
115 11
            "\xbd" => "\u{00bd}", // onehalf
116 11
            "\xbe" => "\u{00be}", // threequarters
117 11
            "\xbf" => "\u{00bf}", // questiondown
118 11
            "\xc0" => "\u{00c0}", // Agrave
119 11
            "\xc1" => "\u{00c1}", // Aacute
120 11
            "\xc2" => "\u{00c2}", // Acircumflex
121 11
            "\xc3" => "\u{00c3}", // Atilde
122 11
            "\xc4" => "\u{00c4}", // Adieresis
123 11
            "\xc5" => "\u{00c5}", // Aring
124 11
            "\xc6" => "\u{00c6}", // AE
125 11
            "\xc7" => "\u{00c7}", // Ccedill
126 11
            "\xc8" => "\u{00c8}", // Egrave
127 11
            "\xc9" => "\u{00c9}", // Eacute
128 11
            "\xca" => "\u{00ca}", // Ecircumflex
129 11
            "\xcb" => "\u{00cb}", // Edieresis
130 11
            "\xcc" => "\u{00cc}", // Igrave
131 11
            "\xcd" => "\u{00cd}", // Iacute
132 11
            "\xce" => "\u{00ce}", // Icircumflex
133 11
            "\xcf" => "\u{00cf}", // Idieresis
134 11
            "\xd0" => "\u{00d0}", // Eth
135 11
            "\xd1" => "\u{00d1}", // Ntilde
136 11
            "\xd2" => "\u{00d2}", // Ograve
137 11
            "\xd3" => "\u{00d3}", // Oacute
138 11
            "\xd4" => "\u{00d4}", // Ocircumflex
139 11
            "\xd5" => "\u{00d5}", // Otilde
140 11
            "\xd6" => "\u{00d6}", // Odieresis
141 11
            "\xd7" => "\u{00d7}", // multiply
142 11
            "\xd8" => "\u{00d8}", // Oslash
143 11
            "\xd9" => "\u{00d9}", // Ugrave
144 11
            "\xda" => "\u{00da}", // Uacute
145 11
            "\xdb" => "\u{00db}", // Ucircumflex
146 11
            "\xdc" => "\u{00dc}", // Udieresis
147 11
            "\xdd" => "\u{00dd}", // Yacute
148 11
            "\xde" => "\u{00de}", // Thorn
149 11
            "\xdf" => "\u{00df}", // germandbls
150 11
            "\xe0" => "\u{00e0}", // agrave
151 11
            "\xe1" => "\u{00e1}", // aacute
152 11
            "\xe2" => "\u{00e2}", // acircumflex
153 11
            "\xe3" => "\u{00e3}", // atilde
154 11
            "\xe4" => "\u{00e4}", // adieresis
155 11
            "\xe5" => "\u{00e5}", // aring
156 11
            "\xe6" => "\u{00e6}", // ae
157 11
            "\xe7" => "\u{00e7}", // ccedilla
158 11
            "\xe8" => "\u{00e8}", // egrave
159 11
            "\xe9" => "\u{00e9}", // eacute
160 11
            "\xea" => "\u{00ea}", // ecircumflex
161 11
            "\xeb" => "\u{00eb}", // edieresis
162 11
            "\xec" => "\u{00ec}", // igrave
163 11
            "\xed" => "\u{00ed}", // iacute
164 11
            "\xee" => "\u{00ee}", // icircumflex
165 11
            "\xef" => "\u{00ef}", // idieresis
166 11
            "\xf0" => "\u{00f0}", // eth
167 11
            "\xf1" => "\u{00f1}", // ntilde
168 11
            "\xf2" => "\u{00f2}", // ograve
169 11
            "\xf3" => "\u{00f3}", // oacute
170 11
            "\xf4" => "\u{00f4}", // ocircumflex
171 11
            "\xf5" => "\u{00f5}", // otilde
172 11
            "\xf6" => "\u{00f6}", // odieresis
173 11
            "\xf7" => "\u{00f7}", // divide
174 11
            "\xf8" => "\u{00f8}", // oslash
175 11
            "\xf9" => "\u{00f9}", // ugrave
176 11
            "\xfa" => "\u{00fa}", // uacute
177 11
            "\xfb" => "\u{00fb}", // ucircumflex
178 11
            "\xfc" => "\u{00fc}", // udieresis
179 11
            "\xfd" => "\u{00fd}", // yacute
180 11
            "\xfe" => "\u{00fe}", // thorn
181 11
            "\xff" => "\u{00ff}", // ydieresis
182 11
        ];
183
    }
184
185 11
    public static function convertPDFDoc2UTF8(string $content): string
186
    {
187 11
        return strtr($content, static::getCodePage());
188
    }
189
}
190