Test Failed
Pull Request — master (#622)
by
unknown
01:47
created

FilterHelper::decodeFilterASCIIHexDecode()   A

Complexity

Conditions 5
Paths 10

Size

Total Lines 30
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 6.6

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 5
eloc 15
c 1
b 0
f 1
nc 10
nop 1
dl 0
loc 30
ccs 9
cts 15
cp 0.6
crap 6.6
rs 9.4555
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
class FilterHelper
46
{
47
    protected $availableFilters = ['ASCIIHexDecode', 'ASCII85Decode', 'LZWDecode', 'FlateDecode', 'RunLengthDecode'];
48
49
    /**
50
     * Decode data using the specified filter type.
51
     *
52
     * @param string $filter Filter name
53
     * @param string $data   Data to decode
54
     *
55
     * @return string Decoded data string
56
     *
57
     * @throws \Exception if a certain decode function is not implemented yet
58
     */
59 52
    public function decodeFilter(string $filter, string $data, int $decodeMemoryLimit = 0): string
60
    {
61 52
        switch ($filter) {
62 52
            case 'ASCIIHexDecode':
63 1
                return $this->decodeFilterASCIIHexDecode($data);
64
65 51
            case 'ASCII85Decode':
66 1
                return $this->decodeFilterASCII85Decode($data);
67
68 50
            case 'LZWDecode':
69
                return $this->decodeFilterLZWDecode($data);
70
71 50
            case 'FlateDecode':
72 44
                return $this->decodeFilterFlateDecode($data, $decodeMemoryLimit);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->decodeFilt...ta, $decodeMemoryLimit) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
73
74 6
            case 'RunLengthDecode':
75
                return $this->decodeFilterRunLengthDecode($data);
76
77 6
            case 'CCITTFaxDecode':
78 1
                throw new \Exception('Decode CCITTFaxDecode not implemented yet.');
79 5
            case 'JBIG2Decode':
80 1
                throw new \Exception('Decode JBIG2Decode not implemented yet.');
81 4
            case 'DCTDecode':
82 1
                throw new \Exception('Decode DCTDecode not implemented yet.');
83 3
            case 'JPXDecode':
84 1
                throw new \Exception('Decode JPXDecode not implemented yet.');
85 2
            case 'Crypt':
86 1
                throw new \Exception('Decode Crypt not implemented yet.');
87
            default:
88 1
                return $data;
89
        }
90
    }
91
92
    /**
93
     * ASCIIHexDecode
94
     *
95
     * Decodes data encoded in an ASCII hexadecimal representation, reproducing the original binary data.
96
     *
97
     * @param string $data Data to decode
98
     *
99
     * @return string data string
100
     *
101
     * @throws \Exception
102
     */
103 1
    protected function decodeFilterASCIIHexDecode(string $data): string
104
    {
105
        // all white-space characters shall be ignored
106 1
        $data = preg_replace('/[\s]/', '', $data);
107
        // check for EOD character: GREATER-THAN SIGN (3Eh)
108 1
        $eod = strpos($data, '>');
109 1
        if (false !== $eod) {
110
            // remove EOD and extra data (if any)
111
            $data = substr($data, 0, $eod);
112
            $eod = true;
113
        }
114
        // get data length
115 1
        $data_length = \strlen($data);
116 1
        if (0 != ($data_length % 2)) {
117
            // odd number of hexadecimal digits
118
            if ($eod) {
119
                // EOD shall behave as if a 0 (zero) followed the last digit
120
                $data = substr($data, 0, -1).'0'.substr($data, -1);
121
            } else {
122
                throw new \Exception('decodeFilterASCIIHexDecode: invalid code');
123
            }
124
        }
125
        // check for invalid characters
126 1
        if (preg_match('/[^a-fA-F\d]/', $data) > 0) {
127
            throw new \Exception('decodeFilterASCIIHexDecode: invalid code');
128
        }
129
        // get one byte of binary data for each pair of ASCII hexadecimal digits
130 1
        $decoded = pack('H*', $data);
131
132 1
        return $decoded;
133
    }
134
135
    /**
136
     * ASCII85Decode
137
     *
138
     * Decodes data encoded in an ASCII base-85 representation, reproducing the original binary data.
139
     *
140
     * @param string $data Data to decode
141
     *
142
     * @return string data string
143
     *
144
     * @throws \Exception
145
     */
146 1
    protected function decodeFilterASCII85Decode(string $data): string
147
    {
148
        // initialize string to return
149 1
        $decoded = '';
150
        // all white-space characters shall be ignored
151 1
        $data = preg_replace('/[\s]/', '', $data);
152
        // remove start sequence 2-character sequence <~ (3Ch)(7Eh)
153 1
        if (0 === strpos($data, '<~')) {
154
            // remove EOD and extra data (if any)
155
            $data = substr($data, 2);
156
        }
157
        // check for EOD: 2-character sequence ~> (7Eh)(3Eh)
158 1
        $eod = strpos($data, '~>');
159 1
        if (\strlen($data) - 2 === $eod) {
160
            // remove EOD and extra data (if any)
161
            $data = substr($data, 0, $eod);
162
        }
163
        // data length
164 1
        $data_length = \strlen($data);
165
        // check for invalid characters
166 1
        if (preg_match('/[^\x21-\x75,\x74]/', $data) > 0) {
167
            throw new \Exception('decodeFilterASCII85Decode: invalid code');
168
        }
169
        // z sequence
170 1
        $zseq = \chr(0).\chr(0).\chr(0).\chr(0);
171
        // position inside a group of 4 bytes (0-3)
172 1
        $group_pos = 0;
173 1
        $tuple = 0;
174 1
        $pow85 = [85 * 85 * 85 * 85, 85 * 85 * 85, 85 * 85, 85, 1];
175
176
        // for each byte
177 1
        for ($i = 0; $i < $data_length; ++$i) {
178
            // get char value
179 1
            $char = \ord($data[$i]);
180 1
            if (122 == $char) { // 'z'
181
                if (0 == $group_pos) {
182
                    $decoded .= $zseq;
183
                } else {
184
                    throw new \Exception('decodeFilterASCII85Decode: invalid code');
185
                }
186
            } else {
187
                // the value represented by a group of 5 characters should never be greater than 2^32 - 1
188 1
                $tuple += (($char - 33) * $pow85[$group_pos]);
189 1
                if (4 == $group_pos) {
190 1
                    $decoded .= \chr($tuple >> 24).\chr($tuple >> 16).\chr($tuple >> 8).\chr($tuple);
191 1
                    $tuple = 0;
192 1
                    $group_pos = 0;
193
                } else {
194 1
                    ++$group_pos;
195
                }
196
            }
197
        }
198 1
        if ($group_pos > 1) {
199 1
            $tuple += $pow85[$group_pos - 1];
200
        }
201
        // last tuple (if any)
202
        switch ($group_pos) {
203 1
            case 4:
204
                $decoded .= \chr($tuple >> 24).\chr($tuple >> 16).\chr($tuple >> 8);
205
                break;
206
207 1
            case 3:
208
                $decoded .= \chr($tuple >> 24).\chr($tuple >> 16);
209
                break;
210
211 1
            case 2:
212 1
                $decoded .= \chr($tuple >> 24);
213 1
                break;
214
215
            case 1:
216
                throw new \Exception('decodeFilterASCII85Decode: invalid code');
217
        }
218
219 1
        return $decoded;
220
    }
221
222
    /**
223
     * FlateDecode
224
     *
225
     * Decompresses data encoded using the zlib/deflate compression method, reproducing the original text or binary data.
226
     *
227
     * @param string $data              Data to decode
228
     * @param int    $decodeMemoryLimit Memory limit on deflation
229
     *
230
     * @return string data string
231
     *
232
     * @throws \Exception
233
     */
234 44
    protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
235
    {
236
        /*
237
         * gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
238
         * the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
239
         */
240
        set_error_handler(function ($errNo, $errStr) {
241 2
            if (\E_WARNING === $errNo) {
242 2
                throw new \Exception($errStr);
243
            } else {
244
                // fallback to default php error handler
245
                return false;
246
            }
247 44
        });
248
249 44
        $decoded = null;
0 ignored issues
show
Unused Code introduced by
The assignment to $decoded is dead and can be removed.
Loading history...
250
251
        // initialize string to return
252
        try {
253 44
            $decoded = gzuncompress($data, $decodeMemoryLimit);
254 42
            if (false === $decoded) {
255 42
                throw new \Exception('decodeFilterFlateDecode: invalid code');
256
            }
257 2
        } catch (\Exception $e) {
258 2
            try {
259 42
                // If gzuncompress() failed with a data error, try again
260
                // allowing for a CRC32 checksum instead of Adler-32.
261 44
                // See: https://www.php.net/manual/en/function.gzuncompress.php#79042
262
                // Issue: https://github.com/smalot/pdfparser/issues/592
263
                $crc32 = @tempnam('/tmp', 'gz_fix');
264 42
                if (false != $crc32) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $crc32 of type false|string against false; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
265
                    file_put_contents($crc32, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
266
                    $decoded = file_get_contents('compress.zlib://'.$crc32);
267
                    unlink($crc32);
268
                }
269
270
                // If the decoded string is empty, that means decoding failed.
271
                if (empty($decoded)) {
272
                    throw $e;
273
                }
274
            } catch (\Exception $e) {
275
                throw $e;
276
            }
277
        } finally {
278
            // Restore old handler just in case it was customized outside of PDFParser.
279
            restore_error_handler();
280
        }
281
282
        return $decoded;
283
    }
284
285
    /**
286
     * LZWDecode
287
     *
288
     * Decompresses data encoded using the LZW (Lempel-Ziv-Welch) adaptive compression method, reproducing the original text or binary data.
289
     *
290
     * @param string $data Data to decode
291
     *
292
     * @return string Data string
293
     */
294
    protected function decodeFilterLZWDecode(string $data): string
295
    {
296
        // initialize string to return
297
        $decoded = '';
298
        // data length
299
        $data_length = \strlen($data);
300
        // convert string to binary string
301
        $bitstring = '';
302
        for ($i = 0; $i < $data_length; ++$i) {
303
            $bitstring .= sprintf('%08b', \ord($data[$i]));
304
        }
305
        // get the number of bits
306
        $data_length = \strlen($bitstring);
307
        // initialize code length in bits
308
        $bitlen = 9;
309
        // initialize dictionary index
310
        $dix = 258;
311
        // initialize the dictionary (with the first 256 entries).
312
        $dictionary = [];
313
        for ($i = 0; $i < 256; ++$i) {
314
            $dictionary[$i] = \chr($i);
315
        }
316
        // previous val
317
        $prev_index = 0;
318
        // while we encounter EOD marker (257), read code_length bits
319
        while (($data_length > 0) && (257 != ($index = bindec(substr($bitstring, 0, $bitlen))))) {
320
            // remove read bits from string
321
            $bitstring = substr($bitstring, $bitlen);
322
            // update number of bits
323
            $data_length -= $bitlen;
324
            if (256 == $index) { // clear-table marker
325
                // reset code length in bits
326
                $bitlen = 9;
327
                // reset dictionary index
328
                $dix = 258;
329
                $prev_index = 256;
330
                // reset the dictionary (with the first 256 entries).
331
                $dictionary = [];
332
                for ($i = 0; $i < 256; ++$i) {
333
                    $dictionary[$i] = \chr($i);
334
                }
335
            } elseif (256 == $prev_index) {
336
                // first entry
337
                $decoded .= $dictionary[$index];
338
                $prev_index = $index;
339
            } else {
340
                // check if index exist in the dictionary
341
                if ($index < $dix) {
342
                    // index exist on dictionary
343
                    $decoded .= $dictionary[$index];
344
                    $dic_val = $dictionary[$prev_index].$dictionary[$index][0];
345
                    // store current index
346
                    $prev_index = $index;
347
                } else {
348
                    // index do not exist on dictionary
349
                    $dic_val = $dictionary[$prev_index].$dictionary[$prev_index][0];
350
                    $decoded .= $dic_val;
351
                }
352
                // update dictionary
353
                $dictionary[$dix] = $dic_val;
354
                ++$dix;
355
                // change bit length by case
356
                if (2047 == $dix) {
357
                    $bitlen = 12;
358
                } elseif (1023 == $dix) {
359
                    $bitlen = 11;
360
                } elseif (511 == $dix) {
361
                    $bitlen = 10;
362
                }
363
            }
364
        }
365
366
        return $decoded;
367
    }
368
369
    /**
370
     * RunLengthDecode
371
     *
372
     * Decompresses data encoded using a byte-oriented run-length encoding algorithm.
373
     *
374
     * @param string $data Data to decode
375
     */
376
    protected function decodeFilterRunLengthDecode(string $data): string
377
    {
378
        // initialize string to return
379
        $decoded = '';
380
        // data length
381
        $data_length = \strlen($data);
382
        $i = 0;
383
        while ($i < $data_length) {
384
            // get current byte value
385
            $byte = \ord($data[$i]);
386
            if (128 == $byte) {
387
                // a length value of 128 denote EOD
388
                break;
389
            } elseif ($byte < 128) {
390
                // if the length byte is in the range 0 to 127
391
                // the following length + 1 (1 to 128) bytes shall be copied literally during decompression
392 41
                $decoded .= substr($data, $i + 1, $byte + 1);
393
                // move to next block
394 41
                $i += ($byte + 2);
395
            } else {
396
                // if length is in the range 129 to 255,
397
                // the following single byte shall be copied 257 - length (2 to 128) times during decompression
398
                $decoded .= str_repeat($data[$i + 1], 257 - $byte);
399
                // move to next block
400
                $i += 2;
401
            }
402
        }
403
404
        return $decoded;
405
    }
406
407
    /**
408
     * @return array list of available filters
409
     */
410
    public function getAvailableFilters(): array
411
    {
412
        return $this->availableFilters;
413
    }
414
}
415