Passed
Pull Request — master (#4162)
by Owen
12:32
created

Csv::guessEncodingTestBom()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 3
nc 3
nop 4
dl 0
loc 5
ccs 4
cts 4
cp 1
crap 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
6
use PhpOffice\PhpSpreadsheet\Cell\Cell;
7
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
8
use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter;
9
use PhpOffice\PhpSpreadsheet\Reader\Exception as ReaderException;
10
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
11
use PhpOffice\PhpSpreadsheet\Spreadsheet;
12
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
13
14
class Csv extends BaseReader
15
{
16
    const DEFAULT_FALLBACK_ENCODING = 'CP1252';
17
    const GUESS_ENCODING = 'guess';
18
    const UTF8_BOM = "\xEF\xBB\xBF";
19
    const UTF8_BOM_LEN = 3;
20
    const UTF16BE_BOM = "\xfe\xff";
21
    const UTF16BE_BOM_LEN = 2;
22
    const UTF16BE_LF = "\x00\x0a";
23
    const UTF16LE_BOM = "\xff\xfe";
24
    const UTF16LE_BOM_LEN = 2;
25
    const UTF16LE_LF = "\x0a\x00";
26
    const UTF32BE_BOM = "\x00\x00\xfe\xff";
27
    const UTF32BE_BOM_LEN = 4;
28
    const UTF32BE_LF = "\x00\x00\x00\x0a";
29
    const UTF32LE_BOM = "\xff\xfe\x00\x00";
30
    const UTF32LE_BOM_LEN = 4;
31
    const UTF32LE_LF = "\x0a\x00\x00\x00";
32
33
    /**
34
     * Input encoding.
35
     */
36
    private string $inputEncoding = 'UTF-8';
37
38
    /**
39
     * Fallback encoding if guess strikes out.
40
     */
41
    private string $fallbackEncoding = self::DEFAULT_FALLBACK_ENCODING;
42
43
    /**
44
     * Delimiter.
45
     */
46
    private ?string $delimiter = null;
47
48
    /**
49
     * Enclosure.
50
     */
51
    private string $enclosure = '"';
52
53
    /**
54
     * Sheet index to read.
55
     */
56
    private int $sheetIndex = 0;
57
58
    /**
59
     * Load rows contiguously.
60
     */
61
    private bool $contiguous = false;
62
63
    /**
64
     * The character that can escape the enclosure.
65
     */
66
    private string $escapeCharacter = '\\';
67
68
    /**
69
     * Callback for setting defaults in construction.
70
     *
71
     * @var ?callable
72
     */
73
    private static $constructorCallback;
74
75
    /**
76
     * Attempt autodetect line endings (deprecated after PHP8.1)?
77
     */
78
    private bool $testAutodetect = true;
79
80
    protected bool $castFormattedNumberToNumeric = false;
81
82
    protected bool $preserveNumericFormatting = false;
83
84
    private bool $preserveNullString = false;
85
86
    private bool $sheetNameIsFileName = false;
87
88
    private string $getTrue = 'true';
89
90
    private string $getFalse = 'false';
91
92
    private string $thousandsSeparator = ',';
93
94
    private string $decimalSeparator = '.';
95
96
    /**
97
     * Create a new CSV Reader instance.
98
     */
99 145
    public function __construct()
100
    {
101 145
        parent::__construct();
102 145
        $callback = self::$constructorCallback;
103 145
        if ($callback !== null) {
104 5
            $callback($this);
105
        }
106
    }
107
108
    /**
109
     * Set a callback to change the defaults.
110
     *
111
     * The callback must accept the Csv Reader object as the first parameter,
112
     * and it should return void.
113
     */
114 6
    public static function setConstructorCallback(?callable $callback): void
115
    {
116 6
        self::$constructorCallback = $callback;
117
    }
118
119 1
    public static function getConstructorCallback(): ?callable
120
    {
121 1
        return self::$constructorCallback;
122
    }
123
124 46
    public function setInputEncoding(string $encoding): self
125
    {
126 46
        $this->inputEncoding = $encoding;
127
128 46
        return $this;
129
    }
130
131 1
    public function getInputEncoding(): string
132
    {
133 1
        return $this->inputEncoding;
134
    }
135
136 5
    public function setFallbackEncoding(string $fallbackEncoding): self
137
    {
138 5
        $this->fallbackEncoding = $fallbackEncoding;
139
140 5
        return $this;
141
    }
142
143 1
    public function getFallbackEncoding(): string
144
    {
145 1
        return $this->fallbackEncoding;
146
    }
147
148
    /**
149
     * Move filepointer past any BOM marker.
150
     */
151 123
    protected function skipBOM(): void
152
    {
153 123
        rewind($this->fileHandle);
154
155 123
        if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) {
156 108
            rewind($this->fileHandle);
157
        }
158
    }
159
160
    /**
161
     * Identify any separator that is explicitly set in the file.
162
     */
163 123
    protected function checkSeparator(): void
164
    {
165 123
        $line = fgets($this->fileHandle);
166 123
        if ($line === false) {
167 1
            return;
168
        }
169
170 122
        if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) {
171 2
            $this->delimiter = substr($line, 4, 1);
172
173 2
            return;
174
        }
175
176 120
        $this->skipBOM();
177
    }
178
179
    /**
180
     * Infer the separator if it isn't explicitly set in the file or specified by the user.
181
     */
182 123
    protected function inferSeparator(): void
183
    {
184 123
        if ($this->delimiter !== null) {
185 22
            return;
186
        }
187
188 112
        $inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure);
189
190
        // If number of lines is 0, nothing to infer : fall back to the default
191 112
        if ($inferenceEngine->linesCounted() === 0) {
192 1
            $this->delimiter = $inferenceEngine->getDefaultDelimiter();
193 1
            $this->skipBOM();
194
195 1
            return;
196
        }
197
198 111
        $this->delimiter = $inferenceEngine->infer();
199
200
        // If no delimiter could be detected, fall back to the default
201 111
        if ($this->delimiter === null) {
202 9
            $this->delimiter = $inferenceEngine->getDefaultDelimiter();
203
        }
204
205 111
        $this->skipBOM();
206
    }
207
208
    /**
209
     * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
210
     */
211 12
    public function listWorksheetInfo(string $filename): array
212
    {
213
        // Open file
214 12
        $this->openFileOrMemory($filename);
215 11
        $fileHandle = $this->fileHandle;
216
217
        // Skip BOM, if any
218 11
        $this->skipBOM();
219 11
        $this->checkSeparator();
220 11
        $this->inferSeparator();
221
222 11
        $worksheetInfo = [];
223 11
        $worksheetInfo[0]['worksheetName'] = 'Worksheet';
224 11
        $worksheetInfo[0]['lastColumnLetter'] = 'A';
225 11
        $worksheetInfo[0]['lastColumnIndex'] = 0;
226 11
        $worksheetInfo[0]['totalRows'] = 0;
227 11
        $worksheetInfo[0]['totalColumns'] = 0;
228 11
        $delimiter = $this->delimiter ?? '';
229
230
        // Loop through each line of the file in turn
231 11
        $rowData = self::getCsv($fileHandle, 0, $delimiter, $this->enclosure, $this->escapeCharacter);
232 11
        while (is_array($rowData)) {
233 11
            ++$worksheetInfo[0]['totalRows'];
234 11
            $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
235 11
            $rowData = self::getCsv($fileHandle, 0, $delimiter, $this->enclosure, $this->escapeCharacter);
236
        }
237
238 11
        $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1);
239 11
        $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;
240
241
        // Close file
242 11
        fclose($fileHandle);
243
244 11
        return $worksheetInfo;
245
    }
246
247
    /**
248
     * Loads Spreadsheet from file.
249
     */
250 109
    protected function loadSpreadsheetFromFile(string $filename): Spreadsheet
251
    {
252
        // Create new Spreadsheet
253 109
        $spreadsheet = new Spreadsheet();
254
255
        // Load into this instance
256 109
        return $this->loadIntoExisting($filename, $spreadsheet);
257
    }
258
259
    /**
260
     * Loads Spreadsheet from string.
261
     */
262 3
    public function loadSpreadsheetFromString(string $contents): Spreadsheet
263
    {
264
        // Create new Spreadsheet
265 3
        $spreadsheet = new Spreadsheet();
266
267
        // Load into this instance
268 3
        return $this->loadStringOrFile('data://text/plain,' . urlencode($contents), $spreadsheet, true);
269
    }
270
271 123
    private function openFileOrMemory(string $filename): void
272
    {
273
        // Open file
274 123
        $fhandle = $this->canRead($filename);
275 123
        if (!$fhandle) {
276 3
            throw new ReaderException($filename . ' is an Invalid Spreadsheet file.');
277
        }
278 120
        if ($this->inputEncoding === self::GUESS_ENCODING) {
279 17
            $this->inputEncoding = self::guessEncoding($filename, $this->fallbackEncoding);
280
        }
281 120
        $this->openFile($filename);
282 120
        if ($this->inputEncoding !== 'UTF-8') {
283 37
            fclose($this->fileHandle);
284 37
            $entireFile = file_get_contents($filename);
285 37
            $fileHandle = fopen('php://memory', 'r+b');
286 37
            if ($fileHandle !== false && $entireFile !== false) {
287 37
                $this->fileHandle = $fileHandle;
288 37
                $data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding);
289 37
                fwrite($this->fileHandle, $data);
290 37
                $this->skipBOM();
291
            }
292
        }
293
    }
294
295 3
    public function setTestAutoDetect(bool $value): self
296
    {
297 3
        $this->testAutodetect = $value;
298
299 3
        return $this;
300
    }
301
302 115
    private function setAutoDetect(?string $value): ?string
303
    {
304 115
        $retVal = null;
305 115
        if ($value !== null && $this->testAutodetect) {
306 112
            $retVal2 = @ini_set('auto_detect_line_endings', $value);
307 112
            if (is_string($retVal2)) {
308 112
                $retVal = $retVal2;
309
            }
310
        }
311
312 115
        return $retVal;
313
    }
314
315 14
    public function castFormattedNumberToNumeric(
316
        bool $castFormattedNumberToNumeric,
317
        bool $preserveNumericFormatting = false
318
    ): void {
319 14
        $this->castFormattedNumberToNumeric = $castFormattedNumberToNumeric;
320 14
        $this->preserveNumericFormatting = $preserveNumericFormatting;
321
    }
322
323
    /**
324
     * Open data uri for reading.
325
     */
326 3
    private function openDataUri(string $filename): void
327
    {
328 3
        $fileHandle = fopen($filename, 'rb');
329 3
        if ($fileHandle === false) {
330
            // @codeCoverageIgnoreStart
331
            throw new ReaderException('Could not open file ' . $filename . ' for reading.');
332
            // @codeCoverageIgnoreEnd
333
        }
334
335 3
        $this->fileHandle = $fileHandle;
336
    }
337
338
    /**
339
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
340
     */
341 112
    public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Spreadsheet
342
    {
343 112
        return $this->loadStringOrFile($filename, $spreadsheet, false);
344
    }
345
346
    /**
347
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
348
     */
349 115
    private function loadStringOrFile(string $filename, Spreadsheet $spreadsheet, bool $dataUri): Spreadsheet
350
    {
351
        // Deprecated in Php8.1
352 115
        $iniset = $this->setAutoDetect('1');
353
354
        // Open file
355 115
        if ($dataUri) {
356 3
            $this->openDataUri($filename);
357
        } else {
358 112
            $this->openFileOrMemory($filename);
359
        }
360 113
        $fileHandle = $this->fileHandle;
361
362
        // Skip BOM, if any
363 113
        $this->skipBOM();
364 113
        $this->checkSeparator();
365 113
        $this->inferSeparator();
366
367
        // Create new PhpSpreadsheet object
368 113
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
369 4
            $spreadsheet->createSheet();
370
        }
371 113
        $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex);
372 113
        if ($this->sheetNameIsFileName) {
373 4
            $sheet->setTitle(substr(basename($filename, '.csv'), 0, Worksheet::SHEET_TITLE_MAXIMUM_LENGTH));
374
        }
375
376
        // Set our starting row based on whether we're in contiguous mode or not
377 113
        $currentRow = 1;
378 113
        $outRow = 0;
379
380
        // Loop through each line of the file in turn
381 113
        $delimiter = $this->delimiter ?? '';
382 113
        $rowData = self::getCsv($fileHandle, 0, $delimiter, $this->enclosure, $this->escapeCharacter);
383 113
        $valueBinder = Cell::getValueBinder();
384 113
        $preserveBooleanString = method_exists($valueBinder, 'getBooleanConversion') && $valueBinder->getBooleanConversion();
385 113
        $this->getTrue = Calculation::getTRUE();
386 113
        $this->getFalse = Calculation::getFALSE();
387 113
        $this->thousandsSeparator = StringHelper::getThousandsSeparator();
388 113
        $this->decimalSeparator = StringHelper::getDecimalSeparator();
389 113
        while (is_array($rowData)) {
390 112
            $noOutputYet = true;
391 112
            $columnLetter = 'A';
392 112
            foreach ($rowData as $rowDatum) {
393 112
                if ($preserveBooleanString) {
394 4
                    $rowDatum = $rowDatum ?? '';
395
                } else {
396 108
                    $this->convertBoolean($rowDatum);
397
                }
398 112
                $numberFormatMask = $this->castFormattedNumberToNumeric ? $this->convertFormattedNumber($rowDatum) : '';
399 112
                if (($rowDatum !== '' || $this->preserveNullString) && $this->readFilter->readCell($columnLetter, $currentRow)) {
400 112
                    if ($this->contiguous) {
401 3
                        if ($noOutputYet) {
402 3
                            $noOutputYet = false;
403 3
                            ++$outRow;
404
                        }
405
                    } else {
406 109
                        $outRow = $currentRow;
407
                    }
408
                    // Set basic styling for the value (Note that this could be overloaded by styling in a value binder)
409 112
                    if ($numberFormatMask !== '') {
410 7
                        $sheet->getStyle($columnLetter . $outRow)
411 7
                            ->getNumberFormat()
412 7
                            ->setFormatCode($numberFormatMask);
413
                    }
414
                    // Set cell value
415 112
                    $sheet->getCell($columnLetter . $outRow)->setValue($rowDatum);
416
                }
417 112
                ++$columnLetter;
418
            }
419 112
            $rowData = self::getCsv($fileHandle, 0, $delimiter, $this->enclosure, $this->escapeCharacter);
420 112
            ++$currentRow;
421
        }
422
423
        // Close file
424 113
        fclose($fileHandle);
425
426 113
        $this->setAutoDetect($iniset);
427
428
        // Return
429 113
        return $spreadsheet;
430
    }
431
432
    /**
433
     * Convert string true/false to boolean, and null to null-string.
434
     */
435 108
    private function convertBoolean(mixed &$rowDatum): void
436
    {
437 108
        if (is_string($rowDatum)) {
438 108
            if (strcasecmp($this->getTrue, $rowDatum) === 0 || strcasecmp('true', $rowDatum) === 0) {
439 11
                $rowDatum = true;
440 108
            } elseif (strcasecmp($this->getFalse, $rowDatum) === 0 || strcasecmp('false', $rowDatum) === 0) {
441 11
                $rowDatum = false;
442
            }
443
        } else {
444
            $rowDatum = $rowDatum ?? '';
445
        }
446
    }
447
448
    /**
449
     * Convert numeric strings to int or float values.
450
     */
451 14
    private function convertFormattedNumber(mixed &$rowDatum): string
452
    {
453 14
        $numberFormatMask = '';
454 14
        if ($this->castFormattedNumberToNumeric === true && is_string($rowDatum)) {
455 14
            $numeric = str_replace(
456 14
                [$this->thousandsSeparator, $this->decimalSeparator],
457 14
                ['', '.'],
458 14
                $rowDatum
459 14
            );
460
461 14
            if (is_numeric($numeric)) {
462 14
                $decimalPos = strpos($rowDatum, $this->decimalSeparator);
463 14
                if ($this->preserveNumericFormatting === true) {
464 7
                    $numberFormatMask = (str_contains($rowDatum, $this->thousandsSeparator))
465 7
                        ? '#,##0' : '0';
466 7
                    if ($decimalPos !== false) {
467 7
                        $decimals = strlen($rowDatum) - $decimalPos - 1;
468 7
                        $numberFormatMask .= '.' . str_repeat('0', min($decimals, 6));
469
                    }
470
                }
471
472 14
                $rowDatum = ($decimalPos !== false) ? (float) $numeric : (int) $numeric;
473
            }
474
        }
475
476 14
        return $numberFormatMask;
477
    }
478
479 14
    public function getDelimiter(): ?string
480
    {
481 14
        return $this->delimiter;
482
    }
483
484 10
    public function setDelimiter(?string $delimiter): self
485
    {
486 10
        $this->delimiter = $delimiter;
487
488 10
        return $this;
489
    }
490
491 2
    public function getEnclosure(): string
492
    {
493 2
        return $this->enclosure;
494
    }
495
496 9
    public function setEnclosure(string $enclosure): self
497
    {
498 9
        if ($enclosure == '') {
499 3
            $enclosure = '"';
500
        }
501 9
        $this->enclosure = $enclosure;
502
503 9
        return $this;
504
    }
505
506 1
    public function getSheetIndex(): int
507
    {
508 1
        return $this->sheetIndex;
509
    }
510
511 5
    public function setSheetIndex(int $indexValue): self
512
    {
513 5
        $this->sheetIndex = $indexValue;
514
515 5
        return $this;
516
    }
517
518 3
    public function setContiguous(bool $contiguous): self
519
    {
520 3
        $this->contiguous = $contiguous;
521
522 3
        return $this;
523
    }
524
525 1
    public function getContiguous(): bool
526
    {
527 1
        return $this->contiguous;
528
    }
529
530 8
    public function setEscapeCharacter(string $escapeCharacter): self
531
    {
532 8
        $this->escapeCharacter = $escapeCharacter;
533
534 8
        return $this;
535
    }
536
537 1
    public function getEscapeCharacter(): string
538
    {
539 1
        return $this->escapeCharacter;
540
    }
541
542
    /**
543
     * Can the current IReader read the file?
544
     */
545 139
    public function canRead(string $filename): bool
546
    {
547
        // Check if file exists
548
        try {
549 139
            $this->openFile($filename);
550 3
        } catch (ReaderException) {
551 3
            return false;
552
        }
553
554 136
        fclose($this->fileHandle);
555
556
        // Trust file extension if any
557 136
        $extension = strtolower(pathinfo($filename, PATHINFO_EXTENSION));
558 136
        if (in_array($extension, ['csv', 'tsv'])) {
559 105
            return true;
560
        }
561
562
        // Attempt to guess mimetype
563 31
        $type = mime_content_type($filename);
564 31
        $supportedTypes = [
565 31
            'application/csv',
566 31
            'text/csv',
567 31
            'text/plain',
568 31
            'inode/x-empty',
569 31
            'text/html',
570 31
        ];
571
572 31
        return in_array($type, $supportedTypes, true);
573
    }
574
575 20
    private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void
576
    {
577 20
        if ($encoding === '') {
578 20
            $pos = strpos($contents, $compare);
579 20
            if ($pos !== false && $pos % strlen($compare) === 0) {
580 10
                $encoding = $setEncoding;
581
            }
582
        }
583
    }
584
585 20
    private static function guessEncodingNoBom(string $filename): string
586
    {
587 20
        $encoding = '';
588 20
        $contents = file_get_contents($filename);
589 20
        self::guessEncodingTestNoBom($encoding, $contents, self::UTF32BE_LF, 'UTF-32BE');
590 20
        self::guessEncodingTestNoBom($encoding, $contents, self::UTF32LE_LF, 'UTF-32LE');
591 20
        self::guessEncodingTestNoBom($encoding, $contents, self::UTF16BE_LF, 'UTF-16BE');
592 20
        self::guessEncodingTestNoBom($encoding, $contents, self::UTF16LE_LF, 'UTF-16LE');
593 20
        if ($encoding === '' && preg_match('//u', $contents) === 1) {
594 3
            $encoding = 'UTF-8';
595
        }
596
597 20
        return $encoding;
598
    }
599
600 30
    private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void
601
    {
602 30
        if ($encoding === '') {
603 30
            if (str_starts_with($first4, $compare)) {
604 10
                $encoding = $setEncoding;
605
            }
606
        }
607
    }
608
609 30
    private static function guessEncodingBom(string $filename): string
610
    {
611 30
        $encoding = '';
612 30
        $first4 = file_get_contents($filename, false, null, 0, 4);
613 30
        if ($first4 !== false) {
614 30
            self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8');
615 30
            self::guessEncodingTestBom($encoding, $first4, self::UTF16BE_BOM, 'UTF-16BE');
616 30
            self::guessEncodingTestBom($encoding, $first4, self::UTF32BE_BOM, 'UTF-32BE');
617 30
            self::guessEncodingTestBom($encoding, $first4, self::UTF32LE_BOM, 'UTF-32LE');
618 30
            self::guessEncodingTestBom($encoding, $first4, self::UTF16LE_BOM, 'UTF-16LE');
619
        }
620
621 30
        return $encoding;
622
    }
623
624 30
    public static function guessEncoding(string $filename, string $dflt = self::DEFAULT_FALLBACK_ENCODING): string
625
    {
626 30
        $encoding = self::guessEncodingBom($filename);
627 30
        if ($encoding === '') {
628 20
            $encoding = self::guessEncodingNoBom($filename);
629
        }
630
631 30
        return ($encoding === '') ? $dflt : $encoding;
632
    }
633
634 1
    public function setPreserveNullString(bool $value): self
635
    {
636 1
        $this->preserveNullString = $value;
637
638 1
        return $this;
639
    }
640
641 1
    public function getPreserveNullString(): bool
642
    {
643 1
        return $this->preserveNullString;
644
    }
645
646 4
    public function setSheetNameIsFileName(bool $sheetNameIsFileName): self
647
    {
648 4
        $this->sheetNameIsFileName = $sheetNameIsFileName;
649
650 4
        return $this;
651
    }
652
653
    /**
654
     * Php8.4 deprecates use of anything other than null string
655
     * as escape Character.
656
     *
657
     * @param resource $stream
658
     * @param null|int<0, max> $length
659
     *
660
     * @return array<int,?string>|false
661
     */
662
    private static function getCsv(
663
        $stream,
664
        ?int $length = null,
665
        string $separator = ',',
666
        string $enclosure = '"',
667
        string $escape = '\\'
668
    ): array|false {
669
        if (PHP_VERSION_ID >= 80400 && $escape !== '') {
670
            return @fgetcsv($stream, $length, $separator, $enclosure, $escape);
671
        }
672
673
        return fgetcsv($stream, $length, $separator, $enclosure, $escape);
674
    }
675
}
676