Passed
Push — develop ( adf95b...46eabb )
by Adrien
27:26 queued 14:05
created

Html::securityScan()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2.032

Importance

Changes 0
Metric Value
cc 2
eloc 4
nc 2
nop 1
dl 0
loc 8
ccs 4
cts 5
cp 0.8
crap 2.032
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
10
use PhpOffice\PhpSpreadsheet\Spreadsheet;
11
use PhpOffice\PhpSpreadsheet\Style\Border;
12
use PhpOffice\PhpSpreadsheet\Style\Color;
13
use PhpOffice\PhpSpreadsheet\Style\Fill;
14
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
15
16
/** PhpSpreadsheet root directory */
17
class Html extends BaseReader
18
{
19
    /**
20
     * Sample size to read to determine if it's HTML or not.
21
     */
22
    const TEST_SAMPLE_SIZE = 2048;
23
24
    /**
25
     * Input encoding.
26
     *
27
     * @var string
28
     */
29
    protected $inputEncoding = 'ANSI';
30
31
    /**
32
     * Sheet index to read.
33
     *
34
     * @var int
35
     */
36
    protected $sheetIndex = 0;
37
38
    /**
39
     * Formats.
40
     *
41
     * @var array
42
     */
43
    protected $formats = [
44
        'h1' => [
45
            'font' => [
46
                'bold' => true,
47
                'size' => 24,
48
            ],
49
        ], //    Bold, 24pt
50
        'h2' => [
51
            'font' => [
52
                'bold' => true,
53
                'size' => 18,
54
            ],
55
        ], //    Bold, 18pt
56
        'h3' => [
57
            'font' => [
58
                'bold' => true,
59
                'size' => 13.5,
60
            ],
61
        ], //    Bold, 13.5pt
62
        'h4' => [
63
            'font' => [
64
                'bold' => true,
65
                'size' => 12,
66
            ],
67
        ], //    Bold, 12pt
68
        'h5' => [
69
            'font' => [
70
                'bold' => true,
71
                'size' => 10,
72
            ],
73
        ], //    Bold, 10pt
74
        'h6' => [
75
            'font' => [
76
                'bold' => true,
77
                'size' => 7.5,
78
            ],
79
        ], //    Bold, 7.5pt
80
        'a' => [
81
            'font' => [
82
                'underline' => true,
83
                'color' => [
84
                    'argb' => Color::COLOR_BLUE,
85
                ],
86
            ],
87
        ], //    Blue underlined
88
        'hr' => [
89
            'borders' => [
90
                'bottom' => [
91
                    'borderStyle' => Border::BORDER_THIN,
92
                    'color' => [
93
                        Color::COLOR_BLACK,
94
                    ],
95
                ],
96
            ],
97
        ], //    Bottom border
98
    ];
99
100
    protected $rowspan = [];
101
102
    /**
103
     * Create a new HTML Reader instance.
104
     */
105 18
    public function __construct()
106
    {
107 18
        $this->readFilter = new DefaultReadFilter();
108 18
    }
109
110
    /**
111
     * Validate that the current file is an HTML file.
112
     *
113
     * @param string $pFilename
114
     *
115
     * @return bool
116
     */
117 16
    public function canRead($pFilename)
118
    {
119
        // Check if file exists
120
        try {
121 16
            $this->openFile($pFilename);
122
        } catch (Exception $e) {
123
            return false;
124
        }
125
126 16
        $beginning = $this->readBeginning();
127 16
        $startWithTag = self::startsWithTag($beginning);
128 16
        $containsTags = self::containsTags($beginning);
129 16
        $endsWithTag = self::endsWithTag($this->readEnding());
130
131 16
        fclose($this->fileHandle);
1 ignored issue
show
Bug introduced by
It seems like $this->fileHandle can also be of type false; however, parameter $handle of fclose() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

131
        fclose(/** @scrutinizer ignore-type */ $this->fileHandle);
Loading history...
132
133 16
        return $startWithTag && $containsTags && $endsWithTag;
134
    }
135
136 16
    private function readBeginning()
137
    {
138 16
        fseek($this->fileHandle, 0);
139
140 16
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
141
    }
142
143 16
    private function readEnding()
144
    {
145 16
        $meta = stream_get_meta_data($this->fileHandle);
146 16
        $filename = $meta['uri'];
147
148 16
        $size = filesize($filename);
149 16
        if ($size === 0) {
150 1
            return '';
151
        }
152
153 15
        $blockSize = self::TEST_SAMPLE_SIZE;
154 15
        if ($size < $blockSize) {
155 2
            $blockSize = $size;
156
        }
157
158 15
        fseek($this->fileHandle, $size - $blockSize);
159
160 15
        return fread($this->fileHandle, $blockSize);
161
    }
162
163 16
    private static function startsWithTag($data)
164
    {
165 16
        return '<' === substr(trim($data), 0, 1);
166
    }
167
168 16
    private static function endsWithTag($data)
169
    {
170 16
        return '>' === substr(trim($data), -1, 1);
171
    }
172
173 16
    private static function containsTags($data)
174
    {
175 16
        return strlen($data) !== strlen(strip_tags($data));
176
    }
177
178
    /**
179
     * Loads Spreadsheet from file.
180
     *
181
     * @param string $pFilename
182
     *
183
     * @throws Exception
184
     *
185
     * @return Spreadsheet
186
     */
187 9
    public function load($pFilename)
188
    {
189
        // Create new Spreadsheet
190 9
        $spreadsheet = new Spreadsheet();
191
192
        // Load into this instance
193 9
        return $this->loadIntoExisting($pFilename, $spreadsheet);
194
    }
195
196
    /**
197
     * Set input encoding.
198
     *
199
     * @param string $pValue Input encoding, eg: 'ANSI'
200
     *
201
     * @return Html
202
     */
203
    public function setInputEncoding($pValue)
204
    {
205
        $this->inputEncoding = $pValue;
206
207
        return $this;
208
    }
209
210
    /**
211
     * Get input encoding.
212
     *
213
     * @return string
214
     */
215
    public function getInputEncoding()
216
    {
217
        return $this->inputEncoding;
218
    }
219
220
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
221
    protected $dataArray = [];
222
223
    protected $tableLevel = 0;
224
225
    protected $nestedColumn = ['A'];
226
227 9
    protected function setTableStartColumn($column)
228
    {
229 9
        if ($this->tableLevel == 0) {
230 9
            $column = 'A';
231
        }
232 9
        ++$this->tableLevel;
233 9
        $this->nestedColumn[$this->tableLevel] = $column;
234
235 9
        return $this->nestedColumn[$this->tableLevel];
236
    }
237
238 9
    protected function getTableStartColumn()
239
    {
240 9
        return $this->nestedColumn[$this->tableLevel];
241
    }
242
243 9
    protected function releaseTableStartColumn()
244
    {
245 9
        --$this->tableLevel;
246
247 9
        return array_pop($this->nestedColumn);
248
    }
249
250 9
    protected function flushCell(Worksheet $sheet, $column, $row, &$cellContent)
251
    {
252 9
        if (is_string($cellContent)) {
253
            //    Simple String content
254 9
            if (trim($cellContent) > '') {
255
                //    Only actually write it if there's content in the string
256
                //    Write to worksheet to be done here...
257
                //    ... we return the cell so we can mess about with styles more easily
258 9
                $sheet->setCellValue($column . $row, $cellContent);
259 9
                $this->dataArray[$row][$column] = $cellContent;
260
            }
261
        } else {
262
            //    We have a Rich Text run
263
            //    TODO
264
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
265
        }
266 9
        $cellContent = (string) '';
267 9
    }
268
269
    /**
270
     * @param DOMNode $element
271
     * @param Worksheet $sheet
272
     * @param int $row
273
     * @param string $column
274
     * @param string $cellContent
275
     */
276 9
    protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
277
    {
278 9
        foreach ($element->childNodes as $child) {
279 9
            if ($child instanceof DOMText) {
280 9
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
281 9
                if (is_string($cellContent)) {
282
                    //    simply append the text if the cell content is a plain text string
283 9
                    $cellContent .= $domText;
284
                }
285
                //    but if we have a rich text run instead, we need to append it correctly
286
                    //    TODO
287 9
            } elseif ($child instanceof DOMElement) {
288 9
                $attributeArray = [];
289 9
                foreach ($child->attributes as $attribute) {
290 9
                    $attributeArray[$attribute->name] = $attribute->value;
291
                }
292
293 9
                switch ($child->nodeName) {
294 9
                    case 'meta':
295 9
                        foreach ($attributeArray as $attributeName => $attributeValue) {
296
                            switch ($attributeName) {
297 9
                                case 'content':
298
                                    //    TODO
299
                                    //    Extract character set, so we can convert to UTF-8 if required
300 9
                                    break;
301
                            }
302
                        }
303 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
304
305 9
                        break;
306 9
                    case 'title':
307 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
308 9
                        $sheet->setTitle($cellContent, true, false);
309 9
                        $cellContent = '';
310
311 9
                        break;
312 9
                    case 'span':
313 9
                    case 'div':
314 9
                    case 'font':
315 9
                    case 'i':
316 9
                    case 'em':
317 9
                    case 'strong':
318 9
                    case 'b':
319 6
                        if (isset($attributeArray['class']) && $attributeArray['class'] === 'comment') {
320 6
                            $sheet->getComment($column . $row)
321 6
                                ->getText()
322 6
                                ->createTextRun($child->textContent);
323
324 6
                            break;
325
                        }
326
327
                        if ($cellContent > '') {
328
                            $cellContent .= ' ';
329
                        }
330
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
331
                        if ($cellContent > '') {
332
                            $cellContent .= ' ';
333
                        }
334
335
                        break;
336 9
                    case 'hr':
337
                        $this->flushCell($sheet, $column, $row, $cellContent);
338
                        ++$row;
339
                        if (isset($this->formats[$child->nodeName])) {
340
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
341
                        } else {
342
                            $cellContent = '----------';
343
                            $this->flushCell($sheet, $column, $row, $cellContent);
344
                        }
345
                        ++$row;
346
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
347
                        // no break
348 9
                    case 'br':
349
                        if ($this->tableLevel > 0) {
350
                            //    If we're inside a table, replace with a \n
351
                            $cellContent .= "\n";
352
                        } else {
353
                            //    Otherwise flush our existing content and move the row cursor on
354
                            $this->flushCell($sheet, $column, $row, $cellContent);
355
                            ++$row;
356
                        }
357
358
                        break;
359 9
                    case 'a':
360 6
                        foreach ($attributeArray as $attributeName => $attributeValue) {
361
                            switch ($attributeName) {
362 6
                                case 'href':
363
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
364
                                    if (isset($this->formats[$child->nodeName])) {
365
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
366
                                    }
367
368
                                    break;
369 6
                                case 'class':
370 6
                                    if ($attributeValue === 'comment-indicator') {
371 6
                                        break; // Ignore - it's just a red square.
372
                                    }
373
                            }
374
                        }
375 6
                        $cellContent .= ' ';
376 6
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
377
378 6
                        break;
379 9
                    case 'h1':
380 9
                    case 'h2':
381 9
                    case 'h3':
382 9
                    case 'h4':
383 9
                    case 'h5':
384 9
                    case 'h6':
385 9
                    case 'ol':
386 9
                    case 'ul':
387 9
                    case 'p':
388
                        if ($this->tableLevel > 0) {
389
                            //    If we're inside a table, replace with a \n
390
                            $cellContent .= "\n";
391
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
392
                        } else {
393
                            if ($cellContent > '') {
394
                                $this->flushCell($sheet, $column, $row, $cellContent);
395
                                ++$row;
396
                            }
397
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
398
                            $this->flushCell($sheet, $column, $row, $cellContent);
399
400
                            if (isset($this->formats[$child->nodeName])) {
401
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
402
                            }
403
404
                            ++$row;
405
                            $column = 'A';
406
                        }
407
408
                        break;
409 9
                    case 'li':
410
                        if ($this->tableLevel > 0) {
411
                            //    If we're inside a table, replace with a \n
412
                            $cellContent .= "\n";
413
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
414
                        } else {
415
                            if ($cellContent > '') {
416
                                $this->flushCell($sheet, $column, $row, $cellContent);
417
                            }
418
                            ++$row;
419
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
420
                            $this->flushCell($sheet, $column, $row, $cellContent);
421
                            $column = 'A';
422
                        }
423
424
                        break;
425 9
                    case 'table':
426 9
                        $this->flushCell($sheet, $column, $row, $cellContent);
427 9
                        $column = $this->setTableStartColumn($column);
428 9
                        if ($this->tableLevel > 1) {
429
                            --$row;
430
                        }
431 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
432 9
                        $column = $this->releaseTableStartColumn();
433 9
                        if ($this->tableLevel > 1) {
434
                            ++$column;
435
                        } else {
436 9
                            ++$row;
437
                        }
438
439 9
                        break;
440 9
                    case 'thead':
441 9
                    case 'tbody':
442 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
443
444 8
                        break;
445 9
                    case 'tr':
446 9
                        $column = $this->getTableStartColumn();
447 9
                        $cellContent = '';
448 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
449 9
                        ++$row;
450
451 9
                        break;
452 9
                    case 'th':
453 9
                    case 'td':
454 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
455
456
                        // apply inline style
457 9
                        $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
458
459 9
                        while (isset($this->rowspan[$column . $row])) {
460
                            ++$column;
461
                        }
462
463 9
                        $this->flushCell($sheet, $column, $row, $cellContent);
464
465 9
                        if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
466
                            //create merging rowspan and colspan
467
                            $columnTo = $column;
468
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
469
                                ++$columnTo;
470
                            }
471
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
472
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
473
                                $this->rowspan[$value] = true;
474
                            }
475
                            $sheet->mergeCells($range);
476
                            $column = $columnTo;
477 9
                        } elseif (isset($attributeArray['rowspan'])) {
478
                            //create merging rowspan
479
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
480
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
481
                                $this->rowspan[$value] = true;
482
                            }
483
                            $sheet->mergeCells($range);
484 9
                        } elseif (isset($attributeArray['colspan'])) {
485
                            //create merging colspan
486 1
                            $columnTo = $column;
487 1
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
488 1
                                ++$columnTo;
489
                            }
490 1
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
491 1
                            $column = $columnTo;
492 9
                        } elseif (isset($attributeArray['bgcolor'])) {
493
                            $sheet->getStyle($column . $row)->applyFromArray(
494
                                [
495
                                    'fill' => [
496
                                        'fillType' => Fill::FILL_SOLID,
497
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
498
                                    ],
499
                                ]
500
                            );
501
                        }
502 9
                        ++$column;
503
504 9
                        break;
505 9
                    case 'body':
506 9
                        $row = 1;
507 9
                        $column = 'A';
508 9
                        $cellContent = '';
509 9
                        $this->tableLevel = 0;
510 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
511
512 9
                        break;
513
                    default:
514 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
515
                }
516
            }
517
        }
518 9
    }
519
520
    /**
521
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
522
     *
523
     * @param string $pFilename
524
     * @param Spreadsheet $spreadsheet
525
     *
526
     * @throws Exception
527
     *
528
     * @return Spreadsheet
529
     */
530 9
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
531
    {
532
        // Validate
533 9
        if (!$this->canRead($pFilename)) {
534
            throw new Exception($pFilename . ' is an Invalid HTML file.');
535
        }
536
537
        // Create new sheet
538 9
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
539
            $spreadsheet->createSheet();
540
        }
541 9
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
542
543
        //    Create a new DOM object
544 9
        $dom = new DOMDocument();
545
        //    Reload the HTML file into the DOM object
546 9
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
547 9
        if ($loaded === false) {
548
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
549
        }
550
551
        //    Discard white space
552 9
        $dom->preserveWhiteSpace = false;
553
554 9
        $row = 0;
555 9
        $column = 'A';
556 9
        $content = '';
557 9
        $this->rowspan = [];
558 9
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
559
560
        // Return
561 9
        return $spreadsheet;
562
    }
563
564
    /**
565
     * Get sheet index.
566
     *
567
     * @return int
568
     */
569
    public function getSheetIndex()
570
    {
571
        return $this->sheetIndex;
572
    }
573
574
    /**
575
     * Set sheet index.
576
     *
577
     * @param int $pValue Sheet index
578
     *
579
     * @return HTML
580
     */
581
    public function setSheetIndex($pValue)
582
    {
583
        $this->sheetIndex = $pValue;
584
585
        return $this;
586
    }
587
588
    /**
589
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
590
     *
591
     * @param string $xml
592
     *
593
     * @return string
594
     */
595 9
    public function securityScan($xml)
596
    {
597 9
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
598 9
        if (preg_match($pattern, $xml)) {
599
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
600
        }
601
602 9
        return $xml;
603
    }
604
605
    /**
606
     * Apply inline css inline style.
607
     *
608
     * NOTES :
609
     * Currently only intended for td & th element,
610
     * and only takes 'background-color' and 'color'; property with HEX color
611
     *
612
     * TODO :
613
     * - Implement to other propertie, such as border
614
     *
615
     * @param Worksheet $sheet
616
     * @param int $row
617
     * @param string $column
618
     * @param array $attributeArray
619
     */
620 9
    private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
621
    {
622 9
        if (!isset($attributeArray['style'])) {
623 9
            return;
624
        }
625
626 1
        $supported_styles = ['background-color', 'color'];
627
628
        // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
629 1
        $styles = explode(';', $attributeArray['style']);
630 1
        foreach ($styles as $st) {
631 1
            $value = explode(':', $st);
632
633 1
            if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
634
                continue;
635
            }
636
637
            //check if has #, so we can get clean hex
638 1
            if (substr(trim($value[1]), 0, 1) == '#') {
639 1
                $style_color = substr(trim($value[1]), 1);
640
            }
641
642 1
            if (empty($style_color)) {
643
                continue;
644
            }
645
646 1
            switch (trim($value[0])) {
647 1
                case 'background-color':
648 1
                    $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $style_color does not seem to be defined for all execution paths leading up to this point.
Loading history...
649
650 1
                    break;
651 1
                case 'color':
652 1
                    $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
653
654 1
                    break;
655
            }
656
        }
657 1
    }
658
}
659