Passed
Push — develop ( 3bea6f...0f8f07 )
by Mark
36:13
created

Html::securityScan()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2.032

Importance

Changes 0
Metric Value
cc 2
eloc 4
nc 2
nop 1
dl 0
loc 8
ccs 4
cts 5
cp 0.8
crap 2.032
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
10
use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner;
11
use PhpOffice\PhpSpreadsheet\Spreadsheet;
12
use PhpOffice\PhpSpreadsheet\Style\Border;
13
use PhpOffice\PhpSpreadsheet\Style\Color;
14
use PhpOffice\PhpSpreadsheet\Style\Fill;
15
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
16
17
/** PhpSpreadsheet root directory */
18
class Html extends BaseReader
19
{
20
    /**
21
     * @var XmlScanner
22
     */
23
    private $securityScanner;
24
25
    /**
26
     * Sample size to read to determine if it's HTML or not.
27
     */
28
    const TEST_SAMPLE_SIZE = 2048;
29
30
    /**
31
     * Input encoding.
32
     *
33
     * @var string
34
     */
35
    protected $inputEncoding = 'ANSI';
36
37
    /**
38
     * Sheet index to read.
39
     *
40
     * @var int
41
     */
42
    protected $sheetIndex = 0;
43
44
    /**
45
     * Formats.
46
     *
47
     * @var array
48
     */
49
    protected $formats = [
50
        'h1' => [
51
            'font' => [
52
                'bold' => true,
53
                'size' => 24,
54
            ],
55
        ], //    Bold, 24pt
56
        'h2' => [
57
            'font' => [
58
                'bold' => true,
59
                'size' => 18,
60
            ],
61
        ], //    Bold, 18pt
62
        'h3' => [
63
            'font' => [
64
                'bold' => true,
65
                'size' => 13.5,
66
            ],
67
        ], //    Bold, 13.5pt
68
        'h4' => [
69
            'font' => [
70
                'bold' => true,
71
                'size' => 12,
72
            ],
73
        ], //    Bold, 12pt
74
        'h5' => [
75
            'font' => [
76
                'bold' => true,
77
                'size' => 10,
78
            ],
79
        ], //    Bold, 10pt
80
        'h6' => [
81
            'font' => [
82
                'bold' => true,
83
                'size' => 7.5,
84
            ],
85
        ], //    Bold, 7.5pt
86
        'a' => [
87
            'font' => [
88
                'underline' => true,
89
                'color' => [
90
                    'argb' => Color::COLOR_BLUE,
91
                ],
92
            ],
93
        ], //    Blue underlined
94
        'hr' => [
95
            'borders' => [
96
                'bottom' => [
97
                    'borderStyle' => Border::BORDER_THIN,
98
                    'color' => [
99
                        Color::COLOR_BLACK,
100
                    ],
101
                ],
102
            ],
103
        ], //    Bottom border
104
    ];
105
106
    protected $rowspan = [];
107
108
    /**
109
     * Create a new HTML Reader instance.
110
     */
111 18
    public function __construct()
112
    {
113 18
        $this->readFilter = new DefaultReadFilter();
114 18
        $this->securityScanner = new XmlScanner('<!ENTITY');
115 18
    }
116
117
    /**
118
     * Validate that the current file is an HTML file.
119
     *
120
     * @param string $pFilename
121
     *
122
     * @return bool
123
     */
124 16
    public function canRead($pFilename)
125
    {
126
        // Check if file exists
127
        try {
128 16
            $this->openFile($pFilename);
129
        } catch (Exception $e) {
130
            return false;
131
        }
132
133 16
        $beginning = $this->readBeginning();
134 16
        $startWithTag = self::startsWithTag($beginning);
135 16
        $containsTags = self::containsTags($beginning);
136 16
        $endsWithTag = self::endsWithTag($this->readEnding());
137
138 16
        fclose($this->fileHandle);
1 ignored issue
show
Bug introduced by
It seems like $this->fileHandle can also be of type false; however, parameter $handle of fclose() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

138
        fclose(/** @scrutinizer ignore-type */ $this->fileHandle);
Loading history...
139
140 16
        return $startWithTag && $containsTags && $endsWithTag;
141
    }
142
143 16
    private function readBeginning()
144
    {
145 16
        fseek($this->fileHandle, 0);
146
147 16
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
148
    }
149
150 16
    private function readEnding()
151
    {
152 16
        $meta = stream_get_meta_data($this->fileHandle);
153 16
        $filename = $meta['uri'];
154
155 16
        $size = filesize($filename);
156 16
        if ($size === 0) {
157 1
            return '';
158
        }
159
160 15
        $blockSize = self::TEST_SAMPLE_SIZE;
161 15
        if ($size < $blockSize) {
162 2
            $blockSize = $size;
163
        }
164
165 15
        fseek($this->fileHandle, $size - $blockSize);
166
167 15
        return fread($this->fileHandle, $blockSize);
168
    }
169
170 16
    private static function startsWithTag($data)
171
    {
172 16
        return '<' === substr(trim($data), 0, 1);
173
    }
174
175 16
    private static function endsWithTag($data)
176
    {
177 16
        return '>' === substr(trim($data), -1, 1);
178
    }
179
180 16
    private static function containsTags($data)
181
    {
182 16
        return strlen($data) !== strlen(strip_tags($data));
183
    }
184
185
    /**
186
     * Loads Spreadsheet from file.
187
     *
188
     * @param string $pFilename
189
     *
190
     * @throws Exception
191
     *
192
     * @return Spreadsheet
193
     */
194 9
    public function load($pFilename)
195
    {
196
        // Create new Spreadsheet
197 9
        $spreadsheet = new Spreadsheet();
198
199
        // Load into this instance
200 9
        return $this->loadIntoExisting($pFilename, $spreadsheet);
201
    }
202
203
    /**
204
     * Set input encoding.
205
     *
206
     * @param string $pValue Input encoding, eg: 'ANSI'
207
     *
208
     * @return Html
209
     */
210
    public function setInputEncoding($pValue)
211
    {
212
        $this->inputEncoding = $pValue;
213
214
        return $this;
215
    }
216
217
    /**
218
     * Get input encoding.
219
     *
220
     * @return string
221
     */
222
    public function getInputEncoding()
223
    {
224
        return $this->inputEncoding;
225
    }
226
227
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
228
    protected $dataArray = [];
229
230
    protected $tableLevel = 0;
231
232
    protected $nestedColumn = ['A'];
233
234 9
    protected function setTableStartColumn($column)
235
    {
236 9
        if ($this->tableLevel == 0) {
237 9
            $column = 'A';
238
        }
239 9
        ++$this->tableLevel;
240 9
        $this->nestedColumn[$this->tableLevel] = $column;
241
242 9
        return $this->nestedColumn[$this->tableLevel];
243
    }
244
245 9
    protected function getTableStartColumn()
246
    {
247 9
        return $this->nestedColumn[$this->tableLevel];
248
    }
249
250 9
    protected function releaseTableStartColumn()
251
    {
252 9
        --$this->tableLevel;
253
254 9
        return array_pop($this->nestedColumn);
255
    }
256
257 9
    protected function flushCell(Worksheet $sheet, $column, $row, &$cellContent)
258
    {
259 9
        if (is_string($cellContent)) {
260
            //    Simple String content
261 9
            if (trim($cellContent) > '') {
262
                //    Only actually write it if there's content in the string
263
                //    Write to worksheet to be done here...
264
                //    ... we return the cell so we can mess about with styles more easily
265 9
                $sheet->setCellValue($column . $row, $cellContent);
266 9
                $this->dataArray[$row][$column] = $cellContent;
267
            }
268
        } else {
269
            //    We have a Rich Text run
270
            //    TODO
271
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
272
        }
273 9
        $cellContent = (string) '';
274 9
    }
275
276
    /**
277
     * @param DOMNode $element
278
     * @param Worksheet $sheet
279
     * @param int $row
280
     * @param string $column
281
     * @param string $cellContent
282
     */
283 9
    protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
284
    {
285 9
        foreach ($element->childNodes as $child) {
286 9
            if ($child instanceof DOMText) {
287 9
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
288 9
                if (is_string($cellContent)) {
289
                    //    simply append the text if the cell content is a plain text string
290 9
                    $cellContent .= $domText;
291
                }
292
                //    but if we have a rich text run instead, we need to append it correctly
293
                    //    TODO
294 9
            } elseif ($child instanceof DOMElement) {
295 9
                $attributeArray = [];
296 9
                foreach ($child->attributes as $attribute) {
297 9
                    $attributeArray[$attribute->name] = $attribute->value;
298
                }
299
300 9
                switch ($child->nodeName) {
301 9
                    case 'meta':
302 9
                        foreach ($attributeArray as $attributeName => $attributeValue) {
303
                            switch ($attributeName) {
304 9
                                case 'content':
305
                                    //    TODO
306
                                    //    Extract character set, so we can convert to UTF-8 if required
307 9
                                    break;
308
                            }
309
                        }
310 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
311
312 9
                        break;
313 9
                    case 'title':
314 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
315 9
                        $sheet->setTitle($cellContent, true, false);
316 9
                        $cellContent = '';
317
318 9
                        break;
319 9
                    case 'span':
320 9
                    case 'div':
321 9
                    case 'font':
322 9
                    case 'i':
323 9
                    case 'em':
324 9
                    case 'strong':
325 9
                    case 'b':
326 6
                        if (isset($attributeArray['class']) && $attributeArray['class'] === 'comment') {
327 6
                            $sheet->getComment($column . $row)
328 6
                                ->getText()
329 6
                                ->createTextRun($child->textContent);
330
331 6
                            break;
332
                        }
333
334
                        if ($cellContent > '') {
335
                            $cellContent .= ' ';
336
                        }
337
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
338
                        if ($cellContent > '') {
339
                            $cellContent .= ' ';
340
                        }
341
342
                        break;
343 9
                    case 'hr':
344
                        $this->flushCell($sheet, $column, $row, $cellContent);
345
                        ++$row;
346
                        if (isset($this->formats[$child->nodeName])) {
347
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
348
                        } else {
349
                            $cellContent = '----------';
350
                            $this->flushCell($sheet, $column, $row, $cellContent);
351
                        }
352
                        ++$row;
353
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
354
                        // no break
355 9
                    case 'br':
356
                        if ($this->tableLevel > 0) {
357
                            //    If we're inside a table, replace with a \n
358
                            $cellContent .= "\n";
359
                        } else {
360
                            //    Otherwise flush our existing content and move the row cursor on
361
                            $this->flushCell($sheet, $column, $row, $cellContent);
362
                            ++$row;
363
                        }
364
365
                        break;
366 9
                    case 'a':
367 6
                        foreach ($attributeArray as $attributeName => $attributeValue) {
368 6
                            switch ($attributeName) {
369 6
                                case 'href':
370
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
371
                                    if (isset($this->formats[$child->nodeName])) {
372
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
373
                                    }
374
375
                                    break;
376 6
                                case 'class':
377 6
                                    if ($attributeValue === 'comment-indicator') {
378 6
                                        break; // Ignore - it's just a red square.
379
                                    }
380
                            }
381
                        }
382 6
                        $cellContent .= ' ';
383 6
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
384
385 6
                        break;
386 9
                    case 'h1':
387 9
                    case 'h2':
388 9
                    case 'h3':
389 9
                    case 'h4':
390 9
                    case 'h5':
391 9
                    case 'h6':
392 9
                    case 'ol':
393 9
                    case 'ul':
394 9
                    case 'p':
395
                        if ($this->tableLevel > 0) {
396
                            //    If we're inside a table, replace with a \n
397
                            $cellContent .= "\n";
398
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
399
                        } else {
400
                            if ($cellContent > '') {
401
                                $this->flushCell($sheet, $column, $row, $cellContent);
402
                                ++$row;
403
                            }
404
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
405
                            $this->flushCell($sheet, $column, $row, $cellContent);
406
407
                            if (isset($this->formats[$child->nodeName])) {
408
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
409
                            }
410
411
                            ++$row;
412
                            $column = 'A';
413
                        }
414
415
                        break;
416 9
                    case 'li':
417
                        if ($this->tableLevel > 0) {
418
                            //    If we're inside a table, replace with a \n
419
                            $cellContent .= "\n";
420
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
421
                        } else {
422
                            if ($cellContent > '') {
423
                                $this->flushCell($sheet, $column, $row, $cellContent);
424
                            }
425
                            ++$row;
426
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
427
                            $this->flushCell($sheet, $column, $row, $cellContent);
428
                            $column = 'A';
429
                        }
430
431
                        break;
432 9
                    case 'table':
433 9
                        $this->flushCell($sheet, $column, $row, $cellContent);
434 9
                        $column = $this->setTableStartColumn($column);
435 9
                        if ($this->tableLevel > 1) {
436
                            --$row;
437
                        }
438 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
439 9
                        $column = $this->releaseTableStartColumn();
440 9
                        if ($this->tableLevel > 1) {
441
                            ++$column;
442
                        } else {
443 9
                            ++$row;
444
                        }
445
446 9
                        break;
447 9
                    case 'thead':
448 9
                    case 'tbody':
449 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
450
451 8
                        break;
452 9
                    case 'tr':
453 9
                        $column = $this->getTableStartColumn();
454 9
                        $cellContent = '';
455 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
456 9
                        ++$row;
457
458 9
                        break;
459 9
                    case 'th':
460 9
                    case 'td':
461 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
462
463
                        // apply inline style
464 9
                        $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
465
466 9
                        while (isset($this->rowspan[$column . $row])) {
467
                            ++$column;
468
                        }
469
470 9
                        $this->flushCell($sheet, $column, $row, $cellContent);
471
472 9
                        if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
473
                            //create merging rowspan and colspan
474
                            $columnTo = $column;
475
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
476
                                ++$columnTo;
477
                            }
478
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
479
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
480
                                $this->rowspan[$value] = true;
481
                            }
482
                            $sheet->mergeCells($range);
483
                            $column = $columnTo;
484 9
                        } elseif (isset($attributeArray['rowspan'])) {
485
                            //create merging rowspan
486
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
487
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
488
                                $this->rowspan[$value] = true;
489
                            }
490
                            $sheet->mergeCells($range);
491 9
                        } elseif (isset($attributeArray['colspan'])) {
492
                            //create merging colspan
493 1
                            $columnTo = $column;
494 1
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
495 1
                                ++$columnTo;
496
                            }
497 1
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
498 1
                            $column = $columnTo;
499 9
                        } elseif (isset($attributeArray['bgcolor'])) {
500
                            $sheet->getStyle($column . $row)->applyFromArray(
501
                                [
502
                                    'fill' => [
503
                                        'fillType' => Fill::FILL_SOLID,
504
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
505
                                    ],
506
                                ]
507
                            );
508
                        }
509 9
                        ++$column;
510
511 9
                        break;
512 9
                    case 'body':
513 9
                        $row = 1;
514 9
                        $column = 'A';
515 9
                        $cellContent = '';
516 9
                        $this->tableLevel = 0;
517 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
518
519 9
                        break;
520
                    default:
521 9
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
522
                }
523
            }
524
        }
525 9
    }
526
527
    /**
528
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
529
     *
530
     * @param string $pFilename
531
     * @param Spreadsheet $spreadsheet
532
     *
533
     * @throws Exception
534
     *
535
     * @return Spreadsheet
536
     */
537 9
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
538
    {
539
        // Validate
540 9
        if (!$this->canRead($pFilename)) {
541
            throw new Exception($pFilename . ' is an Invalid HTML file.');
542
        }
543
544
        // Create new sheet
545 9
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
546
            $spreadsheet->createSheet();
547
        }
548 9
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
549
550
        //    Create a new DOM object
551 9
        $dom = new DOMDocument();
552
        //    Reload the HTML file into the DOM object
553 9
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanner->scanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
554 9
        if ($loaded === false) {
555
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
556
        }
557
558
        //    Discard white space
559 9
        $dom->preserveWhiteSpace = false;
560
561 9
        $row = 0;
562 9
        $column = 'A';
563 9
        $content = '';
564 9
        $this->rowspan = [];
565 9
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
566
567
        // Return
568 9
        return $spreadsheet;
569
    }
570
571
    /**
572
     * Get sheet index.
573
     *
574
     * @return int
575
     */
576
    public function getSheetIndex()
577
    {
578
        return $this->sheetIndex;
579
    }
580
581
    /**
582
     * Set sheet index.
583
     *
584
     * @param int $pValue Sheet index
585
     *
586
     * @return HTML
587
     */
588
    public function setSheetIndex($pValue)
589
    {
590
        $this->sheetIndex = $pValue;
591
592
        return $this;
593
    }
594
595
    /**
596
     * Apply inline css inline style.
597
     *
598
     * NOTES :
599
     * Currently only intended for td & th element,
600
     * and only takes 'background-color' and 'color'; property with HEX color
601
     *
602
     * TODO :
603
     * - Implement to other propertie, such as border
604
     *
605
     * @param Worksheet $sheet
606
     * @param int $row
607
     * @param string $column
608
     * @param array $attributeArray
609
     */
610 9
    private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
611
    {
612 9
        if (!isset($attributeArray['style'])) {
613 9
            return;
614
        }
615
616 1
        $supported_styles = ['background-color', 'color'];
617
618
        // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
619 1
        $styles = explode(';', $attributeArray['style']);
620 1
        foreach ($styles as $st) {
621 1
            $value = explode(':', $st);
622
623 1
            if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
624
                continue;
625
            }
626
627
            //check if has #, so we can get clean hex
628 1
            if (substr(trim($value[1]), 0, 1) == '#') {
629 1
                $style_color = substr(trim($value[1]), 1);
630
            }
631
632 1
            if (empty($style_color)) {
633
                continue;
634
            }
635
636 1
            switch (trim($value[0])) {
637 1
                case 'background-color':
638 1
                    $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $style_color does not seem to be defined for all execution paths leading up to this point.
Loading history...
639
640 1
                    break;
641 1
                case 'color':
642 1
                    $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
643
644 1
                    break;
645
            }
646
        }
647 1
    }
648
}
649