Completed
Push — develop ( cdbf33...653adf )
by Adrien
26:38
created

Html::endsWithTag()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
10
use PhpOffice\PhpSpreadsheet\Spreadsheet;
11
use PhpOffice\PhpSpreadsheet\Style\Border;
12
use PhpOffice\PhpSpreadsheet\Style\Color;
13
use PhpOffice\PhpSpreadsheet\Style\Fill;
14
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
15
16
/** PhpSpreadsheet root directory */
17
class Html extends BaseReader
18
{
19
    /**
20
     * Sample size to read to determine if it's HTML or not.
21
     */
22
    const TEST_SAMPLE_SIZE = 2048;
23
24
    /**
25
     * Input encoding.
26
     *
27
     * @var string
28
     */
29
    protected $inputEncoding = 'ANSI';
30
31
    /**
32
     * Sheet index to read.
33
     *
34
     * @var int
35
     */
36
    protected $sheetIndex = 0;
37
38
    /**
39
     * Formats.
40
     *
41
     * @var array
42
     */
43
    protected $formats = [
44
        'h1' => [
45
            'font' => [
46
                'bold' => true,
47
                'size' => 24,
48
            ],
49
        ], //    Bold, 24pt
50
        'h2' => [
51
            'font' => [
52
                'bold' => true,
53
                'size' => 18,
54
            ],
55
        ], //    Bold, 18pt
56
        'h3' => [
57
            'font' => [
58
                'bold' => true,
59
                'size' => 13.5,
60
            ],
61
        ], //    Bold, 13.5pt
62
        'h4' => [
63
            'font' => [
64
                'bold' => true,
65
                'size' => 12,
66
            ],
67
        ], //    Bold, 12pt
68
        'h5' => [
69
            'font' => [
70
                'bold' => true,
71
                'size' => 10,
72
            ],
73
        ], //    Bold, 10pt
74
        'h6' => [
75
            'font' => [
76
                'bold' => true,
77
                'size' => 7.5,
78
            ],
79
        ], //    Bold, 7.5pt
80
        'a' => [
81
            'font' => [
82
                'underline' => true,
83
                'color' => [
84
                    'argb' => Color::COLOR_BLUE,
85
                ],
86
            ],
87
        ], //    Blue underlined
88
        'hr' => [
89
            'borders' => [
90
                'bottom' => [
91
                    'borderStyle' => Border::BORDER_THIN,
92
                    'color' => [
93
                        Color::COLOR_BLACK,
94
                    ],
95
                ],
96
            ],
97
        ], //    Bottom border
98
    ];
99
100
    protected $rowspan = [];
101
102
    /**
103
     * Create a new HTML Reader instance.
104
     */
105 17
    public function __construct()
106
    {
107 17
        $this->readFilter = new DefaultReadFilter();
108 17
    }
109
110
    /**
111
     * Validate that the current file is an HTML file.
112
     *
113
     * @param string $pFilename
114
     *
115
     * @return bool
116
     */
117 15
    public function canRead($pFilename)
118
    {
119
        // Check if file exists
120
        try {
121 15
            $this->openFile($pFilename);
122
        } catch (Exception $e) {
123
            return false;
124
        }
125
126 15
        $beginning = $this->readBeginning();
127 15
        $startWithTag = self::startsWithTag($beginning);
128 15
        $containsTags = self::containsTags($beginning);
129 15
        $endsWithTag = self::endsWithTag($this->readEnding());
130
131 15
        fclose($this->fileHandle);
1 ignored issue
show
Bug introduced by
It seems like $this->fileHandle can also be of type false; however, parameter $handle of fclose() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

131
        fclose(/** @scrutinizer ignore-type */ $this->fileHandle);
Loading history...
132
133 15
        return $startWithTag && $containsTags && $endsWithTag;
134
    }
135
136 15
    private function readBeginning()
137
    {
138 15
        fseek($this->fileHandle, 0);
139
140 15
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
141
    }
142
143 15
    private function readEnding()
144
    {
145 15
        $meta = stream_get_meta_data($this->fileHandle);
146 15
        $filename = $meta['uri'];
147
148 15
        $size = filesize($filename);
149 15
        if ($size === 0) {
150 1
            return '';
151
        }
152
153 14
        $blockSize = self::TEST_SAMPLE_SIZE;
154 14
        if ($size < $blockSize) {
155 2
            $blockSize = $size;
156
        }
157
158 14
        fseek($this->fileHandle, $size - $blockSize);
159
160 14
        return fread($this->fileHandle, $blockSize);
161
    }
162
163 15
    private static function startsWithTag($data)
164
    {
165 15
        return '<' === substr(trim($data), 0, 1);
166
    }
167
168 15
    private static function endsWithTag($data)
169
    {
170 15
        return '>' === substr(trim($data), -1, 1);
171
    }
172
173 15
    private static function containsTags($data)
174
    {
175 15
        return strlen($data) !== strlen(strip_tags($data));
176
    }
177
178
    /**
179
     * Loads Spreadsheet from file.
180
     *
181
     * @param string $pFilename
182
     *
183
     * @throws Exception
184
     *
185
     * @return Spreadsheet
186
     */
187 8
    public function load($pFilename)
188
    {
189
        // Create new Spreadsheet
190 8
        $spreadsheet = new Spreadsheet();
191
192
        // Load into this instance
193 8
        return $this->loadIntoExisting($pFilename, $spreadsheet);
194
    }
195
196
    /**
197
     * Set input encoding.
198
     *
199
     * @param string $pValue Input encoding, eg: 'ANSI'
200
     */
201
    public function setInputEncoding($pValue)
202
    {
203
        $this->inputEncoding = $pValue;
204
205
        return $this;
206
    }
207
208
    /**
209
     * Get input encoding.
210
     *
211
     * @return string
212
     */
213
    public function getInputEncoding()
214
    {
215
        return $this->inputEncoding;
216
    }
217
218
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
219
    protected $dataArray = [];
220
    protected $tableLevel = 0;
221
    protected $nestedColumn = ['A'];
222
223 8
    protected function setTableStartColumn($column)
224
    {
225 8
        if ($this->tableLevel == 0) {
226 8
            $column = 'A';
227
        }
228 8
        ++$this->tableLevel;
229 8
        $this->nestedColumn[$this->tableLevel] = $column;
230
231 8
        return $this->nestedColumn[$this->tableLevel];
232
    }
233
234 8
    protected function getTableStartColumn()
235
    {
236 8
        return $this->nestedColumn[$this->tableLevel];
237
    }
238
239 8
    protected function releaseTableStartColumn()
240
    {
241 8
        --$this->tableLevel;
242
243 8
        return array_pop($this->nestedColumn);
244
    }
245
246 8
    protected function flushCell(Worksheet $sheet, $column, $row, &$cellContent)
247
    {
248 8
        if (is_string($cellContent)) {
249
            //    Simple String content
250 8
            if (trim($cellContent) > '') {
251
                //    Only actually write it if there's content in the string
252
                //    Write to worksheet to be done here...
253
                //    ... we return the cell so we can mess about with styles more easily
254 8
                $sheet->setCellValue($column . $row, $cellContent);
255 8
                $this->dataArray[$row][$column] = $cellContent;
256
            }
257
        } else {
258
            //    We have a Rich Text run
259
            //    TODO
260
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
261
        }
262 8
        $cellContent = (string) '';
263 8
    }
264
265
    /**
266
     * @param DOMNode $element
267
     * @param Worksheet $sheet
268
     * @param int $row
269
     * @param string $column
270
     * @param string $cellContent
271
     */
272 8
    protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
273
    {
274 8
        foreach ($element->childNodes as $child) {
275 8
            if ($child instanceof DOMText) {
276 8
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
277 8
                if (is_string($cellContent)) {
278
                    //    simply append the text if the cell content is a plain text string
279 8
                    $cellContent .= $domText;
280
                }
281
                //    but if we have a rich text run instead, we need to append it correctly
282
                    //    TODO
283 8
            } elseif ($child instanceof DOMElement) {
284 8
                $attributeArray = [];
285 8
                foreach ($child->attributes as $attribute) {
286 8
                    $attributeArray[$attribute->name] = $attribute->value;
287
                }
288
289 8
                switch ($child->nodeName) {
290 8
                    case 'meta':
291 8
                        foreach ($attributeArray as $attributeName => $attributeValue) {
292
                            switch ($attributeName) {
293 8
                                case 'content':
294
                                    //    TODO
295
                                    //    Extract character set, so we can convert to UTF-8 if required
296 8
                                    break;
297
                            }
298
                        }
299 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
300
301 8
                        break;
302 8
                    case 'title':
303 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
304 8
                        $sheet->setTitle($cellContent, true, false);
305 8
                        $cellContent = '';
306
307 8
                        break;
308 8
                    case 'span':
309 8
                    case 'div':
310 8
                    case 'font':
311 8
                    case 'i':
312 8
                    case 'em':
313 8
                    case 'strong':
314 8
                    case 'b':
315 5
                        if (isset($attributeArray['class']) && $attributeArray['class'] === 'comment') {
316 5
                            $sheet->getComment($column . $row)
317 5
                                ->getText()
318 5
                                ->createTextRun($child->textContent);
319
320 5
                            break;
321
                        }
322
323
                        if ($cellContent > '') {
324
                            $cellContent .= ' ';
325
                        }
326
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
327
                        if ($cellContent > '') {
328
                            $cellContent .= ' ';
329
                        }
330
331
                        break;
332 8
                    case 'hr':
333
                        $this->flushCell($sheet, $column, $row, $cellContent);
334
                        ++$row;
335
                        if (isset($this->formats[$child->nodeName])) {
336
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
337
                        } else {
338
                            $cellContent = '----------';
339
                            $this->flushCell($sheet, $column, $row, $cellContent);
340
                        }
341
                        ++$row;
342
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
343
                        // no break
344 8
                    case 'br':
345
                        if ($this->tableLevel > 0) {
346
                            //    If we're inside a table, replace with a \n
347
                            $cellContent .= "\n";
348
                        } else {
349
                            //    Otherwise flush our existing content and move the row cursor on
350
                            $this->flushCell($sheet, $column, $row, $cellContent);
351
                            ++$row;
352
                        }
353
354
                        break;
355 8
                    case 'a':
356 5
                        foreach ($attributeArray as $attributeName => $attributeValue) {
357
                            switch ($attributeName) {
358 5
                                case 'href':
359
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
360
                                    if (isset($this->formats[$child->nodeName])) {
361
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
362
                                    }
363
364
                                    break;
365 5
                                case 'class':
366 5
                                    if ($attributeValue === 'comment-indicator') {
367 5
                                        break; // Ignore - it's just a red square.
368
                                    }
369
                            }
370
                        }
371 5
                        $cellContent .= ' ';
372 5
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
373
374 5
                        break;
375 8
                    case 'h1':
376 8
                    case 'h2':
377 8
                    case 'h3':
378 8
                    case 'h4':
379 8
                    case 'h5':
380 8
                    case 'h6':
381 8
                    case 'ol':
382 8
                    case 'ul':
383 8
                    case 'p':
384
                        if ($this->tableLevel > 0) {
385
                            //    If we're inside a table, replace with a \n
386
                            $cellContent .= "\n";
387
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
388
                        } else {
389
                            if ($cellContent > '') {
390
                                $this->flushCell($sheet, $column, $row, $cellContent);
391
                                ++$row;
392
                            }
393
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
394
                            $this->flushCell($sheet, $column, $row, $cellContent);
395
396
                            if (isset($this->formats[$child->nodeName])) {
397
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
398
                            }
399
400
                            ++$row;
401
                            $column = 'A';
402
                        }
403
404
                        break;
405 8
                    case 'li':
406
                        if ($this->tableLevel > 0) {
407
                            //    If we're inside a table, replace with a \n
408
                            $cellContent .= "\n";
409
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
410
                        } else {
411
                            if ($cellContent > '') {
412
                                $this->flushCell($sheet, $column, $row, $cellContent);
413
                            }
414
                            ++$row;
415
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
416
                            $this->flushCell($sheet, $column, $row, $cellContent);
417
                            $column = 'A';
418
                        }
419
420
                        break;
421 8
                    case 'table':
422 8
                        $this->flushCell($sheet, $column, $row, $cellContent);
423 8
                        $column = $this->setTableStartColumn($column);
424 8
                        if ($this->tableLevel > 1) {
425
                            --$row;
426
                        }
427 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
428 8
                        $column = $this->releaseTableStartColumn();
429 8
                        if ($this->tableLevel > 1) {
430
                            ++$column;
431
                        } else {
432 8
                            ++$row;
433
                        }
434
435 8
                        break;
436 8
                    case 'thead':
437 8
                    case 'tbody':
438 7
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
439
440 7
                        break;
441 8
                    case 'tr':
442 8
                        $column = $this->getTableStartColumn();
443 8
                        $cellContent = '';
444 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
445 8
                        ++$row;
446
447 8
                        break;
448 8
                    case 'th':
449 8
                    case 'td':
450 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
451
452
                        // apply inline style
453 8
                        $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
454
455 8
                        while (isset($this->rowspan[$column . $row])) {
456
                            ++$column;
457
                        }
458
459 8
                        $this->flushCell($sheet, $column, $row, $cellContent);
460
461 8
                        if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
462
                            //create merging rowspan and colspan
463
                            $columnTo = $column;
464
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
465
                                ++$columnTo;
466
                            }
467
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
468
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
469
                                $this->rowspan[$value] = true;
470
                            }
471
                            $sheet->mergeCells($range);
472
                            $column = $columnTo;
473 8
                        } elseif (isset($attributeArray['rowspan'])) {
474
                            //create merging rowspan
475
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
476
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
477
                                $this->rowspan[$value] = true;
478
                            }
479
                            $sheet->mergeCells($range);
480 8
                        } elseif (isset($attributeArray['colspan'])) {
481
                            //create merging colspan
482 1
                            $columnTo = $column;
483 1
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
484 1
                                ++$columnTo;
485
                            }
486 1
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
487 1
                            $column = $columnTo;
488 8
                        } elseif (isset($attributeArray['bgcolor'])) {
489
                            $sheet->getStyle($column . $row)->applyFromArray(
490
                                [
491
                                    'fill' => [
492
                                        'fillType' => Fill::FILL_SOLID,
493
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
494
                                    ],
495
                                ]
496
                            );
497
                        }
498 8
                        ++$column;
499
500 8
                        break;
501 8
                    case 'body':
502 8
                        $row = 1;
503 8
                        $column = 'A';
504 8
                        $cellContent = '';
505 8
                        $this->tableLevel = 0;
506 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
507
508 8
                        break;
509
                    default:
510 8
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
511
                }
512
            }
513
        }
514 8
    }
515
516
    /**
517
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
518
     *
519
     * @param string $pFilename
520
     * @param Spreadsheet $spreadsheet
521
     *
522
     * @throws Exception
523
     *
524
     * @return Spreadsheet
525
     */
526 8
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
527
    {
528
        // Validate
529 8
        if (!$this->canRead($pFilename)) {
530
            throw new Exception($pFilename . ' is an Invalid HTML file.');
531
        }
532
533
        // Create new sheet
534 8
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
535
            $spreadsheet->createSheet();
536
        }
537 8
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
538
539
        //    Create a new DOM object
540 8
        $dom = new DOMDocument();
0 ignored issues
show
Bug introduced by
The call to DOMDocument::__construct() has too few arguments starting with version. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

540
        $dom = /** @scrutinizer ignore-call */ new DOMDocument();

This check compares calls to functions or methods with their respective definitions. If the call has less arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
541
        //    Reload the HTML file into the DOM object
542 8
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
543 8
        if ($loaded === false) {
544
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
545
        }
546
547
        //    Discard white space
548 8
        $dom->preserveWhiteSpace = false;
549
550 8
        $row = 0;
551 8
        $column = 'A';
552 8
        $content = '';
553 8
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
554
555
        // Return
556 8
        return $spreadsheet;
557
    }
558
559
    /**
560
     * Get sheet index.
561
     *
562
     * @return int
563
     */
564
    public function getSheetIndex()
565
    {
566
        return $this->sheetIndex;
567
    }
568
569
    /**
570
     * Set sheet index.
571
     *
572
     * @param int $pValue Sheet index
573
     *
574
     * @return HTML
575
     */
576
    public function setSheetIndex($pValue)
577
    {
578
        $this->sheetIndex = $pValue;
579
580
        return $this;
581
    }
582
583
    /**
584
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
585
     *
586
     * @param string $xml
587
     *
588
     * @throws Exception
589
     */
590 8
    public function securityScan($xml)
591
    {
592 8
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
593 8
        if (preg_match($pattern, $xml)) {
594
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
595
        }
596
597 8
        return $xml;
598
    }
599
600
    /**
601
     * Apply inline css inline style.
602
     *
603
     * NOTES :
604
     * Currently only intended for td & th element,
605
     * and only takes 'background-color' and 'color'; property with HEX color
606
     *
607
     * TODO :
608
     * - Implement to other propertie, such as border
609
     *
610
     * @param Worksheet $sheet
611
     * @param array $attributeArray
612
     * @param int $row
613
     * @param string $column
614
     */
615 8
    private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
616
    {
617 8
        if (!isset($attributeArray['style'])) {
618 8
            return;
619
        }
620
621 1
        $supported_styles = ['background-color', 'color'];
622
623
        // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
624 1
        $styles = explode(';', $attributeArray['style']);
625 1
        foreach ($styles as $st) {
626 1
            $value = explode(':', $st);
627
628 1
            if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
629
                continue;
630
            }
631
632
            //check if has #, so we can get clean hex
633 1
            if (substr(trim($value[1]), 0, 1) == '#') {
634 1
                $style_color = substr(trim($value[1]), 1);
635
            }
636
637 1
            if (empty($style_color)) {
638
                continue;
639
            }
640
641 1
            switch (trim($value[0])) {
642 1
                case 'background-color':
643 1
                    $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $style_color does not seem to be defined for all execution paths leading up to this point.
Loading history...
644
645 1
                    break;
646 1
                case 'color':
647 1
                    $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
648
649 1
                    break;
650
            }
651
        }
652 1
    }
653
}
654