Completed
Push — develop ( dfcab0...962367 )
by Adrien
28:38
created

Html::releaseTableStartColumn()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 0
dl 0
loc 5
ccs 3
cts 3
cp 1
crap 1
rs 9.4285
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
10
use PhpOffice\PhpSpreadsheet\Spreadsheet;
11
use PhpOffice\PhpSpreadsheet\Style\Border;
12
use PhpOffice\PhpSpreadsheet\Style\Color;
13
use PhpOffice\PhpSpreadsheet\Style\Fill;
14
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
15
16
/** PhpSpreadsheet root directory */
17
class Html extends BaseReader
18
{
19
    /**
20
     * Sample size to read to determine if it's HTML or not.
21
     */
22
    const TEST_SAMPLE_SIZE = 2048;
23
24
    /**
25
     * Input encoding.
26
     *
27
     * @var string
28
     */
29
    protected $inputEncoding = 'ANSI';
30
31
    /**
32
     * Sheet index to read.
33
     *
34
     * @var int
35
     */
36
    protected $sheetIndex = 0;
37
38
    /**
39
     * Formats.
40
     *
41
     * @var array
42
     */
43
    protected $formats = [
44
        'h1' => [
45
            'font' => [
46
                'bold' => true,
47
                'size' => 24,
48
            ],
49
        ], //    Bold, 24pt
50
        'h2' => [
51
            'font' => [
52
                'bold' => true,
53
                'size' => 18,
54
            ],
55
        ], //    Bold, 18pt
56
        'h3' => [
57
            'font' => [
58
                'bold' => true,
59
                'size' => 13.5,
60
            ],
61
        ], //    Bold, 13.5pt
62
        'h4' => [
63
            'font' => [
64
                'bold' => true,
65
                'size' => 12,
66
            ],
67
        ], //    Bold, 12pt
68
        'h5' => [
69
            'font' => [
70
                'bold' => true,
71
                'size' => 10,
72
            ],
73
        ], //    Bold, 10pt
74
        'h6' => [
75
            'font' => [
76
                'bold' => true,
77
                'size' => 7.5,
78
            ],
79
        ], //    Bold, 7.5pt
80
        'a' => [
81
            'font' => [
82
                'underline' => true,
83
                'color' => [
84
                    'argb' => Color::COLOR_BLUE,
85
                ],
86
            ],
87
        ], //    Blue underlined
88
        'hr' => [
89
            'borders' => [
90
                'bottom' => [
91
                    'borderStyle' => Border::BORDER_THIN,
92
                    'color' => [
93
                        Color::COLOR_BLACK,
94
                    ],
95
                ],
96
            ],
97
        ], //    Bottom border
98
    ];
99
100
    protected $rowspan = [];
101
102
    /**
103
     * Create a new HTML Reader instance.
104
     */
105 10
    public function __construct()
106
    {
107 10
        $this->readFilter = new DefaultReadFilter();
108 10
    }
109
110
    /**
111
     * Validate that the current file is an HTML file.
112
     *
113
     * @param string $pFilename
114
     *
115
     * @return bool
116
     */
117 8
    public function canRead($pFilename)
118
    {
119
        // Check if file exists
120
        try {
121 8
            $this->openFile($pFilename);
122
        } catch (Exception $e) {
123
            return false;
124
        }
125
126 8
        $beginning = $this->readBeginning();
127 8
        $startWithTag = self::startsWithTag($beginning);
128 8
        $containsTags = self::containsTags($beginning);
129 8
        $endsWithTag = self::endsWithTag($this->readEnding());
130
131 8
        fclose($this->fileHandle);
132
133 8
        return $startWithTag && $containsTags && $endsWithTag;
134
    }
135
136 8
    private function readBeginning()
137
    {
138 8
        fseek($this->fileHandle, 0);
139
140 8
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
141
    }
142
143 8
    private function readEnding()
144
    {
145 8
        $meta = stream_get_meta_data($this->fileHandle);
146 8
        $filename = $meta['uri'];
147
148 8
        $size = filesize($filename);
149 8
        if ($size === 0) {
150 1
            return '';
151
        }
152
153 7
        $blockSize = self::TEST_SAMPLE_SIZE;
154 7
        if ($size < $blockSize) {
155 2
            $blockSize = $size;
156
        }
157
158 7
        fseek($this->fileHandle, $size - $blockSize);
159
160 7
        return fread($this->fileHandle, $blockSize);
161
    }
162
163 8
    private static function startsWithTag($data)
164
    {
165 8
        return '<' === substr(trim($data), 0, 1);
166
    }
167
168 8
    private static function endsWithTag($data)
169
    {
170 8
        return '>' === substr(trim($data), -1, 1);
171
    }
172
173 8
    private static function containsTags($data)
174
    {
175 8
        return strlen($data) !== strlen(strip_tags($data));
176
    }
177
178
    /**
179
     * Loads Spreadsheet from file.
180
     *
181
     * @param string $pFilename
182
     *
183
     * @throws Exception
184
     *
185
     * @return Spreadsheet
186
     */
187 1
    public function load($pFilename)
188
    {
189
        // Create new Spreadsheet
190 1
        $spreadsheet = new Spreadsheet();
191
192
        // Load into this instance
193 1
        return $this->loadIntoExisting($pFilename, $spreadsheet);
194
    }
195
196
    /**
197
     * Set input encoding.
198
     *
199
     * @param string $pValue Input encoding, eg: 'ANSI'
200
     */
201
    public function setInputEncoding($pValue)
202
    {
203
        $this->inputEncoding = $pValue;
204
205
        return $this;
206
    }
207
208
    /**
209
     * Get input encoding.
210
     *
211
     * @return string
212
     */
213
    public function getInputEncoding()
214
    {
215
        return $this->inputEncoding;
216
    }
217
218
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
219
    protected $dataArray = [];
220
    protected $tableLevel = 0;
221
    protected $nestedColumn = ['A'];
222
223 1
    protected function setTableStartColumn($column)
224
    {
225 1
        if ($this->tableLevel == 0) {
226 1
            $column = 'A';
227
        }
228 1
        ++$this->tableLevel;
229 1
        $this->nestedColumn[$this->tableLevel] = $column;
230
231 1
        return $this->nestedColumn[$this->tableLevel];
232
    }
233
234 1
    protected function getTableStartColumn()
235
    {
236 1
        return $this->nestedColumn[$this->tableLevel];
237
    }
238
239 1
    protected function releaseTableStartColumn()
240
    {
241 1
        --$this->tableLevel;
242
243 1
        return array_pop($this->nestedColumn);
244
    }
245
246 1
    protected function flushCell(Worksheet $sheet, $column, $row, &$cellContent)
247
    {
248 1
        if (is_string($cellContent)) {
249
            //    Simple String content
250 1
            if (trim($cellContent) > '') {
251
                //    Only actually write it if there's content in the string
252
                //    Write to worksheet to be done here...
253
                //    ... we return the cell so we can mess about with styles more easily
254 1
                $sheet->setCellValue($column . $row, $cellContent);
255 1
                $this->dataArray[$row][$column] = $cellContent;
256
            }
257
        } else {
258
            //    We have a Rich Text run
259
            //    TODO
260
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
261
        }
262 1
        $cellContent = (string) '';
263 1
    }
264
265
    /**
266
     * @param DOMNode $element
267
     * @param Worksheet $sheet
268
     * @param int $row
269
     * @param string $column
270
     * @param string $cellContent
271
     */
272 1
    protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
273
    {
274 1
        foreach ($element->childNodes as $child) {
275 1
            if ($child instanceof DOMText) {
276 1
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
277 1
                if (is_string($cellContent)) {
278
                    //    simply append the text if the cell content is a plain text string
279 1
                    $cellContent .= $domText;
280
                }
281
                //    but if we have a rich text run instead, we need to append it correctly
282
                    //    TODO
283 1
            } elseif ($child instanceof DOMElement) {
284 1
                $attributeArray = [];
285 1
                foreach ($child->attributes as $attribute) {
286 1
                    $attributeArray[$attribute->name] = $attribute->value;
287
                }
288
289 1
                switch ($child->nodeName) {
290 1
                    case 'meta':
291 1
                        foreach ($attributeArray as $attributeName => $attributeValue) {
292
                            switch ($attributeName) {
293 1
                                case 'content':
294
                                    //    TODO
295
                                    //    Extract character set, so we can convert to UTF-8 if required
296 1
                                    break;
297
                            }
298
                        }
299 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
300
301 1
                        break;
302 1
                    case 'title':
303 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
304 1
                        $sheet->setTitle($cellContent, true, false);
305 1
                        $cellContent = '';
306
307 1
                        break;
308 1
                    case 'span':
309 1
                    case 'div':
310 1
                    case 'font':
311 1
                    case 'i':
312 1
                    case 'em':
313 1
                    case 'strong':
314 1
                    case 'b':
315
                        if ($cellContent > '') {
316
                            $cellContent .= ' ';
317
                        }
318
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
319
                        if ($cellContent > '') {
320
                            $cellContent .= ' ';
321
                        }
322
323
                        break;
324 1
                    case 'hr':
325
                        $this->flushCell($sheet, $column, $row, $cellContent);
326
                        ++$row;
327
                        if (isset($this->formats[$child->nodeName])) {
328
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
329
                        } else {
330
                            $cellContent = '----------';
331
                            $this->flushCell($sheet, $column, $row, $cellContent);
332
                        }
333
                        ++$row;
334
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
335
                        // no break
336 1
                    case 'br':
337
                        if ($this->tableLevel > 0) {
338
                            //    If we're inside a table, replace with a \n
339
                            $cellContent .= "\n";
340
                        } else {
341
                            //    Otherwise flush our existing content and move the row cursor on
342
                            $this->flushCell($sheet, $column, $row, $cellContent);
343
                            ++$row;
344
                        }
345
346
                        break;
347 1
                    case 'a':
348
                        foreach ($attributeArray as $attributeName => $attributeValue) {
349
                            switch ($attributeName) {
350
                                case 'href':
351
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
352 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
353
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
354
                                    }
355
356
                                    break;
357
                            }
358
                        }
359
                        $cellContent .= ' ';
360
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
361
362
                        break;
363 1
                    case 'h1':
364 1
                    case 'h2':
365 1
                    case 'h3':
366 1
                    case 'h4':
367 1
                    case 'h5':
368 1
                    case 'h6':
369 1
                    case 'ol':
370 1
                    case 'ul':
371 1
                    case 'p':
372
                        if ($this->tableLevel > 0) {
373
                            //    If we're inside a table, replace with a \n
374
                            $cellContent .= "\n";
375
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
376
                        } else {
377
                            if ($cellContent > '') {
378
                                $this->flushCell($sheet, $column, $row, $cellContent);
379
                                ++$row;
380
                            }
381
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
382
                            $this->flushCell($sheet, $column, $row, $cellContent);
383
384 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
385
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
386
                            }
387
388
                            ++$row;
389
                            $column = 'A';
390
                        }
391
392
                        break;
393 1
                    case 'li':
394
                        if ($this->tableLevel > 0) {
395
                            //    If we're inside a table, replace with a \n
396
                            $cellContent .= "\n";
397
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
398
                        } else {
399
                            if ($cellContent > '') {
400
                                $this->flushCell($sheet, $column, $row, $cellContent);
401
                            }
402
                            ++$row;
403
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
404
                            $this->flushCell($sheet, $column, $row, $cellContent);
405
                            $column = 'A';
406
                        }
407
408
                        break;
409 1
                    case 'table':
410 1
                        $this->flushCell($sheet, $column, $row, $cellContent);
411 1
                        $column = $this->setTableStartColumn($column);
412 1
                        if ($this->tableLevel > 1) {
413
                            --$row;
414
                        }
415 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
416 1
                        $column = $this->releaseTableStartColumn();
417 1
                        if ($this->tableLevel > 1) {
418
                            ++$column;
419
                        } else {
420 1
                            ++$row;
421
                        }
422
423 1
                        break;
424 1
                    case 'thead':
425 1
                    case 'tbody':
426
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
427
428
                        break;
429 1
                    case 'tr':
430 1
                        $column = $this->getTableStartColumn();
431 1
                        $cellContent = '';
432 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
433 1
                        ++$row;
434
435 1
                        break;
436 1
                    case 'th':
437 1
                    case 'td':
438 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
439
440
                        // apply inline style
441 1
                        $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
442
443 1
                        while (isset($this->rowspan[$column . $row])) {
444
                            ++$column;
445
                        }
446
447 1
                        $this->flushCell($sheet, $column, $row, $cellContent);
448
449 1
                        if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
450
                            //create merging rowspan and colspan
451
                            $columnTo = $column;
452
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
453
                                ++$columnTo;
454
                            }
455
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
456
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
457
                                $this->rowspan[$value] = true;
458
                            }
459
                            $sheet->mergeCells($range);
460
                            $column = $columnTo;
461 1
                        } elseif (isset($attributeArray['rowspan'])) {
462
                            //create merging rowspan
463
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
464
                            foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
465
                                $this->rowspan[$value] = true;
466
                            }
467
                            $sheet->mergeCells($range);
468 1
                        } elseif (isset($attributeArray['colspan'])) {
469
                            //create merging colspan
470
                            $columnTo = $column;
471
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
472
                                ++$columnTo;
473
                            }
474
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
475
                            $column = $columnTo;
476 1
                        } elseif (isset($attributeArray['bgcolor'])) {
477
                            $sheet->getStyle($column . $row)->applyFromArray(
478
                                [
479
                                    'fill' => [
480
                                        'fillType' => Fill::FILL_SOLID,
481
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
482
                                    ],
483
                                ]
484
                            );
485
                        }
486 1
                        ++$column;
487
488 1
                        break;
489 1
                    case 'body':
490 1
                        $row = 1;
491 1
                        $column = 'A';
492 1
                        $cellContent = '';
493 1
                        $this->tableLevel = 0;
494 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
495
496 1
                        break;
497
                    default:
498 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
499
                }
500
            }
501
        }
502 1
    }
503
504
    /**
505
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
506
     *
507
     * @param string $pFilename
508
     * @param Spreadsheet $spreadsheet
509
     *
510
     * @throws Exception
511
     *
512
     * @return Spreadsheet
513
     */
514 1
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
515
    {
516
        // Validate
517 1
        if (!$this->canRead($pFilename)) {
518
            throw new Exception($pFilename . ' is an Invalid HTML file.');
519
        }
520
521
        // Create new sheet
522 1
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
523
            $spreadsheet->createSheet();
524
        }
525 1
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
526
527
        //    Create a new DOM object
528 1
        $dom = new DOMDocument();
0 ignored issues
show
Bug introduced by
The call to DOMDocument::__construct() has too few arguments starting with version. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

528
        $dom = /** @scrutinizer ignore-call */ new DOMDocument();

This check compares calls to functions or methods with their respective definitions. If the call has less arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
529
        //    Reload the HTML file into the DOM object
530 1
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
1 ignored issue
show
Bug introduced by
It seems like $this->securityScanFile($pFilename) can also be of type false; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

530
        $loaded = $dom->loadHTML(mb_convert_encoding(/** @scrutinizer ignore-type */ $this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
Loading history...
531 1
        if ($loaded === false) {
532
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
533
        }
534
535
        //    Discard white space
536 1
        $dom->preserveWhiteSpace = false;
537
538 1
        $row = 0;
539 1
        $column = 'A';
540 1
        $content = '';
541 1
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
542
543
        // Return
544 1
        return $spreadsheet;
545
    }
546
547
    /**
548
     * Get sheet index.
549
     *
550
     * @return int
551
     */
552
    public function getSheetIndex()
553
    {
554
        return $this->sheetIndex;
555
    }
556
557
    /**
558
     * Set sheet index.
559
     *
560
     * @param int $pValue Sheet index
561
     *
562
     * @return HTML
563
     */
564
    public function setSheetIndex($pValue)
565
    {
566
        $this->sheetIndex = $pValue;
567
568
        return $this;
569
    }
570
571
    /**
572
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
573
     *
574
     * @param string $xml
575
     *
576
     * @throws Exception
577
     */
578 1 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
579
    {
580 1
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
581 1
        if (preg_match($pattern, $xml)) {
582
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
583
        }
584
585 1
        return $xml;
586
    }
587
588
    /**
589
     * Apply inline css inline style.
590
     *
591
     * NOTES :
592
     * Currently only intended for td & th element,
593
     * and only takes 'background-color' and 'color'; property with HEX color
594
     *
595
     * TODO :
596
     * - Implement to other propertie, such as border
597
     *
598
     * @param Worksheet $sheet
599
     * @param array $attributeArray
600
     * @param int $row
601
     * @param string $column
602
     */
603 1
    private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
604
    {
605 1
        if (!isset($attributeArray['style'])) {
606 1
            return;
607
        }
608
609 1
        $supported_styles = ['background-color', 'color'];
610
611
        // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
612 1
        $styles = explode(';', $attributeArray['style']);
613 1
        foreach ($styles as $st) {
614 1
            $value = explode(':', $st);
615
616 1
            if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
617
                continue;
618
            }
619
620
            //check if has #, so we can get clean hex
621 1
            if (substr(trim($value[1]), 0, 1) == '#') {
622 1
                $style_color = substr(trim($value[1]), 1);
623
            }
624
625 1
            if (empty($style_color)) {
626
                continue;
627
            }
628
629 1
            switch (trim($value[0])) {
630 1
                case 'background-color':
631 1
                    $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $style_color does not seem to be defined for all execution paths leading up to this point.
Loading history...
632
633 1
                    break;
634 1
                case 'color':
635 1
                    $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
636
637 1
                    break;
638
            }
639
        }
640 1
    }
641
}
642