Completed
Push — develop ( d3e769...440bfe )
by Adrien
22:10
created

Html::readBeginning()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 0
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
ccs 3
cts 3
cp 1
crap 1
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Cell;
10
use PhpOffice\PhpSpreadsheet\Spreadsheet;
11
use PhpOffice\PhpSpreadsheet\Style\Border;
12
use PhpOffice\PhpSpreadsheet\Style\Color;
13
use PhpOffice\PhpSpreadsheet\Style\Fill;
14
use PhpOffice\PhpSpreadsheet\Worksheet;
15
16
/** PhpSpreadsheet root directory */
17
class Html extends BaseReader implements IReader
18
{
19
    /**
20
     * Sample size to read to determine if it's HTML or not.
21
     */
22
    const TEST_SAMPLE_SIZE = 2048;
23
24
    /**
25
     * Input encoding.
26
     *
27
     * @var string
28
     */
29
    protected $inputEncoding = 'ANSI';
30
31
    /**
32
     * Sheet index to read.
33
     *
34
     * @var int
35
     */
36
    protected $sheetIndex = 0;
37
38
    /**
39
     * Formats.
40
     *
41
     * @var array
42
     */
43
    protected $formats = [
44
        'h1' => [
45
            'font' => [
46
                'bold' => true,
47
                'size' => 24,
48
            ],
49
        ], //    Bold, 24pt
50
        'h2' => [
51
            'font' => [
52
                'bold' => true,
53
                'size' => 18,
54
            ],
55
        ], //    Bold, 18pt
56
        'h3' => [
57
            'font' => [
58
                'bold' => true,
59
                'size' => 13.5,
60
            ],
61
        ], //    Bold, 13.5pt
62
        'h4' => [
63
            'font' => [
64
                'bold' => true,
65
                'size' => 12,
66
            ],
67
        ], //    Bold, 12pt
68
        'h5' => [
69
            'font' => [
70
                'bold' => true,
71
                'size' => 10,
72
            ],
73
        ], //    Bold, 10pt
74
        'h6' => [
75
            'font' => [
76
                'bold' => true,
77
                'size' => 7.5,
78
            ],
79
        ], //    Bold, 7.5pt
80
        'a' => [
81
            'font' => [
82
                'underline' => true,
83
                'color' => [
84
                    'argb' => Color::COLOR_BLUE,
85
                ],
86
            ],
87
        ], //    Blue underlined
88
        'hr' => [
89
            'borders' => [
90
                'bottom' => [
91
                    'borderStyle' => Border::BORDER_THIN,
92
                    'color' => [
93
                        Color::COLOR_BLACK,
94
                    ],
95
                ],
96
            ],
97
        ], //    Bottom border
98
    ];
99
100
    protected $rowspan = [];
101
102
    /**
103
     * Create a new HTML Reader instance.
104
     */
105 2
    public function __construct()
106
    {
107 2
        $this->readFilter = new DefaultReadFilter();
108 2
    }
109
110
    /**
111
     * Validate that the current file is an HTML file.
112
     *
113
     * @param string $pFilename
114
     *
115
     * @throws Exception
116
     *
117
     * @return bool
118
     */
119 2
    public function canRead($pFilename)
120
    {
121
        // Check if file exists
122
        try {
123 2
            $this->openFile($pFilename);
124
        } catch (Exception $e) {
125
            return false;
126
        }
127
128 2
        $beginning = $this->readBeginning();
129 2
        $startWithTag = self::startsWithTag($beginning);
130 2
        $containsTags = self::containsTags($beginning);
131 2
        $endsWithTag = self::endsWithTag($this->readEnding());
132
133 2
        fclose($this->fileHandle);
134
135 2
        return $startWithTag && $containsTags && $endsWithTag;
136
    }
137
138 2
    private function readBeginning()
139
    {
140 2
        fseek($this->fileHandle, 0);
141
142 2
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
143
    }
144
145 2
    private function readEnding()
146
    {
147 2
        $meta = stream_get_meta_data($this->fileHandle);
148 2
        $filename = $meta['uri'];
149
150 2
        $size = filesize($filename);
151 2
        $blockSize = self::TEST_SAMPLE_SIZE;
152
153 2
        fseek($this->fileHandle, $size - $blockSize);
154
155 2
        return fread($this->fileHandle, $blockSize);
156
    }
157
158 2
    private static function startsWithTag($data)
159
    {
160 2
        return '<' === substr(trim($data), 0, 1);
161
    }
162
163 2
    private static function endsWithTag($data)
164
    {
165 2
        return '>' === substr(trim($data), -1, 1);
166
    }
167
168 2
    private static function containsTags($data)
169
    {
170 2
        return strlen($data) !== strlen(strip_tags($data));
171
    }
172
173
    /**
174
     * Loads Spreadsheet from file.
175
     *
176
     * @param string $pFilename
177
     *
178
     * @throws Exception
179
     *
180
     * @return Spreadsheet
181
     */
182 1
    public function load($pFilename)
183
    {
184
        // Create new Spreadsheet
185 1
        $spreadsheet = new Spreadsheet();
186
187
        // Load into this instance
188 1
        return $this->loadIntoExisting($pFilename, $spreadsheet);
189
    }
190
191
    /**
192
     * Set input encoding.
193
     *
194
     * @param string $pValue Input encoding, eg: 'ANSI'
195
     */
196
    public function setInputEncoding($pValue)
197
    {
198
        $this->inputEncoding = $pValue;
199
200
        return $this;
201
    }
202
203
    /**
204
     * Get input encoding.
205
     *
206
     * @return string
207
     */
208
    public function getInputEncoding()
209
    {
210
        return $this->inputEncoding;
211
    }
212
213
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
214
    protected $dataArray = [];
215
    protected $tableLevel = 0;
216
    protected $nestedColumn = ['A'];
217
218 1
    protected function setTableStartColumn($column)
219
    {
220 1
        if ($this->tableLevel == 0) {
221 1
            $column = 'A';
222
        }
223 1
        ++$this->tableLevel;
224 1
        $this->nestedColumn[$this->tableLevel] = $column;
225
226 1
        return $this->nestedColumn[$this->tableLevel];
227
    }
228
229 1
    protected function getTableStartColumn()
230
    {
231 1
        return $this->nestedColumn[$this->tableLevel];
232
    }
233
234 1
    protected function releaseTableStartColumn()
235
    {
236 1
        --$this->tableLevel;
237
238 1
        return array_pop($this->nestedColumn);
239
    }
240
241 1
    protected function flushCell($sheet, $column, $row, &$cellContent)
242
    {
243 1
        if (is_string($cellContent)) {
244
            //    Simple String content
245 1
            if (trim($cellContent) > '') {
246
                //    Only actually write it if there's content in the string
247
                //    Write to worksheet to be done here...
248
                //    ... we return the cell so we can mess about with styles more easily
249 1
                $sheet->setCellValue($column . $row, $cellContent);
250 1
                $this->dataArray[$row][$column] = $cellContent;
251
            }
252
        } else {
253
            //    We have a Rich Text run
254
            //    TODO
255
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
256
        }
257 1
        $cellContent = (string) '';
258 1
    }
259
260
    /**
261
     * @param DOMNode $element
262
     * @param Worksheet $sheet
263
     * @param int $row
264
     * @param string $column
265
     * @param string $cellContent
266
     */
267 1
    protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
268
    {
269 1
        foreach ($element->childNodes as $child) {
270 1
            if ($child instanceof DOMText) {
271 1
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
272 1
                if (is_string($cellContent)) {
273
                    //    simply append the text if the cell content is a plain text string
274 1
                    $cellContent .= $domText;
275
                }
276
                //    but if we have a rich text run instead, we need to append it correctly
277
                    //    TODO
278 1
            } elseif ($child instanceof DOMElement) {
279 1
                $attributeArray = [];
280 1
                foreach ($child->attributes as $attribute) {
281 1
                    $attributeArray[$attribute->name] = $attribute->value;
282
                }
283
284 1
                switch ($child->nodeName) {
285 1
                    case 'meta':
286 1
                        foreach ($attributeArray as $attributeName => $attributeValue) {
287
                            switch ($attributeName) {
288 1
                                case 'content':
289
                                    //    TODO
290
                                    //    Extract character set, so we can convert to UTF-8 if required
291 1
                                    break;
292
                            }
293
                        }
294 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
295
296 1
                        break;
297 1
                    case 'title':
298 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
299 1
                        $sheet->setTitle($cellContent, true, false);
300 1
                        $cellContent = '';
301
302 1
                        break;
303 1
                    case 'span':
304 1
                    case 'div':
305 1
                    case 'font':
306 1
                    case 'i':
307 1
                    case 'em':
308 1
                    case 'strong':
309 1
                    case 'b':
310
                        if ($cellContent > '') {
311
                            $cellContent .= ' ';
312
                        }
313
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
314
                        if ($cellContent > '') {
315
                            $cellContent .= ' ';
316
                        }
317
318
                        break;
319 1
                    case 'hr':
320
                        $this->flushCell($sheet, $column, $row, $cellContent);
321
                        ++$row;
322
                        if (isset($this->formats[$child->nodeName])) {
323
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
324
                        } else {
325
                            $cellContent = '----------';
326
                            $this->flushCell($sheet, $column, $row, $cellContent);
327
                        }
328
                        ++$row;
329
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
330
                        // no break
331 1
                    case 'br':
332
                        if ($this->tableLevel > 0) {
333
                            //    If we're inside a table, replace with a \n
334
                            $cellContent .= "\n";
335
                        } else {
336
                            //    Otherwise flush our existing content and move the row cursor on
337
                            $this->flushCell($sheet, $column, $row, $cellContent);
338
                            ++$row;
339
                        }
340
341
                        break;
342 1
                    case 'a':
343
                        foreach ($attributeArray as $attributeName => $attributeValue) {
344
                            switch ($attributeName) {
345
                                case 'href':
346
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
347 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
348
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
349
                                    }
350
351
                                    break;
352
                            }
353
                        }
354
                        $cellContent .= ' ';
355
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
356
357
                        break;
358 1
                    case 'h1':
359 1
                    case 'h2':
360 1
                    case 'h3':
361 1
                    case 'h4':
362 1
                    case 'h5':
363 1
                    case 'h6':
364 1
                    case 'ol':
365 1
                    case 'ul':
366 1
                    case 'p':
367
                        if ($this->tableLevel > 0) {
368
                            //    If we're inside a table, replace with a \n
369
                            $cellContent .= "\n";
370
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
371
                        } else {
372
                            if ($cellContent > '') {
373
                                $this->flushCell($sheet, $column, $row, $cellContent);
374
                                ++$row;
375
                            }
376
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
377
                            $this->flushCell($sheet, $column, $row, $cellContent);
378
379 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
380
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
381
                            }
382
383
                            ++$row;
384
                            $column = 'A';
385
                        }
386
387
                        break;
388 1
                    case 'li':
389
                        if ($this->tableLevel > 0) {
390
                            //    If we're inside a table, replace with a \n
391
                            $cellContent .= "\n";
392
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
393
                        } else {
394
                            if ($cellContent > '') {
395
                                $this->flushCell($sheet, $column, $row, $cellContent);
396
                            }
397
                            ++$row;
398
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
399
                            $this->flushCell($sheet, $column, $row, $cellContent);
400
                            $column = 'A';
401
                        }
402
403
                        break;
404 1
                    case 'table':
405 1
                        $this->flushCell($sheet, $column, $row, $cellContent);
406 1
                        $column = $this->setTableStartColumn($column);
407 1
                        if ($this->tableLevel > 1) {
408
                            --$row;
409
                        }
410 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
411 1
                        $column = $this->releaseTableStartColumn();
412 1
                        if ($this->tableLevel > 1) {
413
                            ++$column;
414
                        } else {
415 1
                            ++$row;
416
                        }
417
418 1
                        break;
419 1
                    case 'thead':
420 1
                    case 'tbody':
421
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
422
423
                        break;
424 1
                    case 'tr':
425 1
                        $column = $this->getTableStartColumn();
426 1
                        $cellContent = '';
427 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
428 1
                        ++$row;
429
430 1
                        break;
431 1
                    case 'th':
432 1
                    case 'td':
433 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
434
435
                        // apply inline style
436 1
                        $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
437
438 1
                        while (isset($this->rowspan[$column . $row])) {
439
                            ++$column;
440
                        }
441
442 1
                        $this->flushCell($sheet, $column, $row, $cellContent);
443
444 1
                        if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
445
                            //create merging rowspan and colspan
446
                            $columnTo = $column;
447
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
448
                                ++$columnTo;
449
                            }
450
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
451
                            foreach (Cell::extractAllCellReferencesInRange($range) as $value) {
452
                                $this->rowspan[$value] = true;
453
                            }
454
                            $sheet->mergeCells($range);
455
                            $column = $columnTo;
456 1
                        } elseif (isset($attributeArray['rowspan'])) {
457
                            //create merging rowspan
458
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
459
                            foreach (Cell::extractAllCellReferencesInRange($range) as $value) {
460
                                $this->rowspan[$value] = true;
461
                            }
462
                            $sheet->mergeCells($range);
463 1
                        } elseif (isset($attributeArray['colspan'])) {
464
                            //create merging colspan
465
                            $columnTo = $column;
466
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
467
                                ++$columnTo;
468
                            }
469
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
470
                            $column = $columnTo;
471 1
                        } elseif (isset($attributeArray['bgcolor'])) {
472
                            $sheet->getStyle($column . $row)->applyFromArray(
473
                                [
474
                                    'fill' => [
475
                                        'fillType' => Fill::FILL_SOLID,
476
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
477
                                    ],
478
                                ]
479
                            );
480
                        }
481 1
                        ++$column;
482
483 1
                        break;
484 1
                    case 'body':
485 1
                        $row = 1;
486 1
                        $column = 'A';
487 1
                        $content = '';
0 ignored issues
show
Unused Code introduced by
$content is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
488 1
                        $this->tableLevel = 0;
489 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
490
491 1
                        break;
492
                    default:
493 1
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
494
                }
495
            }
496
        }
497 1
    }
498
499
    /**
500
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
501
     *
502
     * @param string $pFilename
503
     * @param Spreadsheet $spreadsheet
504
     *
505
     * @throws Exception
506
     *
507
     * @return Spreadsheet
508
     */
509 1
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
510
    {
511
        // Validate
512 1
        if (!$this->canRead($pFilename)) {
513
            throw new Exception($pFilename . ' is an Invalid HTML file.');
514
        }
515
516
        // Create new sheet
517 1
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
518
            $spreadsheet->createSheet();
519
        }
520 1
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
521
522
        //    Create a new DOM object
523 1
        $dom = new DOMDocument();
524
        //    Reload the HTML file into the DOM object
525 1
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
526 1
        if ($loaded === false) {
527
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
528
        }
529
530
        //    Discard white space
531 1
        $dom->preserveWhiteSpace = false;
532
533 1
        $row = 0;
534 1
        $column = 'A';
535 1
        $content = '';
536 1
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
537
538
        // Return
539 1
        return $spreadsheet;
540
    }
541
542
    /**
543
     * Get sheet index.
544
     *
545
     * @return int
546
     */
547
    public function getSheetIndex()
548
    {
549
        return $this->sheetIndex;
550
    }
551
552
    /**
553
     * Set sheet index.
554
     *
555
     * @param int $pValue Sheet index
556
     *
557
     * @return HTML
558
     */
559
    public function setSheetIndex($pValue)
560
    {
561
        $this->sheetIndex = $pValue;
562
563
        return $this;
564
    }
565
566
    /**
567
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
568
     *
569
     * @param string $xml
570
     *
571
     * @throws Exception
572
     */
573 1 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
574
    {
575 1
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
576 1
        if (preg_match($pattern, $xml)) {
577
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
578
        }
579
580 1
        return $xml;
581
    }
582
583
    /**
584
     * Apply inline css inline style.
585
     *
586
     * NOTES :
587
     * Currently only intended for td & th element,
588
     * and only takes 'background-color' and 'color'; property with HEX color
589
     *
590
     * TODO :
591
     * - Implement to other propertie, such as border
592
     *
593
     * @param Worksheet $sheet
594
     * @param array $attributeArray
595
     * @param int $row
596
     * @param string $column
597
     */
598 1
    private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
599
    {
600 1
        if (!isset($attributeArray['style'])) {
601 1
            return;
602
        }
603
604 1
        $supported_styles = ['background-color', 'color'];
605
606
        // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
607 1
        $styles = explode(';', $attributeArray['style']);
608 1
        foreach ($styles as $st) {
609 1
            $value = explode(':', $st);
610
611 1
            if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
612
                continue;
613
            }
614
615
            //check if has #, so we can get clean hex
616 1
            if (substr(trim($value[1]), 0, 1) == '#') {
617 1
                $style_color = substr(trim($value[1]), 1);
618
            }
619
620 1
            if (empty($style_color)) {
621
                continue;
622
            }
623
624 1
            switch (trim($value[0])) {
625 1
                case 'background-color':
626 1
                    $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
627
628 1
                    break;
629 1
                case 'color':
630 1
                    $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
631
632 1
                    break;
633
            }
634
        }
635 1
    }
636
}
637