Completed
Push — develop ( 8c5838...f74fde )
by Adrien
20:53
created

HTML::canRead()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 18
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 4.128

Importance

Changes 0
Metric Value
cc 4
eloc 11
nc 4
nop 1
dl 0
loc 18
ccs 8
cts 10
cp 0.8
crap 4.128
rs 9.2
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Spreadsheet;
10
11
/**
12
 * Copyright (c) 2006 - 2016 PhpSpreadsheet
13
 *
14
 * This library is free software; you can redistribute it and/or
15
 * modify it under the terms of the GNU Lesser General Public
16
 * License as published by the Free Software Foundation; either
17
 * version 2.1 of the License, or (at your option) any later version.
18
 *
19
 * This library is distributed in the hope that it will be useful,
20
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
 * Lesser General Public License for more details.
23
 *
24
 * You should have received a copy of the GNU Lesser General Public
25
 * License along with this library; if not, write to the Free Software
26
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
27
 *
28
 * @category   PhpSpreadsheet
29
 * @copyright  Copyright (c) 2006 - 2016 PhpSpreadsheet (https://github.com/PHPOffice/PhpSpreadsheet)
30
 * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
31
 * @version    ##VERSION##, ##DATE##
32
 */
33
/** PhpSpreadsheet root directory */
34
class HTML extends BaseReader implements IReader
35
{
36
    /**
37
     * Sample size to read to determine if it's HTML or not
38
     */
39
    const TEST_SAMPLE_SIZE = 2048;
40
41
    /**
42
     * Input encoding
43
     *
44
     * @var string
45
     */
46
    protected $inputEncoding = 'ANSI';
47
48
    /**
49
     * Sheet index to read
50
     *
51
     * @var int
52
     */
53
    protected $sheetIndex = 0;
54
55
    /**
56
     * Formats
57
     *
58
     * @var array
59
     */
60
    protected $formats = [
61
        'h1' => [
62
            'font' => [
63
                'bold' => true,
64
                'size' => 24,
65
            ],
66
        ], //    Bold, 24pt
67
        'h2' => [
68
            'font' => [
69
                'bold' => true,
70
                'size' => 18,
71
            ],
72
        ], //    Bold, 18pt
73
        'h3' => [
74
            'font' => [
75
                'bold' => true,
76
                'size' => 13.5,
77
            ],
78
        ], //    Bold, 13.5pt
79
        'h4' => [
80
            'font' => [
81
                'bold' => true,
82
                'size' => 12,
83
            ],
84
        ], //    Bold, 12pt
85
        'h5' => [
86
            'font' => [
87
                'bold' => true,
88
                'size' => 10,
89
            ],
90
        ], //    Bold, 10pt
91
        'h6' => [
92
            'font' => [
93
                'bold' => true,
94
                'size' => 7.5,
95
            ],
96
        ], //    Bold, 7.5pt
97
        'a' => [
98
            'font' => [
99
                'underline' => true,
100
                'color' => [
101
                    'argb' => \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLUE,
102
                ],
103
            ],
104
        ], //    Blue underlined
105
        'hr' => [
106
            'borders' => [
107
                'bottom' => [
108
                    'style' => \PhpOffice\PhpSpreadsheet\Style\Border::BORDER_THIN,
109
                    'color' => [
110
                        \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLACK,
111
                    ],
112
                ],
113
            ],
114
        ], //    Bottom border
115
    ];
116
117
    protected $rowspan = [];
118
119
    /**
120
     * Create a new HTML Reader instance
121
     */
122 1
    public function __construct()
123
    {
124 1
        $this->readFilter = new DefaultReadFilter();
125 1
    }
126
127
    /**
128
     * Validate that the current file is an HTML file
129
     *
130
     * @param     string         $pFilename
131
     * @throws Exception
132
     * @return bool
133
     */
134 1
    public function canRead($pFilename)
135
    {
136
        // Check if file exists
137
        try {
138 1
            $this->openFile($pFilename);
139
        } catch (Exception $e) {
140
            return false;
141
        }
142
143 1
        $beginning = $this->readBeginning();
144 1
        $startWithTag = self::startsWithTag($beginning);
145 1
        $containsTags = self::containsTags($beginning);
146 1
        $endsWithTag = self::endsWithTag($this->readEnding());
147
148 1
        fclose($this->fileHandle);
149
150 1
        return $startWithTag && $containsTags && $endsWithTag;
151
    }
152
153 1
    private function readBeginning()
154
    {
155 1
        fseek($this->fileHandle, 0);
156
157 1
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
158
    }
159
160 1
    private function readEnding()
161
    {
162 1
        $meta = stream_get_meta_data($this->fileHandle);
163 1
        $filename = $meta['uri'];
164
165 1
        $size = filesize($filename);
166 1
        $blockSize = self::TEST_SAMPLE_SIZE;
167
168 1
        fseek($this->fileHandle, $size - $blockSize);
169
170 1
        return fread($this->fileHandle, $blockSize);
171
    }
172
173 1
    private static function startsWithTag($data)
174
    {
175 1
        return '<' === substr(trim($data), 0, 1);
176
    }
177
178 1
    private static function endsWithTag($data)
179
    {
180 1
        return '>' === substr(trim($data), -1, 1);
181
    }
182
183 1
    private static function containsTags($data)
184
    {
185 1
        return strlen($data) !== strlen(strip_tags($data));
186
    }
187
188
    /**
189
     * Loads Spreadsheet from file
190
     *
191
     * @param  string                    $pFilename
192
     * @throws Exception
193
     * @return Spreadsheet
194
     */
195
    public function load($pFilename)
196
    {
197
        // Create new Spreadsheet
198
        $spreadsheet = new Spreadsheet();
199
200
        // Load into this instance
201
        return $this->loadIntoExisting($pFilename, $spreadsheet);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this->loadIntoEx...ilename, $spreadsheet); (PhpOffice\PhpSpreadsheet\Spreadsheet) is incompatible with the return type declared by the interface PhpOffice\PhpSpreadsheet\Reader\IReader::load of type PhpOffice\PhpSpreadsheet\Reader\PhpSpreadsheet.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
202
    }
203
204
    /**
205
     * Set input encoding
206
     *
207
     * @param string $pValue Input encoding
208
     */
209
    public function setInputEncoding($pValue = 'ANSI')
210
    {
211
        $this->inputEncoding = $pValue;
212
213
        return $this;
214
    }
215
216
    /**
217
     * Get input encoding
218
     *
219
     * @return string
220
     */
221
    public function getInputEncoding()
222
    {
223
        return $this->inputEncoding;
224
    }
225
226
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
227
    protected $dataArray = [];
228
    protected $tableLevel = 0;
229
    protected $nestedColumn = ['A'];
230
231
    protected function setTableStartColumn($column)
232
    {
233
        if ($this->tableLevel == 0) {
234
            $column = 'A';
235
        }
236
        ++$this->tableLevel;
237
        $this->nestedColumn[$this->tableLevel] = $column;
238
239
        return $this->nestedColumn[$this->tableLevel];
240
    }
241
242
    protected function getTableStartColumn()
243
    {
244
        return $this->nestedColumn[$this->tableLevel];
245
    }
246
247
    protected function releaseTableStartColumn()
248
    {
249
        --$this->tableLevel;
250
251
        return array_pop($this->nestedColumn);
252
    }
253
254
    protected function flushCell($sheet, $column, $row, &$cellContent)
255
    {
256
        if (is_string($cellContent)) {
257
            //    Simple String content
258
            if (trim($cellContent) > '') {
259
                //    Only actually write it if there's content in the string
260
                //    Write to worksheet to be done here...
261
                //    ... we return the cell so we can mess about with styles more easily
262
                $sheet->setCellValue($column . $row, $cellContent, true);
263
                $this->dataArray[$row][$column] = $cellContent;
264
            }
265
        } else {
266
            //    We have a Rich Text run
267
            //    TODO
268
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
269
        }
270
        $cellContent = (string) '';
271
    }
272
273
    /**
274
     * @param DOMNode $element
275
     * @param \PhpOffice\PhpSpreadsheet\Worksheet $sheet
276
     * @param int $row
277
     * @param string $column
278
     * @param string $cellContent
279
     */
280
    protected function processDomElement(DOMNode $element, \PhpOffice\PhpSpreadsheet\Worksheet $sheet, &$row, &$column, &$cellContent)
281
    {
282
        foreach ($element->childNodes as $child) {
283
            if ($child instanceof DOMText) {
284
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
285
                if (is_string($cellContent)) {
286
                    //    simply append the text if the cell content is a plain text string
287
                    $cellContent .= $domText;
288
                } else {
0 ignored issues
show
Unused Code introduced by
This else statement is empty and can be removed.

This check looks for the else branches of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These else branches can be removed.

if (rand(1, 6) > 3) {
print "Check failed";
} else {
    //print "Check succeeded";
}

could be turned into

if (rand(1, 6) > 3) {
    print "Check failed";
}

This is much more concise to read.

Loading history...
289
                    //    but if we have a rich text run instead, we need to append it correctly
290
                    //    TODO
291
                }
292
            } elseif ($child instanceof DOMElement) {
293
                $attributeArray = [];
294
                foreach ($child->attributes as $attribute) {
295
                    $attributeArray[$attribute->name] = $attribute->value;
296
                }
297
298
                switch ($child->nodeName) {
299
                    case 'meta':
300
                        foreach ($attributeArray as $attributeName => $attributeValue) {
301
                            switch ($attributeName) {
302
                                case 'content':
303
                                    //    TODO
304
                                    //    Extract character set, so we can convert to UTF-8 if required
305
                                    break;
306
                            }
307
                        }
308
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
309
                        break;
310
                    case 'title':
311
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
312
                        $sheet->setTitle($cellContent);
313
                        $cellContent = '';
314
                        break;
315
                    case 'span':
316
                    case 'div':
317
                    case 'font':
318
                    case 'i':
319
                    case 'em':
320
                    case 'strong':
321
                    case 'b':
322
                        if ($cellContent > '') {
323
                            $cellContent .= ' ';
324
                        }
325
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
326
                        if ($cellContent > '') {
327
                            $cellContent .= ' ';
328
                        }
329
                        break;
330
                    case 'hr':
331
                        $this->flushCell($sheet, $column, $row, $cellContent);
332
                        ++$row;
333
                        if (isset($this->formats[$child->nodeName])) {
334
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
335
                        } else {
336
                            $cellContent = '----------';
337
                            $this->flushCell($sheet, $column, $row, $cellContent);
338
                        }
339
                        ++$row;
340
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
341
                    case 'br':
342
                        if ($this->tableLevel > 0) {
343
                            //    If we're inside a table, replace with a \n
344
                            $cellContent .= "\n";
345
                        } else {
346
                            //    Otherwise flush our existing content and move the row cursor on
347
                            $this->flushCell($sheet, $column, $row, $cellContent);
348
                            ++$row;
349
                        }
350
                        break;
351
                    case 'a':
352
                        foreach ($attributeArray as $attributeName => $attributeValue) {
353
                            switch ($attributeName) {
354
                                case 'href':
355
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
356 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
357
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
358
                                    }
359
                                    break;
360
                            }
361
                        }
362
                        $cellContent .= ' ';
363
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
364
                        break;
365
                    case 'h1':
366
                    case 'h2':
367
                    case 'h3':
368
                    case 'h4':
369
                    case 'h5':
370
                    case 'h6':
371
                    case 'ol':
372
                    case 'ul':
373
                    case 'p':
374
                        if ($this->tableLevel > 0) {
375
                            //    If we're inside a table, replace with a \n
376
                            $cellContent .= "\n";
377
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
378
                        } else {
379
                            if ($cellContent > '') {
380
                                $this->flushCell($sheet, $column, $row, $cellContent);
381
                                ++$row;
382
                            }
383
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
384
                            $this->flushCell($sheet, $column, $row, $cellContent);
385
386 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
387
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
388
                            }
389
390
                            ++$row;
391
                            $column = 'A';
392
                        }
393
                        break;
394
                    case 'li':
395
                        if ($this->tableLevel > 0) {
396
                            //    If we're inside a table, replace with a \n
397
                            $cellContent .= "\n";
398
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
399
                        } else {
400
                            if ($cellContent > '') {
401
                                $this->flushCell($sheet, $column, $row, $cellContent);
402
                            }
403
                            ++$row;
404
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
405
                            $this->flushCell($sheet, $column, $row, $cellContent);
406
                            $column = 'A';
407
                        }
408
                        break;
409
                    case 'table':
410
                        $this->flushCell($sheet, $column, $row, $cellContent);
411
                        $column = $this->setTableStartColumn($column);
412
                        if ($this->tableLevel > 1) {
413
                            --$row;
414
                        }
415
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
416
                        $column = $this->releaseTableStartColumn();
417
                        if ($this->tableLevel > 1) {
418
                            ++$column;
419
                        } else {
420
                            ++$row;
421
                        }
422
                        break;
423
                    case 'thead':
424
                    case 'tbody':
425
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
426
                        break;
427
                    case 'tr':
428
                        $column = $this->getTableStartColumn();
429
                        $cellContent = '';
430
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
431
                        ++$row;
432
                        break;
433
                    case 'th':
434
                    case 'td':
435
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
436
437
                        while (isset($this->rowspan[$column . $row])) {
438
                            ++$column;
439
                        }
440
441
                        $this->flushCell($sheet, $column, $row, $cellContent);
442
443
                        if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
444
                            //create merging rowspan and colspan
445
                            $columnTo = $column;
446
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
447
                                ++$columnTo;
448
                            }
449
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
450
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
451
                                $this->rowspan[$value] = true;
452
                            }
453
                            $sheet->mergeCells($range);
454
                            $column = $columnTo;
455
                        } elseif (isset($attributeArray['rowspan'])) {
456
                            //create merging rowspan
457
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
458
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
459
                                $this->rowspan[$value] = true;
460
                            }
461
                            $sheet->mergeCells($range);
462
                        } elseif (isset($attributeArray['colspan'])) {
463
                            //create merging colspan
464
                            $columnTo = $column;
465
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
466
                                ++$columnTo;
467
                            }
468
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
469
                            $column = $columnTo;
470
                        }
471
                        ++$column;
472
                        break;
473
                    case 'body':
474
                        $row = 1;
475
                        $column = 'A';
476
                        $content = '';
0 ignored issues
show
Unused Code introduced by
$content is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
477
                        $this->tableLevel = 0;
478
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
479
                        break;
480
                    default:
481
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
482
                }
483
            }
484
        }
485
    }
486
487
    /**
488
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance
489
     *
490
     * @param  string                    $pFilename
491
     * @param  Spreadsheet                  $spreadsheet
492
     * @throws Exception
493
     * @return Spreadsheet
494
     */
495
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
496
    {
497
        // Validate
498
        if (!$this->canRead($pFilename)) {
499
            throw new Exception($pFilename . ' is an Invalid HTML file.');
500
        }
501
502
        // Create new sheet
503
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
504
            $spreadsheet->createSheet();
505
        }
506
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
507
508
        //    Create a new DOM object
509
        $dom = new DOMDocument();
510
        //    Reload the HTML file into the DOM object
511
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
512
        if ($loaded === false) {
513
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
514
        }
515
516
        //    Discard white space
517
        $dom->preserveWhiteSpace = false;
518
519
        $row = 0;
520
        $column = 'A';
521
        $content = '';
522
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
523
524
        // Return
525
        return $spreadsheet;
526
    }
527
528
    /**
529
     * Get sheet index
530
     *
531
     * @return int
532
     */
533
    public function getSheetIndex()
534
    {
535
        return $this->sheetIndex;
536
    }
537
538
    /**
539
     * Set sheet index
540
     *
541
     * @param  int                  $pValue Sheet index
542
     * @return HTML
543
     */
544
    public function setSheetIndex($pValue = 0)
545
    {
546
        $this->sheetIndex = $pValue;
547
548
        return $this;
549
    }
550
551
    /**
552
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
553
     *
554
     * @param     string         $xml
555
     * @throws Exception
556
     */
557 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
558
    {
559
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
560
        if (preg_match($pattern, $xml)) {
561
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
562
        }
563
564
        return $xml;
565
    }
566
}
567