Completed
Push — develop ( 4b4831...408da0 )
by Adrien
28:25 queued 17:13
created

HTML::endsWithTag()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 4
ccs 0
cts 2
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Spreadsheet;
10
11
/**
12
 * Copyright (c) 2006 - 2016 PhpSpreadsheet
13
 *
14
 * This library is free software; you can redistribute it and/or
15
 * modify it under the terms of the GNU Lesser General Public
16
 * License as published by the Free Software Foundation; either
17
 * version 2.1 of the License, or (at your option) any later version.
18
 *
19
 * This library is distributed in the hope that it will be useful,
20
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
 * Lesser General Public License for more details.
23
 *
24
 * You should have received a copy of the GNU Lesser General Public
25
 * License along with this library; if not, write to the Free Software
26
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
27
 *
28
 * @category   PhpSpreadsheet
29
 * @copyright  Copyright (c) 2006 - 2016 PhpSpreadsheet (https://github.com/PHPOffice/PhpSpreadsheet)
30
 * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
31
 * @version    ##VERSION##, ##DATE##
32
 */
33
/** PhpSpreadsheet root directory */
34
class HTML extends BaseReader implements IReader
35
{
36
    /**
37
     * Sample size to read to determine if it's HTML or not
38
     */
39
    const TEST_SAMPLE_SIZE = 2048;
40
41
    /**
42
     * Input encoding
43
     *
44
     * @var string
45
     */
46
    protected $inputEncoding = 'ANSI';
47
48
    /**
49
     * Sheet index to read
50
     *
51
     * @var int
52
     */
53
    protected $sheetIndex = 0;
54
55
    /**
56
     * Formats
57
     *
58
     * @var array
59
     */
60
    protected $formats = [
61
        'h1' => [
62
            'font' => [
63
                'bold' => true,
64
                'size' => 24,
65
            ],
66
        ], //    Bold, 24pt
67
        'h2' => [
68
            'font' => [
69
                'bold' => true,
70
                'size' => 18,
71
            ],
72
        ], //    Bold, 18pt
73
        'h3' => [
74
            'font' => [
75
                'bold' => true,
76
                'size' => 13.5,
77
            ],
78
        ], //    Bold, 13.5pt
79
        'h4' => [
80
            'font' => [
81
                'bold' => true,
82
                'size' => 12,
83
            ],
84
        ], //    Bold, 12pt
85
        'h5' => [
86
            'font' => [
87
                'bold' => true,
88
                'size' => 10,
89
            ],
90
        ], //    Bold, 10pt
91
        'h6' => [
92
            'font' => [
93
                'bold' => true,
94
                'size' => 7.5,
95
            ],
96
        ], //    Bold, 7.5pt
97
        'a' => [
98
            'font' => [
99
                'underline' => true,
100
                'color' => [
101
                    'argb' => \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLUE,
102
                ],
103
            ],
104
        ], //    Blue underlined
105
        'hr' => [
106
            'borders' => [
107
                'bottom' => [
108
                    'style' => \PhpOffice\PhpSpreadsheet\Style\Border::BORDER_THIN,
109
                    'color' => [
110
                        \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLACK,
111
                    ],
112
                ],
113
            ],
114
        ], //    Bottom border
115
    ];
116
117
    protected $rowspan = [];
118
119
    /**
120
     * Create a new HTML Reader instance
121
     */
122 1
    public function __construct()
123
    {
124 1
        $this->readFilter = new DefaultReadFilter();
125 1
    }
126
127
    /**
128
     * Validate that the current file is an HTML file
129
     *
130
     * @return bool
131
     */
132 1
    protected function isValidFormat()
133
    {
134 1
        $beginning = $this->readBeginning();
135
136 1
        if (!self::startsWithTag($beginning)) {
137 1
            return false;
138
        }
139
140
        if (!self::containsTags($beginning)) {
141
            return false;
142
        }
143
144
        if (!self::endsWithTag($this->readEnding())) {
145
            return false;
146
        }
147
148
        return true;
149
    }
150
151 1
    private function readBeginning()
152
    {
153 1
        fseek($this->fileHandle, 0);
154
155 1
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
156
    }
157
158
    private function readEnding()
159
    {
160
        $meta = stream_get_meta_data($this->fileHandle);
161
        $filename = $meta['uri'];
162
163
        $size = filesize($filename);
164
        $blockSize = self::TEST_SAMPLE_SIZE;
165
166
        fseek($this->fileHandle, $size - $blockSize);
167
168
        return fread($this->fileHandle, $blockSize);
169
    }
170
171 1
    private static function startsWithTag($data)
172
    {
173 1
        return '<' === substr(trim($data), 0, 1);
174
    }
175
176
    private static function endsWithTag($data)
177
    {
178
        return '>' === substr(trim($data), -1, 1);
179
    }
180
181
    private static function containsTags($data)
182
    {
183
        return strlen($data) !== strlen(strip_tags($data));
184
    }
185
186
    /**
187
     * Loads Spreadsheet from file
188
     *
189
     * @param  string                    $pFilename
190
     * @throws Exception
191
     * @return Spreadsheet
192
     */
193
    public function load($pFilename)
194
    {
195
        // Create new Spreadsheet
196
        $spreadsheet = new Spreadsheet();
197
198
        // Load into this instance
199
        return $this->loadIntoExisting($pFilename, $spreadsheet);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this->loadIntoEx...ilename, $spreadsheet); (PhpOffice\PhpSpreadsheet\Spreadsheet) is incompatible with the return type declared by the interface PhpOffice\PhpSpreadsheet\Reader\IReader::load of type PhpOffice\PhpSpreadsheet\Reader\PhpSpreadsheet.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
200
    }
201
202
    /**
203
     * Set input encoding
204
     *
205
     * @param string $pValue Input encoding
206
     */
207
    public function setInputEncoding($pValue = 'ANSI')
208
    {
209
        $this->inputEncoding = $pValue;
210
211
        return $this;
212
    }
213
214
    /**
215
     * Get input encoding
216
     *
217
     * @return string
218
     */
219
    public function getInputEncoding()
220
    {
221
        return $this->inputEncoding;
222
    }
223
224
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
225
    protected $dataArray = [];
226
    protected $tableLevel = 0;
227
    protected $nestedColumn = ['A'];
228
229
    protected function setTableStartColumn($column)
230
    {
231
        if ($this->tableLevel == 0) {
232
            $column = 'A';
233
        }
234
        ++$this->tableLevel;
235
        $this->nestedColumn[$this->tableLevel] = $column;
236
237
        return $this->nestedColumn[$this->tableLevel];
238
    }
239
240
    protected function getTableStartColumn()
241
    {
242
        return $this->nestedColumn[$this->tableLevel];
243
    }
244
245
    protected function releaseTableStartColumn()
246
    {
247
        --$this->tableLevel;
248
249
        return array_pop($this->nestedColumn);
250
    }
251
252
    protected function flushCell($sheet, $column, $row, &$cellContent)
253
    {
254
        if (is_string($cellContent)) {
255
            //    Simple String content
256
            if (trim($cellContent) > '') {
257
                //    Only actually write it if there's content in the string
258
                //    Write to worksheet to be done here...
259
                //    ... we return the cell so we can mess about with styles more easily
260
                $sheet->setCellValue($column . $row, $cellContent, true);
261
                $this->dataArray[$row][$column] = $cellContent;
262
            }
263
        } else {
264
            //    We have a Rich Text run
265
            //    TODO
266
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
267
        }
268
        $cellContent = (string) '';
269
    }
270
271
    /**
272
     * @param DOMNode $element
273
     * @param \PhpOffice\PhpSpreadsheet\Worksheet $sheet
274
     * @param int $row
275
     * @param string $column
276
     * @param string $cellContent
277
     */
278
    protected function processDomElement(DOMNode $element, \PhpOffice\PhpSpreadsheet\Worksheet $sheet, &$row, &$column, &$cellContent)
279
    {
280
        foreach ($element->childNodes as $child) {
281
            if ($child instanceof DOMText) {
282
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
283
                if (is_string($cellContent)) {
284
                    //    simply append the text if the cell content is a plain text string
285
                    $cellContent .= $domText;
286
                } else {
0 ignored issues
show
Unused Code introduced by
This else statement is empty and can be removed.

This check looks for the else branches of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These else branches can be removed.

if (rand(1, 6) > 3) {
print "Check failed";
} else {
    //print "Check succeeded";
}

could be turned into

if (rand(1, 6) > 3) {
    print "Check failed";
}

This is much more concise to read.

Loading history...
287
                    //    but if we have a rich text run instead, we need to append it correctly
288
                    //    TODO
289
                }
290
            } elseif ($child instanceof DOMElement) {
291
                $attributeArray = [];
292
                foreach ($child->attributes as $attribute) {
293
                    $attributeArray[$attribute->name] = $attribute->value;
294
                }
295
296
                switch ($child->nodeName) {
297
                    case 'meta':
298
                        foreach ($attributeArray as $attributeName => $attributeValue) {
299
                            switch ($attributeName) {
300
                                case 'content':
301
                                    //    TODO
302
                                    //    Extract character set, so we can convert to UTF-8 if required
303
                                    break;
304
                            }
305
                        }
306
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
307
                        break;
308
                    case 'title':
309
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
310
                        $sheet->setTitle($cellContent);
311
                        $cellContent = '';
312
                        break;
313
                    case 'span':
314
                    case 'div':
315
                    case 'font':
316
                    case 'i':
317
                    case 'em':
318
                    case 'strong':
319
                    case 'b':
320
                        if ($cellContent > '') {
321
                            $cellContent .= ' ';
322
                        }
323
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
324
                        if ($cellContent > '') {
325
                            $cellContent .= ' ';
326
                        }
327
                        break;
328
                    case 'hr':
329
                        $this->flushCell($sheet, $column, $row, $cellContent);
330
                        ++$row;
331
                        if (isset($this->formats[$child->nodeName])) {
332
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
333
                        } else {
334
                            $cellContent = '----------';
335
                            $this->flushCell($sheet, $column, $row, $cellContent);
336
                        }
337
                        ++$row;
338
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
339
                    case 'br':
340
                        if ($this->tableLevel > 0) {
341
                            //    If we're inside a table, replace with a \n
342
                            $cellContent .= "\n";
343
                        } else {
344
                            //    Otherwise flush our existing content and move the row cursor on
345
                            $this->flushCell($sheet, $column, $row, $cellContent);
346
                            ++$row;
347
                        }
348
                        break;
349
                    case 'a':
350
                        foreach ($attributeArray as $attributeName => $attributeValue) {
351
                            switch ($attributeName) {
352
                                case 'href':
353
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
354 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
355
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
356
                                    }
357
                                    break;
358
                            }
359
                        }
360
                        $cellContent .= ' ';
361
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
362
                        break;
363
                    case 'h1':
364
                    case 'h2':
365
                    case 'h3':
366
                    case 'h4':
367
                    case 'h5':
368
                    case 'h6':
369
                    case 'ol':
370
                    case 'ul':
371
                    case 'p':
372
                        if ($this->tableLevel > 0) {
373
                            //    If we're inside a table, replace with a \n
374
                            $cellContent .= "\n";
375
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
376
                        } else {
377
                            if ($cellContent > '') {
378
                                $this->flushCell($sheet, $column, $row, $cellContent);
379
                                ++$row;
380
                            }
381
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
382
                            $this->flushCell($sheet, $column, $row, $cellContent);
383
384 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
385
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
386
                            }
387
388
                            ++$row;
389
                            $column = 'A';
390
                        }
391
                        break;
392
                    case 'li':
393
                        if ($this->tableLevel > 0) {
394
                            //    If we're inside a table, replace with a \n
395
                            $cellContent .= "\n";
396
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
397
                        } else {
398
                            if ($cellContent > '') {
399
                                $this->flushCell($sheet, $column, $row, $cellContent);
400
                            }
401
                            ++$row;
402
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
403
                            $this->flushCell($sheet, $column, $row, $cellContent);
404
                            $column = 'A';
405
                        }
406
                        break;
407
                    case 'table':
408
                        $this->flushCell($sheet, $column, $row, $cellContent);
409
                        $column = $this->setTableStartColumn($column);
410
                        if ($this->tableLevel > 1) {
411
                            --$row;
412
                        }
413
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
414
                        $column = $this->releaseTableStartColumn();
415
                        if ($this->tableLevel > 1) {
416
                            ++$column;
417
                        } else {
418
                            ++$row;
419
                        }
420
                        break;
421
                    case 'thead':
422
                    case 'tbody':
423
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
424
                        break;
425
                    case 'tr':
426
                        $column = $this->getTableStartColumn();
427
                        $cellContent = '';
428
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
429
                        ++$row;
430
                        break;
431
                    case 'th':
432
                    case 'td':
433
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
434
435
                        while (isset($this->rowspan[$column . $row])) {
436
                            ++$column;
437
                        }
438
439
                        $this->flushCell($sheet, $column, $row, $cellContent);
440
441
                        if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
442
                            //create merging rowspan and colspan
443
                            $columnTo = $column;
444
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
445
                                ++$columnTo;
446
                            }
447
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
448
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
449
                                $this->rowspan[$value] = true;
450
                            }
451
                            $sheet->mergeCells($range);
452
                            $column = $columnTo;
453
                        } elseif (isset($attributeArray['rowspan'])) {
454
                            //create merging rowspan
455
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
456
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
457
                                $this->rowspan[$value] = true;
458
                            }
459
                            $sheet->mergeCells($range);
460
                        } elseif (isset($attributeArray['colspan'])) {
461
                            //create merging colspan
462
                            $columnTo = $column;
463
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
464
                                ++$columnTo;
465
                            }
466
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
467
                            $column = $columnTo;
468
                        }
469
                        ++$column;
470
                        break;
471
                    case 'body':
472
                        $row = 1;
473
                        $column = 'A';
474
                        $content = '';
0 ignored issues
show
Unused Code introduced by
$content is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
475
                        $this->tableLevel = 0;
476
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
477
                        break;
478
                    default:
479
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
480
                }
481
            }
482
        }
483
    }
484
485
    /**
486
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance
487
     *
488
     * @param  string                    $pFilename
489
     * @param  Spreadsheet                  $spreadsheet
490
     * @throws Exception
491
     * @return Spreadsheet
492
     */
493
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
494
    {
495
        // Open file to validate
496
        $this->openFile($pFilename);
497
        if (!$this->isValidFormat()) {
498
            fclose($this->fileHandle);
499
            throw new Exception($pFilename . ' is an Invalid HTML file.');
500
        }
501
        //    Close after validating
502
        fclose($this->fileHandle);
503
504
        // Create new sheet
505
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
506
            $spreadsheet->createSheet();
507
        }
508
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
509
510
        //    Create a new DOM object
511
        $dom = new DOMDocument();
512
        //    Reload the HTML file into the DOM object
513
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
514
        if ($loaded === false) {
515
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
516
        }
517
518
        //    Discard white space
519
        $dom->preserveWhiteSpace = false;
520
521
        $row = 0;
522
        $column = 'A';
523
        $content = '';
524
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
525
526
        // Return
527
        return $spreadsheet;
528
    }
529
530
    /**
531
     * Get sheet index
532
     *
533
     * @return int
534
     */
535
    public function getSheetIndex()
536
    {
537
        return $this->sheetIndex;
538
    }
539
540
    /**
541
     * Set sheet index
542
     *
543
     * @param  int                  $pValue Sheet index
544
     * @return HTML
545
     */
546
    public function setSheetIndex($pValue = 0)
547
    {
548
        $this->sheetIndex = $pValue;
549
550
        return $this;
551
    }
552
553
    /**
554
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
555
     *
556
     * @param     string         $xml
557
     * @throws Exception
558
     */
559 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
560
    {
561
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
562
        if (preg_match($pattern, $xml)) {
563
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
564
        }
565
566
        return $xml;
567
    }
568
}
569