Completed
Push — develop ( 03f96a...8c66af )
by Adrien
19:36
created

HTML::getTableStartColumn()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 0
dl 0
loc 4
ccs 0
cts 2
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Spreadsheet;
10
11
/**
12
 * Copyright (c) 2006 - 2016 PhpSpreadsheet.
13
 *
14
 * This library is free software; you can redistribute it and/or
15
 * modify it under the terms of the GNU Lesser General Public
16
 * License as published by the Free Software Foundation; either
17
 * version 2.1 of the License, or (at your option) any later version.
18
 *
19
 * This library is distributed in the hope that it will be useful,
20
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
 * Lesser General Public License for more details.
23
 *
24
 * You should have received a copy of the GNU Lesser General Public
25
 * License along with this library; if not, write to the Free Software
26
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
27
 *
28
 * @category   PhpSpreadsheet
29
 *
30
 * @copyright  Copyright (c) 2006 - 2016 PhpSpreadsheet (https://github.com/PHPOffice/PhpSpreadsheet)
31
 * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
32
 */
33
/** PhpSpreadsheet root directory */
34
class HTML extends BaseReader implements IReader
35
{
36
    /**
37
     * Sample size to read to determine if it's HTML or not.
38
     */
39
    const TEST_SAMPLE_SIZE = 2048;
40
41
    /**
42
     * Input encoding.
43
     *
44
     * @var string
45
     */
46
    protected $inputEncoding = 'ANSI';
47
48
    /**
49
     * Sheet index to read.
50
     *
51
     * @var int
52
     */
53
    protected $sheetIndex = 0;
54
55
    /**
56
     * Formats.
57
     *
58
     * @var array
59
     */
60
    protected $formats = [
61
        'h1' => [
62
            'font' => [
63
                'bold' => true,
64
                'size' => 24,
65
            ],
66
        ], //    Bold, 24pt
67
        'h2' => [
68
            'font' => [
69
                'bold' => true,
70
                'size' => 18,
71
            ],
72
        ], //    Bold, 18pt
73
        'h3' => [
74
            'font' => [
75
                'bold' => true,
76
                'size' => 13.5,
77
            ],
78
        ], //    Bold, 13.5pt
79
        'h4' => [
80
            'font' => [
81
                'bold' => true,
82
                'size' => 12,
83
            ],
84
        ], //    Bold, 12pt
85
        'h5' => [
86
            'font' => [
87
                'bold' => true,
88
                'size' => 10,
89
            ],
90
        ], //    Bold, 10pt
91
        'h6' => [
92
            'font' => [
93
                'bold' => true,
94
                'size' => 7.5,
95
            ],
96
        ], //    Bold, 7.5pt
97
        'a' => [
98
            'font' => [
99
                'underline' => true,
100
                'color' => [
101
                    'argb' => \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLUE,
102
                ],
103
            ],
104
        ], //    Blue underlined
105
        'hr' => [
106
            'borders' => [
107
                'bottom' => [
108
                    'style' => \PhpOffice\PhpSpreadsheet\Style\Border::BORDER_THIN,
109
                    'color' => [
110
                        \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLACK,
111
                    ],
112
                ],
113
            ],
114
        ], //    Bottom border
115
    ];
116
117
    protected $rowspan = [];
118
119
    /**
120
     * Create a new HTML Reader instance.
121
     */
122 1
    public function __construct()
123
    {
124 1
        $this->readFilter = new DefaultReadFilter();
125 1
    }
126
127
    /**
128
     * Validate that the current file is an HTML file.
129
     *
130
     * @param     string         $pFilename
131
     *
132
     * @throws Exception
133
     *
134
     * @return bool
135
     */
136 1
    public function canRead($pFilename)
137
    {
138
        // Check if file exists
139
        try {
140 1
            $this->openFile($pFilename);
141
        } catch (Exception $e) {
142
            return false;
143
        }
144
145 1
        $beginning = $this->readBeginning();
146 1
        $startWithTag = self::startsWithTag($beginning);
147 1
        $containsTags = self::containsTags($beginning);
148 1
        $endsWithTag = self::endsWithTag($this->readEnding());
149
150 1
        fclose($this->fileHandle);
151
152 1
        return $startWithTag && $containsTags && $endsWithTag;
153
    }
154
155 1
    private function readBeginning()
156
    {
157 1
        fseek($this->fileHandle, 0);
158
159 1
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
160
    }
161
162 1
    private function readEnding()
163
    {
164 1
        $meta = stream_get_meta_data($this->fileHandle);
165 1
        $filename = $meta['uri'];
166
167 1
        $size = filesize($filename);
168 1
        $blockSize = self::TEST_SAMPLE_SIZE;
169
170 1
        fseek($this->fileHandle, $size - $blockSize);
171
172 1
        return fread($this->fileHandle, $blockSize);
173
    }
174
175 1
    private static function startsWithTag($data)
176
    {
177 1
        return '<' === substr(trim($data), 0, 1);
178
    }
179
180 1
    private static function endsWithTag($data)
181
    {
182 1
        return '>' === substr(trim($data), -1, 1);
183
    }
184
185 1
    private static function containsTags($data)
186
    {
187 1
        return strlen($data) !== strlen(strip_tags($data));
188
    }
189
190
    /**
191
     * Loads Spreadsheet from file.
192
     *
193
     * @param  string                    $pFilename
194
     *
195
     * @throws Exception
196
     *
197
     * @return Spreadsheet
198
     */
199
    public function load($pFilename)
200
    {
201
        // Create new Spreadsheet
202
        $spreadsheet = new Spreadsheet();
203
204
        // Load into this instance
205
        return $this->loadIntoExisting($pFilename, $spreadsheet);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this->loadIntoEx...ilename, $spreadsheet); (PhpOffice\PhpSpreadsheet\Spreadsheet) is incompatible with the return type declared by the interface PhpOffice\PhpSpreadsheet\Reader\IReader::load of type PhpOffice\PhpSpreadsheet\Reader\PhpSpreadsheet.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
206
    }
207
208
    /**
209
     * Set input encoding.
210
     *
211
     * @param string $pValue Input encoding
212
     */
213
    public function setInputEncoding($pValue = 'ANSI')
214
    {
215
        $this->inputEncoding = $pValue;
216
217
        return $this;
218
    }
219
220
    /**
221
     * Get input encoding.
222
     *
223
     * @return string
224
     */
225
    public function getInputEncoding()
226
    {
227
        return $this->inputEncoding;
228
    }
229
230
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
231
    protected $dataArray = [];
232
    protected $tableLevel = 0;
233
    protected $nestedColumn = ['A'];
234
235
    protected function setTableStartColumn($column)
236
    {
237
        if ($this->tableLevel == 0) {
238
            $column = 'A';
239
        }
240
        ++$this->tableLevel;
241
        $this->nestedColumn[$this->tableLevel] = $column;
242
243
        return $this->nestedColumn[$this->tableLevel];
244
    }
245
246
    protected function getTableStartColumn()
247
    {
248
        return $this->nestedColumn[$this->tableLevel];
249
    }
250
251
    protected function releaseTableStartColumn()
252
    {
253
        --$this->tableLevel;
254
255
        return array_pop($this->nestedColumn);
256
    }
257
258
    protected function flushCell($sheet, $column, $row, &$cellContent)
259
    {
260
        if (is_string($cellContent)) {
261
            //    Simple String content
262
            if (trim($cellContent) > '') {
263
                //    Only actually write it if there's content in the string
264
                //    Write to worksheet to be done here...
265
                //    ... we return the cell so we can mess about with styles more easily
266
                $sheet->setCellValue($column . $row, $cellContent, true);
267
                $this->dataArray[$row][$column] = $cellContent;
268
            }
269
        } else {
270
            //    We have a Rich Text run
271
            //    TODO
272
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
273
        }
274
        $cellContent = (string) '';
275
    }
276
277
    /**
278
     * @param DOMNode $element
279
     * @param \PhpOffice\PhpSpreadsheet\Worksheet $sheet
280
     * @param int $row
281
     * @param string $column
282
     * @param string $cellContent
283
     */
284
    protected function processDomElement(DOMNode $element, \PhpOffice\PhpSpreadsheet\Worksheet $sheet, &$row, &$column, &$cellContent)
285
    {
286
        foreach ($element->childNodes as $child) {
287
            if ($child instanceof DOMText) {
288
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
289
                if (is_string($cellContent)) {
290
                    //    simply append the text if the cell content is a plain text string
291
                    $cellContent .= $domText;
292
                }
293
                    //    but if we have a rich text run instead, we need to append it correctly
294
                    //    TODO
295
            } elseif ($child instanceof DOMElement) {
296
                $attributeArray = [];
297
                foreach ($child->attributes as $attribute) {
298
                    $attributeArray[$attribute->name] = $attribute->value;
299
                }
300
301
                switch ($child->nodeName) {
302
                    case 'meta':
303
                        foreach ($attributeArray as $attributeName => $attributeValue) {
304
                            switch ($attributeName) {
305
                                case 'content':
306
                                    //    TODO
307
                                    //    Extract character set, so we can convert to UTF-8 if required
308
                                    break;
309
                            }
310
                        }
311
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
312
                        break;
313
                    case 'title':
314
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
315
                        $sheet->setTitle($cellContent);
316
                        $cellContent = '';
317
                        break;
318
                    case 'span':
319
                    case 'div':
320
                    case 'font':
321
                    case 'i':
322
                    case 'em':
323
                    case 'strong':
324
                    case 'b':
325
                        if ($cellContent > '') {
326
                            $cellContent .= ' ';
327
                        }
328
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
329
                        if ($cellContent > '') {
330
                            $cellContent .= ' ';
331
                        }
332
                        break;
333
                    case 'hr':
334
                        $this->flushCell($sheet, $column, $row, $cellContent);
335
                        ++$row;
336
                        if (isset($this->formats[$child->nodeName])) {
337
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
338
                        } else {
339
                            $cellContent = '----------';
340
                            $this->flushCell($sheet, $column, $row, $cellContent);
341
                        }
342
                        ++$row;
343
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
344
                    case 'br':
345
                        if ($this->tableLevel > 0) {
346
                            //    If we're inside a table, replace with a \n
347
                            $cellContent .= "\n";
348
                        } else {
349
                            //    Otherwise flush our existing content and move the row cursor on
350
                            $this->flushCell($sheet, $column, $row, $cellContent);
351
                            ++$row;
352
                        }
353
                        break;
354
                    case 'a':
355
                        foreach ($attributeArray as $attributeName => $attributeValue) {
356
                            switch ($attributeName) {
357
                                case 'href':
358
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
359 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
360
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
361
                                    }
362
                                    break;
363
                            }
364
                        }
365
                        $cellContent .= ' ';
366
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
367
                        break;
368
                    case 'h1':
369
                    case 'h2':
370
                    case 'h3':
371
                    case 'h4':
372
                    case 'h5':
373
                    case 'h6':
374
                    case 'ol':
375
                    case 'ul':
376
                    case 'p':
377
                        if ($this->tableLevel > 0) {
378
                            //    If we're inside a table, replace with a \n
379
                            $cellContent .= "\n";
380
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
381
                        } else {
382
                            if ($cellContent > '') {
383
                                $this->flushCell($sheet, $column, $row, $cellContent);
384
                                ++$row;
385
                            }
386
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
387
                            $this->flushCell($sheet, $column, $row, $cellContent);
388
389 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
390
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
391
                            }
392
393
                            ++$row;
394
                            $column = 'A';
395
                        }
396
                        break;
397
                    case 'li':
398
                        if ($this->tableLevel > 0) {
399
                            //    If we're inside a table, replace with a \n
400
                            $cellContent .= "\n";
401
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
402
                        } else {
403
                            if ($cellContent > '') {
404
                                $this->flushCell($sheet, $column, $row, $cellContent);
405
                            }
406
                            ++$row;
407
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
408
                            $this->flushCell($sheet, $column, $row, $cellContent);
409
                            $column = 'A';
410
                        }
411
                        break;
412
                    case 'table':
413
                        $this->flushCell($sheet, $column, $row, $cellContent);
414
                        $column = $this->setTableStartColumn($column);
415
                        if ($this->tableLevel > 1) {
416
                            --$row;
417
                        }
418
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
419
                        $column = $this->releaseTableStartColumn();
420
                        if ($this->tableLevel > 1) {
421
                            ++$column;
422
                        } else {
423
                            ++$row;
424
                        }
425
                        break;
426
                    case 'thead':
427
                    case 'tbody':
428
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
429
                        break;
430
                    case 'tr':
431
                        $column = $this->getTableStartColumn();
432
                        $cellContent = '';
433
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
434
                        ++$row;
435
                        break;
436
                    case 'th':
437
                    case 'td':
438
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
439
440
                        while (isset($this->rowspan[$column . $row])) {
441
                            ++$column;
442
                        }
443
444
                        $this->flushCell($sheet, $column, $row, $cellContent);
445
446
                        if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
447
                            //create merging rowspan and colspan
448
                            $columnTo = $column;
449
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
450
                                ++$columnTo;
451
                            }
452
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
453
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
454
                                $this->rowspan[$value] = true;
455
                            }
456
                            $sheet->mergeCells($range);
457
                            $column = $columnTo;
458
                        } elseif (isset($attributeArray['rowspan'])) {
459
                            //create merging rowspan
460
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
461
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
462
                                $this->rowspan[$value] = true;
463
                            }
464
                            $sheet->mergeCells($range);
465
                        } elseif (isset($attributeArray['colspan'])) {
466
                            //create merging colspan
467
                            $columnTo = $column;
468
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
469
                                ++$columnTo;
470
                            }
471
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
472
                            $column = $columnTo;
473
                        } elseif (isset($attributeArray['bgcolor'])) {
474
                            $sheet->getStyle($column . $row)->applyFromArray(
475
                                [
476
                                    'fill' => [
477
                                        'type' => PHPExcel_Style_Fill::FILL_SOLID,
478
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
479
                                    ],
480
                                ]
481
                            );
482
                        }
483
                        ++$column;
484
                        break;
485
                    case 'body':
486
                        $row = 1;
487
                        $column = 'A';
488
                        $content = '';
0 ignored issues
show
Unused Code introduced by
$content is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
489
                        $this->tableLevel = 0;
490
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
491
                        break;
492
                    default:
493
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
494
                }
495
            }
496
        }
497
    }
498
499
    /**
500
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
501
     *
502
     * @param  string                    $pFilename
503
     * @param  Spreadsheet                  $spreadsheet
504
     *
505
     * @throws Exception
506
     *
507
     * @return Spreadsheet
508
     */
509
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
510
    {
511
        // Validate
512
        if (!$this->canRead($pFilename)) {
513
            throw new Exception($pFilename . ' is an Invalid HTML file.');
514
        }
515
516
        // Create new sheet
517
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
518
            $spreadsheet->createSheet();
519
        }
520
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
521
522
        //    Create a new DOM object
523
        $dom = new DOMDocument();
524
        //    Reload the HTML file into the DOM object
525
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
526
        if ($loaded === false) {
527
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
528
        }
529
530
        //    Discard white space
531
        $dom->preserveWhiteSpace = false;
532
533
        $row = 0;
534
        $column = 'A';
535
        $content = '';
536
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
537
538
        // Return
539
        return $spreadsheet;
540
    }
541
542
    /**
543
     * Get sheet index.
544
     *
545
     * @return int
546
     */
547
    public function getSheetIndex()
548
    {
549
        return $this->sheetIndex;
550
    }
551
552
    /**
553
     * Set sheet index.
554
     *
555
     * @param  int                  $pValue Sheet index
556
     *
557
     * @return HTML
558
     */
559
    public function setSheetIndex($pValue = 0)
560
    {
561
        $this->sheetIndex = $pValue;
562
563
        return $this;
564
    }
565
566
    /**
567
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
568
     *
569
     * @param     string         $xml
570
     *
571
     * @throws Exception
572
     */
573 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
574
    {
575
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
576
        if (preg_match($pattern, $xml)) {
577
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
578
        }
579
580
        return $xml;
581
    }
582
}
583