Completed
Push — develop ( ab7aa6...720fb3 )
by Adrien
87:24 queued 79:26
created

HTML::getTableStartColumn()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 0
dl 0
loc 4
ccs 0
cts 2
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PhpOffice\PhpSpreadsheet\Reader;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use DOMText;
9
use PhpOffice\PhpSpreadsheet\Spreadsheet;
10
11
/**
12
 * Copyright (c) 2006 - 2016 PhpSpreadsheet
13
 *
14
 * This library is free software; you can redistribute it and/or
15
 * modify it under the terms of the GNU Lesser General Public
16
 * License as published by the Free Software Foundation; either
17
 * version 2.1 of the License, or (at your option) any later version.
18
 *
19
 * This library is distributed in the hope that it will be useful,
20
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
 * Lesser General Public License for more details.
23
 *
24
 * You should have received a copy of the GNU Lesser General Public
25
 * License along with this library; if not, write to the Free Software
26
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
27
 *
28
 * @category   PhpSpreadsheet
29
 * @copyright  Copyright (c) 2006 - 2016 PhpSpreadsheet (https://github.com/PHPOffice/PhpSpreadsheet)
30
 * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
31
 */
32
/** PhpSpreadsheet root directory */
33
class HTML extends BaseReader implements IReader
34
{
35
    /**
36
     * Sample size to read to determine if it's HTML or not
37
     */
38
    const TEST_SAMPLE_SIZE = 2048;
39
40
    /**
41
     * Input encoding
42
     *
43
     * @var string
44
     */
45
    protected $inputEncoding = 'ANSI';
46
47
    /**
48
     * Sheet index to read
49
     *
50
     * @var int
51
     */
52
    protected $sheetIndex = 0;
53
54
    /**
55
     * Formats
56
     *
57
     * @var array
58
     */
59
    protected $formats = [
60
        'h1' => [
61
            'font' => [
62
                'bold' => true,
63
                'size' => 24,
64
            ],
65
        ], //    Bold, 24pt
66
        'h2' => [
67
            'font' => [
68
                'bold' => true,
69
                'size' => 18,
70
            ],
71
        ], //    Bold, 18pt
72
        'h3' => [
73
            'font' => [
74
                'bold' => true,
75
                'size' => 13.5,
76
            ],
77
        ], //    Bold, 13.5pt
78
        'h4' => [
79
            'font' => [
80
                'bold' => true,
81
                'size' => 12,
82
            ],
83
        ], //    Bold, 12pt
84
        'h5' => [
85
            'font' => [
86
                'bold' => true,
87
                'size' => 10,
88
            ],
89
        ], //    Bold, 10pt
90
        'h6' => [
91
            'font' => [
92
                'bold' => true,
93
                'size' => 7.5,
94
            ],
95
        ], //    Bold, 7.5pt
96
        'a' => [
97
            'font' => [
98
                'underline' => true,
99
                'color' => [
100
                    'argb' => \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLUE,
101
                ],
102
            ],
103
        ], //    Blue underlined
104
        'hr' => [
105
            'borders' => [
106
                'bottom' => [
107
                    'style' => \PhpOffice\PhpSpreadsheet\Style\Border::BORDER_THIN,
108
                    'color' => [
109
                        \PhpOffice\PhpSpreadsheet\Style\Color::COLOR_BLACK,
110
                    ],
111
                ],
112
            ],
113
        ], //    Bottom border
114
    ];
115
116
    protected $rowspan = [];
117
118
    /**
119
     * Create a new HTML Reader instance
120
     */
121 1
    public function __construct()
122
    {
123 1
        $this->readFilter = new DefaultReadFilter();
124 1
    }
125
126
    /**
127
     * Validate that the current file is an HTML file
128
     *
129
     * @param     string         $pFilename
130
     * @throws Exception
131
     * @return bool
132
     */
133 1
    public function canRead($pFilename)
134
    {
135
        // Check if file exists
136
        try {
137 1
            $this->openFile($pFilename);
138
        } catch (Exception $e) {
139
            return false;
140
        }
141
142 1
        $beginning = $this->readBeginning();
143 1
        $startWithTag = self::startsWithTag($beginning);
144 1
        $containsTags = self::containsTags($beginning);
145 1
        $endsWithTag = self::endsWithTag($this->readEnding());
146
147 1
        fclose($this->fileHandle);
148
149 1
        return $startWithTag && $containsTags && $endsWithTag;
150
    }
151
152 1
    private function readBeginning()
153
    {
154 1
        fseek($this->fileHandle, 0);
155
156 1
        return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
157
    }
158
159 1
    private function readEnding()
160
    {
161 1
        $meta = stream_get_meta_data($this->fileHandle);
162 1
        $filename = $meta['uri'];
163
164 1
        $size = filesize($filename);
165 1
        $blockSize = self::TEST_SAMPLE_SIZE;
166
167 1
        fseek($this->fileHandle, $size - $blockSize);
168
169 1
        return fread($this->fileHandle, $blockSize);
170
    }
171
172 1
    private static function startsWithTag($data)
173
    {
174 1
        return '<' === substr(trim($data), 0, 1);
175
    }
176
177 1
    private static function endsWithTag($data)
178
    {
179 1
        return '>' === substr(trim($data), -1, 1);
180
    }
181
182 1
    private static function containsTags($data)
183
    {
184 1
        return strlen($data) !== strlen(strip_tags($data));
185
    }
186
187
    /**
188
     * Loads Spreadsheet from file
189
     *
190
     * @param  string                    $pFilename
191
     * @throws Exception
192
     * @return Spreadsheet
193
     */
194
    public function load($pFilename)
195
    {
196
        // Create new Spreadsheet
197
        $spreadsheet = new Spreadsheet();
198
199
        // Load into this instance
200
        return $this->loadIntoExisting($pFilename, $spreadsheet);
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $this->loadIntoEx...ilename, $spreadsheet); (PhpOffice\PhpSpreadsheet\Spreadsheet) is incompatible with the return type declared by the interface PhpOffice\PhpSpreadsheet\Reader\IReader::load of type PhpOffice\PhpSpreadsheet\Reader\PhpSpreadsheet.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
201
    }
202
203
    /**
204
     * Set input encoding
205
     *
206
     * @param string $pValue Input encoding
207
     */
208
    public function setInputEncoding($pValue = 'ANSI')
209
    {
210
        $this->inputEncoding = $pValue;
211
212
        return $this;
213
    }
214
215
    /**
216
     * Get input encoding
217
     *
218
     * @return string
219
     */
220
    public function getInputEncoding()
221
    {
222
        return $this->inputEncoding;
223
    }
224
225
    //    Data Array used for testing only, should write to Spreadsheet object on completion of tests
226
    protected $dataArray = [];
227
    protected $tableLevel = 0;
228
    protected $nestedColumn = ['A'];
229
230
    protected function setTableStartColumn($column)
231
    {
232
        if ($this->tableLevel == 0) {
233
            $column = 'A';
234
        }
235
        ++$this->tableLevel;
236
        $this->nestedColumn[$this->tableLevel] = $column;
237
238
        return $this->nestedColumn[$this->tableLevel];
239
    }
240
241
    protected function getTableStartColumn()
242
    {
243
        return $this->nestedColumn[$this->tableLevel];
244
    }
245
246
    protected function releaseTableStartColumn()
247
    {
248
        --$this->tableLevel;
249
250
        return array_pop($this->nestedColumn);
251
    }
252
253
    protected function flushCell($sheet, $column, $row, &$cellContent)
254
    {
255
        if (is_string($cellContent)) {
256
            //    Simple String content
257
            if (trim($cellContent) > '') {
258
                //    Only actually write it if there's content in the string
259
                //    Write to worksheet to be done here...
260
                //    ... we return the cell so we can mess about with styles more easily
261
                $sheet->setCellValue($column . $row, $cellContent, true);
262
                $this->dataArray[$row][$column] = $cellContent;
263
            }
264
        } else {
265
            //    We have a Rich Text run
266
            //    TODO
267
            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
268
        }
269
        $cellContent = (string) '';
270
    }
271
272
    /**
273
     * @param DOMNode $element
274
     * @param \PhpOffice\PhpSpreadsheet\Worksheet $sheet
275
     * @param int $row
276
     * @param string $column
277
     * @param string $cellContent
278
     */
279
    protected function processDomElement(DOMNode $element, \PhpOffice\PhpSpreadsheet\Worksheet $sheet, &$row, &$column, &$cellContent)
280
    {
281
        foreach ($element->childNodes as $child) {
282
            if ($child instanceof DOMText) {
283
                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
284
                if (is_string($cellContent)) {
285
                    //    simply append the text if the cell content is a plain text string
286
                    $cellContent .= $domText;
287
                } else {
0 ignored issues
show
Unused Code introduced by
This else statement is empty and can be removed.

This check looks for the else branches of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These else branches can be removed.

if (rand(1, 6) > 3) {
print "Check failed";
} else {
    //print "Check succeeded";
}

could be turned into

if (rand(1, 6) > 3) {
    print "Check failed";
}

This is much more concise to read.

Loading history...
288
                    //    but if we have a rich text run instead, we need to append it correctly
289
                    //    TODO
290
                }
291
            } elseif ($child instanceof DOMElement) {
292
                $attributeArray = [];
293
                foreach ($child->attributes as $attribute) {
294
                    $attributeArray[$attribute->name] = $attribute->value;
295
                }
296
297
                switch ($child->nodeName) {
298
                    case 'meta':
299
                        foreach ($attributeArray as $attributeName => $attributeValue) {
300
                            switch ($attributeName) {
301
                                case 'content':
302
                                    //    TODO
303
                                    //    Extract character set, so we can convert to UTF-8 if required
304
                                    break;
305
                            }
306
                        }
307
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
308
                        break;
309
                    case 'title':
310
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
311
                        $sheet->setTitle($cellContent);
312
                        $cellContent = '';
313
                        break;
314
                    case 'span':
315
                    case 'div':
316
                    case 'font':
317
                    case 'i':
318
                    case 'em':
319
                    case 'strong':
320
                    case 'b':
321
                        if ($cellContent > '') {
322
                            $cellContent .= ' ';
323
                        }
324
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
325
                        if ($cellContent > '') {
326
                            $cellContent .= ' ';
327
                        }
328
                        break;
329
                    case 'hr':
330
                        $this->flushCell($sheet, $column, $row, $cellContent);
331
                        ++$row;
332
                        if (isset($this->formats[$child->nodeName])) {
333
                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
334
                        } else {
335
                            $cellContent = '----------';
336
                            $this->flushCell($sheet, $column, $row, $cellContent);
337
                        }
338
                        ++$row;
339
                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
340
                    case 'br':
341
                        if ($this->tableLevel > 0) {
342
                            //    If we're inside a table, replace with a \n
343
                            $cellContent .= "\n";
344
                        } else {
345
                            //    Otherwise flush our existing content and move the row cursor on
346
                            $this->flushCell($sheet, $column, $row, $cellContent);
347
                            ++$row;
348
                        }
349
                        break;
350
                    case 'a':
351
                        foreach ($attributeArray as $attributeName => $attributeValue) {
352
                            switch ($attributeName) {
353
                                case 'href':
354
                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
355 View Code Duplication
                                    if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
356
                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
357
                                    }
358
                                    break;
359
                            }
360
                        }
361
                        $cellContent .= ' ';
362
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
363
                        break;
364
                    case 'h1':
365
                    case 'h2':
366
                    case 'h3':
367
                    case 'h4':
368
                    case 'h5':
369
                    case 'h6':
370
                    case 'ol':
371
                    case 'ul':
372
                    case 'p':
373
                        if ($this->tableLevel > 0) {
374
                            //    If we're inside a table, replace with a \n
375
                            $cellContent .= "\n";
376
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
377
                        } else {
378
                            if ($cellContent > '') {
379
                                $this->flushCell($sheet, $column, $row, $cellContent);
380
                                ++$row;
381
                            }
382
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
383
                            $this->flushCell($sheet, $column, $row, $cellContent);
384
385 View Code Duplication
                            if (isset($this->formats[$child->nodeName])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
386
                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
387
                            }
388
389
                            ++$row;
390
                            $column = 'A';
391
                        }
392
                        break;
393
                    case 'li':
394
                        if ($this->tableLevel > 0) {
395
                            //    If we're inside a table, replace with a \n
396
                            $cellContent .= "\n";
397
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
398
                        } else {
399
                            if ($cellContent > '') {
400
                                $this->flushCell($sheet, $column, $row, $cellContent);
401
                            }
402
                            ++$row;
403
                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
404
                            $this->flushCell($sheet, $column, $row, $cellContent);
405
                            $column = 'A';
406
                        }
407
                        break;
408
                    case 'table':
409
                        $this->flushCell($sheet, $column, $row, $cellContent);
410
                        $column = $this->setTableStartColumn($column);
411
                        if ($this->tableLevel > 1) {
412
                            --$row;
413
                        }
414
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
415
                        $column = $this->releaseTableStartColumn();
416
                        if ($this->tableLevel > 1) {
417
                            ++$column;
418
                        } else {
419
                            ++$row;
420
                        }
421
                        break;
422
                    case 'thead':
423
                    case 'tbody':
424
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
425
                        break;
426
                    case 'tr':
427
                        $column = $this->getTableStartColumn();
428
                        $cellContent = '';
429
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
430
                        ++$row;
431
                        break;
432
                    case 'th':
433
                    case 'td':
434
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
435
436
                        while (isset($this->rowspan[$column . $row])) {
437
                            ++$column;
438
                        }
439
440
                        $this->flushCell($sheet, $column, $row, $cellContent);
441
442
                        if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
443
                            //create merging rowspan and colspan
444
                            $columnTo = $column;
445
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
446
                                ++$columnTo;
447
                            }
448
                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
449
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
450
                                $this->rowspan[$value] = true;
451
                            }
452
                            $sheet->mergeCells($range);
453
                            $column = $columnTo;
454
                        } elseif (isset($attributeArray['rowspan'])) {
455
                            //create merging rowspan
456
                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
457
                            foreach (\PhpOffice\PhpSpreadsheet\Cell::extractAllCellReferencesInRange($range) as $value) {
458
                                $this->rowspan[$value] = true;
459
                            }
460
                            $sheet->mergeCells($range);
461
                        } elseif (isset($attributeArray['colspan'])) {
462
                            //create merging colspan
463
                            $columnTo = $column;
464
                            for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
465
                                ++$columnTo;
466
                            }
467
                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
468
                            $column = $columnTo;
469
                        } elseif (isset($attributeArray['bgcolor'])) {
470
                            $sheet->getStyle($column . $row)->applyFromArray(
471
                                [
472
                                    'fill' => [
473
                                        'type' => PHPExcel_Style_Fill::FILL_SOLID,
474
                                        'color' => ['rgb' => $attributeArray['bgcolor']],
475
                                    ],
476
                                ]
477
                            );
478
                        }
479
                        ++$column;
480
                        break;
481
                    case 'body':
482
                        $row = 1;
483
                        $column = 'A';
484
                        $content = '';
0 ignored issues
show
Unused Code introduced by
$content is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
485
                        $this->tableLevel = 0;
486
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
487
                        break;
488
                    default:
489
                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
490
                }
491
            }
492
        }
493
    }
494
495
    /**
496
     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance
497
     *
498
     * @param  string                    $pFilename
499
     * @param  Spreadsheet                  $spreadsheet
500
     * @throws Exception
501
     * @return Spreadsheet
502
     */
503
    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
504
    {
505
        // Validate
506
        if (!$this->canRead($pFilename)) {
507
            throw new Exception($pFilename . ' is an Invalid HTML file.');
508
        }
509
510
        // Create new sheet
511
        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
512
            $spreadsheet->createSheet();
513
        }
514
        $spreadsheet->setActiveSheetIndex($this->sheetIndex);
515
516
        //    Create a new DOM object
517
        $dom = new DOMDocument();
518
        //    Reload the HTML file into the DOM object
519
        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
520
        if ($loaded === false) {
521
            throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
522
        }
523
524
        //    Discard white space
525
        $dom->preserveWhiteSpace = false;
526
527
        $row = 0;
528
        $column = 'A';
529
        $content = '';
530
        $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
531
532
        // Return
533
        return $spreadsheet;
534
    }
535
536
    /**
537
     * Get sheet index
538
     *
539
     * @return int
540
     */
541
    public function getSheetIndex()
542
    {
543
        return $this->sheetIndex;
544
    }
545
546
    /**
547
     * Set sheet index
548
     *
549
     * @param  int                  $pValue Sheet index
550
     * @return HTML
551
     */
552
    public function setSheetIndex($pValue = 0)
553
    {
554
        $this->sheetIndex = $pValue;
555
556
        return $this;
557
    }
558
559
    /**
560
     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
561
     *
562
     * @param     string         $xml
563
     * @throws Exception
564
     */
565 View Code Duplication
    public function securityScan($xml)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
566
    {
567
        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
568
        if (preg_match($pattern, $xml)) {
569
            throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
570
        }
571
572
        return $xml;
573
    }
574
}
575