Parser::parseObject() - Code Metrics - Inspection of "Merge branch 'master' into skyfms/fix_encoding" - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed
Push — pr/257 ( 57d61f )

by Konrad
created 2020-06-08 14:05 UTC
Parser::parseObject() C

↳ Parent: Parser
Complexity

Conditions	14
Paths	30
Size

Total Lines	83
Code Lines	50
Duplication

Lines	0
Ratio	0 %
Importance

Changes	5
Bugs	2	Features	0
Metric	Value
cc	14
eloc	50
c	5
b	2
f	0
nc	30
nop	3
dl	0
loc	83
rs	6.2666
How to fix Long Method Complexity

<?php

/**
 * @file
 *          This file is part of the PdfParser library.
 *
 * @author  Sébastien MALOT <[email protected]>
 * @date    2017-01-03
 *
 * @license LGPLv3
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Smalot\PdfParser;

use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Element\ElementBoolean;
use Smalot\PdfParser\Element\ElementDate;
use Smalot\PdfParser\Element\ElementHexa;
use Smalot\PdfParser\Element\ElementName;
use Smalot\PdfParser\Element\ElementNull;
use Smalot\PdfParser\Element\ElementNumeric;
use Smalot\PdfParser\Element\ElementString;
use Smalot\PdfParser\Element\ElementXRef;
use Smalot\PdfParser\RawData\RawDataParser;

/**
 * Class Parser
 */
class Parser
{
    /**
     * @var PDFObject[]
     */
    protected $objects = [];

    protected $rawDataParser;

    public function __construct($cfg = [])
    {
        $this->rawDataParser = new RawDataParser($cfg);
    }

    /**
     * @param string $filename
     *
     * @return Document
     *
     * @throws \Exception
     */
    public function parseFile($filename)
    {
        $content = file_get_contents($filename);
        /*
         * 2018/06/20 @doganoo as multiple times a
         * users have complained that the parseFile()
         * method dies silently, it is an better option
         * to remove the error control operator (@) and
         * let the users know that the method throws an exception
         * by adding @throws tag to PHPDoc.
         *
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
         */
        return $this->parseContent($content);
    }

    /**
     * @param string $content PDF content to parse
     *
     * @return Document
     *
     * @throws \Exception if secured PDF file was detected
     * @throws \Exception if no object list was found
     */
    public function parseContent($content)
    {
        // Create structure from raw data.
        list($xref, $data) = $this->rawDataParser->parseData($content);

        if (isset($xref['trailer']['encrypt'])) {
            throw new \Exception('Secured pdf file are currently not supported.');
        }

        if (empty($data)) {
            throw new \Exception('Object list not found. Possible secured file.');
        }

        // Create destination object.
        $document = new Document();
        $this->objects = [];

        foreach ($data as $id => $structure) {
            $this->parseObject($id, $structure, $document);
            unset($data[$id]);
        }

        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
        $document->setObjects($this->objects);

        return $document;
    }

    protected function parseTrailer($structure, $document)
    {
        $trailer = [];

        foreach ($structure as $name => $values) {
            $name = ucfirst($name);

            if (is_numeric($values)) {
                $trailer[$name] = new ElementNumeric($values);
            } elseif (\is_array($values)) {
                $value = $this->parseTrailer($values, null);
                $trailer[$name] = new ElementArray($value, null);
                $trailer[$name] = new ElementArray(/** @scrutinizer ignore-type */ $value, null);
            } elseif (false !== strpos($values, '_')) {
                $trailer[$name] = new ElementXRef($values, $document);
            } else {
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
            }
        }

        return new Header($trailer, $document);
    }

    /**
     * @param string   $id
     * @param array    $structure
     * @param Document $document
     */
    protected function parseObject($id, $structure, $document)
    {
        $header = new Header([], $document);
        $content = '';

        foreach ($structure as $position => $part) {
            switch ($part[0]) {
                case '[':
                    $elements = [];

                    foreach ($part[1] as $sub_element) {
                        $sub_type = $sub_element[0];
                        $sub_value = $sub_element[1];
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
                    }

                    $header = new Header($elements, $document);
                    break;

                case '<<':
                    $header = $this->parseHeader($part[1], $document);
                    break;

                case 'stream':
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];

                    if ($header->get('Type')->equals('ObjStm')) {
                        $match = [];

                        // Split xrefs and contents.
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
                        $content = $match[3];

                        // Extract xrefs.
                        $xrefs = preg_split(
                            '/(\d+\s+\d+\s*)/s',
                            $match[1],
                            -1,
                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
                        );
                        $table = [];

                        foreach ($xrefs as $xref) {
                            list($id, $position) = explode(' ', trim($xref));
                            $table[$position] = $id;
                        }

                        ksort($table);

                        $ids = array_values($table);
                        $positions = array_keys($table);

                        foreach ($positions as $index => $position) {

                            $id = $ids[$index].'_0';
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);

                            $sub_header = Header::parse($sub_content, $document);
                            $object = PDFObject::factory($document, $sub_header, '');
                            $this->objects[$id] = $object;
                        }

                        // It is not necessary to store this content.
                        $content = '';


                        return;
                    }
                    break;

                default:
                    if ('null' != $part) {
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);

                        if ($element) {
                            $header = new Header([$element], $document);
                        }
                    }
                    break;
            }
        }

        if (!isset($this->objects[$id])) {
            $this->objects[$id] = PDFObject::factory($document, $header, $content);
        }
    }

    /**
     * @param array    $structure
     * @param Document $document
     *
     * @return Header
     *
     * @throws \Exception
     */
    protected function parseHeader($structure, $document)
    {
        $elements = [];
        $count = \count($structure);

        for ($position = 0; $position < $count; $position += 2) {
            $name = $structure[$position][1];
            $type = $structure[$position + 1][0];
            $value = $structure[$position + 1][1];

            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
        }

        return new Header($elements, $document);
    }

    /**
     * @param string       $type
     * @param string|array $value
     * @param Document     $document
     *
     * @return Element|Header|null
     *
     * @throws \Exception
     */
    protected function parseHeaderElement($type, $value, $document)
    {
        switch ($type) {
            case '<<':
                $header = $this->parseHeader($value, $document);
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
                return PDFObject::factory($document, $header, null);


            case 'numeric':
                return new ElementNumeric($value);
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);

            case 'boolean':
                return new ElementBoolean($value);
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);

            case 'null':
                return new ElementNull();

            case '(':
                if ($date = ElementDate::parse('('.$value.')', $document)) {
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
                    return $date;
                }

                return ElementString::parse('('.$value.')', $document);


            case '<':
                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value, $document), $document);

            case '/':
                return ElementName::parse('/'.$value, $document);


            case 'ojbref': // old mistake in tcpdf parser
            case 'objref':
                return new ElementXRef($value, $document);

            case '[':
                $values = [];

                if (\is_array($value)) {
                    foreach ($value as $sub_element) {
                        $sub_type = $sub_element[0];
                        $sub_value = $sub_element[1];
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
                    }
                }

                return new ElementArray($values, $document);

            case 'endstream':
            case 'obj': //I don't know what it means but got my project fixed.
            case '':
                // Nothing to do with.
                return null;

            default:
                throw new \Exception('Invalid type: "'.$type.'".');
        }
    }
}

smalot / pdfparser

Push — pr/257 ( 57d61f )

Parser::parseObject() C

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like