Completed
Pull Request — master (#118)
by Filip
24:11 queued 22:16
created

HtmlConverter::getEnvironment()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 0
cts 2
cp 0
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 0
crap 2
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param array $options Configuration options
28
     * @param Environment $environment
29
     */
30 81
    public function __construct(array $options = array(), Environment $environment = null)
31
    {
32
        $defaults = array(
33 81
            'header_style'    => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
34 54
            'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
35 54
            'strip_tags'      => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
36 54
            'bold_style'      => '**', // Set to '__' if you prefer the underlined style
37 54
            'italic_style'    => '_', // Set to '*' if you prefer the asterisk style
38 54
            'remove_nodes'    => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
39 54
            'hard_break'      => false, // Set to true to turn <br> into `\n` instead of `  \n`
40 54
        );
41
42 81
        $env = $environment instanceof Environment ? $environment : Environment::createDefaultEnvironment($defaults);
43 81
        $this->environment = $env;
44
45 81
        $this->environment->getConfig()->merge($options);
46 81
    }
47
48
    /**
49
     * @return Environment
50
     */
51
    public function getEnvironment()
52
    {
53
        return $this->environment;
54
    }
55
56
    /**
57
     * @return Configuration
58
     */
59 78
    public function getConfig()
60
    {
61 78
        return $this->environment->getConfig();
62
    }
63
64
    /**
65
     * Convert
66
     *
67
     * @see HtmlConverter::convert
68
     *
69
     * @param string $html
70
     *
71
     * @return string The Markdown version of the html
72
     */
73 3
    public function __invoke($html)
74
    {
75 3
        return $this->convert($html);
76
    }
77
78
    /**
79
     * Convert
80
     *
81
     * Loads HTML and passes to getMarkdown()
82
     *
83
     * @param $html
84
     *
85
     * @return string The Markdown version of the html
86
     */
87 81
    public function convert($html)
88
    {
89 81
        if (trim($html) === '') {
90 3
            return '';
91
        }
92
93 78
        $document = $this->createDOMDocument($html);
94
95
        // Work on the entire DOM tree (including head and body)
96 78
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
97
            throw new \InvalidArgumentException('Invalid HTML was provided');
98
        }
99
100 78
        $rootElement = new Element($root);
101 78
        $this->convertChildren($rootElement);
102
103
        // Store the now-modified DOMDocument as a string
104 78
        $markdown = $document->saveHTML();
105
106 78
        $markdown = $this->sanitize($markdown);
107
108 78
        return $markdown;
109
    }
110
111
    /**
112
     * @param string $html
113
     *
114
     * @return \DOMDocument
115
     */
116 78
    private function createDOMDocument($html)
117
    {
118 78
        $document = new \DOMDocument();
119
120 78
        if ($this->getConfig()->getOption('suppress_errors')) {
121
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
122 75
            libxml_use_internal_errors(true);
123 50
        }
124
125
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
126 78
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
127 78
        $document->encoding = 'UTF-8';
128
129 78
        if ($this->getConfig()->getOption('suppress_errors')) {
130 75
            libxml_clear_errors();
131 50
        }
132
133 78
        return $document;
134
    }
135
136
    /**
137
     * Convert Children
138
     *
139
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
140
     *
141
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
142
     * starting with the innermost element and working up to the outermost element.
143
     *
144
     * @param ElementInterface $element
145
     */
146 78
    private function convertChildren(ElementInterface $element)
147
    {
148
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
149
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
150 78
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
151 15
            return;
152
        }
153
154
        // If the node has children, convert those to Markdown first
155 78
        if ($element->hasChildren()) {
156 78
            foreach ($element->getChildren() as $child) {
157 78
                $this->convertChildren($child);
158 52
            }
159 52
        }
160
161
        // Now that child nodes have been converted, convert the original node
162 78
        $markdown = $this->convertToMarkdown($element);
163
164
        // Create a DOM text node containing the Markdown equivalent of the original node
165
166
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
167 78
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 162 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
168 78
    }
169
170
    /**
171
     * Convert to Markdown
172
     *
173
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
174
     *
175
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
176
     *
177
     * @param ElementInterface $element
178
     *
179
     * @return string The converted HTML as Markdown
180
     */
181 78
    protected function convertToMarkdown(ElementInterface $element)
182
    {
183 78
        $tag = $element->getTagName();
184
185
        // Strip nodes named in remove_nodes
186 78
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
187 78
        if (in_array($tag, $tags_to_remove)) {
188 3
            return false;
189
        }
190
191 78
        $converter = $this->environment->getConverterByTag($tag);
192
193 78
        return $converter->convert($element);
194
    }
195
196
    /**
197
     * @param string $markdown
198
     *
199
     * @return string
200
     */
201 78
    protected function sanitize($markdown)
202
    {
203 78
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
204 78
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
205 78
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
206
207
        /*
208
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
209
         * XML, html and body opening tags should be in that order. Same case with closing tags
210
         */
211 78
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
212
213 78
        foreach ($unwanted as $tag) {
214 78
            if (strpos($tag, '/') === false) {
215
                // Opening tags
216 78
                if (strpos($markdown, $tag) === 0) {
217 78
                    $markdown = substr($markdown, strlen($tag));
218 52
                }
219 52
            } else {
220
                // Closing tags
221 78
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
222 72
                    $markdown = substr($markdown, 0, -strlen($tag));
223 46
                }
224
            }
225 52
        }
226
227 78
        $markdown = trim($markdown, "\n\r\0\x0B");
228
229 78
        return $markdown;
230
    }
231
}
232