Completed
Push — master ( 032337...4a3f84 )
by Colin
03:00
created

HtmlConverter::setOptions()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 0
cts 6
cp 0
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 5
nc 2
nop 1
crap 6
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param Environment|array $options Environment object or configuration options
28
     */
29 84
    public function __construct($options = array())
30
    {
31 84
        if ($options instanceof Environment) {
32 3
            $this->environment = $options;
33 83
        } elseif (is_array($options)) {
34
            $defaults = array(
35 81
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
36 54
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
37 54
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
38 54
                'bold_style' => '**', // Set to '__' if you prefer the underlined style
39 54
                'italic_style' => '_', // Set to '*' if you prefer the asterisk style
40 54
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
41 54
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
42 54
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
43 54
            );
44
45 81
            $this->environment = Environment::createDefaultEnvironment($defaults);
46
47 81
            $this->environment->getConfig()->merge($options);
48 54
        }
49 84
    }
50
51
    /**
52
     * @return Environment
53
     */
54
    public function getEnvironment()
55
    {
56
        return $this->environment;
57
    }
58
59
    /**
60
     * @return Configuration
61
     */
62 81
    public function getConfig()
63
    {
64 81
        return $this->environment->getConfig();
65
    }
66
67
    /**
68
     * Convert
69
     *
70
     * @see HtmlConverter::convert
71
     *
72
     * @param string $html
73
     *
74
     * @return string The Markdown version of the html
75
     */
76 3
    public function __invoke($html)
77
    {
78 3
        return $this->convert($html);
79
    }
80
81
    /**
82
     * Convert
83
     *
84
     * Loads HTML and passes to getMarkdown()
85
     *
86
     * @param string $html
87
     *
88
     * @throws \InvalidArgumentException
89
     *
90
     * @return string The Markdown version of the html
91
     */
92 84
    public function convert($html)
93
    {
94 84
        if (trim($html) === '') {
95 3
            return '';
96
        }
97
98 81
        $document = $this->createDOMDocument($html);
99
100
        // Work on the entire DOM tree (including head and body)
101 81
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
102
            throw new \InvalidArgumentException('Invalid HTML was provided');
103
        }
104
105 81
        $rootElement = new Element($root);
106 81
        $this->convertChildren($rootElement);
107
108
        // Store the now-modified DOMDocument as a string
109 81
        $markdown = $document->saveHTML();
110
111 81
        return $this->sanitize($markdown);
112
    }
113
114
    /**
115
     * @param string $html
116
     *
117
     * @return \DOMDocument
118
     */
119 81
    private function createDOMDocument($html)
120
    {
121 81
        $document = new \DOMDocument();
122
123 81
        if ($this->getConfig()->getOption('suppress_errors')) {
124
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
125 78
            libxml_use_internal_errors(true);
126 52
        }
127
128
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
129 81
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
130 81
        $document->encoding = 'UTF-8';
131
132 81
        if ($this->getConfig()->getOption('suppress_errors')) {
133 78
            libxml_clear_errors();
134 52
        }
135
136 81
        return $document;
137
    }
138
139
    /**
140
     * Convert Children
141
     *
142
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
143
     *
144
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
145
     * starting with the innermost element and working up to the outermost element.
146
     *
147
     * @param ElementInterface $element
148
     */
149 81
    private function convertChildren(ElementInterface $element)
150
    {
151
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
152
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
153 81
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
154 15
            return;
155
        }
156
157
        // If the node has children, convert those to Markdown first
158 81
        if ($element->hasChildren()) {
159 81
            foreach ($element->getChildren() as $child) {
160 81
                $this->convertChildren($child);
161 54
            }
162 54
        }
163
164
        // Now that child nodes have been converted, convert the original node
165 81
        $markdown = $this->convertToMarkdown($element);
166
167
        // Create a DOM text node containing the Markdown equivalent of the original node
168
169
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
170 81
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by Colin O'Dell
It seems like $markdown defined by $this->convertToMarkdown($element) on line 165 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
171 81
    }
172
173
    /**
174
     * Convert to Markdown
175
     *
176
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
177
     *
178
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
179
     *
180
     * @param ElementInterface $element
181
     *
182
     * @return string The converted HTML as Markdown
183
     */
184 81
    protected function convertToMarkdown(ElementInterface $element)
185
    {
186 81
        $tag = $element->getTagName();
187
188
        // Strip nodes named in remove_nodes
189 81
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
190 81
        if (in_array($tag, $tags_to_remove)) {
191 3
            return false;
192
        }
193
194 81
        $converter = $this->environment->getConverterByTag($tag);
195
196 81
        return $converter->convert($element);
197
    }
198
199
    /**
200
     * @param string $markdown
201
     *
202
     * @return string
203
     */
204 81
    protected function sanitize($markdown)
205
    {
206 81
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
207 81
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
208 81
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
209
210
        /*
211
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
212
         * XML, html and body opening tags should be in that order. Same case with closing tags
213
         */
214 81
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
215
216 81
        foreach ($unwanted as $tag) {
217 81
            if (strpos($tag, '/') === false) {
218
                // Opening tags
219 81
                if (strpos($markdown, $tag) === 0) {
220 81
                    $markdown = substr($markdown, strlen($tag));
221 54
                }
222 54
            } else {
223
                // Closing tags
224 81
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
225 75
                    $markdown = substr($markdown, 0, -strlen($tag));
226 48
                }
227
            }
228 54
        }
229
230 81
        return trim($markdown, "\n\r\0\x0B");
231
    }
232
    
233
    /**
234
     * Pass a series of key-value pairs in an array; these will be passed
235
     * through the config and set.
236
     * The advantage of this is that it can allow for static use (IE in Laravel).
237
     * An example being:
238
     * 
239
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
240
     */
241
    public function setOptions(array $options)
242
    {
243
        $config = $this->getConfig();
244
245
        foreach ($options as $key => $option) {
246
            $config->setOption($key, $option);
247
        }
248
249
        return $this;
250
    }
251
}
252