Completed
Push — master ( 63adb9...1faad8 )
by Colin
02:08
created

HtmlConverter::setOptions()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 0
cts 5
cp 0
rs 9.9332
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 6
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter implements HtmlConverterInterface
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param Environment|array $options Environment object or configuration options
28
     */
29 90
    public function __construct($options = array())
30
    {
31 90
        if ($options instanceof Environment) {
32 3
            $this->environment = $options;
33 87
        } elseif (is_array($options)) {
34
            $defaults = array(
35 87
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
36
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
37
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
38
                'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
39
                'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
40
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
41
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
42
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
43
                'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
44
            );
45
46 87
            $this->environment = Environment::createDefaultEnvironment($defaults);
47
48 87
            $this->environment->getConfig()->merge($options);
49
        }
50 90
    }
51
52
    /**
53
     * @return Environment
54
     */
55
    public function getEnvironment()
56
    {
57
        return $this->environment;
58
    }
59
60
    /**
61
     * @return Configuration
62
     */
63 87
    public function getConfig()
64
    {
65 87
        return $this->environment->getConfig();
66
    }
67
68
    /**
69
     * Convert
70
     *
71
     * @see HtmlConverter::convert
72
     *
73
     * @param string $html
74
     *
75
     * @return string The Markdown version of the html
76
     */
77 3
    public function __invoke($html)
78
    {
79 3
        return $this->convert($html);
80
    }
81
82
    /**
83
     * Convert
84
     *
85
     * Loads HTML and passes to getMarkdown()
86
     *
87
     * @param string $html
88
     *
89
     * @throws \InvalidArgumentException
90
     *
91
     * @return string The Markdown version of the html
92
     */
93 90
    public function convert($html)
94
    {
95 90
        if (trim($html) === '') {
96 3
            return '';
97
        }
98
99 87
        $document = $this->createDOMDocument($html);
100
101
        // Work on the entire DOM tree (including head and body)
102 87
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
103
            throw new \InvalidArgumentException('Invalid HTML was provided');
104
        }
105
106 87
        $rootElement = new Element($root);
107 87
        $this->convertChildren($rootElement);
108
109
        // Store the now-modified DOMDocument as a string
110 87
        $markdown = $document->saveHTML();
111
112 87
        return $this->sanitize($markdown);
113
    }
114
115
    /**
116
     * @param string $html
117
     *
118
     * @return \DOMDocument
119
     */
120 87
    private function createDOMDocument($html)
121
    {
122 87
        $document = new \DOMDocument();
123
124 87
        if ($this->getConfig()->getOption('suppress_errors')) {
125
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
126 84
            libxml_use_internal_errors(true);
127
        }
128
129
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
130 87
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
131 87
        $document->encoding = 'UTF-8';
132
133 87
        if ($this->getConfig()->getOption('suppress_errors')) {
134 84
            libxml_clear_errors();
135
        }
136
137 87
        return $document;
138
    }
139
140
    /**
141
     * Convert Children
142
     *
143
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
144
     *
145
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
146
     * starting with the innermost element and working up to the outermost element.
147
     *
148
     * @param ElementInterface $element
149
     */
150 87
    private function convertChildren(ElementInterface $element)
151
    {
152
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
153
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
154 87
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
155 18
            return;
156
        }
157
158
        // If the node has children, convert those to Markdown first
159 87
        if ($element->hasChildren()) {
160 87
            foreach ($element->getChildren() as $child) {
161 87
                $this->convertChildren($child);
162
            }
163
        }
164
165
        // Now that child nodes have been converted, convert the original node
166 87
        $markdown = $this->convertToMarkdown($element);
167
168
        // Create a DOM text node containing the Markdown equivalent of the original node
169
170
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
171 87
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 166 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
172 87
    }
173
174
    /**
175
     * Convert to Markdown
176
     *
177
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
178
     *
179
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
180
     *
181
     * @param ElementInterface $element
182
     *
183
     * @return string The converted HTML as Markdown
184
     */
185 87
    protected function convertToMarkdown(ElementInterface $element)
186
    {
187 87
        $tag = $element->getTagName();
188
189
        // Strip nodes named in remove_nodes
190 87
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
191 87
        if (in_array($tag, $tags_to_remove)) {
192 3
            return false;
193
        }
194
195 87
        $converter = $this->environment->getConverterByTag($tag);
196
197 87
        return $converter->convert($element);
198
    }
199
200
    /**
201
     * @param string $markdown
202
     *
203
     * @return string
204
     */
205 87
    protected function sanitize($markdown)
206
    {
207 87
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
208 87
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
209 87
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
210
211
        /*
212
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
213
         * XML, html and body opening tags should be in that order. Same case with closing tags
214
         */
215 87
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
216
217 87
        foreach ($unwanted as $tag) {
218 87
            if (strpos($tag, '/') === false) {
219
                // Opening tags
220 87
                if (strpos($markdown, $tag) === 0) {
221 87
                    $markdown = substr($markdown, strlen($tag));
222
                }
223
            } else {
224
                // Closing tags
225 87
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
226 81
                    $markdown = substr($markdown, 0, -strlen($tag));
227
                }
228
            }
229
        }
230
231 87
        return trim($markdown, "\n\r\0\x0B");
232
    }
233
234
    /**
235
     * Pass a series of key-value pairs in an array; these will be passed
236
     * through the config and set.
237
     * The advantage of this is that it can allow for static use (IE in Laravel).
238
     * An example being:
239
     *
240
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
241
     */
242
    public function setOptions(array $options)
243
    {
244
        $config = $this->getConfig();
245
246
        foreach ($options as $key => $option) {
247
            $config->setOption($key, $option);
248
        }
249
250
        return $this;
251
    }
252
}
253