Completed
Push — master ( a1ddbf...2742fe )
by Colin
05:07
created

HtmlConverter::convert()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 3.009

Importance

Changes 0
Metric Value
dl 0
loc 21
ccs 9
cts 10
cp 0.9
rs 9.584
c 0
b 0
f 0
cc 3
nc 3
nop 1
crap 3.009
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter implements HtmlConverterInterface
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param Environment|array $options Environment object or configuration options
28
     */
29 90
    public function __construct($options = array())
30
    {
31 90
        if ($options instanceof Environment) {
32 3
            $this->environment = $options;
33 89
        } elseif (is_array($options)) {
34
            $defaults = array(
35 87
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
36 58
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
37 58
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
38 58
                'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
39 58
                'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
40 58
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
41 58
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
42 58
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
43 58
                'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
44 58
                'use_autolinks' => true, // Set to true to use simple link syntax if possible. Will always use []() if set to false
45 58
            );
46
47 87
            $this->environment = Environment::createDefaultEnvironment($defaults);
48
49 87
            $this->environment->getConfig()->merge($options);
50 58
        }
51 90
    }
52
53
    /**
54
     * @return Environment
55
     */
56
    public function getEnvironment()
57
    {
58
        return $this->environment;
59
    }
60
61
    /**
62
     * @return Configuration
63
     */
64 87
    public function getConfig()
65
    {
66 87
        return $this->environment->getConfig();
67
    }
68
69
    /**
70
     * Convert
71
     *
72
     * @see HtmlConverter::convert
73
     *
74
     * @param string $html
75
     *
76
     * @return string The Markdown version of the html
77
     */
78 3
    public function __invoke($html)
79
    {
80 3
        return $this->convert($html);
81
    }
82
83
    /**
84
     * Convert
85
     *
86
     * Loads HTML and passes to getMarkdown()
87
     *
88
     * @param string $html
89
     *
90
     * @throws \InvalidArgumentException
91
     *
92
     * @return string The Markdown version of the html
93
     */
94 90
    public function convert($html)
95
    {
96 90
        if (trim($html) === '') {
97 3
            return '';
98
        }
99
100 87
        $document = $this->createDOMDocument($html);
101
102
        // Work on the entire DOM tree (including head and body)
103 87
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
104
            throw new \InvalidArgumentException('Invalid HTML was provided');
105
        }
106
107 87
        $rootElement = new Element($root);
108 87
        $this->convertChildren($rootElement);
109
110
        // Store the now-modified DOMDocument as a string
111 87
        $markdown = $document->saveHTML();
112
113 87
        return $this->sanitize($markdown);
114
    }
115
116
    /**
117
     * @param string $html
118
     *
119
     * @return \DOMDocument
120
     */
121 87
    private function createDOMDocument($html)
122
    {
123 87
        $document = new \DOMDocument();
124
125 87
        if ($this->getConfig()->getOption('suppress_errors')) {
126
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
127 84
            libxml_use_internal_errors(true);
128 56
        }
129
130
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
131 87
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
132 87
        $document->encoding = 'UTF-8';
133
134 87
        if ($this->getConfig()->getOption('suppress_errors')) {
135 84
            libxml_clear_errors();
136 56
        }
137
138 87
        return $document;
139
    }
140
141
    /**
142
     * Convert Children
143
     *
144
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
145
     *
146
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
147
     * starting with the innermost element and working up to the outermost element.
148
     *
149
     * @param ElementInterface $element
150
     */
151 87
    private function convertChildren(ElementInterface $element)
152
    {
153
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
154
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
155 87
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
156 18
            return;
157
        }
158
159
        // If the node has children, convert those to Markdown first
160 87
        if ($element->hasChildren()) {
161 87
            foreach ($element->getChildren() as $child) {
162 87
                $this->convertChildren($child);
163 58
            }
164 58
        }
165
166
        // Now that child nodes have been converted, convert the original node
167 87
        $markdown = $this->convertToMarkdown($element);
168
169
        // Create a DOM text node containing the Markdown equivalent of the original node
170
171
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
172 87
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 167 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
173 87
    }
174
175
    /**
176
     * Convert to Markdown
177
     *
178
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
179
     *
180
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
181
     *
182
     * @param ElementInterface $element
183
     *
184
     * @return string The converted HTML as Markdown
185
     */
186 87
    protected function convertToMarkdown(ElementInterface $element)
187
    {
188 87
        $tag = $element->getTagName();
189
190
        // Strip nodes named in remove_nodes
191 87
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
192 87
        if (in_array($tag, $tags_to_remove)) {
193 3
            return false;
194
        }
195
196 87
        $converter = $this->environment->getConverterByTag($tag);
197
198 87
        return $converter->convert($element);
199
    }
200
201
    /**
202
     * @param string $markdown
203
     *
204
     * @return string
205
     */
206 87
    protected function sanitize($markdown)
207
    {
208 87
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
209 87
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
210 87
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
211
212
        /*
213
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
214
         * XML, html and body opening tags should be in that order. Same case with closing tags
215
         */
216 87
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
217
218 87
        foreach ($unwanted as $tag) {
219 87
            if (strpos($tag, '/') === false) {
220
                // Opening tags
221 87
                if (strpos($markdown, $tag) === 0) {
222 87
                    $markdown = substr($markdown, strlen($tag));
223 58
                }
224 58
            } else {
225
                // Closing tags
226 87
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
227 81
                    $markdown = substr($markdown, 0, -strlen($tag));
228 52
                }
229
            }
230 58
        }
231
232 87
        return trim($markdown, "\n\r\0\x0B");
233
    }
234
235
    /**
236
     * Pass a series of key-value pairs in an array; these will be passed
237
     * through the config and set.
238
     * The advantage of this is that it can allow for static use (IE in Laravel).
239
     * An example being:
240
     *
241
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
242
     */
243
    public function setOptions(array $options)
244
    {
245
        $config = $this->getConfig();
246
247
        foreach ($options as $key => $option) {
248
            $config->setOption($key, $option);
249
        }
250
251
        return $this;
252
    }
253
}
254