Completed
Push — master ( 0868ae...a5d47a )
by Colin
03:35
created

HtmlConverter::convertChildren()   B

Complexity

Conditions 6
Paths 5

Size

Total Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 29
c 0
b 0
f 0
ccs 12
cts 12
cp 1
rs 8.8337
cc 6
nc 5
nop 1
crap 6
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
use League\HTMLToMarkdown\Converter\PreConverterInterface;
6
7
/**
8
 * Class HtmlConverter
9
 *
10
 * A helper class to convert HTML to Markdown.
11
 *
12
 * @author Colin O'Dell <[email protected]>
13
 * @author Nick Cernis <[email protected]>
14
 *
15
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
16
 *
17
 * @license http://www.opensource.org/licenses/mit-license.php MIT
18
 */
19
class HtmlConverter implements HtmlConverterInterface
20
{
21
    /**
22
     * @var Environment
23
     */
24
    protected $environment;
25
26
    /**
27
     * Constructor
28
     *
29
     * @param Environment|array $options Environment object or configuration options
30
     */
31 93
    public function __construct($options = array())
32
    {
33 93
        if ($options instanceof Environment) {
34 3
            $this->environment = $options;
35 90
        } elseif (is_array($options)) {
36
            $defaults = array(
37 90
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
38
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
39
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
40
                'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
41
                'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
42
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
43
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
44
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
45
                'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
46
                'use_autolinks' => true, // Set to true to use simple link syntax if possible. Will always use []() if set to false
47
                'table_pipe_escape' => '\|', // Replacement string for pipe characters inside markdown table cells
48
                'table_caption_side' => 'top', // Set to 'top' or 'bottom' to show <caption> content before or after table, null to suppress
49
            );
50
51 90
            $this->environment = Environment::createDefaultEnvironment($defaults);
52
53 90
            $this->environment->getConfig()->merge($options);
54
        }
55 93
    }
56
57
    /**
58
     * @return Environment
59
     */
60 3
    public function getEnvironment()
61
    {
62 3
        return $this->environment;
63
    }
64
65
    /**
66
     * @return Configuration
67
     */
68 90
    public function getConfig()
69
    {
70 90
        return $this->environment->getConfig();
71
    }
72
73
    /**
74
     * Convert
75
     *
76
     * @see HtmlConverter::convert
77
     *
78
     * @param string $html
79
     *
80
     * @return string The Markdown version of the html
81
     */
82 3
    public function __invoke($html)
83
    {
84 3
        return $this->convert($html);
85
    }
86
87
    /**
88
     * Convert
89
     *
90
     * Loads HTML and passes to getMarkdown()
91
     *
92
     * @param string $html
93
     *
94
     * @throws \InvalidArgumentException
95
     *
96
     * @return string The Markdown version of the html
97
     */
98 93
    public function convert($html)
99
    {
100 93
        if (trim($html) === '') {
101 3
            return '';
102
        }
103
104 90
        $document = $this->createDOMDocument($html);
105
106
        // Work on the entire DOM tree (including head and body)
107 90
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
108
            throw new \InvalidArgumentException('Invalid HTML was provided');
109
        }
110
111 90
        $rootElement = new Element($root);
112 90
        $this->convertChildren($rootElement);
113
114
        // Store the now-modified DOMDocument as a string
115 90
        $markdown = $document->saveHTML();
116
117 90
        return $this->sanitize($markdown);
118
    }
119
120
    /**
121
     * @param string $html
122
     *
123
     * @return \DOMDocument
124
     */
125 90
    private function createDOMDocument($html)
126
    {
127 90
        $document = new \DOMDocument();
128
129 90
        if ($this->getConfig()->getOption('suppress_errors')) {
130
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
131 87
            libxml_use_internal_errors(true);
132
        }
133
134
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
135 90
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
136 90
        $document->encoding = 'UTF-8';
137
138 90
        if ($this->getConfig()->getOption('suppress_errors')) {
139 87
            libxml_clear_errors();
140
        }
141
142 90
        return $document;
143
    }
144
145
    /**
146
     * Convert Children
147
     *
148
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
149
     *
150
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
151
     * starting with the innermost element and working up to the outermost element.
152
     *
153
     * @param ElementInterface $element
154
     */
155 90
    private function convertChildren(ElementInterface $element)
156
    {
157
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
158
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
159 90
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
160 21
            return;
161
        }
162
163
        // Give converter a chance to inspect/modify the DOM before children are converted
164 90
        $converter = $this->environment->getConverterByTag($element->getTagName());
165 90
        if ($converter instanceof PreConverterInterface) {
166 3
            $converter->preConvert($element);
167
        }
168
169
        // If the node has children, convert those to Markdown first
170 90
        if ($element->hasChildren()) {
171 90
            foreach ($element->getChildren() as $child) {
172 90
                $this->convertChildren($child);
173
            }
174
        }
175
176
        // Now that child nodes have been converted, convert the original node
177 90
        $markdown = $this->convertToMarkdown($element);
178
179
        // Create a DOM text node containing the Markdown equivalent of the original node
180
181
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
182 90
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 177 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
183 90
    }
184
185
    /**
186
     * Convert to Markdown
187
     *
188
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
189
     *
190
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
191
     *
192
     * @param ElementInterface $element
193
     *
194
     * @return string The converted HTML as Markdown
195
     */
196 90
    protected function convertToMarkdown(ElementInterface $element)
197
    {
198 90
        $tag = $element->getTagName();
199
200
        // Strip nodes named in remove_nodes
201 90
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
202 90
        if (in_array($tag, $tags_to_remove)) {
203 3
            return false;
204
        }
205
206 90
        $converter = $this->environment->getConverterByTag($tag);
207
208 90
        return $converter->convert($element);
209
    }
210
211
    /**
212
     * @param string $markdown
213
     *
214
     * @return string
215
     */
216 90
    protected function sanitize($markdown)
217
    {
218 90
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
219 90
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
220 90
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
221
222
        /*
223
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
224
         * XML, html and body opening tags should be in that order. Same case with closing tags
225
         */
226 90
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
227
228 90
        foreach ($unwanted as $tag) {
229 90
            if (strpos($tag, '/') === false) {
230
                // Opening tags
231 90
                if (strpos($markdown, $tag) === 0) {
232 90
                    $markdown = substr($markdown, strlen($tag));
233
                }
234
            } else {
235
                // Closing tags
236 90
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
237 81
                    $markdown = substr($markdown, 0, -strlen($tag));
238
                }
239
            }
240
        }
241
242 90
        return trim($markdown, "\n\r\0\x0B");
243
    }
244
245
    /**
246
     * Pass a series of key-value pairs in an array; these will be passed
247
     * through the config and set.
248
     * The advantage of this is that it can allow for static use (IE in Laravel).
249
     * An example being:
250
     *
251
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
252
     */
253
    public function setOptions(array $options)
254
    {
255
        $config = $this->getConfig();
256
257
        foreach ($options as $key => $option) {
258
            $config->setOption($key, $option);
259
        }
260
261
        return $this;
262
    }
263
}
264