Completed
Push — master ( 84e485...c5777c )
by Colin
46:01 queued 44:53
created

HtmlConverter::convertChildren()   B

Complexity

Conditions 6
Paths 5

Size

Total Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 29
ccs 15
cts 15
cp 1
rs 8.8337
c 0
b 0
f 0
cc 6
nc 5
nop 1
crap 6
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
use League\HTMLToMarkdown\Converter\PreConverterInterface;
6
7
/**
8
 * Class HtmlConverter
9
 *
10
 * A helper class to convert HTML to Markdown.
11
 *
12
 * @author Colin O'Dell <[email protected]>
13
 * @author Nick Cernis <[email protected]>
14
 *
15
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
16
 *
17
 * @license http://www.opensource.org/licenses/mit-license.php MIT
18
 */
19
class HtmlConverter implements HtmlConverterInterface
20
{
21
    /**
22
     * @var Environment
23
     */
24
    protected $environment;
25
26
    /**
27
     * Constructor
28
     *
29
     * @param Environment|array $options Environment object or configuration options
30
     */
31 99
    public function __construct($options = array())
32
    {
33 99
        if ($options instanceof Environment) {
34 3
            $this->environment = $options;
35 98
        } elseif (is_array($options)) {
36
            $defaults = array(
37 96
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
38 64
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
39 64
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
40 64
                'strip_placeholder_links' => false, // Set to true to remove <a> that doesn't have href.
41 64
                'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
42 64
                'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
43 64
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
44 64
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
45 64
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
46 64
                'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
47 64
                'use_autolinks' => true, // Set to true to use simple link syntax if possible. Will always use []() if set to false
48 64
                'table_pipe_escape' => '\|', // Replacement string for pipe characters inside markdown table cells
49 64
                'table_caption_side' => 'top', // Set to 'top' or 'bottom' to show <caption> content before or after table, null to suppress
50 64
            );
51
52 96
            $this->environment = Environment::createDefaultEnvironment($defaults);
53
54 96
            $this->environment->getConfig()->merge($options);
55 64
        }
56 99
    }
57
58
    /**
59
     * @return Environment
60
     */
61 3
    public function getEnvironment()
62
    {
63 3
        return $this->environment;
64
    }
65
66
    /**
67
     * @return Configuration
68
     */
69 96
    public function getConfig()
70
    {
71 96
        return $this->environment->getConfig();
72
    }
73
74
    /**
75
     * Convert
76
     *
77
     * @see HtmlConverter::convert
78
     *
79
     * @param string $html
80
     *
81
     * @return string The Markdown version of the html
82
     */
83 3
    public function __invoke($html)
84
    {
85 3
        return $this->convert($html);
86
    }
87
88
    /**
89
     * Convert
90
     *
91
     * Loads HTML and passes to getMarkdown()
92
     *
93
     * @param string $html
94
     *
95
     * @throws \InvalidArgumentException
96
     *
97
     * @return string The Markdown version of the html
98
     */
99 99
    public function convert($html)
100
    {
101 99
        if (trim($html) === '') {
102 3
            return '';
103
        }
104
105 96
        $document = $this->createDOMDocument($html);
106
107
        // Work on the entire DOM tree (including head and body)
108 96
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
109
            throw new \InvalidArgumentException('Invalid HTML was provided');
110
        }
111
112 96
        $rootElement = new Element($root);
113 96
        $this->convertChildren($rootElement);
114
115
        // Store the now-modified DOMDocument as a string
116 96
        $markdown = $document->saveHTML();
117
118 96
        return $this->sanitize($markdown);
119
    }
120
121
    /**
122
     * @param string $html
123
     *
124
     * @return \DOMDocument
125
     */
126 96
    private function createDOMDocument($html)
127
    {
128 96
        $document = new \DOMDocument();
129
130 96
        if ($this->getConfig()->getOption('suppress_errors')) {
131
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
132 93
            libxml_use_internal_errors(true);
133 62
        }
134
135
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
136 96
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
137 96
        $document->encoding = 'UTF-8';
138
139 96
        if ($this->getConfig()->getOption('suppress_errors')) {
140 93
            libxml_clear_errors();
141 62
        }
142
143 96
        return $document;
144
    }
145
146
    /**
147
     * Convert Children
148
     *
149
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
150
     *
151
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
152
     * starting with the innermost element and working up to the outermost element.
153
     *
154
     * @param ElementInterface $element
155
     */
156 96
    private function convertChildren(ElementInterface $element)
157
    {
158
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
159
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
160 96
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
161 21
            return;
162
        }
163
164
        // Give converter a chance to inspect/modify the DOM before children are converted
165 96
        $converter = $this->environment->getConverterByTag($element->getTagName());
166 96
        if ($converter instanceof PreConverterInterface) {
167 3
            $converter->preConvert($element);
168 2
        }
169
170
        // If the node has children, convert those to Markdown first
171 96
        if ($element->hasChildren()) {
172 96
            foreach ($element->getChildren() as $child) {
173 96
                $this->convertChildren($child);
174 64
            }
175 64
        }
176
177
        // Now that child nodes have been converted, convert the original node
178 96
        $markdown = $this->convertToMarkdown($element);
179
180
        // Create a DOM text node containing the Markdown equivalent of the original node
181
182
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
183 96
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 178 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
184 96
    }
185
186
    /**
187
     * Convert to Markdown
188
     *
189
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
190
     *
191
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
192
     *
193
     * @param ElementInterface $element
194
     *
195
     * @return string The converted HTML as Markdown
196
     */
197 96
    protected function convertToMarkdown(ElementInterface $element)
198
    {
199 96
        $tag = $element->getTagName();
200
201
        // Strip nodes named in remove_nodes
202 96
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
203 96
        if (in_array($tag, $tags_to_remove)) {
204 3
            return false;
205
        }
206
207 96
        $converter = $this->environment->getConverterByTag($tag);
208
209 96
        return $converter->convert($element);
210
    }
211
212
    /**
213
     * @param string $markdown
214
     *
215
     * @return string
216
     */
217 96
    protected function sanitize($markdown)
218
    {
219 96
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
220 96
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
221 96
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
222
223
        /*
224
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
225
         * XML, html and body opening tags should be in that order. Same case with closing tags
226
         */
227 96
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
228
229 96
        foreach ($unwanted as $tag) {
230 96
            if (strpos($tag, '/') === false) {
231
                // Opening tags
232 96
                if (strpos($markdown, $tag) === 0) {
233 96
                    $markdown = substr($markdown, strlen($tag));
234 64
                }
235 64
            } else {
236
                // Closing tags
237 96
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
238 90
                    $markdown = substr($markdown, 0, -strlen($tag));
239 58
                }
240
            }
241 64
        }
242
243 96
        return trim($markdown, "\n\r\0\x0B");
244
    }
245
246
    /**
247
     * Pass a series of key-value pairs in an array; these will be passed
248
     * through the config and set.
249
     * The advantage of this is that it can allow for static use (IE in Laravel).
250
     * An example being:
251
     *
252
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
253
     */
254
    public function setOptions(array $options)
255
    {
256
        $config = $this->getConfig();
257
258
        foreach ($options as $key => $option) {
259
            $config->setOption($key, $option);
260
        }
261
262
        return $this;
263
    }
264
}
265