Completed
Push — master ( b2b071...32626c )
by Colin
02:36
created

HtmlConverter::getEnvironment()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 1 Features 1
Metric Value
dl 0
loc 4
ccs 0
cts 2
cp 0
rs 10
c 1
b 1
f 1
cc 1
eloc 2
nc 1
nop 0
crap 2
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param array $options Configuration options
28
     */
29 72
    public function __construct(array $options = array())
30
    {
31
        $defaults = array(
32 72
            'header_style'    => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
33 72
            'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
34 72
            'strip_tags'      => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
35 72
            'bold_style'      => '**', // Set to '__' if you prefer the underlined style
36 72
            'italic_style'    => '_', // Set to '*' if you prefer the asterisk style
37 72
            'remove_nodes'    => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
38 72
        );
39
40 72
        $this->environment = Environment::createDefaultEnvironment($defaults);
41
42 72
        $this->environment->getConfig()->merge($options);
43 72
    }
44
45
    /**
46
     * @return Environment
47
     */
48
    public function getEnvironment()
49
    {
50
        return $this->environment;
51
    }
52
53
    /**
54
     * @return Configuration
55
     */
56 69
    public function getConfig()
57
    {
58 69
        return $this->environment->getConfig();
59
    }
60
61
    /**
62
     * Convert
63
     *
64
     * @see HtmlConverter::convert
65
     *
66
     * @param string $html
67
     *
68
     * @return string The Markdown version of the html
69
     */
70 3
    public function __invoke($html)
71
    {
72 3
        return $this->convert($html);
73
    }
74
75
    /**
76
     * Convert
77
     *
78
     * Loads HTML and passes to getMarkdown()
79
     *
80
     * @param $html
81
     *
82
     * @return string The Markdown version of the html
83
     */
84 72
    public function convert($html)
85 3
    {
86 72
        if (trim($html) === '') {
87 3
            return '';
88 3
        }
89
90 72
        $document = $this->createDOMDocument($html);
91
92
        // Work on the entire DOM tree (including head and body)
93 72
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
94 3
            throw new \InvalidArgumentException('Invalid HTML was provided');
95 3
        }
96
97 72
        $rootElement = new Element($root);
98 72
        $this->convertChildren($rootElement);
99
100
        // Store the now-modified DOMDocument as a string
101 69
        $markdown = $document->saveHTML();
102
103 69
        $markdown = $this->sanitize($markdown);
104
105 69
        return $markdown;
106
    }
107
108
    /**
109
     * @param string $html
110
     *
111
     * @return \DOMDocument
112
     */
113 69
    private function createDOMDocument($html)
114
    {
115 69
        $document = new \DOMDocument();
116
117 69
        if ($this->getConfig()->getOption('suppress_errors')) {
118
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
119 69
            libxml_use_internal_errors(true);
120 69
        }
121
122
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
123 69
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
124 69
        $document->encoding = 'UTF-8';
125
126 69
        if ($this->getConfig()->getOption('suppress_errors')) {
127 69
            libxml_clear_errors();
128 69
        }
129
130 69
        return $document;
131
    }
132
133
    /**
134
     * Convert Children
135
     *
136
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
137
     *
138
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
139
     * starting with the innermost element and working up to the outermost element.
140
     *
141
     * @param ElementInterface $element
142
     */
143 69
    private function convertChildren(ElementInterface $element)
144
    {
145
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
146 69
        if ($element->isDescendantOf(array('pre', 'code'))) {
147 9
            return;
148
        }
149
150
        // If the node has children, convert those to Markdown first
151 69
        if ($element->hasChildren()) {
152 69
            foreach ($element->getChildren() as $child) {
153 69
                $this->convertChildren($child);
154 69
            }
155 69
        }
156
157
        // Now that child nodes have been converted, convert the original node
158 69
        $markdown = $this->convertToMarkdown($element);
159
160
        // Create a DOM text node containing the Markdown equivalent of the original node
161
162
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
163 69
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 158 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
164 69
    }
165
166
    /**
167
     * Convert to Markdown
168
     *
169
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
170
     *
171
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
172
     *
173
     * @param ElementInterface $element
174
     *
175
     * @return string The converted HTML as Markdown
176
     */
177 69
    protected function convertToMarkdown(ElementInterface $element)
178
    {
179 69
        $tag = $element->getTagName();
180
181
        // Strip nodes named in remove_nodes
182 69
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
183 69
        if (in_array($tag, $tags_to_remove)) {
184 3
            return false;
185
        }
186
187 69
        $converter = $this->environment->getConverterByTag($tag);
188
189 69
        return $converter->convert($element);
190
    }
191
192
    /**
193
     * @param string $markdown
194
     *
195
     * @return string
196
     */
197 69
    protected function sanitize($markdown)
198
    {
199 69
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
200 69
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); // Double decode to cover cases like &amp;nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984
201 69
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
202 69
        $unwanted = array('<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<?xml encoding="UTF-8">', '&#xD;');
203 69
        $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags
204 69
        $markdown = trim($markdown, "\n\r\0\x0B");
205
206 69
        return $markdown;
207
    }
208
}
209