Completed
Push — master ( fce42c...ce3e45 )
by Colin
10s
created

src/HtmlConverter.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param array $options Configuration options
28
     */
29 78
    public function __construct(array $options = array())
30
    {
31
        $defaults = array(
32 78
            'header_style'    => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
33 78
            'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
34 78
            'strip_tags'      => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
35 78
            'bold_style'      => '**', // Set to '__' if you prefer the underlined style
36 78
            'italic_style'    => '_', // Set to '*' if you prefer the asterisk style
37 78
            'remove_nodes'    => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
38 78
        );
39
40 78
        $this->environment = Environment::createDefaultEnvironment($defaults);
41
42 78
        $this->environment->getConfig()->merge($options);
43 78
    }
44
45
    /**
46
     * @return Environment
47
     */
48
    public function getEnvironment()
49
    {
50
        return $this->environment;
51
    }
52
53
    /**
54
     * @return Configuration
55
     */
56 75
    public function getConfig()
57
    {
58 75
        return $this->environment->getConfig();
59
    }
60
61
    /**
62
     * Convert
63
     *
64
     * @see HtmlConverter::convert
65
     *
66
     * @param string $html
67
     *
68
     * @return string The Markdown version of the html
69
     */
70 3
    public function __invoke($html)
71
    {
72 3
        return $this->convert($html);
73
    }
74
75
    /**
76
     * Convert
77
     *
78
     * Loads HTML and passes to getMarkdown()
79
     *
80
     * @param $html
81
     *
82
     * @return string The Markdown version of the html
83
     */
84 78
    public function convert($html)
85
    {
86 78
        if (trim($html) === '') {
87 3
            return '';
88 3
        }
89
90 78
        $document = $this->createDOMDocument($html);
91
92
        // Work on the entire DOM tree (including head and body)
93 78
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
94 3
            throw new \InvalidArgumentException('Invalid HTML was provided');
95 3
        }
96
97 78
        $rootElement = new Element($root);
98 78
        $this->convertChildren($rootElement);
99
100
        // Store the now-modified DOMDocument as a string
101 75
        $markdown = $document->saveHTML();
102
103 75
        $markdown = $this->sanitize($markdown);
104
105 75
        return $markdown;
106
    }
107
108
    /**
109
     * @param string $html
110
     *
111
     * @return \DOMDocument
112
     */
113 75
    private function createDOMDocument($html)
114
    {
115 75
        $document = new \DOMDocument();
116
117 75
        if ($this->getConfig()->getOption('suppress_errors')) {
118
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
119 75
            libxml_use_internal_errors(true);
120 75
        }
121
122
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
123 75
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
124 75
        $document->encoding = 'UTF-8';
125
126 75
        if ($this->getConfig()->getOption('suppress_errors')) {
127 75
            libxml_clear_errors();
128 75
        }
129
130 75
        return $document;
131
    }
132
133
    /**
134
     * Convert Children
135
     *
136
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
137
     *
138
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
139
     * starting with the innermost element and working up to the outermost element.
140
     *
141
     * @param ElementInterface $element
142
     */
143 75
    private function convertChildren(ElementInterface $element)
144
    {
145
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
146
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
147 75
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
148 15
            return;
149
        }
150
151
        // If the node has children, convert those to Markdown first
152 75
        if ($element->hasChildren()) {
153 75
            foreach ($element->getChildren() as $child) {
154 75
                $this->convertChildren($child);
155 75
            }
156 75
        }
157
158
        // Now that child nodes have been converted, convert the original node
159 75
        $markdown = $this->convertToMarkdown($element);
160
161
        // Create a DOM text node containing the Markdown equivalent of the original node
162
163
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
164 75
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
It seems like $markdown defined by $this->convertToMarkdown($element) on line 159 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
165 75
    }
166
167
    /**
168
     * Convert to Markdown
169
     *
170
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
171
     *
172
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
173
     *
174
     * @param ElementInterface $element
175
     *
176
     * @return string The converted HTML as Markdown
177
     */
178 75
    protected function convertToMarkdown(ElementInterface $element)
179
    {
180 75
        $tag = $element->getTagName();
181
182
        // Strip nodes named in remove_nodes
183 75
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
184 75
        if (in_array($tag, $tags_to_remove)) {
185 3
            return false;
186
        }
187
188 75
        $converter = $this->environment->getConverterByTag($tag);
189
190 75
        return $converter->convert($element);
191
    }
192
193
    /**
194
     * @param string $markdown
195
     *
196
     * @return string
197
     */
198 75
    protected function sanitize($markdown)
199
    {
200 75
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
201 75
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
202 75
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
203
204
        /*
205
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
206
         * XML, html and body opening tags should be in that order. Same case with closing tags
207
         */
208 75
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
209
210 75
        foreach ($unwanted as $tag) {
211 75
            if (strpos($tag, '/') === false) {
212
                // Opening tags
213 75
                if (strpos($markdown, $tag) === 0) {
214 75
                    $markdown = substr($markdown, strlen($tag));
215 75
                }
216 75
            } else {
217
                // Closing tags
218 75
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
219 66
                    $markdown = substr($markdown, 0, -strlen($tag));
220 66
                }
221
            }
222 75
        }
223
224 75
        $markdown = trim($markdown, "\n\r\0\x0B");
225
226 75
        return $markdown;
227
    }
228
}
229