Completed
Push — master ( ca9126...817989 )
by Colin
05:00 queued 15s
created

src/HtmlConverter.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param array $options Configuration options
28
     */
29 78
    public function __construct(array $options = array())
30
    {
31
        $defaults = array(
32 78
            'header_style'    => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
33 78
            'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
34 78
            'strip_tags'      => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
35 78
            'bold_style'      => '**', // Set to '__' if you prefer the underlined style
36 78
            'italic_style'    => '_', // Set to '*' if you prefer the asterisk style
37 78
            'remove_nodes'    => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
38 78
            'hard_break'      => false, // Set to true to turn <br> into `\n` instead of `  \n`
39 78
        );
40
41 78
        $this->environment = Environment::createDefaultEnvironment($defaults);
42
43 78
        $this->environment->getConfig()->merge($options);
44 78
    }
45
46
    /**
47
     * @return Environment
48
     */
49
    public function getEnvironment()
50
    {
51
        return $this->environment;
52
    }
53
54
    /**
55
     * @return Configuration
56
     */
57 75
    public function getConfig()
58
    {
59 75
        return $this->environment->getConfig();
60
    }
61
62
    /**
63
     * Convert
64
     *
65
     * @see HtmlConverter::convert
66
     *
67
     * @param string $html
68
     *
69
     * @return string The Markdown version of the html
70
     */
71 3
    public function __invoke($html)
72
    {
73 3
        return $this->convert($html);
74
    }
75
76
    /**
77
     * Convert
78
     *
79
     * Loads HTML and passes to getMarkdown()
80
     *
81
     * @param $html
82
     *
83
     * @return string The Markdown version of the html
84
     */
85 78
    public function convert($html)
86 3
    {
87 78
        if (trim($html) === '') {
88 3
            return '';
89 3
        }
90
91 78
        $document = $this->createDOMDocument($html);
92
93
        // Work on the entire DOM tree (including head and body)
94 78
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
95 3
            throw new \InvalidArgumentException('Invalid HTML was provided');
96 3
        }
97
98 78
        $rootElement = new Element($root);
99 78
        $this->convertChildren($rootElement);
100
101
        // Store the now-modified DOMDocument as a string
102 75
        $markdown = $document->saveHTML();
103
104 75
        $markdown = $this->sanitize($markdown);
105
106 75
        return $markdown;
107
    }
108
109
    /**
110
     * @param string $html
111
     *
112
     * @return \DOMDocument
113
     */
114 75
    private function createDOMDocument($html)
115
    {
116 75
        $document = new \DOMDocument();
117
118 75
        if ($this->getConfig()->getOption('suppress_errors')) {
119
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
120 75
            libxml_use_internal_errors(true);
121 75
        }
122
123
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
124 75
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
125 75
        $document->encoding = 'UTF-8';
126
127 75
        if ($this->getConfig()->getOption('suppress_errors')) {
128 75
            libxml_clear_errors();
129 75
        }
130
131 75
        return $document;
132
    }
133
134
    /**
135
     * Convert Children
136
     *
137
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
138
     *
139
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
140
     * starting with the innermost element and working up to the outermost element.
141
     *
142
     * @param ElementInterface $element
143
     */
144 75
    private function convertChildren(ElementInterface $element)
145
    {
146
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
147
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
148 75
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
149 15
            return;
150
        }
151
152
        // If the node has children, convert those to Markdown first
153 75
        if ($element->hasChildren()) {
154 75
            foreach ($element->getChildren() as $child) {
155 75
                $this->convertChildren($child);
156 75
            }
157 75
        }
158
159
        // Now that child nodes have been converted, convert the original node
160 75
        $markdown = $this->convertToMarkdown($element);
161
162
        // Create a DOM text node containing the Markdown equivalent of the original node
163
164
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
165 75
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
It seems like $markdown defined by $this->convertToMarkdown($element) on line 160 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
166 75
    }
167
168
    /**
169
     * Convert to Markdown
170
     *
171
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
172
     *
173
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
174
     *
175
     * @param ElementInterface $element
176
     *
177
     * @return string The converted HTML as Markdown
178
     */
179 75
    protected function convertToMarkdown(ElementInterface $element)
180
    {
181 75
        $tag = $element->getTagName();
182
183
        // Strip nodes named in remove_nodes
184 75
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
185 75
        if (in_array($tag, $tags_to_remove)) {
186 3
            return false;
187
        }
188
189 75
        $converter = $this->environment->getConverterByTag($tag);
190
191 75
        return $converter->convert($element);
192
    }
193
194
    /**
195
     * @param string $markdown
196
     *
197
     * @return string
198
     */
199 75
    protected function sanitize($markdown)
200
    {
201 75
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
202 75
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
203 75
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
204
205
        /*
206
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
207
         * XML, html and body opening tags should be in that order. Same case with closing tags
208
         */
209 75
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
210
211 75
        foreach ($unwanted as $tag) {
212 75
            if (strpos($tag, '/') === false) {
213
                // Opening tags
214 75
                if (strpos($markdown, $tag) === 0) {
215 75
                    $markdown = substr($markdown, strlen($tag));
216 75
                }
217 75
            } else {
218
                // Closing tags
219 75
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
220 66
                    $markdown = substr($markdown, 0, -strlen($tag));
221 66
                }
222
            }
223 75
        }
224
225 75
        $markdown = trim($markdown, "\n\r\0\x0B");
226
227 75
        return $markdown;
228
    }
229
}
230