Completed
Push — master ( 51616b...1810d3 )
by Colin
09:28 queued 06:40
created

HtmlConverter   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 215
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 5

Test Coverage

Coverage 96.1%

Importance

Changes 2
Bugs 0 Features 0
Metric Value
wmc 24
c 2
b 0
f 0
lcom 1
cbo 5
dl 0
loc 215
ccs 74
cts 77
cp 0.961
rs 10

9 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 20 3
A getEnvironment() 0 4 1
A getConfig() 0 4 1
A __invoke() 0 4 1
A createDOMDocument() 0 19 3
B convertChildren() 0 23 5
A convertToMarkdown() 0 14 2
A convert() 0 21 3
B sanitize() 0 28 5
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * Constructor
26
     *
27
     * @param Environment|array $options Environment object or configuration options
28
     */
29 81
    public function __construct($options = array())
30
    {
31 81
        if ($options instanceof Environment) {
32 3
            $this->environment = $options;
33 81
        } elseif (is_array($options)) {
34
            $defaults = array(
35 78
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
36 78
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
37 78
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
38 78
                'bold_style' => '**', // Set to '__' if you prefer the underlined style
39 78
                'italic_style' => '_', // Set to '*' if you prefer the asterisk style
40 78
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
41 78
                'hard_break' => false,// Set to true to turn <br> into `\n` instead of `  \n`
42 78
            );
43
44 78
            $this->environment = Environment::createDefaultEnvironment($defaults);
45
46 78
            $this->environment->getConfig()->merge($options);
47 78
        }
48 81
    }
49
50
    /**
51
     * @return Environment
52
     */
53
    public function getEnvironment()
54
    {
55
        return $this->environment;
56
    }
57
58
    /**
59
     * @return Configuration
60
     */
61 78
    public function getConfig()
62
    {
63 78
        return $this->environment->getConfig();
64
    }
65
66
    /**
67
     * Convert
68
     *
69
     * @see HtmlConverter::convert
70
     *
71
     * @param string $html
72
     *
73
     * @return string The Markdown version of the html
74
     */
75 3
    public function __invoke($html)
76
    {
77 3
        return $this->convert($html);
78
    }
79
80
    /**
81
     * Convert
82
     *
83
     * Loads HTML and passes to getMarkdown()
84
     *
85
     * @param string $html
86
     *
87
     * @throws \InvalidArgumentException
88
     *
89
     * @return string The Markdown version of the html
90
     */
91 81
    public function convert($html)
92
    {
93 81
        if (trim($html) === '') {
94 3
            return '';
95
        }
96
97 78
        $document = $this->createDOMDocument($html);
98
99
        // Work on the entire DOM tree (including head and body)
100 78
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
101
            throw new \InvalidArgumentException('Invalid HTML was provided');
102
        }
103
104 78
        $rootElement = new Element($root);
105 78
        $this->convertChildren($rootElement);
106
107
        // Store the now-modified DOMDocument as a string
108 78
        $markdown = $document->saveHTML();
109
110 78
        return $this->sanitize($markdown);
111
    }
112
113
    /**
114
     * @param string $html
115
     *
116
     * @return \DOMDocument
117
     */
118 78
    private function createDOMDocument($html)
119
    {
120 78
        $document = new \DOMDocument();
121
122 78
        if ($this->getConfig()->getOption('suppress_errors')) {
123
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
124 75
            libxml_use_internal_errors(true);
125 75
        }
126
127
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
128 78
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
129 78
        $document->encoding = 'UTF-8';
130
131 78
        if ($this->getConfig()->getOption('suppress_errors')) {
132 75
            libxml_clear_errors();
133 75
        }
134
135 78
        return $document;
136
    }
137
138
    /**
139
     * Convert Children
140
     *
141
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
142
     *
143
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
144
     * starting with the innermost element and working up to the outermost element.
145
     *
146
     * @param ElementInterface $element
147
     */
148 78
    private function convertChildren(ElementInterface $element)
149
    {
150
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
151
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
152 78
        if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
153 15
            return;
154
        }
155
156
        // If the node has children, convert those to Markdown first
157 78
        if ($element->hasChildren()) {
158 78
            foreach ($element->getChildren() as $child) {
159 78
                $this->convertChildren($child);
160 78
            }
161 78
        }
162
163
        // Now that child nodes have been converted, convert the original node
164 78
        $markdown = $this->convertToMarkdown($element);
165
166
        // Create a DOM text node containing the Markdown equivalent of the original node
167
168
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
169 78
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 164 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
170 78
    }
171
172
    /**
173
     * Convert to Markdown
174
     *
175
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
176
     *
177
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
178
     *
179
     * @param ElementInterface $element
180
     *
181
     * @return string The converted HTML as Markdown
182
     */
183 78
    protected function convertToMarkdown(ElementInterface $element)
184
    {
185 78
        $tag = $element->getTagName();
186
187
        // Strip nodes named in remove_nodes
188 78
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
189 78
        if (in_array($tag, $tags_to_remove)) {
190 3
            return false;
191
        }
192
193 78
        $converter = $this->environment->getConverterByTag($tag);
194
195 78
        return $converter->convert($element);
196
    }
197
198
    /**
199
     * @param string $markdown
200
     *
201
     * @return string
202
     */
203 78
    protected function sanitize($markdown)
204
    {
205 78
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
206 78
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
207 78
        $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
208
209
        /*
210
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
211
         * XML, html and body opening tags should be in that order. Same case with closing tags
212
         */
213 78
        $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
214
215 78
        foreach ($unwanted as $tag) {
216 78
            if (strpos($tag, '/') === false) {
217
                // Opening tags
218 78
                if (strpos($markdown, $tag) === 0) {
219 78
                    $markdown = substr($markdown, strlen($tag));
220 78
                }
221 78
            } else {
222
                // Closing tags
223 78
                if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
224 69
                    $markdown = substr($markdown, 0, -strlen($tag));
225 69
                }
226
            }
227 78
        }
228
229 78
        return trim($markdown, "\n\r\0\x0B");
230
    }
231
}
232