Completed
Pull Request — master (#100)
by
unknown
02:20
created

HtmlConverter::convertToMarkdown()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 14
ccs 0
cts 0
cp 0
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 7
nc 2
nop 1
crap 6
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * @var array
26
     */
27
    protected $whiteTags = array();
28
29 75
    /**
30
     * @var string
31
     */
32 75
    protected $wildCard = '';
33 75
34 75
    /**
35 75
     * Constructor
36 75
     *
37 75
     * @param array $options Configuration options
38 75
     */
39
    public function __construct(array $options = array())
40 75
    {
41
        $defaults = array(
42 75
            'header_style'    => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
43 75
            'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
44
            'strip_tags'      => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
45
            'bold_style'      => '**', // Set to '__' if you prefer the underlined style
46
            'italic_style'    => '_', // Set to '*' if you prefer the asterisk style
47
            'remove_nodes'    => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
48
            'white_tags' => array(), // Array with allowed html tags
49
            'white_tag_wildcard' => '|', // Use a non common character
50
        );
51
52
        $this->environment = Environment::createDefaultEnvironment($defaults);
53
54
        $this->environment->getConfig()->merge($options);
55
    }
56 72
57
    /**
58 72
     * @return Environment
59
     */
60
    public function getEnvironment()
61
    {
62
        return $this->environment;
63
    }
64
65
    /**
66
     * @return Configuration
67
     */
68
    public function getConfig()
69
    {
70 3
        return $this->environment->getConfig();
71
    }
72 3
73
    /**
74
     * Convert
75
     *
76
     * @see HtmlConverter::convert
77
     *
78
     * @param string $html
79
     *
80
     * @return string The Markdown version of the html
81
     */
82
    public function __invoke($html)
83
    {
84 75
        return $this->convert($html);
85 3
    }
86 75
87 3
    /**
88 3
     * Convert
89
     *
90 75
     * Loads HTML and passes to getMarkdown()
91
     *
92
     * @param $html
93 75
     *
94 3
     * @return string The Markdown version of the html
95 3
     */
96
    public function convert($html)
97 75
    {
98 75
        if (trim($html) === '') {
99
            return '';
100
        }
101 72
102
        $this->setWhiteTagVariables();
103 72
104
        $html = $this->escapeWhiteTags($html);
105 72
106
        $document = $this->createDOMDocument($html);
107
108
        // Work on the entire DOM tree (including head and body)
109
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
110
            throw new \InvalidArgumentException('Invalid HTML was provided');
111
        }
112
113 72
        $rootElement = new Element($root);
114
        $this->convertChildren($rootElement);
115 72
116
        // Store the now-modified DOMDocument as a string
117 72
        $markdown = $document->saveHTML();
118
119 72
        $markdown = $this->sanitize($markdown);
120 72
121
        $markdown = $this->removeEscapedWhiteTags($markdown);
122
123 72
        return $markdown;
124 72
    }
125
126 72
    /**
127 72
     * Set the values for use after
128 72
     */
129
    protected function setWhiteTagVariables()
130 72
    {
131
        $this->whiteTags = $this->getConfig()->getOption('white_tags');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->getConfig()->getOption('white_tags') of type * is incompatible with the declared type array of property $whiteTags.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
132
        $this->wildCard = $this->getConfig()->getOption('white_tag_wildcard');
133
    }
134
135
    /**
136
     * Add each "whiteTag" into <code> tags and add the "wildCard" before and after the "<code>" tag
137
     * for avoid convert into markdown and indentify them later
138
     *
139
     * @param string $html
140
     *
141
     * @return string
142
     */
143 72 View Code Duplication
    protected function escapeWhiteTags($html)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
144
    {
145
        if (count($this->whiteTags) > 0) {
146 72
            foreach ($this->whiteTags as $whiteTag) {
147 12
                //Search and replace the "<openTag" for "wildCard<code><openTag"
148
                $openTag = $this->getOpenTag($whiteTag);
149
                $replaceTag = sprintf('%s<code>%s', $this->wildCard, $openTag);
150
                $html = str_replace($openTag, $replaceTag, $html);
151 72
152 72
                //Search and replace the "closeTag>" for "closeTag></code>wildCard"
153 72
                $closeTag = $this->getCloseTag($whiteTag);
154 72
                $replaceTag = sprintf('%s</code>%s', $closeTag, $this->wildCard);
155 72
                $html = str_replace($closeTag, $replaceTag, $html);
156
            }
157
        }
158 72
159
        return $html;
160
    }
161
162
    /**
163 72
     * @param string $tag
164 72
     *
165
     * @return string
166
     */
167
    protected function getOpenTag($tag)
168
    {
169
        return sprintf('<%s', $tag);
170
    }
171
172
    /**
173
     * @param string $tag
174
     *
175
     * @return string
176
     */
177 72
    protected function getCloseTag($tag)
178
    {
179 72
        return sprintf('%s>', $tag);
180
    }
181
182 72
    /**
183 72
     * @param string $html
184 3
     *
185
     * @return \DOMDocument
186
     */
187 72
    private function createDOMDocument($html)
188
    {
189 72
        $document = new \DOMDocument();
190
191
        if ($this->getConfig()->getOption('suppress_errors')) {
192
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
193
            libxml_use_internal_errors(true);
194
        }
195
196
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
197 72
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
198
        $document->encoding = 'UTF-8';
199 72
200 72
        if ($this->getConfig()->getOption('suppress_errors')) {
201 72
            libxml_clear_errors();
202 72
        }
203 72
204
        return $document;
205 72
    }
206
207
    /**
208
     * Convert Children
209
     *
210
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
211
     *
212
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
213
     * starting with the innermost element and working up to the outermost element.
214
     *
215
     * @param ElementInterface $element
216
     */
217
    private function convertChildren(ElementInterface $element)
218
    {
219
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
220
        if ($element->isDescendantOf(array('pre', 'code'))) {
221
            return;
222
        }
223
224
        // If the node has children, convert those to Markdown first
225
        if ($element->hasChildren()) {
226
            foreach ($element->getChildren() as $child) {
227
                $this->convertChildren($child);
228
            }
229
        }
230
231
        // Now that child nodes have been converted, convert the original node
232
        $markdown = $this->convertToMarkdown($element);
233
234
        // Create a DOM text node containing the Markdown equivalent of the original node
235
236
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
237
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 232 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
238
    }
239
240
    /**
241
     * Convert to Markdown
242
     *
243
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
244
     *
245
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
246
     *
247
     * @param ElementInterface $element
248
     *
249
     * @return string The converted HTML as Markdown
250
     */
251
    protected function convertToMarkdown(ElementInterface $element)
252
    {
253
        $tag = $element->getTagName();
254
255
        // Strip nodes named in remove_nodes
256
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
257
        if (in_array($tag, $tags_to_remove)) {
258
            return false;
259
        }
260
261
        $converter = $this->environment->getConverterByTag($tag);
262
263
        return $converter->convert($element);
264
    }
265
266
    /**
267
     * @param string $markdown
268
     *
269
     * @return string
270
     */
271
    protected function sanitize($markdown)
272
    {
273
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
274
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
275
        $unwanted = array('<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<?xml encoding="UTF-8">', '&#xD;');
276
        $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags
277
        $markdown = trim($markdown, "\n\r\0\x0B");
278
279
        return $markdown;
280
    }
281
282
    /**
283
     * Remove the previously added <code> for the "whiteTags" marked by the "wildCard"
284
     * to return the "html" as the user typed
285
     *
286
     * @param string $markdown
287
     *
288
     * @return string
289
     */
290 View Code Duplication
    protected function removeEscapedWhiteTags($markdown)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
291
    {
292
        if (count($this->whiteTags) > 0) {
293
            foreach ($this->whiteTags as $whiteTag) {
294
                //Search and replace the "wildCard`<openTag" for "<openTag"
295
                $openTag = $this->getOpenTag($whiteTag);
296
                $openEscapedTag = sprintf('%s`%s', $this->wildCard, $openTag);
297
                $markdown = str_replace($openEscapedTag, $openTag, $markdown);
298
299
                //Search and replace the "closeTag>`wildCard" for "closeTag>"
300
                $closeTag = $this->getCloseTag($whiteTag);
301
                $closeEscapedTag = sprintf('%s`%s', $closeTag, $this->wildCard);
302
                $markdown = str_replace($closeEscapedTag, $closeTag, $markdown);
303
            }
304
        }
305
306
        return $markdown;
307
    }
308
}
309