Completed
Pull Request — master (#100)
by
unknown
21:51 queued 19:53
created

HtmlConverter::setWhiteTagVariables()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 1
Metric Value
dl 0
loc 5
ccs 4
cts 4
cp 1
rs 9.4285
c 1
b 0
f 1
cc 1
eloc 3
nc 1
nop 0
crap 1
1
<?php
2
3
namespace League\HTMLToMarkdown;
4
5
/**
6
 * Class HtmlConverter
7
 *
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter
18
{
19
    /**
20
     * @var Environment
21
     */
22
    protected $environment;
23
24
    /**
25
     * @var array
26
     */
27
    protected $whiteTags = array();
28
29
    /**
30
     * @var string
31
     */
32
    protected $wildCard = '';
33
34
    /**
35
     * Constructor
36
     *
37
     * @param array $options Configuration options
38
     */
39 78
    public function __construct(array $options = array())
40
    {
41
        $defaults = array(
42 78
            'header_style'       => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
43 78
            'suppress_errors'    => true, // Set to false to show warnings when loading malformed HTML
44 78
            'strip_tags'         => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
45 78
            'bold_style'         => '**', // Set to '__' if you prefer the underlined style
46 78
            'italic_style'       => '_', // Set to '*' if you prefer the asterisk style
47 78
            'remove_nodes'       => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
48 78
            'white_tags'         => array(), // Array with allowed html tags
49 78
            'white_tag_wildcard' => '|', // Use a non common character
50 78
        );
51
52 78
        $this->environment = Environment::createDefaultEnvironment($defaults);
53
54 78
        $this->environment->getConfig()->merge($options);
55 78
    }
56
57
    /**
58
     * @return Environment
59
     */
60 3
    public function getEnvironment()
61
    {
62 3
        return $this->environment;
63
    }
64
65
    /**
66
     * @return Configuration
67
     */
68 72
    public function getConfig()
69
    {
70 72
        return $this->environment->getConfig();
71
    }
72
73
    /**
74
     * Convert
75
     *
76
     * @see HtmlConverter::convert
77
     *
78
     * @param string $html
79
     *
80
     * @return string The Markdown version of the html
81
     */
82 3
    public function __invoke($html)
83
    {
84 3
        return $this->convert($html);
85 3
    }
86
87
    /**
88
     * Convert
89
     *
90
     * Loads HTML and passes to getMarkdown()
91
     *
92
     * @param $html
93
     *
94
     * @return string The Markdown version of the html
95
     */
96 75
    public function convert($html)
97 3
    {
98 75
        if (trim($html) === '') {
99 3
            return '';
100
        }
101
102 72
        $this->setWhiteTagVariables();
103
104 72
        $html = $this->escapeWhiteTags($html);
105
106 72
        $document = $this->createDOMDocument($html);
107
108
        // Work on the entire DOM tree (including head and body)
109 72
        if (!($root = $document->getElementsByTagName('html')->item(0))) {
110
            throw new \InvalidArgumentException('Invalid HTML was provided');
111
        }
112
113 72
        $rootElement = new Element($root);
114 72
        $this->convertChildren($rootElement);
115
116
        // Store the now-modified DOMDocument as a string
117 72
        $markdown = $document->saveHTML();
118
119 72
        $markdown = $this->sanitize($markdown);
120
121 72
        $markdown = $this->removeEscapedWhiteTags($markdown);
122
123 72
        return $markdown;
124
    }
125
126
    /**
127
     * Set the values for use after
128
     */
129 72
    protected function setWhiteTagVariables()
130
    {
131 72
        $this->whiteTags = $this->getConfig()->getOption('white_tags');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->getConfig()->getOption('white_tags') of type * is incompatible with the declared type array of property $whiteTags.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
132 72
        $this->wildCard = $this->getConfig()->getOption('white_tag_wildcard');
133 72
    }
134
135
    /**
136
     * Add each "whiteTag" into <code> tags and add the "wildCard" before and after the "<code>" tag
137
     * for avoid convert into markdown and indentify them later
138
     *
139
     * @param string $html
140
     *
141
     * @return string
142
     */
143 72 View Code Duplication
    protected function escapeWhiteTags($html)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
144
    {
145 72
        if (count($this->whiteTags) > 0) {
146 3
            foreach ($this->whiteTags as $whiteTag) {
147
                //Search and replace the "<openTag" for "wildCard<code><openTag"
148 3
                $openTag = $this->getOpenTag($whiteTag);
149 3
                $replaceTag = sprintf('%s<code>%s', $this->wildCard, $openTag);
150 3
                $html = str_replace($openTag, $replaceTag, $html);
151
152
                //Search and replace the "closeTag>" for "closeTag></code>wildCard"
153 3
                $closeTag = $this->getCloseTag($whiteTag);
154 3
                $replaceTag = sprintf('%s</code>%s', $closeTag, $this->wildCard);
155 3
                $html = str_replace($closeTag, $replaceTag, $html);
156 3
            }
157 3
        }
158
159 72
        return $html;
160
    }
161
162
    /**
163
     * @param string $tag
164
     *
165
     * @return string
166
     */
167 3
    protected function getOpenTag($tag)
168
    {
169 3
        return sprintf('<%s', $tag);
170
    }
171
172
    /**
173
     * @param string $tag
174
     *
175
     * @return string
176
     */
177 3
    protected function getCloseTag($tag)
178
    {
179 3
        return sprintf('%s>', $tag);
180
    }
181
182
    /**
183
     * @param string $html
184
     *
185
     * @return \DOMDocument
186
     */
187 72
    private function createDOMDocument($html)
188
    {
189 72
        $document = new \DOMDocument();
190
191 72
        if ($this->getConfig()->getOption('suppress_errors')) {
192
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
193 72
            libxml_use_internal_errors(true);
194 72
        }
195
196
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
197 72
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
198 72
        $document->encoding = 'UTF-8';
199
200 72
        if ($this->getConfig()->getOption('suppress_errors')) {
201 72
            libxml_clear_errors();
202 72
        }
203
204 72
        return $document;
205
    }
206
207
    /**
208
     * Convert Children
209
     *
210
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
211
     *
212
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
213
     * starting with the innermost element and working up to the outermost element.
214
     *
215
     * @param ElementInterface $element
216
     */
217 72
    private function convertChildren(ElementInterface $element)
218
    {
219
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
220 72
        if ($element->isDescendantOf(array('pre', 'code'))) {
221 12
            return;
222
        }
223
224
        // If the node has children, convert those to Markdown first
225 72
        if ($element->hasChildren()) {
226 72
            foreach ($element->getChildren() as $child) {
227 72
                $this->convertChildren($child);
228 72
            }
229 72
        }
230
231
        // Now that child nodes have been converted, convert the original node
232 72
        $markdown = $this->convertToMarkdown($element);
233
234
        // Create a DOM text node containing the Markdown equivalent of the original node
235
236
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
237 72
        $element->setFinalMarkdown($markdown);
0 ignored issues
show
Security Bug introduced by
It seems like $markdown defined by $this->convertToMarkdown($element) on line 232 can also be of type false; however, League\HTMLToMarkdown\El...ace::setFinalMarkdown() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
238 72
    }
239
240
    /**
241
     * Convert to Markdown
242
     *
243
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
244
     *
245
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
246
     *
247
     * @param ElementInterface $element
248
     *
249
     * @return string The converted HTML as Markdown
250
     */
251 72
    protected function convertToMarkdown(ElementInterface $element)
252
    {
253 72
        $tag = $element->getTagName();
254
255
        // Strip nodes named in remove_nodes
256 72
        $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
257 72
        if (in_array($tag, $tags_to_remove)) {
258 3
            return false;
259
        }
260
261 72
        $converter = $this->environment->getConverterByTag($tag);
262
263 72
        return $converter->convert($element);
264
    }
265
266
    /**
267
     * @param string $markdown
268
     *
269
     * @return string
270
     */
271 72
    protected function sanitize($markdown)
272
    {
273 72
        $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
274 72
        $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
275 72
        $unwanted = array('<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<?xml encoding="UTF-8">', '&#xD;');
276 72
        $markdown = str_replace($unwanted, '', $markdown); // Strip unwanted tags
277 72
        $markdown = trim($markdown, "\n\r\0\x0B");
278
279 72
        return $markdown;
280
    }
281
282
    /**
283
     * Remove the previously added <code> for the "whiteTags" marked by the "wildCard"
284
     * to return the "html" as the user typed
285
     *
286
     * @param string $markdown
287
     *
288
     * @return string
289
     */
290 72 View Code Duplication
    protected function removeEscapedWhiteTags($markdown)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
291
    {
292 72
        if (count($this->whiteTags) > 0) {
293 3
            foreach ($this->whiteTags as $whiteTag) {
294
                //Search and replace the "wildCard`<openTag" for "<openTag"
295 3
                $openTag = $this->getOpenTag($whiteTag);
296 3
                $openEscapedTag = sprintf('%s`%s', $this->wildCard, $openTag);
297 3
                $markdown = str_replace($openEscapedTag, $openTag, $markdown);
298
299
                //Search and replace the "closeTag>`wildCard" for "closeTag>"
300 3
                $closeTag = $this->getCloseTag($whiteTag);
301 3
                $closeEscapedTag = sprintf('%s`%s', $closeTag, $this->wildCard);
302 3
                $markdown = str_replace($closeEscapedTag, $closeTag, $markdown);
303 3
            }
304 3
        }
305
306 72
        return $markdown;
307
    }
308
}
309