HtmlConverter::convertToMarkdown()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 13
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 6
nc 2
nop 1
dl 0
loc 13
ccs 4
cts 4
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace League\HTMLToMarkdown;
6
7
/**
8
 * A helper class to convert HTML to Markdown.
9
 *
10
 * @author Colin O'Dell <[email protected]>
11
 * @author Nick Cernis <[email protected]>
12
 *
13
 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
14
 *
15
 * @license http://www.opensource.org/licenses/mit-license.php MIT
16
 */
17
class HtmlConverter implements HtmlConverterInterface
18
{
19
    /** @var Environment */
20
    protected $environment;
21
22
    /**
23
     * Constructor
24
     *
25
     * @param Environment|array<string, mixed> $options Environment object or configuration options
26
     */
27
    public function __construct($options = [])
28
    {
29
        if ($options instanceof Environment) {
30
            $this->environment = $options;
31 99
        } elseif (\is_array($options)) {
0 ignored issues
show
introduced by
The condition is_array($options) is always true.
Loading history...
32
            $defaults = [
33 99
                'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
34 3
                'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
35 98
                'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
36
                'strip_placeholder_links' => false, // Set to true to remove <a> that doesn't have href.
37 96
                'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
38 64
                'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
39 64
                'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
40 64
                'hard_break' => false, // Set to true to turn <br> into `\n` instead of `  \n`
41 64
                'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
42 64
                'preserve_comments' => false, // Set to true to preserve comments, or set to an array of strings to preserve specific comments
43 64
                'use_autolinks' => true, // Set to true to use simple link syntax if possible. Will always use []() if set to false
44 64
                'table_pipe_escape' => '\|', // Replacement string for pipe characters inside markdown table cells
45 64
                'table_caption_side' => 'top', // Set to 'top' or 'bottom' to show <caption> content before or after table, null to suppress
46 64
            ];
47 64
48 64
            $this->environment = Environment::createDefaultEnvironment($defaults);
49 64
50 64
            $this->environment->getConfig()->merge($options);
51
        }
52 96
    }
53
54 96
    public function getEnvironment(): Environment
55 64
    {
56 99
        return $this->environment;
57
    }
58
59
    public function getConfig(): Configuration
60
    {
61 3
        return $this->environment->getConfig();
62
    }
63 3
64
    /**
65
     * Convert
66
     *
67
     * @see HtmlConverter::convert
68
     *
69 96
     * @return string The Markdown version of the html
70
     */
71 96
    public function __invoke(string $html): string
72
    {
73
        return $this->convert($html);
74
    }
75
76
    /**
77
     * Convert
78
     *
79
     * Loads HTML and passes to getMarkdown()
80
     *
81
     * @return string The Markdown version of the html
82
     *
83 3
     * @throws \InvalidArgumentException|\RuntimeException
84
     */
85 3
    public function convert(string $html): string
86
    {
87
        if (\trim($html) === '') {
88
            return '';
89
        }
90
91
        $document = $this->createDOMDocument($html);
92
93
        // Work on the entire DOM tree (including head and body)
94
        if (! ($root = $document->getElementsByTagName('html')->item(0))) {
95
            throw new \InvalidArgumentException('Invalid HTML was provided');
96
        }
97
98
        $rootElement = new Element($root);
99 99
        $this->convertChildren($rootElement);
100
101 99
        // Store the now-modified DOMDocument as a string
102 3
        $markdown = $document->saveHTML();
103
104
        if ($markdown === false) {
105 96
            throw new \RuntimeException('Unknown error occurred during HTML to Markdown conversion');
106
        }
107
108 96
        return $this->sanitize($markdown);
109
    }
110
111
    private function createDOMDocument(string $html): \DOMDocument
112 96
    {
113 96
        $document = new \DOMDocument();
114
115
        if ($this->getConfig()->getOption('suppress_errors')) {
116 96
            // Suppress conversion errors (from http://bit.ly/pCCRSX)
117
            \libxml_use_internal_errors(true);
118 96
        }
119
120
        // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
121
        $document->loadHTML('<?xml encoding="UTF-8">' . $html);
122
        $document->encoding = 'UTF-8';
123
124
        $this->replaceMisplacedComments($document);
125
126 96
        if ($this->getConfig()->getOption('suppress_errors')) {
127
            \libxml_clear_errors();
128 96
        }
129
130 96
        return $document;
131
    }
132 93
133 62
    /**
134
     * Finds any comment nodes outside <html> element and moves them into <body>.
135
     *
136 96
     * @see https://github.com/thephpleague/html-to-markdown/issues/212
137 96
     * @see https://3v4l.org/7bC33
138
     */
139 96
    private function replaceMisplacedComments(\DOMDocument $document): void
140 93
    {
141 62
        // Find ny comment nodes at the root of the document.
142
        $misplacedComments = (new \DOMXPath($document))->query('/comment()');
143 96
        if ($misplacedComments === false) {
144
            return;
145
        }
146
147
        $body = $document->getElementsByTagName('body')->item(0);
148
        if ($body === null) {
149
            return;
150
        }
151
152
        // Loop over comment nodes in reverse so we put them inside <body> in
153
        // their original order.
154
        for ($index = $misplacedComments->length - 1; $index >= 0; $index--) {
155
            if ($body->firstChild === null) {
156 96
                $body->insertBefore($misplacedComments[$index]);
157
            } else {
158
                $body->insertBefore($misplacedComments[$index], $body->firstChild);
159
            }
160 96
        }
161 21
    }
162
163
    /**
164
     * Convert Children
165 96
     *
166 96
     * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
167 3
     *
168 2
     * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
169
     * starting with the innermost element and working up to the outermost element.
170
     */
171 96
    private function convertChildren(ElementInterface $element): void
172 96
    {
173 96
        // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
174 64
        // except if the current node is a code tag, which needs to be converted by the CodeConverter.
175 64
        if ($element->isDescendantOf(['pre', 'code']) && $element->getTagName() !== 'code') {
176
            return;
177
        }
178 96
179
        // Give converter a chance to inspect/modify the DOM before children are converted
180
        $converter = $this->environment->getConverterByTag($element->getTagName());
181
        if ($converter instanceof PreConverterInterface) {
182
            $converter->preConvert($element);
183 96
        }
184 96
185
        // If the node has children, convert those to Markdown first
186
        if ($element->hasChildren()) {
187
            foreach ($element->getChildren() as $child) {
188
                $this->convertChildren($child);
189
            }
190
        }
191
192
        // Now that child nodes have been converted, convert the original node
193
        $markdown = $this->convertToMarkdown($element);
194
195
        // Create a DOM text node containing the Markdown equivalent of the original node
196
197 96
        // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
198
        $element->setFinalMarkdown($markdown);
199 96
    }
200
201
    /**
202 96
     * Convert to Markdown
203 96
     *
204 3
     * Converts an individual node into a #text node containing a string of its Markdown equivalent.
205
     *
206
     * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
207 96
     *
208
     * @return string The converted HTML as Markdown
209 96
     */
210
    protected function convertToMarkdown(ElementInterface $element): string
211
    {
212
        $tag = $element->getTagName();
213
214
        // Strip nodes named in remove_nodes
215
        $tagsToRemove = \explode(' ', Coerce::toString($this->getConfig()->getOption('remove_nodes') ?? ''));
216
        if (\in_array($tag, $tagsToRemove, true)) {
217 96
            return '';
218
        }
219 96
220 96
        $converter = $this->environment->getConverterByTag($tag);
221 96
222
        return $converter->convert($element);
223
    }
224
225
    protected function sanitize(string $markdown): string
226
    {
227 96
        $markdown = \html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
228
        $markdown = \preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
229 96
        \assert($markdown !== null);
230 96
        $markdown = \trim($markdown); // Remove blank spaces at the beggining of the html
231
232 96
        /*
233 96
         * Removing unwanted tags. Tags should be added to the array in the order they are expected.
234 64
         * XML, html and body opening tags should be in that order. Same case with closing tags
235 64
         */
236
        $unwanted = ['<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;'];
237 96
238 90
        foreach ($unwanted as $tag) {
239 58
            if (\strpos($tag, '/') === false) {
240
                // Opening tags
241 64
                if (\strpos($markdown, $tag) === 0) {
242
                    $markdown = \substr($markdown, \strlen($tag));
243 96
                }
244
            } else {
245
                // Closing tags
246
                if (\strpos($markdown, $tag) === \strlen($markdown) - \strlen($tag)) {
247
                    $markdown = \substr($markdown, 0, -\strlen($tag));
248
                }
249
            }
250
        }
251
252
        return \trim($markdown, "\n\r\0\x0B");
253
    }
254
255
    /**
256
     * Pass a series of key-value pairs in an array; these will be passed
257
     * through the config and set.
258
     * The advantage of this is that it can allow for static use (IE in Laravel).
259
     * An example being:
260
     *
261
     * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>');
262
     *
263
     * @param array<string, mixed> $options
264
     *
265
     * @return $this
266
     */
267
    public function setOptions(array $options)
268
    {
269
        $config = $this->getConfig();
270
271
        foreach ($options as $key => $option) {
272
            $config->setOption($key, $option);
273
        }
274
275
        return $this;
276
    }
277
}
278