These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | namespace League\HTMLToMarkdown; |
||
4 | |||
5 | /** |
||
6 | * Class HtmlConverter |
||
7 | * |
||
8 | * A helper class to convert HTML to Markdown. |
||
9 | * |
||
10 | * @author Colin O'Dell <[email protected]> |
||
11 | * @author Nick Cernis <[email protected]> |
||
12 | * |
||
13 | * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub. |
||
14 | * |
||
15 | * @license http://www.opensource.org/licenses/mit-license.php MIT |
||
16 | */ |
||
17 | class HtmlConverter |
||
18 | { |
||
19 | /** |
||
20 | * @var Environment |
||
21 | */ |
||
22 | protected $environment; |
||
23 | |||
24 | /** |
||
25 | * Constructor |
||
26 | * |
||
27 | * @param Environment|array $options Environment object or configuration options |
||
28 | */ |
||
29 | 84 | public function __construct($options = array()) |
|
30 | { |
||
31 | 84 | if ($options instanceof Environment) { |
|
32 | 3 | $this->environment = $options; |
|
33 | 83 | } elseif (is_array($options)) { |
|
34 | $defaults = array( |
||
35 | 81 | 'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2 |
|
36 | 54 | 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML |
|
37 | 54 | 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. |
|
38 | 54 | 'bold_style' => '**', // Set to '__' if you prefer the underlined style |
|
39 | 54 | 'italic_style' => '_', // Set to '*' if you prefer the asterisk style |
|
40 | 54 | 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script' |
|
41 | 54 | 'hard_break' => false, // Set to true to turn <br> into `\n` instead of ` \n` |
|
42 | 54 | 'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+' |
|
43 | 54 | ); |
|
44 | |||
45 | 81 | $this->environment = Environment::createDefaultEnvironment($defaults); |
|
46 | |||
47 | 81 | $this->environment->getConfig()->merge($options); |
|
48 | 54 | } |
|
49 | 84 | } |
|
50 | |||
51 | /** |
||
52 | * @return Environment |
||
53 | */ |
||
54 | public function getEnvironment() |
||
55 | { |
||
56 | return $this->environment; |
||
57 | } |
||
58 | |||
59 | /** |
||
60 | * @return Configuration |
||
61 | */ |
||
62 | 81 | public function getConfig() |
|
63 | { |
||
64 | 81 | return $this->environment->getConfig(); |
|
65 | } |
||
66 | |||
67 | /** |
||
68 | * Convert |
||
69 | * |
||
70 | * @see HtmlConverter::convert |
||
71 | * |
||
72 | * @param string $html |
||
73 | * |
||
74 | * @return string The Markdown version of the html |
||
75 | */ |
||
76 | 3 | public function __invoke($html) |
|
77 | { |
||
78 | 3 | return $this->convert($html); |
|
79 | } |
||
80 | |||
81 | /** |
||
82 | * Convert |
||
83 | * |
||
84 | * Loads HTML and passes to getMarkdown() |
||
85 | * |
||
86 | * @param string $html |
||
87 | * |
||
88 | * @throws \InvalidArgumentException |
||
89 | * |
||
90 | * @return string The Markdown version of the html |
||
91 | */ |
||
92 | 84 | public function convert($html) |
|
93 | { |
||
94 | 84 | if (trim($html) === '') { |
|
95 | 3 | return ''; |
|
96 | } |
||
97 | |||
98 | 81 | $document = $this->createDOMDocument($html); |
|
99 | |||
100 | // Work on the entire DOM tree (including head and body) |
||
101 | 81 | if (!($root = $document->getElementsByTagName('html')->item(0))) { |
|
102 | throw new \InvalidArgumentException('Invalid HTML was provided'); |
||
103 | } |
||
104 | |||
105 | 81 | $rootElement = new Element($root); |
|
106 | 81 | $this->convertChildren($rootElement); |
|
107 | |||
108 | // Store the now-modified DOMDocument as a string |
||
109 | 81 | $markdown = $document->saveHTML(); |
|
110 | |||
111 | 81 | return $this->sanitize($markdown); |
|
112 | } |
||
113 | |||
114 | /** |
||
115 | * @param string $html |
||
116 | * |
||
117 | * @return \DOMDocument |
||
118 | */ |
||
119 | 81 | private function createDOMDocument($html) |
|
120 | { |
||
121 | 81 | $document = new \DOMDocument(); |
|
122 | |||
123 | 81 | if ($this->getConfig()->getOption('suppress_errors')) { |
|
124 | // Suppress conversion errors (from http://bit.ly/pCCRSX) |
||
125 | 78 | libxml_use_internal_errors(true); |
|
126 | 52 | } |
|
127 | |||
128 | // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt) |
||
129 | 81 | $document->loadHTML('<?xml encoding="UTF-8">' . $html); |
|
130 | 81 | $document->encoding = 'UTF-8'; |
|
131 | |||
132 | 81 | if ($this->getConfig()->getOption('suppress_errors')) { |
|
133 | 78 | libxml_clear_errors(); |
|
134 | 52 | } |
|
135 | |||
136 | 81 | return $document; |
|
137 | } |
||
138 | |||
139 | /** |
||
140 | * Convert Children |
||
141 | * |
||
142 | * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. |
||
143 | * |
||
144 | * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, |
||
145 | * starting with the innermost element and working up to the outermost element. |
||
146 | * |
||
147 | * @param ElementInterface $element |
||
148 | */ |
||
149 | 81 | private function convertChildren(ElementInterface $element) |
|
150 | { |
||
151 | // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML |
||
152 | // except if the current node is a code tag, which needs to be converted by the CodeConverter. |
||
153 | 81 | if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') { |
|
154 | 15 | return; |
|
155 | } |
||
156 | |||
157 | // If the node has children, convert those to Markdown first |
||
158 | 81 | if ($element->hasChildren()) { |
|
159 | 81 | foreach ($element->getChildren() as $child) { |
|
160 | 81 | $this->convertChildren($child); |
|
161 | 54 | } |
|
162 | 54 | } |
|
163 | |||
164 | // Now that child nodes have been converted, convert the original node |
||
165 | 81 | $markdown = $this->convertToMarkdown($element); |
|
166 | |||
167 | // Create a DOM text node containing the Markdown equivalent of the original node |
||
168 | |||
169 | // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title' |
||
170 | 81 | $element->setFinalMarkdown($markdown); |
|
0 ignored issues
–
show
|
|||
171 | 81 | } |
|
172 | |||
173 | /** |
||
174 | * Convert to Markdown |
||
175 | * |
||
176 | * Converts an individual node into a #text node containing a string of its Markdown equivalent. |
||
177 | * |
||
178 | * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title' |
||
179 | * |
||
180 | * @param ElementInterface $element |
||
181 | * |
||
182 | * @return string The converted HTML as Markdown |
||
183 | */ |
||
184 | 81 | protected function convertToMarkdown(ElementInterface $element) |
|
185 | { |
||
186 | 81 | $tag = $element->getTagName(); |
|
187 | |||
188 | // Strip nodes named in remove_nodes |
||
189 | 81 | $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes')); |
|
190 | 81 | if (in_array($tag, $tags_to_remove)) { |
|
191 | 3 | return false; |
|
192 | } |
||
193 | |||
194 | 81 | $converter = $this->environment->getConverterByTag($tag); |
|
195 | |||
196 | 81 | return $converter->convert($element); |
|
197 | } |
||
198 | |||
199 | /** |
||
200 | * @param string $markdown |
||
201 | * |
||
202 | * @return string |
||
203 | */ |
||
204 | 81 | protected function sanitize($markdown) |
|
205 | { |
||
206 | 81 | $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); |
|
207 | 81 | $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration |
|
208 | 81 | $markdown = trim($markdown); // Remove blank spaces at the beggining of the html |
|
209 | |||
210 | /* |
||
211 | * Removing unwanted tags. Tags should be added to the array in the order they are expected. |
||
212 | * XML, html and body opening tags should be in that order. Same case with closing tags |
||
213 | */ |
||
214 | 81 | $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '
'); |
|
215 | |||
216 | 81 | foreach ($unwanted as $tag) { |
|
217 | 81 | if (strpos($tag, '/') === false) { |
|
218 | // Opening tags |
||
219 | 81 | if (strpos($markdown, $tag) === 0) { |
|
220 | 81 | $markdown = substr($markdown, strlen($tag)); |
|
221 | 54 | } |
|
222 | 54 | } else { |
|
223 | // Closing tags |
||
224 | 81 | if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) { |
|
225 | 75 | $markdown = substr($markdown, 0, -strlen($tag)); |
|
226 | 48 | } |
|
227 | } |
||
228 | 54 | } |
|
229 | |||
230 | 81 | return trim($markdown, "\n\r\0\x0B"); |
|
231 | } |
||
232 | |||
233 | /** |
||
234 | * Pass a series of key-value pairs in an array; these will be passed |
||
235 | * through the config and set. |
||
236 | * The advantage of this is that it can allow for static use (IE in Laravel). |
||
237 | * An example being: |
||
238 | * |
||
239 | * HtmlConverter::setOptions(['strip_tags' => true])->convert('<h1>test</h1>'); |
||
240 | */ |
||
241 | public function setOptions(array $options) |
||
242 | { |
||
243 | $config = $this->getConfig(); |
||
244 | |||
245 | foreach ($options as $key => $option) { |
||
246 | $config->setOption($key, $option); |
||
247 | } |
||
248 | |||
249 | return $this; |
||
250 | } |
||
251 | } |
||
252 |
This check looks for type mismatches where the missing type is
false
. This is usually indicative of an error condtion.Consider the follow example
This function either returns a new
DateTime
object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returnedfalse
before passing on the value to another function or method that may not be able to handle afalse
.