1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Converts HTML to Markdown text |
5
|
|
|
* |
6
|
|
|
* @name ElkArte Forum |
7
|
|
|
* @copyright ElkArte Forum contributors |
8
|
|
|
* @license BSD http://opensource.org/licenses/BSD-3-Clause |
9
|
|
|
* |
10
|
|
|
* @version 1.1.7 |
11
|
|
|
* |
12
|
|
|
*/ |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* Converts HTML to Markdown text |
16
|
|
|
*/ |
17
|
|
|
class Html_2_Md |
18
|
|
|
{ |
19
|
|
|
/** |
20
|
|
|
* The value that will hold our dom object |
21
|
|
|
* @var object |
22
|
|
|
*/ |
23
|
|
|
public $doc; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* The value that will hold if we are using the internal or external parser |
27
|
|
|
* @var boolean |
28
|
|
|
*/ |
29
|
|
|
private $_parser; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* Line end character |
33
|
|
|
* @var string |
34
|
|
|
*/ |
35
|
|
|
public $line_end = "\n"; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Line break character |
39
|
|
|
* @var string |
40
|
|
|
*/ |
41
|
|
|
public $line_break = "\n\n"; |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* Wordwrap output, set to 0 to skip wrapping |
45
|
|
|
* @var int |
46
|
|
|
*/ |
47
|
|
|
public $body_width = 76; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* Strip remaining tags, set to false to leave them in |
51
|
|
|
* @var boolean |
52
|
|
|
*/ |
53
|
|
|
public $strip_tags = true; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* Regex to run on plain text to prevent markdown from erroneously converting |
57
|
|
|
* @var string[] |
58
|
|
|
*/ |
59
|
|
|
private $_textEscapeRegex = array(); |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* The passed html string to convert |
63
|
|
|
* @var string |
64
|
|
|
*/ |
65
|
|
|
public $html; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* The markdown equivalent to the html string |
69
|
|
|
* @var string |
70
|
|
|
*/ |
71
|
|
|
public $markdown; |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* Gets everything started using the built in or external parser |
75
|
|
|
* |
76
|
|
|
* @param string $html string of html to convert to MD text |
77
|
|
|
*/ |
78
|
|
|
public function __construct($html) |
79
|
|
|
{ |
80
|
|
|
// Up front, remove whitespace between html tags |
81
|
|
|
$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html); |
82
|
|
|
|
83
|
|
|
// The XML parser will not deal gracefully with these |
84
|
|
|
$this->html = strtr($this->html, array( |
85
|
|
|
'?<' => '|?|<', |
86
|
|
|
'?>' => '|?|>', |
87
|
|
|
'>?' => '>|?|', |
88
|
|
|
'<?' => '<|?|' |
89
|
|
|
)); |
90
|
|
|
|
91
|
|
|
// Set the dom parser to use and load the HTML to the parser |
92
|
|
|
$this->_set_parser(); |
93
|
|
|
|
94
|
|
|
// Initialize the regex array to escape text areas so markdown does |
95
|
|
|
// not interpret plain text as markdown syntax |
96
|
|
|
$this->_textEscapeRegex = array( |
97
|
|
|
// Things that may convert to an hr --- or - - - etc |
98
|
|
|
'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|', |
99
|
|
|
// or **stuff** => \*\*stuff\*\* |
100
|
|
|
'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*', |
101
|
|
|
// or versions of *italic* __italic__ _italic_ |
102
|
|
|
'\*([^*\s]+)\*' => '\*$1\*', |
103
|
|
|
'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_', |
104
|
|
|
'_(?! |_)(.+)(?!<_| )_' => '\_$1\_', |
105
|
|
|
// nor `code` |
106
|
|
|
'`(.+)`' => '\`$1\`', |
107
|
|
|
// or links |
108
|
|
|
'\[(.+)\](\s*\()' => '\[$1\]$2', |
109
|
|
|
'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]', |
110
|
|
|
); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* Set the DOM parser for class, loads the supplied HTML |
115
|
|
|
*/ |
116
|
|
|
private function _set_parser() |
117
|
|
|
{ |
118
|
|
|
// Using PHP built in functions ... |
119
|
|
|
if (class_exists('DOMDocument')) |
120
|
|
|
{ |
121
|
|
|
$this->_parser = true; |
122
|
|
|
$previous = libxml_use_internal_errors(true); |
123
|
|
|
|
124
|
|
|
// Set up basic parameters for DomDocument, including silencing structural errors |
125
|
|
|
$this->_setupDOMDocument(); |
126
|
|
|
|
127
|
|
|
// Set the error handle back to what it was, and flush |
128
|
|
|
libxml_use_internal_errors($previous); |
129
|
|
|
libxml_clear_errors(); |
130
|
|
|
} |
131
|
|
|
// Or using the external simple html parser |
132
|
|
|
else |
133
|
|
|
{ |
134
|
|
|
$this->_parser = false; |
135
|
|
|
require_once(EXTDIR . '/simple_html_dom.php'); |
136
|
|
|
$this->doc = str_get_html($this->html, true, true, 'UTF-8', false); |
|
|
|
|
137
|
|
|
} |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
/** |
141
|
|
|
* Loads the html body and sends it to the parsing loop to convert all |
142
|
|
|
* DOM nodes to markup |
143
|
|
|
*/ |
144
|
|
|
public function get_markdown() |
145
|
|
|
{ |
146
|
|
|
// For this html node, find all child elements and convert |
147
|
|
|
$body = $this->_getBody(); |
148
|
|
|
$this->_convert_childNodes($body); |
149
|
|
|
|
150
|
|
|
// Done replacing HTML elements, now get the converted DOM tree back into a string |
151
|
|
|
$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save(); |
152
|
|
|
|
153
|
|
|
// Using the internal DOM methods requires we need to do a little extra work |
154
|
|
|
if ($this->_parser) |
155
|
|
|
{ |
156
|
|
|
$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
// Clean up any excess spacing etc |
160
|
|
|
$this->_clean_markdown(); |
161
|
|
|
|
162
|
|
|
// Convert any clear text links to MD |
163
|
|
|
$this->_convert_plaintxt_links(); |
164
|
|
|
|
165
|
|
|
// Wordwrap? |
166
|
|
|
if (!empty($this->body_width)) |
167
|
|
|
{ |
168
|
|
|
$this->_check_line_lenght($this->markdown); |
169
|
|
|
$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end); |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
// The null character will trigger a base64 version in outbound email |
173
|
|
|
return $this->markdown . "\n\x00"; |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
/** |
177
|
|
|
* Returns just the body of the HTML, as best possible, so we are not dealing with head |
178
|
|
|
* and above head markup |
179
|
|
|
* |
180
|
|
|
* @return object |
181
|
|
|
*/ |
182
|
|
|
private function _getBody() |
183
|
|
|
{ |
184
|
|
|
// If there is a head node, then off with his head! |
185
|
|
|
$this->_clipHead(); |
186
|
|
|
|
187
|
|
|
// The body of the HTML is where its at. |
188
|
|
|
if ($this->_parser) |
189
|
|
|
{ |
190
|
|
|
$body = $this->doc->getElementsByTagName('body')->item(0); |
191
|
|
|
} |
192
|
|
|
else |
193
|
|
|
{ |
194
|
|
|
if ($this->doc->find('body', 0) !== null) |
195
|
|
|
{ |
196
|
|
|
$body = $this->doc->find('body', 0); |
197
|
|
|
} |
198
|
|
|
elseif ($this->doc->find('html', 0) !== null) |
199
|
|
|
{ |
200
|
|
|
$body = $this->doc->find('html', 0); |
201
|
|
|
} |
202
|
|
|
else |
203
|
|
|
{ |
204
|
|
|
$body = $this->doc->root; |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
return $body; |
209
|
|
|
} |
210
|
|
|
|
211
|
|
|
/** |
212
|
|
|
* Remove any <head node from the DOM |
213
|
|
|
*/ |
214
|
|
|
private function _clipHead() |
215
|
|
|
{ |
216
|
|
|
$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0); |
217
|
|
|
if ($head !== null) |
218
|
|
|
{ |
219
|
|
|
if ($this->_parser) |
220
|
|
|
{ |
221
|
|
|
$head->parentNode->removeChild($head); |
222
|
|
|
} |
223
|
|
|
else |
224
|
|
|
{ |
225
|
|
|
$this->doc->find('head', 0)->outertext = ''; |
226
|
|
|
} |
227
|
|
|
} |
228
|
|
|
} |
229
|
|
|
|
230
|
|
|
/** |
231
|
|
|
* Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8 |
232
|
|
|
*/ |
233
|
|
|
private function _setupDOMDocument() |
234
|
|
|
{ |
235
|
|
|
// If the html is already wrapped, remove it |
236
|
|
|
$this->html = $this->_returnBodyText($this->html); |
237
|
|
|
|
238
|
|
|
// Set up processing details |
239
|
|
|
$this->doc = new DOMDocument(); |
240
|
|
|
$this->doc->preserveWhiteSpace = false; |
241
|
|
|
$this->doc->encoding = 'UTF-8'; |
242
|
|
|
|
243
|
|
|
// Do what we can to ensure this is processed as UTF-8 |
244
|
|
|
$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>'); |
245
|
|
|
} |
246
|
|
|
|
247
|
|
|
/** |
248
|
|
|
* Normalize any spacing and excess blank lines that may have been generated |
249
|
|
|
*/ |
250
|
|
|
private function _clean_markdown() |
251
|
|
|
{ |
252
|
|
|
// We only want the content, no wrappers |
253
|
|
|
$this->markdown = $this->_returnBodyText($this->markdown); |
254
|
|
|
|
255
|
|
|
// Remove non breakable spaces that may be hiding in here |
256
|
|
|
$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown); |
257
|
|
|
$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown); |
258
|
|
|
|
259
|
|
|
// Remove any "bonus" tags |
260
|
|
|
if ($this->strip_tags) |
261
|
|
|
{ |
262
|
|
|
$this->markdown = strip_tags($this->markdown); |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
// Replace content that we "hide" from the XML parsers |
266
|
|
|
$this->markdown = strtr($this->markdown, array( |
267
|
|
|
'|?|>' => '?>', |
268
|
|
|
'|?|<' => '?<', |
269
|
|
|
'<|?|' => '<?', |
270
|
|
|
'>|?|' => '>?' |
271
|
|
|
)); |
272
|
|
|
|
273
|
|
|
// Strip the chaff and any excess blank lines we may have produced |
274
|
|
|
$this->markdown = trim($this->markdown); |
275
|
|
|
$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown); |
276
|
|
|
$this->markdown = preg_replace("~(^\s\s\n){3,}~m", " \n \n", $this->markdown); |
277
|
|
|
$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", " \n \n", $this->markdown); |
278
|
|
|
$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", " \n \n", $this->markdown); |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
/** |
282
|
|
|
* Looks for the text inside of <body> and then <html>, returning just the inner |
283
|
|
|
* |
284
|
|
|
* @param $text |
285
|
|
|
* |
286
|
|
|
* @return string |
287
|
|
|
*/ |
288
|
|
|
private function _returnBodyText($text) |
289
|
|
|
{ |
290
|
|
|
if (preg_match('~<body>(.*)</body>~su', $text, $body)) |
291
|
|
|
{ |
292
|
|
|
return $body[1]; |
293
|
|
|
} |
294
|
|
|
elseif (preg_match('~<html>(.*)</html>~su', $text, $body)) |
295
|
|
|
{ |
296
|
|
|
return $body[1]; |
297
|
|
|
} |
298
|
|
|
|
299
|
|
|
return $text; |
300
|
|
|
} |
301
|
|
|
|
302
|
|
|
/** |
303
|
|
|
* For a given node, checks if it is anywhere nested inside of a code block |
304
|
|
|
* - Prevents converting anything that's inside a code block |
305
|
|
|
* |
306
|
|
|
* @param object $node |
307
|
|
|
* @param boolean $parser flag for internal or external parser |
308
|
|
|
* |
309
|
|
|
* @return boolean |
310
|
|
|
*/ |
311
|
|
|
private static function _has_parent_code($node, $parser) |
312
|
|
|
{ |
313
|
|
|
$parent = $parser ? $node->parentNode : $node->parentNode(); |
314
|
|
|
while ($parent) |
315
|
|
|
{ |
316
|
|
|
if ($parent === null) |
317
|
|
|
{ |
318
|
|
|
return false; |
319
|
|
|
} |
320
|
|
|
|
321
|
|
|
// Anywhere nested inside a code block we don't render tags |
322
|
|
|
$tag = $parser ? $parent->nodeName : $parent->nodeName(); |
323
|
|
|
if ($tag === 'code') |
324
|
|
|
{ |
325
|
|
|
return true; |
326
|
|
|
} |
327
|
|
|
|
328
|
|
|
// Back out another level, until we are done |
329
|
|
|
$parent = $parser ? $parent->parentNode : $parent->parentNode(); |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
return false; |
333
|
|
|
} |
334
|
|
|
|
335
|
|
|
/** |
336
|
|
|
* Get the nesting level when inside a list |
337
|
|
|
* |
338
|
|
|
* @param object $node |
339
|
|
|
* @param boolean $parser flag for internal or external parser |
340
|
|
|
* |
341
|
|
|
* @return int |
342
|
|
|
*/ |
343
|
|
|
private static function _has_parent_list($node, $parser) |
344
|
|
|
{ |
345
|
|
|
$inlist = array('ul', 'ol'); |
346
|
|
|
$depth = 0; |
347
|
|
|
|
348
|
|
|
$parent = $parser ? $node->parentNode : $node->parentNode(); |
349
|
|
|
while ($parent) |
350
|
|
|
{ |
351
|
|
|
// Anywhere nested inside a list we need to get the depth |
352
|
|
|
$tag = $parser ? $parent->nodeName : $parent->nodeName(); |
353
|
|
|
if (in_array($tag, $inlist)) |
354
|
|
|
{ |
355
|
|
|
$depth++; |
356
|
|
|
} |
357
|
|
|
|
358
|
|
|
// Back out another level |
359
|
|
|
$parent = $parser ? $parent->parentNode : $parent->parentNode(); |
360
|
|
|
} |
361
|
|
|
|
362
|
|
|
return $depth; |
363
|
|
|
} |
364
|
|
|
|
365
|
|
|
/** |
366
|
|
|
* Traverse each node to its base, then convert tags to markup on the way back out |
367
|
|
|
* |
368
|
|
|
* @param object $node |
369
|
|
|
*/ |
370
|
|
|
private function _convert_childNodes($node) |
371
|
|
|
{ |
372
|
|
|
if (self::_has_parent_code($node, $this->_parser)) |
373
|
|
|
{ |
374
|
|
|
return; |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
// Keep traversing till we are at the base of this node |
378
|
|
|
if ($node->hasChildNodes()) |
379
|
|
|
{ |
380
|
|
|
$num = $this->_parser ? $node->childNodes->length : count($node->childNodes()); |
381
|
|
|
for ($i = 0; $i < $num; $i++) |
382
|
|
|
{ |
383
|
|
|
$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i); |
384
|
|
|
$this->_convert_childNodes($child); |
385
|
|
|
} |
386
|
|
|
} |
387
|
|
|
|
388
|
|
|
// At the root of this node, convert it to markdown |
389
|
|
|
$this->_convert_to_markdown($node); |
390
|
|
|
} |
391
|
|
|
|
392
|
|
|
/** |
393
|
|
|
* Convert the supplied node into its markdown equivalent |
394
|
|
|
* - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion |
395
|
|
|
* |
396
|
|
|
* @param object $node |
397
|
|
|
*/ |
398
|
|
|
private function _convert_to_markdown($node) |
399
|
|
|
{ |
400
|
|
|
// HTML tag we are dealing with |
401
|
|
|
$tag = $this->_get_name($node); |
402
|
|
|
|
403
|
|
|
// Based on the tag, determine how to convert |
404
|
|
|
switch ($tag) |
405
|
|
|
{ |
406
|
|
|
case 'a': |
407
|
|
|
if ($node->getAttribute('data-lightboximage') || $node->getAttribute('data-lightboxmessage')) |
408
|
|
|
$markdown = '~`skip`~'; |
409
|
|
|
else |
410
|
|
|
$markdown = $this->line_end . $this->_convert_anchor($node) . $this->line_end; |
411
|
|
|
break; |
412
|
|
|
case 'abbr': |
413
|
|
|
$markdown = $this->_convert_abbr($node); |
414
|
|
|
break; |
415
|
|
|
case 'b': |
416
|
|
|
case 'strong': |
417
|
|
|
$markdown = '**' . $this->_get_value($node) . '**'; |
418
|
|
|
break; |
419
|
|
|
case 'blockquote': |
420
|
|
|
$markdown = $this->_convert_blockquote($node); |
421
|
|
|
break; |
422
|
|
|
case 'br': |
423
|
|
|
// DomDocument strips empty lines, this prevents that |
424
|
|
|
$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break; |
425
|
|
|
break; |
426
|
|
|
case 'center': |
427
|
|
|
$markdown = $this->line_end . $this->_get_value($node) . $this->line_end; |
428
|
|
|
break; |
429
|
|
|
case 'code': |
430
|
|
|
$markdown = $this->_convert_code($node); |
431
|
|
|
break; |
432
|
|
|
case 'dt': |
433
|
|
|
$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end; |
434
|
|
|
break; |
435
|
|
|
case 'dd': |
436
|
|
|
$markdown = ': ' . $this->_get_value($node) . $this->line_break; |
437
|
|
|
break; |
438
|
|
|
case 'dl': |
439
|
|
|
$markdown = trim($this->_get_value($node)) . $this->line_break; |
440
|
|
|
break; |
441
|
|
|
case 'em': |
442
|
|
|
case 'i': |
443
|
|
|
$markdown = '_' . $this->_get_value($node) . '_'; |
444
|
|
|
break; |
445
|
|
|
case 'hr': |
446
|
|
|
$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end; |
447
|
|
|
break; |
448
|
|
|
case 'h1': |
449
|
|
|
case 'h2': |
450
|
|
|
case 'h3': |
451
|
|
|
case 'h4': |
452
|
|
|
case 'h5': |
453
|
|
|
case 'h6': |
454
|
|
|
$markdown = $this->_convert_header($tag, $this->_get_value($node)); |
|
|
|
|
455
|
|
|
break; |
456
|
|
|
case 'img': |
457
|
|
|
$markdown = $this->_convert_image($node) . $this->line_end; |
458
|
|
|
break; |
459
|
|
|
case 'ol': |
460
|
|
|
case 'ul': |
461
|
|
|
$markdown = rtrim($this->_get_value($node)) . $this->line_break; |
462
|
|
|
break; |
463
|
|
|
case 'li': |
464
|
|
|
$markdown = $this->_convert_list($node); |
465
|
|
|
break; |
466
|
|
|
case 'p': |
467
|
|
|
if (!$node->hasChildNodes()) |
468
|
|
|
{ |
469
|
|
|
$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break; |
470
|
|
|
$markdown = $this->_escape_text($markdown); |
471
|
|
|
} |
472
|
|
|
else |
473
|
|
|
{ |
474
|
|
|
$markdown = rtrim($this->_get_value($node)) . $this->line_break; |
475
|
|
|
} |
476
|
|
|
break; |
477
|
|
|
case 'pre': |
478
|
|
|
$markdown = $this->_get_value($node) . $this->line_break; |
479
|
|
|
break; |
480
|
|
|
case 'div': |
481
|
|
|
$markdown = $this->line_end . $this->_get_value($node) . $this->line_end; |
482
|
|
|
if (!$node->hasChildNodes()) |
483
|
|
|
{ |
484
|
|
|
$markdown = $this->_escape_text($markdown); |
485
|
|
|
} |
486
|
|
|
break; |
487
|
|
|
//case '#text': |
488
|
|
|
// $markdown = $this->_escape_text($this->_get_value($node)); |
489
|
|
|
// break; |
490
|
|
|
case 'title': |
491
|
|
|
$markdown = '# ' . $this->_get_value($node) . $this->line_break; |
492
|
|
|
break; |
493
|
|
|
case 'table': |
494
|
|
|
$markdown = $this->_convert_table($node) . $this->line_break; |
495
|
|
|
break; |
496
|
|
|
case 'th': |
497
|
|
|
case 'tr': |
498
|
|
|
case 'td': |
499
|
|
|
case 'tbody': |
500
|
|
|
case 'tfoot': |
501
|
|
|
case 'thead': |
502
|
|
|
// Just skip over these as we handle them in the table tag itself |
503
|
|
|
$markdown = '~`skip`~'; |
504
|
|
|
break; |
505
|
|
|
case 'root': |
506
|
|
|
case 'span': |
507
|
|
|
case 'body': |
508
|
|
|
// Remove these tags and simply replace with the text inside the tags |
509
|
|
|
$markdown = $this->_get_innerHTML($node); |
510
|
|
|
break; |
511
|
|
|
default: |
512
|
|
|
// Don't know you or text, so just preserve whats there |
513
|
|
|
$markdown = $this->_get_outerHTML($node); |
514
|
|
|
} |
515
|
|
|
|
516
|
|
|
// Replace the node with our markdown replacement, or with the node itself if none was found |
517
|
|
|
if ($markdown !== '~`skip`~') |
518
|
|
|
{ |
519
|
|
|
if ($this->_parser) |
520
|
|
|
{ |
521
|
|
|
// Create a new text node with our markdown tag and replace the original node |
522
|
|
|
$markdown_node = $this->doc->createTextNode($markdown); |
523
|
|
|
$node->parentNode->replaceChild($markdown_node, $node); |
524
|
|
|
} |
525
|
|
|
else |
526
|
|
|
{ |
527
|
|
|
$node->outertext = $markdown; |
528
|
|
|
} |
529
|
|
|
} |
530
|
|
|
} |
531
|
|
|
|
532
|
|
|
/** |
533
|
|
|
* Converts <abbr> tags to markdown (extra) |
534
|
|
|
* |
535
|
|
|
* html: <abbr title="Hyper Text Markup Language">HTML</abbr> |
536
|
|
|
* md: *[HTML]: Hyper Text Markup Language |
537
|
|
|
* |
538
|
|
|
* @param object $node |
539
|
|
|
* @return string |
540
|
|
|
*/ |
541
|
|
|
private function _convert_abbr($node) |
542
|
|
|
{ |
543
|
|
|
$title = $node->getAttribute('title'); |
544
|
|
|
$value = $this->_get_value($node); |
545
|
|
|
|
546
|
|
|
if (!empty($title)) |
547
|
|
|
{ |
548
|
|
|
$markdown = '*[' . $value . ']: ' . $title . $this->line_break; |
549
|
|
|
} |
550
|
|
|
else |
551
|
|
|
{ |
552
|
|
|
$markdown = ''; |
553
|
|
|
} |
554
|
|
|
|
555
|
|
|
return $markdown; |
556
|
|
|
} |
557
|
|
|
|
558
|
|
|
/** |
559
|
|
|
* Converts <a> tags to markdown |
560
|
|
|
* |
561
|
|
|
* html: <a href='http://somesite.com' title='Title'>Awesome Site</a> |
562
|
|
|
* md: [Awesome Site](http://somesite.com 'Title') |
563
|
|
|
* |
564
|
|
|
* @param object $node |
565
|
|
|
* @return string |
566
|
|
|
*/ |
567
|
|
|
private function _convert_anchor($node) |
568
|
|
|
{ |
569
|
|
|
global $txt; |
570
|
|
|
|
571
|
|
|
$href = htmlspecialchars_decode($node->getAttribute('href')); |
572
|
|
|
$href = strtr($href, array('(' => '%28', ')' => '%29', '[' => '%5B', ']' => '%5D', '&' => '%26a')); |
573
|
|
|
|
574
|
|
|
$title = $node->getAttribute('title'); |
575
|
|
|
$class = $node->getAttribute('class'); |
576
|
|
|
$value = $this->_get_value($node); |
577
|
|
|
|
578
|
|
|
// Provide a more compact [name] if none is given |
579
|
|
|
if ($value == $node->getAttribute('href') || empty($value)) |
580
|
|
|
{ |
581
|
|
|
$value = empty($title) ? $txt['link'] : $title; |
582
|
|
|
} |
583
|
|
|
|
584
|
|
|
// Special processing just for our own footnotes |
585
|
|
|
if ($class === 'target' || $class === 'footnote_return') |
586
|
|
|
{ |
587
|
|
|
$markdown = $value; |
588
|
|
|
} |
589
|
|
|
elseif (!empty($title)) |
590
|
|
|
{ |
591
|
|
|
$markdown = '[' . $value . '](' . $href . ' "' . $title . '")'; |
592
|
|
|
} |
593
|
|
|
else |
594
|
|
|
{ |
595
|
|
|
$markdown = '[' . $value . ']( ' . $href . ' )'; |
596
|
|
|
} |
597
|
|
|
|
598
|
|
|
return $markdown; |
599
|
|
|
} |
600
|
|
|
|
601
|
|
|
/** |
602
|
|
|
* Converts blockquotes to markdown > quote style |
603
|
|
|
* |
604
|
|
|
* html: <blockquote>quote</blockquote> |
605
|
|
|
* md: > quote |
606
|
|
|
* |
607
|
|
|
* @param object $node |
608
|
|
|
* @return string |
609
|
|
|
*/ |
610
|
|
|
private function _convert_blockquote($node) |
611
|
|
|
{ |
612
|
|
|
$markdown = ''; |
613
|
|
|
|
614
|
|
|
// All the contents of this block quote |
615
|
|
|
$value = $this->_get_value($node); |
616
|
|
|
$value = trim($value); |
617
|
|
|
|
618
|
|
|
// Go line by line |
619
|
|
|
$lines = preg_split('~\r\n|\r|\n~', $value); |
620
|
|
|
|
621
|
|
|
// Each line gets a '> ' in front of it, just like email quotes really |
622
|
|
|
foreach ($lines as $line) |
623
|
|
|
{ |
624
|
|
|
$markdown .= '> ' . ltrim($line, "\t") . $this->line_end; |
625
|
|
|
} |
626
|
|
|
|
627
|
|
|
$markdown .= $this->line_end; |
628
|
|
|
|
629
|
|
|
return $markdown; |
630
|
|
|
} |
631
|
|
|
|
632
|
|
|
/** |
633
|
|
|
* Converts code tags to markdown span `code` or block code |
634
|
|
|
* Converts single line code to inline tick mark |
635
|
|
|
* Converts multi line to 4 space indented code |
636
|
|
|
* |
637
|
|
|
* html: <code>code</code> |
638
|
|
|
* md: `code` |
639
|
|
|
* |
640
|
|
|
* @param object $node |
641
|
|
|
* @return string |
642
|
|
|
*/ |
643
|
|
|
private function _convert_code($node) |
644
|
|
|
{ |
645
|
|
|
$value = $this->_get_innerHTML($node); |
646
|
|
|
|
647
|
|
|
// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselves |
648
|
|
|
$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace(' ', ' ', $value)); |
649
|
|
|
|
650
|
|
|
// If there are html tags in this code block, we need to disable strip tags |
651
|
|
|
// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse. |
652
|
|
|
if ($this->strip_tags && preg_match('~<[^<]+>~', $value)) |
653
|
|
|
{ |
654
|
|
|
$this->strip_tags = false; |
655
|
|
|
} |
656
|
|
|
|
657
|
|
|
// Get the number of lines of code that we have |
658
|
|
|
$lines = preg_split('~\r\n|\r|\n~', $value); |
659
|
|
|
$total = count($lines); |
660
|
|
|
|
661
|
|
|
// If there's more than one line of code, use leading four space syntax |
662
|
|
|
if ($total > 1) |
663
|
|
|
{ |
664
|
|
|
$first_line = trim($lines[0]); |
665
|
|
|
$last_line = trim($lines[$total - 1]); |
666
|
|
|
|
667
|
|
|
// Remove any leading and trailing blank lines |
668
|
|
|
if (empty($first_line)) |
669
|
|
|
{ |
670
|
|
|
array_shift($lines); |
671
|
|
|
} |
672
|
|
|
if (empty($last_line)) |
673
|
|
|
{ |
674
|
|
|
array_pop($lines); |
675
|
|
|
} |
676
|
|
|
|
677
|
|
|
// Convert what remains |
678
|
|
|
$markdown = ''; |
679
|
|
|
foreach ($lines as $line) |
680
|
|
|
{ |
681
|
|
|
// Adjust the word wrapping since this has code tags, leave it up to |
682
|
|
|
// the email client to mess these up ;) |
683
|
|
|
$this->_check_line_lenght($markdown, 5); |
684
|
|
|
|
685
|
|
|
$markdown .= str_repeat(' ', 4) . $line . $this->line_end; |
686
|
|
|
} |
687
|
|
|
|
688
|
|
|
// The parser will encode, but we don't want that for our code block |
689
|
|
|
if ($this->_parser) |
690
|
|
|
{ |
691
|
|
|
$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8'); |
692
|
|
|
} |
693
|
|
|
} |
694
|
|
|
// Single line, back tick and move on |
695
|
|
|
else |
696
|
|
|
{ |
697
|
|
|
// Account for backticks in the single line code itself |
698
|
|
|
$ticks = $this->_has_ticks($node, $value); |
699
|
|
|
if (!empty($ticks)) |
700
|
|
|
{ |
701
|
|
|
// If the ticks were at the start/end of the word space it off |
702
|
|
|
if ($lines[0][0] == '`' || substr($lines[0], -1) == '`') |
703
|
|
|
{ |
704
|
|
|
$lines[0] = ' ' . $lines[0] . ' '; |
705
|
|
|
} |
706
|
|
|
|
707
|
|
|
$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks; |
708
|
|
|
} |
709
|
|
|
else |
710
|
|
|
{ |
711
|
|
|
$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`'; |
712
|
|
|
} |
713
|
|
|
} |
714
|
|
|
|
715
|
|
|
return $markdown; |
716
|
|
|
} |
717
|
|
|
|
718
|
|
|
/** |
719
|
|
|
* Converts <h1> and <h2> headers to markdown-style headers in setex style, |
720
|
|
|
* all other headers are returned as atx style ### h3 |
721
|
|
|
* |
722
|
|
|
* html: <h1>header</h1> |
723
|
|
|
* md: header |
724
|
|
|
* ====== |
725
|
|
|
* |
726
|
|
|
* html: <h3>header</h3> |
727
|
|
|
* md: ###header |
728
|
|
|
* |
729
|
|
|
* @param int $level |
730
|
|
|
* @param string $content |
731
|
|
|
* @return string |
732
|
|
|
*/ |
733
|
|
|
private function _convert_header($level, $content) |
734
|
|
|
{ |
735
|
|
|
$level = (int) ltrim($level, 'h'); |
736
|
|
|
|
737
|
|
|
if ($level < 3) |
738
|
|
|
{ |
739
|
|
|
$length = Util::strlen($content); |
740
|
|
|
$underline = ($level === 1) ? '=' : '-'; |
741
|
|
|
$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break; |
742
|
|
|
} |
743
|
|
|
else |
744
|
|
|
{ |
745
|
|
|
$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break; |
746
|
|
|
} |
747
|
|
|
|
748
|
|
|
return $markdown; |
749
|
|
|
} |
750
|
|
|
|
751
|
|
|
/** |
752
|
|
|
* Converts <img> tags to markdown |
753
|
|
|
* |
754
|
|
|
* html: <img src='source' alt='alt' title='title' /> |
755
|
|
|
* md:  |
756
|
|
|
* |
757
|
|
|
* @param object $node |
758
|
|
|
* @return string |
759
|
|
|
*/ |
760
|
|
|
private function _convert_image($node) |
761
|
|
|
{ |
762
|
|
|
$src = $node->getAttribute('src'); |
763
|
|
|
$alt = $node->getAttribute('alt'); |
764
|
|
|
$title = $node->getAttribute('title'); |
765
|
|
|
|
766
|
|
|
if (!empty($title)) |
767
|
|
|
{ |
768
|
|
|
$markdown = ''; |
769
|
|
|
} |
770
|
|
|
else |
771
|
|
|
{ |
772
|
|
|
$markdown = ''; |
773
|
|
|
} |
774
|
|
|
|
775
|
|
|
return $markdown; |
776
|
|
|
} |
777
|
|
|
|
778
|
|
|
/** |
779
|
|
|
* Converts ordered <ol> and unordered <ul> lists to markdown syntax |
780
|
|
|
* |
781
|
|
|
* html: <ul><li>one</li></ul> |
782
|
|
|
* md * one |
783
|
|
|
* |
784
|
|
|
* @param object $node |
785
|
|
|
* @return string |
786
|
|
|
*/ |
787
|
|
|
private function _convert_list($node) |
788
|
|
|
{ |
789
|
|
|
$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName(); |
790
|
|
|
$value = $this->_get_value($node); |
791
|
|
|
|
792
|
|
|
$loose = rtrim($value) !== $value; |
793
|
|
|
$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1); |
794
|
|
|
|
795
|
|
|
// Unordered lists get a simple bullet |
796
|
|
|
if ($list_type === 'ul') |
797
|
|
|
{ |
798
|
|
|
$markdown = str_repeat("\t", $depth) . '* ' . $value; |
799
|
|
|
} |
800
|
|
|
// Ordered lists need a number |
801
|
|
|
else |
802
|
|
|
{ |
803
|
|
|
$number = $this->_get_list_position($node); |
804
|
|
|
$markdown = str_repeat("\t", $depth) . $number . '. ' . $value; |
805
|
|
|
} |
806
|
|
|
|
807
|
|
|
return $markdown . (!$loose ? $this->line_end : ''); |
808
|
|
|
} |
809
|
|
|
|
810
|
|
|
/** |
811
|
|
|
* Converts tables tags to markdown extra table syntax |
812
|
|
|
* |
813
|
|
|
* - Have to build top down vs normal inside out due to needing col numbers and widths |
814
|
|
|
* |
815
|
|
|
* @param object $node |
816
|
|
|
* @return string |
817
|
|
|
*/ |
818
|
|
|
private function _convert_table($node) |
819
|
|
|
{ |
820
|
|
|
$table_heading = $node->getElementsByTagName('th'); |
821
|
|
|
if ($this->_get_item($table_heading, 0) === null) |
822
|
|
|
{ |
823
|
|
|
return ''; |
824
|
|
|
} |
825
|
|
|
|
826
|
|
|
$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false; |
827
|
|
|
|
828
|
|
|
// Set up for a markdown table, then storm the castle |
829
|
|
|
$align = array(); |
830
|
|
|
$value = array(); |
831
|
|
|
$width = array(); |
832
|
|
|
$max = array(); |
833
|
|
|
$header = array(); |
834
|
|
|
$rows = array(); |
835
|
|
|
|
836
|
|
|
// We only markdown well formed tables ... |
837
|
|
|
if ($table_heading && $th_parent === 'tr') |
838
|
|
|
{ |
839
|
|
|
// Find out how many columns we are dealing with |
840
|
|
|
$th_num = $this->_get_length($table_heading); |
841
|
|
|
|
842
|
|
|
for ($col = 0; $col < $th_num; $col++) |
843
|
|
|
{ |
844
|
|
|
// Get the align and text for each th (html5 this is no longer valid) |
845
|
|
|
$th = $this->_get_item($table_heading, $col); |
846
|
|
|
$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false; |
847
|
|
|
$align[0][$col] = $align_value === false ? 'left' : $align_value; |
848
|
|
|
$value[0][$col] = $this->_get_value($th); |
849
|
|
|
$width[0][$col] = Util::strlen($this->_get_value($th)); |
850
|
|
|
|
851
|
|
|
// Seed the max col width |
852
|
|
|
$max[$col] = $width[0][$col]; |
853
|
|
|
} |
854
|
|
|
|
855
|
|
|
// Get all of the rows |
856
|
|
|
$table_rows = $node->getElementsByTagName('tr'); |
857
|
|
|
$num_rows = $this->_get_length($table_rows); |
858
|
|
|
for ($row = 1; $row < $num_rows; $row++) |
859
|
|
|
{ |
860
|
|
|
// Start at row 1 and get all of the td's in this row |
861
|
|
|
$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td'); |
862
|
|
|
|
863
|
|
|
// Simply use the th count as the number of columns, if its not right its not markdown-able anyway |
864
|
|
|
for ($col = 0; $col < $th_num; $col++) |
865
|
|
|
{ |
866
|
|
|
// Get the align and text for each td in this row |
867
|
|
|
$td = $this->_get_item($row_data, $col); |
868
|
|
|
$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false; |
869
|
|
|
$align[$row][$col] = $align_value === false ? 'left' : $align_value; |
870
|
|
|
$value[$row][$col] = $this->_get_value($td); |
871
|
|
|
$width[$row][$col] = Util::strlen($this->_get_value($td)); |
872
|
|
|
|
873
|
|
|
// Keep track of the longest col cell as we go |
874
|
|
|
if ($width[$row][$col] > $max[$col]) |
875
|
|
|
{ |
876
|
|
|
$max[$col] = $width[$row][$col]; |
877
|
|
|
} |
878
|
|
|
} |
879
|
|
|
} |
880
|
|
|
|
881
|
|
|
// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster |
882
|
|
|
for ($row = 0; $row < $num_rows; $row++) |
883
|
|
|
{ |
884
|
|
|
$temp = array(); |
885
|
|
|
for ($col = 0; $col < $th_num; $col++) |
886
|
|
|
{ |
887
|
|
|
// Build the header row once |
888
|
|
|
if ($row === 0) |
889
|
|
|
{ |
890
|
|
|
$header[] = str_repeat('-', $max[$col]); |
891
|
|
|
} |
892
|
|
|
|
893
|
|
|
// Build the data for each col, align/pad as needed |
894
|
|
|
$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]); |
895
|
|
|
} |
896
|
|
|
|
897
|
|
|
// Join it all up so we have a nice looking row |
898
|
|
|
$rows[] = '| ' . implode(' | ', $temp) . ' |'; |
899
|
|
|
|
900
|
|
|
// Stuff in the header after the th row |
901
|
|
|
if ($row === 0) |
902
|
|
|
{ |
903
|
|
|
$rows[] = '| ' . implode(' | ', $header) . ' | '; |
904
|
|
|
} |
905
|
|
|
} |
906
|
|
|
|
907
|
|
|
// Adjust the word wrapping since this has a table, will get mussed by email anyway |
908
|
|
|
$this->_check_line_lenght($rows[1], 2); |
909
|
|
|
|
910
|
|
|
// Return what we did so it can be swapped in |
911
|
|
|
return implode($this->line_end, $rows); |
912
|
|
|
} |
913
|
|
|
} |
914
|
|
|
|
915
|
|
|
/** |
916
|
|
|
* Helper function for getting a node object |
917
|
|
|
* |
918
|
|
|
* @param object $node |
919
|
|
|
* @param int $item |
920
|
|
|
* @return object |
921
|
|
|
*/ |
922
|
|
|
private function _get_item($node, $item) |
923
|
|
|
{ |
924
|
|
|
if ($this->_parser) |
925
|
|
|
{ |
926
|
|
|
return $node->item($item); |
927
|
|
|
} |
928
|
|
|
else |
929
|
|
|
{ |
930
|
|
|
return $node[$item]; |
931
|
|
|
} |
932
|
|
|
} |
933
|
|
|
|
934
|
|
|
/** |
935
|
|
|
* Helper function for getting a node length |
936
|
|
|
* |
937
|
|
|
* @param object|array $node |
938
|
|
|
* @return int |
939
|
|
|
*/ |
940
|
|
|
private function _get_length($node) |
941
|
|
|
{ |
942
|
|
|
if ($this->_parser) |
943
|
|
|
{ |
944
|
|
|
return $node->length; |
945
|
|
|
} |
946
|
|
|
else |
947
|
|
|
{ |
948
|
|
|
return count($node); |
949
|
|
|
} |
950
|
|
|
} |
951
|
|
|
|
952
|
|
|
/** |
953
|
|
|
* Helper function for getting a node value |
954
|
|
|
* |
955
|
|
|
* @param object $node |
956
|
|
|
* @return string |
957
|
|
|
*/ |
958
|
|
|
private function _get_value($node) |
959
|
|
|
{ |
960
|
|
|
if ($node === null) |
961
|
|
|
{ |
962
|
|
|
return ''; |
963
|
|
|
} |
964
|
|
|
|
965
|
|
|
if ($this->_parser) |
966
|
|
|
{ |
967
|
|
|
return $node->nodeValue; |
968
|
|
|
} |
969
|
|
|
else |
970
|
|
|
{ |
971
|
|
|
return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
972
|
|
|
} |
973
|
|
|
} |
974
|
|
|
|
975
|
|
|
/** |
976
|
|
|
* Helper function for getting a node name |
977
|
|
|
* |
978
|
|
|
* @param object $node |
979
|
|
|
* @return string |
980
|
|
|
*/ |
981
|
|
|
private function _get_name($node) |
982
|
|
|
{ |
983
|
|
|
if ($node === null) |
984
|
|
|
{ |
985
|
|
|
return ''; |
986
|
|
|
} |
987
|
|
|
|
988
|
|
|
if ($this->_parser) |
989
|
|
|
{ |
990
|
|
|
return $node->nodeName; |
991
|
|
|
} |
992
|
|
|
else |
993
|
|
|
{ |
994
|
|
|
return $node->nodeName(); |
995
|
|
|
} |
996
|
|
|
} |
997
|
|
|
|
998
|
|
|
/** |
999
|
|
|
* Helper function for creating ol's |
1000
|
|
|
* |
1001
|
|
|
* - Returns the absolute number of an <li> inside an <ol> |
1002
|
|
|
* |
1003
|
|
|
* @param object $node |
1004
|
|
|
* @return int |
1005
|
|
|
*/ |
1006
|
|
|
private function _get_list_position($node) |
1007
|
|
|
{ |
1008
|
|
|
$position = 1; |
1009
|
|
|
|
1010
|
|
|
// Get all of the list nodes inside this parent |
1011
|
|
|
$list_node = $this->_parser ? $node->parentNode : $node->parentNode(); |
1012
|
|
|
$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes()); |
1013
|
|
|
|
1014
|
|
|
// Loop through all li nodes and find where we are in this list |
1015
|
|
|
for ($i = 0; $i < $total_nodes; $i++) |
1016
|
|
|
{ |
1017
|
|
|
$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i); |
1018
|
|
|
if ($current_node === $node) |
1019
|
|
|
{ |
1020
|
|
|
$position = $i + 1; |
1021
|
|
|
} |
1022
|
|
|
} |
1023
|
|
|
|
1024
|
|
|
return $position; |
1025
|
|
|
} |
1026
|
|
|
|
1027
|
|
|
/** |
1028
|
|
|
* Helper function for table creation |
1029
|
|
|
* |
1030
|
|
|
* - Builds td's to a give width, aligned as needed |
1031
|
|
|
* |
1032
|
|
|
* @param string $align |
1033
|
|
|
* @param int $width |
1034
|
|
|
* @param string $content |
1035
|
|
|
* @param int $max |
1036
|
|
|
* @return string |
1037
|
|
|
*/ |
1038
|
|
|
private function _align_row_content($align, $width, $content, $max) |
1039
|
|
|
{ |
1040
|
|
|
switch ($align) |
1041
|
|
|
{ |
1042
|
|
|
default: |
1043
|
|
|
case 'left': |
1044
|
|
|
$content .= str_repeat(' ', $max - $width); |
1045
|
|
|
break; |
1046
|
|
|
case 'right': |
1047
|
|
|
$content = str_repeat(' ', $max - $width) . $content; |
1048
|
|
|
break; |
1049
|
|
|
case 'center': |
1050
|
|
|
$paddingNeeded = $max - $width; |
1051
|
|
|
$left = floor($paddingNeeded / 2); |
1052
|
|
|
$right = $paddingNeeded - $left; |
1053
|
|
|
$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right); |
|
|
|
|
1054
|
|
|
break; |
1055
|
|
|
} |
1056
|
|
|
|
1057
|
|
|
return $content; |
1058
|
|
|
} |
1059
|
|
|
|
1060
|
|
|
/** |
1061
|
|
|
* Gets the inner html of a node |
1062
|
|
|
* |
1063
|
|
|
* @param DOMNode|object $node |
1064
|
|
|
* @return string |
1065
|
|
|
*/ |
1066
|
|
|
private function _get_innerHTML($node) |
1067
|
|
|
{ |
1068
|
|
|
if ($this->_parser) |
1069
|
|
|
{ |
1070
|
|
|
$doc = new DOMDocument(); |
1071
|
|
|
$doc->appendChild($doc->importNode($node, true)); |
1072
|
|
|
$html = trim($doc->saveHTML()); |
1073
|
|
|
$tag = $node->nodeName; |
1074
|
|
|
|
1075
|
|
|
return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html); |
1076
|
|
|
} |
1077
|
|
|
else |
1078
|
|
|
{ |
1079
|
|
|
return $node->innertext; |
|
|
|
|
1080
|
|
|
} |
1081
|
|
|
} |
1082
|
|
|
|
1083
|
|
|
/** |
1084
|
|
|
* Gets the outer html of a node |
1085
|
|
|
* |
1086
|
|
|
* @param DOMNode|object $node |
1087
|
|
|
* @return string |
1088
|
|
|
*/ |
1089
|
|
|
private function _get_outerHTML($node) |
1090
|
|
|
{ |
1091
|
|
|
if ($this->_parser) |
1092
|
|
|
{ |
1093
|
|
|
if (version_compare(PHP_VERSION, '5.3.6') >= 0) |
1094
|
|
|
{ |
1095
|
|
|
return htmlspecialchars_decode($this->doc->saveHTML($node)); |
1096
|
|
|
} |
1097
|
|
|
else |
1098
|
|
|
{ |
1099
|
|
|
// @todo remove when 5.3.6 min |
1100
|
|
|
$doc = new DOMDocument(); |
1101
|
|
|
$doc->appendChild($doc->importNode($node, true)); |
1102
|
|
|
$html = $doc->saveHTML(); |
1103
|
|
|
|
1104
|
|
|
// We just want the html of the inserted node, it *may* be wrapped |
1105
|
|
|
$html = $this->_returnBodyText($html); |
1106
|
|
|
|
1107
|
|
|
// Clean it up |
1108
|
|
|
$html = rtrim($html, "\n"); |
1109
|
|
|
|
1110
|
|
|
return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
1111
|
|
|
} |
1112
|
|
|
} |
1113
|
|
|
else |
1114
|
|
|
{ |
1115
|
|
|
return $node->outertext; |
|
|
|
|
1116
|
|
|
} |
1117
|
|
|
} |
1118
|
|
|
|
1119
|
|
|
/** |
1120
|
|
|
* Escapes markup looking text in html to prevent accidental assignment |
1121
|
|
|
* |
1122
|
|
|
* <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to |
1123
|
|
|
* be converted by md to html as <strong>stuff</strong> |
1124
|
|
|
* |
1125
|
|
|
* @param string $value |
1126
|
|
|
* @return string |
1127
|
|
|
*/ |
1128
|
|
|
private function _escape_text($value) |
1129
|
|
|
{ |
1130
|
|
|
// Search and replace ... |
1131
|
|
|
foreach ($this->_textEscapeRegex as $regex => $replacement) |
1132
|
|
|
{ |
1133
|
|
|
$value = preg_replace('~' . $regex . '~', $replacement, $value); |
1134
|
|
|
} |
1135
|
|
|
|
1136
|
|
|
return $value; |
1137
|
|
|
} |
1138
|
|
|
|
1139
|
|
|
/** |
1140
|
|
|
* If inline code contains backticks ` as part of its content, we need to wrap them so |
1141
|
|
|
* when markdown is run we don't interpret the ` as additional code blocks |
1142
|
|
|
* |
1143
|
|
|
* @param object $node |
1144
|
|
|
* @param string $value |
1145
|
|
|
* @return string |
1146
|
|
|
*/ |
1147
|
|
|
private function _has_ticks($node, $value) |
1148
|
|
|
{ |
1149
|
|
|
$ticks = ''; |
1150
|
|
|
$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName(); |
1151
|
|
|
|
1152
|
|
|
// Inside of a pre, we don't do anything |
1153
|
|
|
if ($code_parent === 'pre') |
1154
|
|
|
{ |
1155
|
|
|
return $value; |
1156
|
|
|
} |
1157
|
|
|
|
1158
|
|
|
// If we have backticks in code, then we back tick the ticks |
1159
|
|
|
// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly |
1160
|
|
|
preg_match_all('~`+~', $value, $matches); |
1161
|
|
|
if (!empty($matches[0])) |
1162
|
|
|
{ |
1163
|
|
|
// Yup ticks in the hair |
1164
|
|
|
$ticks = '`'; |
1165
|
|
|
rsort($matches[0]); |
1166
|
|
|
|
1167
|
|
|
// Backtick as many as needed so markdown will work |
1168
|
|
|
while (true) |
1169
|
|
|
{ |
1170
|
|
|
if (!in_array($ticks, $matches[0])) |
1171
|
|
|
{ |
1172
|
|
|
break; |
1173
|
|
|
} |
1174
|
|
|
$ticks .= '`'; |
1175
|
|
|
} |
1176
|
|
|
} |
1177
|
|
|
|
1178
|
|
|
return $ticks; |
1179
|
|
|
} |
1180
|
|
|
|
1181
|
|
|
/** |
1182
|
|
|
* Helper function to adjust wrapping width for long-ish links |
1183
|
|
|
* |
1184
|
|
|
* @param string $markdown |
1185
|
|
|
* @param bool|int $buffer |
1186
|
|
|
*/ |
1187
|
|
|
private function _check_line_lenght($markdown, $buffer = false) |
1188
|
|
|
{ |
1189
|
|
|
// Some Lines can be very long and if we wrap them they break |
1190
|
|
|
$lines = explode($this->line_end, $markdown); |
1191
|
|
|
foreach ($lines as $line) |
1192
|
|
|
{ |
1193
|
|
|
$line_strlen = Util::strlen($line) + (!empty($buffer) ? (int) $buffer : 0); |
1194
|
|
|
if ($line_strlen > $this->body_width) |
1195
|
|
|
{ |
1196
|
|
|
$this->body_width = $line_strlen; |
1197
|
|
|
} |
1198
|
|
|
} |
1199
|
|
|
} |
1200
|
|
|
|
1201
|
|
|
/** |
1202
|
|
|
* Helper function to find and wrap plain text links in MD format |
1203
|
|
|
*/ |
1204
|
|
|
private function _convert_plaintxt_links() |
1205
|
|
|
{ |
1206
|
|
|
$this->markdown = preg_replace_callback('/((?<!\]\( |\]\()https?:\/\/|(?<!\]\( |\]\(|:\/\/)www)[-\p{L}0-9+&@#\/%?=~_|!:,.;]*[\p{L}0-9+&@#\/%=~_|]/iu', array($this, '_plaintxt_callback'), $this->markdown); |
1207
|
|
|
} |
1208
|
|
|
|
1209
|
|
|
/** |
1210
|
|
|
* Callback function used by _convert_plaintxt_links for plain link to MD |
1211
|
|
|
* |
1212
|
|
|
* @param string[] $matches |
1213
|
|
|
* @return string |
1214
|
|
|
*/ |
1215
|
|
|
private function _plaintxt_callback($matches) |
1216
|
|
|
{ |
1217
|
|
|
global $txt; |
1218
|
|
|
|
1219
|
|
|
$replacement = $this->line_end . '[' . $txt['link'] . ']( ' . trim($matches[0]) . ' )'; |
1220
|
|
|
|
1221
|
|
|
return $replacement; |
1222
|
|
|
} |
1223
|
|
|
|
1224
|
|
|
/** |
1225
|
|
|
* Breaks a string up so its no more than width characters long |
1226
|
|
|
* |
1227
|
|
|
* - Will break at word boundaries |
1228
|
|
|
* - If no natural space is found will break mid-word |
1229
|
|
|
* |
1230
|
|
|
* @param string $string |
1231
|
|
|
* @param int $width |
1232
|
|
|
* @param string $break |
1233
|
|
|
* @return string |
1234
|
|
|
*/ |
1235
|
|
|
private function _utf8_wordwrap($string, $width = 75, $break = "\n") |
1236
|
|
|
{ |
1237
|
|
|
$strings = explode($break, $string); |
1238
|
|
|
$lines = array(); |
1239
|
|
|
|
1240
|
|
|
foreach ($strings as $string) |
|
|
|
|
1241
|
|
|
{ |
1242
|
|
|
$in_quote = isset($string[0]) && $string[0] === '>'; |
1243
|
|
|
while (!empty($string)) |
1244
|
|
|
{ |
1245
|
|
|
// Get the next #width characters before a break (space, punctuation tab etc) |
1246
|
|
|
if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches)) |
1247
|
|
|
{ |
1248
|
|
|
// Add the #width to the output and set up for the next pass |
1249
|
|
|
$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . ltrim($matches[1], ' '); |
1250
|
|
|
$string = Util::substr($string, Util::strlen($matches[1])); |
1251
|
|
|
} |
1252
|
|
|
// Humm just a long word with no place to break so we simply cut it after width characters |
1253
|
|
|
else |
1254
|
|
|
{ |
1255
|
|
|
$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width); |
1256
|
|
|
$string = Util::substr($string, $width); |
1257
|
|
|
} |
1258
|
|
|
} |
1259
|
|
|
} |
1260
|
|
|
|
1261
|
|
|
// Join it all the shortened sections up on our break characters |
1262
|
|
|
return implode($break, $lines); |
1263
|
|
|
} |
1264
|
|
|
} |
1265
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.