Completed
Push — patch_1-0-10 ( 7a6c4f...3de995 )
by Stephen
07:05
created

Html_2_Md::_clean_markdown()   B

Complexity

Conditions 2
Paths 2

Size

Total Lines 28
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 16
nc 2
nop 0
dl 0
loc 28
rs 8.8571
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.0.7
11
 *
12
 */
13
14
if (!defined('ELK'))
15
	die('No access...');
16
17
/**
18
 * Converts HTML to Markdown text
19
 */
20
class Html_2_Md
21
{
22
	/**
23
	 * The value that will hold our dom object
24
	 * @var object
25
	 */
26
	public $doc;
27
28
	/**
29
	 * The value that will hold if we are using the internal or external parser
30
	 * @var boolean
31
	 */
32
	private $_parser;
33
34
	/**
35
	 * Line end character
36
	 * @var string
37
	 */
38
	public $line_end = "\n";
39
40
	/**
41
	 * Line break character
42
	 * @var string
43
	 */
44
	public $line_break = "\n\n";
45
46
	/**
47
	 * Wordwrap output, set to 0 to skip wrapping
48
	 * @var int
49
	 */
50
	public $body_width = 76;
51
52
	/**
53
	 * Strip remaining tags, set to false to leave them in
54
	 * @var boolean
55
	 */
56
	public $strip_tags = true;
57
58
	/**
59
	 * Regex to run on plain text to prevent markdown from erroneously converting
60
	 * @var string[]
61
	 */
62
	private $_textEscapeRegex = array();
63
64
	/**
65
	 * The passed html string to convert
66
	 * @var string
67
	 */
68
	public $html;
69
70
	/**
71
	 * The markdown equivalent to the  html string
72
	 * @var string
73
	 */
74
	public $markdown;
75
76
	/**
77
	 * Gets everything started using the built in or external parser
78
	 *
79
	 * @param string $html string of html to convert to MD text
80
	 */
81
	public function __construct($html)
82
	{
83
		// Up front, remove whitespace between html tags
84
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
85
86
		// The XML parser will not deal gracefully with these
87
		$this->html = strtr($this->html, array(
88
			'?<' => '|?|&lt',
89
			'?>' => '|?|&gt',
90
			'>?' => '&gt|?|',
91
			'<?' => '&lt|?|'
92
		));
93
94
		// Set the dom parser to use and load the HTML to the parser
95
		$this->_set_parser();
96
97
		// Initialize the regex array to escape text areas so markdown does
98
		// not interpret plain text as markdown syntax
99
		$this->_textEscapeRegex = array(
0 ignored issues
show
Documentation Bug introduced by
It seems like (.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' => '\\[$1\\]$2\\[$3\\]')">array('([-*_])([ ]{0,2}\(.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' =">...> '\\[$1\\]$2\\[$3\\]') of type (.+)\\":"string","\\(.+)\\\\[(.*)\\]":"string"}>">array<string,string,{"([(.+)\\":"string","\\...">(.+)\\\\[(.*)\\]":"string"}> is incompatible with the declared type array<integer,string> of property $_textEscapeRegex.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
100
			// Things that may convert to an hr --- or - - - etc
101
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
102
			// or **stuff** => \*\*stuff\*\*
103
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
104
			// or versions of *italic* __italic__ _italic_
105
			'\*([^*\s]+)\*' => '\*$1\*',
106
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
107
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
108
			// nor `code`
109
			'`(.+)`' => '\`$1\`',
110
			// or links
111
			'\[(.+)\](\s*\()' => '\[$1\]$2',
112
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
113
		);
114
	}
115
116
	/**
117
	 * Set the DOM parser for class, loads the supplied HTML
118
	 */
119
	private function _set_parser()
120
	{
121
		// Using PHP built in functions ...
122
		if (class_exists('DOMDocument'))
123
		{
124
			$this->_parser = true;
125
			$previous = libxml_use_internal_errors(true);
126
127
			// Set up basic parameters for DomDocument, including silencing structural errors
128
			$this->_setupDOMDocument();
129
130
			// Set the error handle back to what it was, and flush
131
			libxml_use_internal_errors($previous);
132
			libxml_clear_errors();
133
		}
134
		// Or using the external simple html parser
135
		else
136
		{
137
			$this->_parser = false;
138
			require_once(EXTDIR . '/simple_html_dom.php');
139
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
140
		}
141
	}
142
143
	/**
144
	 * Loads the html body and sends it to the parsing loop to convert all
145
	 * DOM nodes to markup
146
	 */
147
	public function get_markdown()
148
	{
149
		// For this html node, find all child elements and convert
150
		$body = $this->_getBody();
151
		$this->_convert_childNodes($body);
152
153
		// Done replacing HTML elements, now get the converted DOM tree back into a string
154
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
155
156
		// Using the internal DOM methods requires we need to do a little extra work
157
		if ($this->_parser)
158
		{
159
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
160
		}
161
162
		// Clean up any excess spacing etc
163
		$this->_clean_markdown();
164
165
		// Wordwrap?
166
		if (!empty($this->body_width))
167
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
168
169
		return $this->markdown;
170
	}
171
172
	/**
173
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
174
	 * and above head markup
175
	 *
176
	 * @return object
177
	 */
178
	private function  _getBody()
179
	{
180
		// If there is a head node, then off with his head!
181
		$this->_clipHead();
182
183
		// The body of the HTML is where its at.
184
		if ($this->_parser)
185
		{
186
			$body = $this->doc->getElementsByTagName('body')->item(0);
187
		}
188
		else
189
		{
190
			if ($this->doc->find('body', 0) !== null)
191
			{
192
				$body = $this->doc->find('body', 0);
193
			}
194
			elseif ($this->doc->find('html', 0) !== null)
195
			{
196
				$body = $this->doc->find('html', 0);
197
			}
198
			else
199
			{
200
				$body = $this->doc->root;
201
			}
202
		}
203
204
		return $body;
205
	}
206
207
	/**
208
	 * Remove any <head node from the DOM
209
	 */
210
	private function _clipHead()
211
	{
212
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0)->outertext;
213
		if ($head !== null)
214
		{
215
			if ($this->_parser)
216
			{
217
				$head->parentNode->removeChild($head);
218
			}
219
			else
220
			{
221
				$this->doc->find('head', 0)->outertext = '';
222
			}
223
		}
224
	}
225
226
	/**
227
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
228
	 */
229
	private function _setupDOMDocument()
230
	{
231
		// If the html is already wrapped, remove it
232
		$this->html = $this->_returnBodyText($this->html);
233
234
		// Set up processing details
235
		$this->doc = new DOMDocument();
236
		$this->doc->preserveWhiteSpace = false;
237
		$this->doc->encoding = 'UTF-8';
238
239
		// Do what we can to ensure this is processed as UTF-8
240
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
241
	}
242
243
	/**
244
	 * Normalize any spacing and excess blank lines that may have been generated
245
	 */
246
	private function _clean_markdown()
247
	{
248
		// We only want the content, no wrappers
249
		$this->markdown = $this->_returnBodyText($this->markdown);
250
251
		// Remove non breakable spaces that may be hiding in here
252
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
253
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
254
255
		// Remove any "bonus" tags
256
		if ($this->strip_tags)
257
			$this->markdown = strip_tags($this->markdown);
258
259
		// Replace content that we "hide" from the XML parsers
260
		$this->markdown = strtr($this->markdown, array(
261
			'|?|&gt' => '?>',
262
			'|?|&lt' => '?<',
263
			'&lt|?|' => '<?',
264
			'&gt|?|' => '>?'
265
		));
266
267
		// Strip the chaff and any excess blank lines we may have produced
268
		$this->markdown = trim($this->markdown);
269
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
270
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
271
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
272
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
273
	}
274
275
	/**
276
	 * Looks for the text inside of <body> and then <html>, returning just the inner
277
	 *
278
	 * @param $text
279
	 *
280
	 * @return string
281
	 */
282
	private function _returnBodyText($text)
283
	{
284
		if (preg_match('~<body>(.*)</body>~su', $text, $body))
285
			return $body[1];
286
		elseif (preg_match('~<html>(.*)</html>~su', $text, $body))
287
			return $body[1];
288
289
		return $text;
290
	}
291
292
	/**
293
	 * For a given node, checks if it is anywhere nested inside of a code block
294
	 *  - Prevents converting anything that's inside a code block
295
	 *
296
	 * @param object $node
297
	 * @param boolean $parser flag for internal or external parser
298
	 */
299
	private static function _has_parent_code($node, $parser)
300
	{
301
		$parent = $parser ? $node->parentNode : $node->parentNode();
302
		while ($parent)
303
		{
304
			if ($parent === null)
305
				return false;
306
307
			// Anywhere nested inside a code block we don't render tags
308
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
309
			if ($tag === 'code')
310
				return true;
311
312
			// Back out another level, until we are done
313
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
314
		}
315
316
		return false;
317
	}
318
319
	/**
320
	 * Get the nesting level when inside a list
321
	 *
322
	 * @param object $node
323
	 * @param boolean $parser flag for internal or external parser
324
	 */
325
	private static function _has_parent_list($node, $parser)
326
	{
327
		$inlist = array('ul', 'ol');
328
		$depth = 0;
329
330
		$parent = $parser ? $node->parentNode : $node->parentNode();
331
		while ($parent)
332
		{
333
			// Anywhere nested inside a list we need to get the depth
334
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
335
			if (in_array($tag, $inlist))
336
				$depth++;
337
338
			// Back out another level
339
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
340
		}
341
342
		return $depth;
343
	}
344
345
	/**
346
	 * Traverse each node to its base, then convert tags to markup on the way back out
347
	 *
348
	 * @param object $node
349
	 */
350
	private function _convert_childNodes($node)
351
	{
352
		if (self::_has_parent_code($node, $this->_parser))
353
			return;
354
355
		// Keep traversing till we are at the base of this node
356
		if ($node->hasChildNodes())
357
		{
358
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
359
			for ($i = 0; $i < $num; $i++)
360
			{
361
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
362
				$this->_convert_childNodes($child);
363
			}
364
		}
365
366
		// At the root of this node, convert it to markdown
367
		$this->_convert_to_markdown($node);
368
	}
369
370
	/**
371
	 * Convert the supplied node into its markdown equivalent
372
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
373
	 *
374
	 * @param object $node
375
	 */
376
	private function _convert_to_markdown($node)
377
	{
378
		// HTML tag we are dealing with
379
		$tag = $this->_get_name($node);
380
381
		// Based on the tag, determine how to convert
382
		switch ($tag)
383
		{
384
			case 'a':
385
				$markdown = $this->_convert_anchor($node);
386
				break;
387
			case 'abbr':
388
				$markdown = $this->_convert_abbr($node);
389
				break;
390
			case 'b':
391
			case 'strong':
392
				$markdown = '**' . $this->_get_value($node) . '**';
393
				break;
394
			case 'blockquote':
395
				$markdown = $this->_convert_blockquote($node);
396
				break;
397
			case 'br':
398
				// DomDocument strips empty lines, this prevents that
399
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
400
				break;
401
			case 'center':
402
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
403
				break;
404
			case 'code':
405
				$markdown = $this->_convert_code($node);
406
				break;
407
			case 'dt':
408
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
409
				break;
410
			case 'dd':
411
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
412
				break;
413
			case 'dl':
414
				$markdown = trim($this->_get_value($node)) . $this->line_break;
415
				break;
416
			case 'em':
417
			case 'i':
418
				$markdown = '_' . $this->_get_value($node) . '_';
419
				break;
420
			case 'hr':
421
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
422
				break;
423
			case 'h1':
424
			case 'h2':
425
			case 'h3':
426
			case 'h4':
427
			case 'h5':
428
			case 'h6':
429
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
430
				break;
431
			case 'img':
432
				$markdown = $this->_convert_image($node);
433
				break;
434
			case 'ol':
435
			case 'ul':
436
				$markdown = rtrim($this->_get_value($node)) . $this->line_break;
437
				break;
438
			case 'li':
439
				$markdown = $this->_convert_list($node);
440
				break;
441
			case 'p':
442
				if (!$node->hasChildNodes())
443
				{
444
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
445
					$markdown = $this->_escape_text($markdown);
446
				}
447
				else
448
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
449
				break;
450
			case 'pre':
451
				$markdown = $this->_get_value($node) . $this->line_break;
452
				break;
453
			case 'div':
454
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
455
				if (!$node->hasChildNodes())
456
					$markdown = $this->_escape_text($markdown);
457
				break;
458
			//case '#text':
459
			//  $markdown = $this->_escape_text($this->_get_value($node));
460
			//  break;
461
			case 'title':
462
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
463
				break;
464
			case 'table':
465
				$markdown = $this->_convert_table($node) . $this->line_break;
466
				break;
467
			case 'th':
468
			case 'tr':
469
			case 'td':
470
			case 'tbody':
471
			case 'tfoot':
472
			case 'thead':
473
				// Just skip over these as we handle them in the table tag itself
474
				$markdown = '~`skip`~';
475
				break;
476
			case 'root':
477
			case 'span':
478
			case 'body':
479
				// Remove these tags and simply replace with the text inside the tags
480
				$markdown = $this->_get_innerHTML($node);
481
				break;
482
			default:
483
				// Don't know you or text, so just preserve whats there
484
				$markdown = $this->_get_outerHTML($node);
485
		}
486
487
		// Replace the node with our markdown replacement, or with the node itself if none was found
488
		if ($markdown !== '~`skip`~')
489
		{
490
			if ($this->_parser)
491
			{
492
				// Create a new text node with our markdown tag and replace the original node
493
				$markdown_node = $this->doc->createTextNode($markdown);
494
				$node->parentNode->replaceChild($markdown_node, $node);
495
			}
496
			else
497
				$node->outertext = $markdown;
498
		}
499
	}
500
501
	/**
502
	 * Converts <abbr> tags to markdown (extra)
503
	 *
504
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
505
	 * md:   *[HTML]: Hyper Text Markup Language
506
	 *
507
	 * @param object $node
508
	 */
509
	private function _convert_abbr($node)
510
	{
511
		$title = $node->getAttribute('title');
512
		$value = $this->_get_value($node);
513
514
		if (!empty($title))
515
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
516
		else
517
			$markdown = '';
518
519
		return $markdown;
520
	}
521
522
	/**
523
	 * Converts <a> tags to markdown
524
	 *
525
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
526
	 * md: [Awesome Site](http://somesite.com 'Title')
527
	 *
528
	 * @param object $node
529
	 * @return string
530
	 */
531
	private function _convert_anchor($node)
532
	{
533
		global $txt;
534
535
		$href = htmlentities($node->getAttribute('href'), ENT_COMPAT, 'UTF-8', false);
536
		$title = $node->getAttribute('title');
537
		$class = $node->getAttribute('class');
538
		$value = $this->_get_value($node);
539
540
		// Provide a more compact [name] if none is given
541
		if ($value == $node->getAttribute('href') || empty($value))
542
			$value = empty($title) ? $txt['link'] : $title;
543
544
		// Special processing just for our own footnotes
545
		if ($class === 'target' || $class === 'footnote_return')
546
			$markdown = $value;
547
		elseif (!empty($title))
548
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
549
		else
550
			$markdown = '[' . $value . '](' . $href . ')';
551
552
		// Some links can be very long and if we wrap them they break
553
		$line_strlen = Util::strlen($markdown);
554
		if ($line_strlen > $this->body_width)
555
			$this->body_width = $line_strlen;
556
557
		return $markdown;
558
	}
559
560
	/**
561
	 * Converts blockquotes to markdown > quote style
562
	 *
563
	 * html: <blockquote>quote</blockquote>
564
	 * md: > quote
565
	 *
566
	 * @param object $node
567
	 */
568
	private function _convert_blockquote($node)
569
	{
570
		$markdown = '';
571
572
		// All the contents of this block quote
573
		$value = $this->_get_value($node);
574
		$value = trim($value);
575
576
		// Go line by line
577
		$lines = preg_split('~\r\n|\r|\n~', $value);
578
579
		// Each line gets a '> ' in front of it, just like email quotes really
580
		foreach ($lines as $line)
581
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
582
583
		$markdown .= $this->line_end;
584
		return $markdown;
585
	}
586
587
	/**
588
	 * Converts code tags to markdown span `code` or block code
589
	 * Converts single line code to inline tick mark
590
	 * Converts multi line to 4 space indented code
591
	 *
592
	 * html: <code>code</code>
593
	 * md: `code`
594
	 *
595
	 * @param object $node
596
	 */
597
	private function _convert_code($node)
598
	{
599
		$value = $this->_get_innerHTML($node);
600
601
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselfs
602
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
603
604
		// If there are html tags in this code block, we need to disable strip tags
605
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
606
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
607
			$this->strip_tags = false;
608
609
		// Get the number of lines of code that we have
610
		$lines = preg_split('~\r\n|\r|\n~', $value);
611
		$total = count($lines);
612
613
		// If there's more than one line of code, use leading four space syntax
614
		if ($total > 1)
615
		{
616
			$first_line = trim($lines[0]);
617
			$last_line = trim($lines[$total - 1]);
618
619
			// Remove any leading and trailing blank lines
620
			if (empty($first_line))
621
				array_shift($lines);
622
			if (empty($last_line))
623
				array_pop($lines);
624
625
			// Convert what remains
626
			$markdown = '';
627
			foreach ($lines as $line)
628
			{
629
				// Adjust the word wrapping since this has code tags, leave it up to
630
				// the email client to mess these up ;)
631
				$line_strlen = strlen($line) + 5;
632
				if ($line_strlen > $this->body_width)
633
					$this->body_width = $line_strlen;
634
635
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
636
			}
637
638
			// The parser will encode, but we don't want that for our code block
639
			if ($this->_parser)
640
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
641
		}
642
		// Single line, back tick and move on
643
		else
644
		{
645
			// Account for backticks in the single line code itself
646
			$ticks = $this->_has_ticks($node, $value);
647
			if (!empty($ticks))
648
			{
649
				// If the ticks were at the start/end of the word space it off
650
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
651
					$lines[0] = ' ' . $lines[0] . ' ';
652
653
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
654
			}
655
			else
656
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
657
		}
658
659
		return $markdown;
660
	}
661
662
	/**
663
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
664
	 * all other headers are returned as atx style ### h3
665
	 *
666
	 * html: <h1>header</h1>
667
	 * md: header
668
	 *     ======
669
	 *
670
	 * html: <h3>header</h3>
671
	 * md: ###header
672
	 *
673
	 * @param int $level
674
	 * @param string $content
675
	 */
676
	private function _convert_header($level, $content)
677
	{
678
		$level = (int) ltrim($level, 'h');
679
680
		if ($level < 3)
681
		{
682
			$length = Util::strlen($content);
683
			$underline = ($level === 1) ? '=' : '-';
684
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
685
		}
686
		else
687
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
688
689
		return $markdown;
690
	}
691
692
	/**
693
	 * Converts <img> tags to markdown
694
	 *
695
	 * html: <img src='source' alt='alt' title='title' />
696
	 * md: ![alt](source 'title')
697
	 *
698
	 * @param object $node
699
	 */
700
	private function _convert_image($node)
701
	{
702
		$src = $node->getAttribute('src');
703
		$alt = $node->getAttribute('alt');
704
		$title = $node->getAttribute('title');
705
706
		if (!empty($title))
707
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
708
		else
709
			$markdown = '![' . $alt . '](' . $src . ')';
710
711
		return $markdown;
712
	}
713
714
	/**
715
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
716
	 *
717
	 * html: <ul><li>one</li></ul>
718
	 * md * one
719
	 *
720
	 * @param object $node
721
	 */
722
	private function _convert_list($node)
723
	{
724
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
725
		$value = $this->_get_value($node);
726
727
		$loose = rtrim($value) !== $value;
728
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
729
730
		// Unordered lists get a simple bullet
731
		if ($list_type === 'ul')
732
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
733
		// Ordered lists need a number
734
		else
735
		{
736
			$number = $this->_get_list_position($node);
737
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
738
		}
739
740
		return $markdown . (!$loose ? $this->line_end : '');
741
	}
742
743
	/**
744
	 * Converts tables tags to markdown extra table syntax
745
	 *
746
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
747
	 *
748
	 * @param object $node
749
	 */
750
	private function _convert_table($node)
751
	{
752
		$table_heading = $node->getElementsByTagName('th');
753
		if ($this->_get_item($table_heading, 0) === null)
754
			return;
755
756
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
757
758
		// Set up for a markdown table, then storm the castle
759
		$align = array();
760
		$value = array();
761
		$width = array();
762
		$max = array();
763
		$header = array();
764
		$rows = array();
765
766
		// We only markdown well formed tables ...
767
		if ($table_heading && $th_parent === 'tr')
768
		{
769
			// Find out how many columns we are dealing with
770
			$th_num = $this->_get_length($table_heading);
771
772
			for ($col = 0; $col < $th_num; $col++)
773
			{
774
				// Get the align and text for each th (html5 this is no longer valid)
775
				$th = $this->_get_item($table_heading, $col);
776
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
777
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
778
				$value[0][$col] = $this->_get_value($th);
779
				$width[0][$col] = Util::strlen($this->_get_value($th));
780
781
				// Seed the max col width
782
				$max[$col] = $width[0][$col];
783
			}
784
785
			// Get all of the rows
786
			$table_rows = $node->getElementsByTagName('tr');
787
			$num_rows = $this->_get_length($table_rows);
788
			for ($row = 1; $row < $num_rows; $row++)
789
			{
790
				// Start at row 1 and get all of the td's in this row
791
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
792
793
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
794
				for ($col = 0; $col < $th_num; $col++)
795
				{
796
					// Get the align and text for each td in this row
797
					$td = $this->_get_item($row_data, $col);
798
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
799
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
800
					$value[$row][$col] = $this->_get_value($td);
801
					$width[$row][$col] = Util::strlen($this->_get_value($td));
802
803
					// Keep track of the longest col cell as we go
804
					if ($width[$row][$col] > $max[$col])
805
						$max[$col] = $width[$row][$col];
806
				}
807
			}
808
809
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
810
			for ($row = 0; $row < $num_rows; $row++)
811
			{
812
				$temp = array();
813
				for ($col = 0; $col < $th_num; $col++)
814
				{
815
					// Build the header row once
816
					if ($row === 0)
817
						$header[] = str_repeat('-', $max[$col]);
818
819
					// Build the data for each col, align/pad as needed
820
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
821
				}
822
823
				// Join it all up so we have a nice looking row
824
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
825
826
				// Stuff in the header after the th row
827
				if ($row === 0)
828
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
829
			}
830
831
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
832
			$line_strlen = strlen($rows[1]) + 2;
833
			if ($line_strlen > $this->body_width)
834
				$this->body_width = $line_strlen;
835
836
			// Return what we did so it can be swapped in
837
			return implode($this->line_end, $rows);
838
		}
839
	}
840
841
	/**
842
	 * Helper function for getting a node object
843
	 *
844
	 * @param object $node
845
	 * @param int $item
846
	 */
847
	private function _get_item($node, $item)
848
	{
849
		if ($this->_parser)
850
			return $node->item($item);
851
		else
852
			return $node[$item];
853
	}
854
855
	/**
856
	 * Helper function for getting a node length
857
	 *
858
	 * @param object $node
859
	 */
860
	private function _get_length($node)
861
	{
862
		if ($this->_parser)
863
			return $node->length;
864
		else
865
			return count($node);
866
	}
867
868
	/**
869
	 * Helper function for getting a node value
870
	 *
871
	 * @param object $node
872
	 */
873
	private function _get_value($node)
874
	{
875
		if ($node === null)
876
			return '';
877
878
		if ($this->_parser)
879
			return $node->nodeValue;
880
		else
881
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
882
	}
883
884
	/**
885
	 * Helper function for getting a node name
886
	 *
887
	 * @param object $node
888
	 */
889
	private function _get_name($node)
890
	{
891
		if ($node === null)
892
			return '';
893
894
		if ($this->_parser)
895
			return $node->nodeName;
896
		else
897
			return $node->nodeName();
898
	}
899
900
	/**
901
	 * Helper function for creating ol's
902
	 *
903
	 * - Returns the absolute number of an <li> inside an <ol>
904
	 *
905
	 * @param object $node
906
	 */
907
	private function _get_list_position($node)
908
	{
909
		$position = 1;
910
911
		// Get all of the list nodes inside this parent
912
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
913
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
914
915
		// Loop through all li nodes and find where we are in this list
916
		for ($i = 0; $i < $total_nodes; $i++)
917
		{
918
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
919
			if ($current_node === $node)
920
				$position = $i + 1;
921
		}
922
923
		return $position;
924
	}
925
926
	/**
927
	 * Helper function for table creation
928
	 *
929
	 * - Builds td's to a give width, aligned as needed
930
	 *
931
	 * @param string $align
932
	 * @param int $width
933
	 * @param string $content
934
	 * @param int $max
935
	 */
936
	private function _align_row_content($align, $width, $content, $max)
937
	{
938
		switch ($align)
939
		{
940
			default:
941
			case 'left':
942
				$content .= str_repeat(' ', $max - $width);
943
				break;
944
			case 'right':
945
				$content = str_repeat(' ', $max - $width) . $content;
946
				break;
947
			case 'center':
948
				$paddingNeeded = $max - $width;
949
				$left = floor($paddingNeeded / 2);
950
				$right = $paddingNeeded - $left;
951
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
952
				break;
953
		}
954
955
		return $content;
956
	}
957
958
	/**
959
	 * Gets the inner html of a node
960
	 *
961
	 * @param object $node
962
	 */
963
	private function _get_innerHTML($node)
964
	{
965
		if ($this->_parser)
966
		{
967
			$doc = new DOMDocument();
968
			$doc->appendChild($doc->importNode($node, true));
969
			$html = trim($doc->saveHTML());
970
			$tag = $node->nodeName;
971
972
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
973
		}
974
		else
975
			return $node->innertext;
976
	}
977
978
	/**
979
	 * Gets the outer html of a node
980
	 *
981
	 * @param object $node
982
	 */
983
	private function _get_outerHTML($node)
984
	{
985
		if ($this->_parser)
986
		{
987
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
988
				return htmlspecialchars_decode($this->doc->saveHTML($node));
989
			else
990
			{
991
				// @todo remove when 5.3.6 min
992
				$doc = new DOMDocument();
993
				$doc->appendChild($doc->importNode($node, true));
994
				$html = $doc->saveHTML();
995
996
				// We just want the html of the inserted node, it *may* be wrapped
997
				if (preg_match('~<body>(.*)</body>~su', $html, $body))
998
					$html = $body[1];
999
				elseif (preg_match('~<html>(.*)</html>~su', $html, $body))
1000
					$html = $body[1];
1001
1002
				// Clean it up
1003
				$html = rtrim($html, "\n");
1004
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1005
			}
1006
		}
1007
		else
1008
			return $node->outertext;
1009
	}
1010
1011
	/**
1012
	 * Escapes markup looking text in html to prevent accidental assignment
1013
	 *
1014
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1015
	 * be converted by md to html as <strong>stuff</strong>
1016
	 *
1017
	 * @param string $value
1018
	 */
1019
	private function _escape_text($value)
1020
	{
1021
		// Search and replace ...
1022
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1023
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
1024
1025
		return $value;
1026
	}
1027
1028
	/**
1029
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1030
	 * when markdown is run we don't interpret the ` as additional code blocks
1031
	 *
1032
	 * @param object $node
1033
	 * @param string $value
1034
	 */
1035
	private function _has_ticks($node, $value)
1036
	{
1037
		$ticks = '';
1038
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
1039
1040
		// Inside of a pre, we don't do anything
1041
		if ($code_parent === 'pre')
1042
			return $value;
1043
1044
		// If we have backticks in code, then we back tick the ticks
1045
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1046
		preg_match_all('~`+~', $value, $matches);
1047
		if (!empty($matches[0]))
1048
		{
1049
			// Yup ticks in the hair
1050
			$ticks = '`';
1051
			rsort($matches[0]);
1052
1053
			// Backtick as many as needed so markdown will work
1054
			while (true)
1055
			{
1056
				if (!in_array($ticks, $matches[0]))
1057
					break;
1058
				$ticks .= '`';
1059
			}
1060
		}
1061
1062
		return $ticks;
1063
	}
1064
1065
	/**
1066
	 * Breaks a string up so its no more than width characters long
1067
	 *
1068
	 * - Will break at word boundaries
1069
	 * - If no natural space is found will break mid-word
1070
	 *
1071
	 * @param string $string
1072
	 * @param int $width
1073
	 * @param string $break
1074
	 */
1075
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
1076
	{
1077
		$strings = explode($break, $string);
1078
		$lines = array();
1079
1080
		foreach ($strings as $string)
1081
		{
1082
			$in_quote = isset($string[0]) && $string[0] === '>';
1083
			while (!empty($string))
1084
			{
1085
				// Get the next #width characters before a break (space, punctuation tab etc)
1086
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1087
				{
1088
					// Add the #width to the output and set up for the next pass
1089
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1];
1090
					$string = Util::substr($string, Util::strlen($matches[1]));
1091
				}
1092
				// Humm just a long word with no place to break so we simply cut it after width characters
1093
				else
1094
				{
1095
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1096
					$string = Util::substr($string, $width);
1097
				}
1098
			}
1099
		}
1100
1101
		// Join it all the shortened sections up on our break characters
1102
		return implode($break, $lines);
1103
	}
1104
}
1105