Completed
Pull Request — patch_1-0-10 (#2859)
by Stephen
06:31
created

Html_2_Md::_set_parser()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 23
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 11
nc 2
nop 0
dl 0
loc 23
rs 9.0856
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.0.7
11
 *
12
 */
13
14
if (!defined('ELK'))
15
	die('No access...');
16
17
/**
18
 * Converts HTML to Markdown text
19
 */
20
class Html_2_Md
21
{
22
	/**
23
	 * The value that will hold our dom object
24
	 * @var object
25
	 */
26
	public $doc;
27
28
	/**
29
	 * The value that will hold if we are using the internal or external parser
30
	 * @var boolean
31
	 */
32
	private $_parser;
33
34
	/**
35
	 * Line end character
36
	 * @var string
37
	 */
38
	public $line_end = "\n";
39
40
	/**
41
	 * Line break character
42
	 * @var string
43
	 */
44
	public $line_break = "\n\n";
45
46
	/**
47
	 * Wordwrap output, set to 0 to skip wrapping
48
	 * @var int
49
	 */
50
	public $body_width = 76;
51
52
	/**
53
	 * Strip remaining tags, set to false to leave them in
54
	 * @var boolean
55
	 */
56
	public $strip_tags = true;
57
58
	/**
59
	 * Regex to run on plain text to prevent markdown from erroneously converting
60
	 * @var string[]
61
	 */
62
	private $_textEscapeRegex = array();
63
64
	/**
65
	 * The passed html string to convert
66
	 * @var string
67
	 */
68
	public $html;
69
70
	/**
71
	 * The markdown equivalent to the  html string
72
	 * @var string
73
	 */
74
	public $markdown;
75
76
	/**
77
	 * Gets everything started using the built in or external parser
78
	 *
79
	 * @param string $html string of html to convert to MD text
80
	 */
81
	public function __construct($html)
82
	{
83
		// Up front, remove whitespace between html tags
84
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
85
86
		// The XML parser will not deal gracefully with these
87
		$this->html = strtr($this->html, array(
88
			'?<' => "|?|&lt",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal |?|&lt does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
89
			'?>' => "|?|&gt",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal |?|&gt does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
90
			'>?' => "&gt|?|",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &gt|?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
91
			'<?' => "&lt|?|"
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &lt|?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
92
		));
93
94
		// Set the dom parser to use and load the HTML to the parser
95
		$this->_set_parser();
96
97
		// Initialize the regex array to escape text areas so markdown does
98
		// not interpret plain text as markdown syntax
99
		$this->_textEscapeRegex = array(
0 ignored issues
show
Documentation Bug introduced by
It seems like (.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' => '\\[$1\\]$2\\[$3\\]')">array('([-*_])([ ]{0,2}\(.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' =">...> '\\[$1\\]$2\\[$3\\]') of type (.+)\\":"string","\\(.+)\\\\[(.*)\\]":"string"}>">array<string,string,{"([(.+)\\":"string","\\...">(.+)\\\\[(.*)\\]":"string"}> is incompatible with the declared type array<integer,string> of property $_textEscapeRegex.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
100
			// Things that may convert to an hr --- or - - - etc
101
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
102
			// or **stuff** => \*\*stuff\*\*
103
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
104
			// or versions of *italic* __italic__ _italic_
105
			'\*([^*\s]+)\*' => '\*$1\*',
106
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
107
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
108
			// nor `code`
109
			'`(.+)`' => '\`$1\`',
110
			// or links
111
			'\[(.+)\](\s*\()' => '\[$1\]$2',
112
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
113
		);
114
	}
115
116
	/**
117
	 * Set the DOM parser for class, loads the supplied HTML
118
	 */
119
	private function _set_parser()
120
	{
121
		// Using PHP built in functions ...
122
		if (class_exists('DOMDocument'))
123
		{
124
			$this->_parser = true;
125
			$previous = libxml_use_internal_errors(true);
126
127
			// Set up basic parameters for DomDocument, including silencing structural errors
128
			$this->_setupDOMDocument();
129
130
			// Set the error handle back to what it was, and flush
131
			libxml_use_internal_errors($previous);
132
			libxml_clear_errors();
133
		}
134
		// Or using the external simple html parser
135
		else
136
		{
137
			$this->_parser = false;
138
			require_once(EXTDIR . '/simple_html_dom.php');
139
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
140
		}
141
	}
142
143
	/**
144
	 * Loads the html body and sends it to the parsing loop to convert all
145
	 * DOM nodes to markup
146
	 */
147
	public function get_markdown()
148
	{
149
		// For this html node, find all child elements and convert
150
		$body = $this->_getBody();
151
		$this->_convert_childNodes($body);
152
153
		// Done replacing HTML elements, now get the converted DOM tree back into a string
154
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
155
156
		// Using the internal DOM methods requires we need to do a little extra work
157
		if ($this->_parser)
158
		{
159
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
160
		}
161
162
		// Clean up any excess spacing etc
163
		$this->_clean_markdown();
164
165
		// Wordwrap?
166
		if (!empty($this->body_width))
167
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
168
169
		return $this->markdown;
170
	}
171
172
	/**
173
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
174
	 * and above head markup
175
	 *
176
	 * @return object
177
	 */
178
	private function  _getBody()
179
	{
180
		// If there is a head node, then off with his head!
181
		$this->_clipHead();
182
183
		// The body of the HTML is where its at.
184
		if ($this->_parser)
185
		{
186
			$body = $this->doc->getElementsByTagName('body')->item(0);
187
		}
188
		else
189
		{
190
			if ($this->doc->find('body', 0) !== null)
191
			{
192
				$body = $this->doc->find('body', 0);
193
			}
194
			elseif ($this->doc->find('html', 0) !== null)
195
			{
196
				$body = $this->doc->find('html', 0);
197
			}
198
			else
199
			{
200
				$body = $this->doc->root;
201
			}
202
		}
203
204
		return $body;
205
	}
206
207
	/**
208
	 * Remove any <head node from the DOM
209
	 */
210
	private function _clipHead()
211
	{
212
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0)->outertext;
213
		if ($head !== null)
214
		{
215
			if ($this->_parser)
216
			{
217
				$head->parentNode->removeChild($head);
218
			}
219
			else
220
			{
221
				$this->doc->find('head', 0)->outertext = '';
222
			}
223
		}
224
	}
225
226
	/**
227
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
228
	 */
229
	private function _setupDOMDocument()
230
	{
231
		// If the html is already wrapped, remove it
232
		$this->html = $this->_returnBodyText($this->html);
233
234
		// Set up processing details
235
		$this->doc = new DOMDocument();
236
		$this->doc->preserveWhiteSpace = false;
237
		$this->doc->encoding = 'UTF-8';
238
239
		// Do what we can to ensure this is processed as UTF-8
240
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
241
	}
242
243
	/**
244
	 * Normalize any spacing and excess blank lines that may have been generated
245
	 */
246
	private function _clean_markdown()
247
	{
248
		// We only want the content, no wrappers
249
		$this->markdown = $this->_returnBodyText($this->markdown);
250
251
		// Remove non breakable spaces that may be hiding in here
252
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
253
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
254
255
		// Remove any "bonus" tags
256
		if ($this->strip_tags)
257
			$this->markdown = strip_tags($this->markdown);
258
259
		// Replace content that we "hide" from the XML parsers
260
		$this->markdown = strtr($this->markdown, array(
261
			'|?|&gt' => '?>',
262
			'|?|&lt' => '?<',
263
			"&lt|?|" => '<?',
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &lt|?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
264
			"&gt|?|" => '>?'
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &gt|?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
265
		));
266
267
		// Strip the chaff and any excess blank lines we may have produced
268
		$this->markdown = trim($this->markdown);
269
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
270
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
271
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
272
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
273
	}
274
275
	/**
276
	 * Looks for the text inside of <body> and then <html>, returning just the inner
277
	 *
278
	 * @param $text
279
	 *
280
	 * @return string
281
	 */
282
	private function _returnBodyText($text)
283
	{
284
		if (preg_match('~<body>(.*)</body>~su', $text, $body))
285
			return $body[1];
286
		elseif (preg_match('~<html>(.*)</html>~su', $text, $body))
287
			return $body[1];
288
289
		return $text;
290
	}
291
292
	/**
293
	 * For a given node, checks if it is anywhere nested inside of a code block
294
	 *  - Prevents converting anything that's inside a code block
295
	 *
296
	 * @param object $node
297
	 * @param boolean $parser flag for internal or external parser
298
	 */
299
	private static function _has_parent_code($node, $parser)
300
	{
301
		$parent = $parser ? $node->parentNode : $node->parentNode();
302
		while ($parent)
303
		{
304
			if ($parent === null)
305
				return false;
306
307
			// Anywhere nested inside a code block we don't render tags
308
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
309
			if ($tag === 'code')
310
				return true;
311
312
			// Back out another level, until we are done
313
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
314
		}
315
316
		return false;
317
	}
318
319
	/**
320
	 * Get the nesting level when inside a list
321
	 *
322
	 * @param object $node
323
	 * @param boolean $parser flag for internal or external parser
324
	 */
325
	private static function _has_parent_list($node, $parser)
326
	{
327
		$inlist = array('ul', 'ol');
328
		$depth = 0;
329
330
		$parent = $parser ? $node->parentNode : $node->parentNode();
331
		while ($parent)
332
		{
333
			// Anywhere nested inside a list we need to get the depth
334
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
335
			if (in_array($tag, $inlist))
336
				$depth++;
337
338
			// Back out another level
339
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
340
		}
341
342
		return $depth;
343
	}
344
345
	/**
346
	 * Traverse each node to its base, then convert tags to markup on the way back out
347
	 *
348
	 * @param object $node
349
	 */
350
	private function _convert_childNodes($node)
351
	{
352
		if (self::_has_parent_code($node, $this->_parser))
353
			return;
354
355
		// Keep traversing till we are at the base of this node
356
		if ($node->hasChildNodes())
357
		{
358
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
359
			for ($i = 0; $i < $num; $i++)
360
			{
361
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
362
				$this->_convert_childNodes($child);
363
			}
364
		}
365
366
		// At the root of this node, convert it to markdown
367
		$this->_convert_to_markdown($node);
368
	}
369
370
	/**
371
	 * Convert the supplied node into its markdown equivalent
372
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
373
	 *
374
	 * @param object $node
375
	 */
376
	private function _convert_to_markdown($node)
377
	{
378
		// HTML tag we are dealing with
379
		$tag = $this->_get_name($node);
380
381
		// Based on the tag, determine how to convert
382
		switch ($tag)
383
		{
384
			case 'a':
385
				$markdown = $this->_convert_anchor($node);
386
				break;
387
			case 'abbr':
388
				$markdown = $this->_convert_abbr($node);
389
				break;
390
			case 'b':
391
			case 'strong':
392
				$markdown = '**' . $this->_get_value($node) . '**';
393
				break;
394
			case 'blockquote':
395
				$markdown = $this->_convert_blockquote($node);
396
				break;
397
			case 'br':
398
				// DomDocument strips empty lines, this prevents that
399
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
400
				break;
401
			case 'center':
402
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
403
				break;
404
			case 'code':
405
				$markdown = $this->_convert_code($node);
406
				break;
407
			case 'dt':
408
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
409
				break;
410
			case 'dd':
411
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
412
				break;
413
			case 'dl':
414
				$markdown = trim($this->_get_value($node)) . $this->line_break;
415
				break;
416
			case 'em':
417
			case 'i':
418
				$markdown = '_' . $this->_get_value($node) . '_';
419
				break;
420
			case 'hr':
421
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
422
				break;
423
			case 'h1':
424
			case 'h2':
425
			case 'h3':
426
			case 'h4':
427
			case 'h5':
428
			case 'h6':
429
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
430
				break;
431
			case 'img':
432
				$markdown = $this->_convert_image($node);
433
				break;
434
			case 'ol':
435
			case 'ul':
436
				$markdown = rtrim($this->_get_value($node)) . $this->line_break;
437
				break;
438
			case 'li':
439
				$markdown = $this->_convert_list($node);
440
				break;
441
			case 'p':
442
				if (!$node->hasChildNodes())
443
				{
444
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
445
					$markdown = $this->_escape_text($markdown);
446
				}
447
				else
448
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
449
				break;
450
			case 'pre':
451
				$markdown = $this->_get_value($node) . $this->line_break;
452
				break;
453
			case 'div':
454
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
455
				if (!$node->hasChildNodes())
456
					$markdown = $this->_escape_text($markdown);
457
				break;
458
			//case '#text':
459
			//  $markdown = $this->_escape_text($this->_get_value($node));
460
			//  break;
461
			case 'title':
462
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
463
				break;
464
			case 'table':
465
				$markdown = $this->_convert_table($node) . $this->line_break;
466
				break;
467
			case 'th':
468
			case 'tr':
469
			case 'td':
470
			case 'tbody':
471
			case 'tfoot':
472
			case 'thead':
473
				// Just skip over these as we handle them in the table tag itself
474
				$markdown = '~`skip`~';
475
				break;
476
			case 'root':
477
			case 'span':
478
			case 'body':
479
				// Remove these tags and simply replace with the text inside the tags
480
				$markdown = $this->_get_innerHTML($node);
481
				break;
482
			default:
483
				// Don't know you or text, so just preserve whats there
484
				$markdown = $this->_get_outerHTML($node);
485
		}
486
487
		// Replace the node with our markdown replacement, or with the node itself if none was found
488
		if ($markdown !== '~`skip`~')
489
		{
490
			if ($this->_parser)
491
			{
492
				// Create a new text node with our markdown tag and replace the original node
493
				$markdown_node = $this->doc->createTextNode($markdown);
494
				$node->parentNode->replaceChild($markdown_node, $node);
495
			}
496
			else
497
				$node->outertext = $markdown;
498
		}
499
	}
500
501
	/**
502
	 * Converts <abbr> tags to markdown (extra)
503
	 *
504
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
505
	 * md:   *[HTML]: Hyper Text Markup Language
506
	 *
507
	 * @param object $node
508
	 */
509
	private function _convert_abbr($node)
510
	{
511
		$title = $node->getAttribute('title');
512
		$value = $this->_get_value($node);
513
514
		if (!empty($title))
515
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
516
		else
517
			$markdown = '';
518
519
		return $markdown;
520
	}
521
522
	/**
523
	 * Converts <a> tags to markdown
524
	 *
525
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
526
	 * md: [Awesome Site](http://somesite.com 'Title')
527
	 *
528
	 * @param object $node
529
	 * @return string
530
	 */
531
	private function _convert_anchor($node)
532
	{
533
		global $txt;
534
535
		$href = htmlentities($node->getAttribute('href'), ENT_COMPAT, 'UTF-8', false);
536
		$title = $node->getAttribute('title');
537
		$class = $node->getAttribute('class');
538
		$value = $this->_get_value($node);
539
540
		// Provide a more compact [name] if none is given
541
		if ($value == $node->getAttribute('href') || empty($value))
542
			$value = empty($title) ? $txt['link'] : $title;
543
544
		// Special processing just for our own footnotes
545
		if ($class === 'target' || $class === 'footnote_return')
546
			$markdown = $value;
547
		elseif (!empty($title))
548
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
549
		else
550
			$markdown = '[' . $value . '](' . $href . ')';
551
552
		// Some links can be very long and if we wrap them they break
553
		$line_strlen = Util::strlen($markdown);
554
		if ($line_strlen > $this->body_width)
555
			$this->body_width = $line_strlen;
556
557
		return $markdown;
558
	}
559
560
	/**
561
	 * Converts blockquotes to markdown > quote style
562
	 *
563
	 * html: <blockquote>quote</blockquote>
564
	 * md: > quote
565
	 *
566
	 * @param object $node
567
	 */
568
	private function _convert_blockquote($node)
569
	{
570
		$markdown = '';
571
572
		// All the contents of this block quote
573
		$value = $this->_get_value($node);
574
		$value = trim($value);
575
576
		// Go line by line
577
		$lines = preg_split('~\r\n|\r|\n~', $value);
578
579
		// Each line gets a '> ' in front of it, just like email quotes really
580
		foreach ($lines as $line)
581
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
582
583
		$markdown .= $this->line_end;
584
		return $markdown;
585
	}
586
587
	/**
588
	 * Converts code tags to markdown span `code` or block code
589
	 * Converts single line code to inline tick mark
590
	 * Converts multi line to 4 space indented code
591
	 *
592
	 * html: <code>code</code>
593
	 * md: `code`
594
	 *
595
	 * @param object $node
596
	 */
597
	private function _convert_code($node)
598
	{
599
		$value = $this->_get_innerHTML($node);
600
601
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselfs
602
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
603
604
		// If there are html tags in this code block, we need to disable strip tags
605
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
606
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
607
			$this->strip_tags = false;
608
609
		// Get the number of lines of code that we have
610
		$lines = preg_split('~\r\n|\r|\n~', $value);
611
		$total = count($lines);
612
613
		// If there's more than one line of code, use leading four space syntax
614
		if ($total > 1)
615
		{
616
			$first_line = trim($lines[0]);
617
			$last_line = trim($lines[$total - 1]);
618
619
			// Remove any leading and trailing blank lines
620
			if (empty($first_line))
621
				array_shift($lines);
622
			if (empty($last_line))
623
				array_pop($lines);
624
625
			// Convert what remains
626
			$markdown = '';
627
			foreach ($lines as $line)
628
			{
629
				// Adjust the word wrapping since this has code tags, leave it up to
630
				// the email client to mess these up ;)
631
				$line_strlen = strlen($line) + 5;
632
				if ($line_strlen > $this->body_width)
633
					$this->body_width = $line_strlen;
634
635
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
636
			}
637
638
			// The parser will encode, but we don't want that for our code block
639
			if ($this->_parser)
640
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
641
		}
642
		// Single line, back tick and move on
643
		else
644
		{
645
			// Account for backticks in the single line code itself
646
			$ticks = $this->_has_ticks($node, $value);
647
			if (!empty($ticks))
648
			{
649
				// If the ticks were at the start/end of the word space it off
650
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
651
					$lines[0] = ' ' . $lines[0] . ' ';
652
653
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
654
			}
655
			else
656
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
657
		}
658
659
		return $markdown;
660
	}
661
662
	/**
663
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
664
	 * all other headers are returned as atx style ### h3
665
	 *
666
	 * html: <h1>header</h1>
667
	 * md: header
668
	 *     ======
669
	 *
670
	 * html: <h3>header</h3>
671
	 * md: ###header
672
	 *
673
	 * @param int $level
674
	 * @param string $content
675
	 */
676
	private function _convert_header($level, $content)
677
	{
678
		$level = (int) ltrim($level, 'h');
679
680
		if ($level < 3)
681
		{
682
			$length = Util::strlen($content);
683
			$underline = ($level === 1) ? '=' : '-';
684
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
685
		}
686
		else
687
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
688
689
		return $markdown;
690
	}
691
692
	/**
693
	 * Converts <img> tags to markdown
694
	 *
695
	 * html: <img src='source' alt='alt' title='title' />
696
	 * md: ![alt](source 'title')
697
	 *
698
	 * @param object $node
699
	 */
700
	private function _convert_image($node)
701
	{
702
		$src = $node->getAttribute('src');
703
		$alt = $node->getAttribute('alt');
704
		$title = $node->getAttribute('title');
705
706
		if (!empty($title))
707
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
708
		else
709
			$markdown = '![' . $alt . '](' . $src . ')';
710
711
		return $markdown;
712
	}
713
714
	/**
715
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
716
	 *
717
	 * html: <ul><li>one</li></ul>
718
	 * md * one
719
	 *
720
	 * @param object $node
721
	 */
722
	private function _convert_list($node)
723
	{
724
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
725
		$value = $this->_get_value($node);
726
727
		$loose = rtrim($value) !== $value;
728
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
729
730
		// Unordered lists get a simple bullet
731
		if ($list_type === 'ul')
732
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
733
		// Ordered lists need a number
734
		else
735
		{
736
			$number = $this->_get_list_position($node);
737
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
738
		}
739
740
		return $markdown . (!$loose ? $this->line_end : '');
741
	}
742
743
	/**
744
	 * Converts tables tags to markdown extra table syntax
745
	 *
746
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
747
	 *
748
	 * @param object $node
749
	 */
750
	private function _convert_table($node)
751
	{
752
		$table_heading = $node->getElementsByTagName('th');
753
		if ($this->_get_item($table_heading, 0) === null)
754
			return;
755
756
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
757
758
		// Set up for a markdown table, then storm the castle
759
		$align = array();
760
		$value = array();
761
		$width = array();
762
		$max = array();
763
		$header = array();
764
		$rows = array();
765
766
		// We only markdown well formed tables ...
767
		if ($table_heading && $th_parent === 'tr')
768
		{
769
			// Find out how many columns we are dealing with
770
			$th_num = $this->_get_length($table_heading);
771
772
			for ($col = 0; $col < $th_num; $col++)
773
			{
774
				// Get the align and text for each th (html5 this is no longer valid)
775
				$th = $this->_get_item($table_heading, $col);
776
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
777
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
778
				$value[0][$col] = $this->_get_value($th);
779
				$width[0][$col] = Util::strlen($this->_get_value($th));
780
781
				// Seed the max col width
782
				$max[$col] = $width[0][$col];
783
			}
784
785
			// Get all of the rows
786
			$table_rows = $node->getElementsByTagName('tr');
787
			$num_rows = $this->_get_length($table_rows);
788
			for ($row = 1; $row < $num_rows; $row++)
789
			{
790
				// Start at row 1 and get all of the td's in this row
791
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
792
793
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
794
				for ($col = 0; $col < $th_num; $col++)
795
				{
796
					// Get the align and text for each td in this row
797
					$td = $this->_get_item($row_data, $col);
798
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
799
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
800
					$value[$row][$col] = $this->_get_value($td);
801
					$width[$row][$col] = Util::strlen($this->_get_value($td));
802
803
					// Keep track of the longest col cell as we go
804
					if ($width[$row][$col] > $max[$col])
805
						$max[$col] = $width[$row][$col];
806
				}
807
			}
808
809
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
810
			for ($row = 0; $row < $num_rows; $row++)
811
			{
812
				$temp = array();
813
				for ($col = 0; $col < $th_num; $col++)
814
				{
815
					// Build the header row once
816
					if ($row === 0)
817
						$header[] = str_repeat('-', $max[$col]);
818
819
					// Build the data for each col, align/pad as needed
820
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
821
				}
822
823
				// Join it all up so we have a nice looking row
824
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
825
826
				// Stuff in the header after the th row
827
				if ($row === 0)
828
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
829
			}
830
831
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
832
			$line_strlen = strlen($rows[1]) + 2;
833
			if ($line_strlen > $this->body_width)
834
				$this->body_width = $line_strlen;
835
836
			// Return what we did so it can be swapped in
837
			return implode($this->line_end, $rows);
838
		}
839
	}
840
841
	/**
842
	 * Helper function for getting a node object
843
	 *
844
	 * @param object $node
845
	 * @param int $item
846
	 */
847
	private function _get_item($node, $item)
848
	{
849
		if ($this->_parser)
850
			return $node->item($item);
851
		else
852
			return $node[$item];
853
	}
854
855
	/**
856
	 * Helper function for getting a node length
857
	 *
858
	 * @param object $node
859
	 */
860
	private function _get_length($node)
861
	{
862
		if ($this->_parser)
863
			return $node->length;
864
		else
865
			return count($node);
866
	}
867
868
	/**
869
	 * Helper function for getting a node value
870
	 *
871
	 * @param object $node
872
	 */
873
	private function _get_value($node)
874
	{
875
		if ($node === null)
876
			return '';
877
878
		if ($this->_parser)
879
			return $node->nodeValue;
880
		else
881
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
882
	}
883
884
	/**
885
	 * Helper function for getting a node name
886
	 *
887
	 * @param object $node
888
	 */
889
	private function _get_name($node)
890
	{
891
		if ($node === null)
892
			return '';
893
894
		if ($this->_parser)
895
			return $node->nodeName;
896
		else
897
			return $node->nodeName();
898
	}
899
900
	/**
901
	 * Helper function for creating ol's
902
	 *
903
	 * - Returns the absolute number of an <li> inside an <ol>
904
	 *
905
	 * @param object $node
906
	 */
907
	private function _get_list_position($node)
908
	{
909
		$position = 1;
910
911
		// Get all of the list nodes inside this parent
912
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
913
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
914
915
		// Loop through all li nodes and find where we are in this list
916
		for ($i = 0; $i < $total_nodes; $i++)
917
		{
918
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
919
			if ($current_node === $node)
920
				$position = $i + 1;
921
		}
922
923
		return $position;
924
	}
925
926
	/**
927
	 * Helper function for table creation
928
	 *
929
	 * - Builds td's to a give width, aligned as needed
930
	 *
931
	 * @param string $align
932
	 * @param int $width
933
	 * @param string $content
934
	 * @param int $max
935
	 */
936
	private function _align_row_content($align, $width, $content, $max)
937
	{
938
		switch ($align)
939
		{
940
			default:
941
			case 'left':
942
				$content .= str_repeat(' ', $max - $width);
943
				break;
944
			case 'right':
945
				$content = str_repeat(' ', $max - $width) . $content;
946
				break;
947
			case 'center':
948
				$paddingNeeded = $max - $width;
949
				$left = floor($paddingNeeded / 2);
950
				$right = $paddingNeeded - $left;
951
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
952
				break;
953
		}
954
955
		return $content;
956
	}
957
958
	/**
959
	 * Gets the inner html of a node
960
	 *
961
	 * @param object $node
962
	 */
963
	private function _get_innerHTML($node)
964
	{
965
		if ($this->_parser)
966
		{
967
			$doc = new DOMDocument();
968
			$doc->appendChild($doc->importNode($node, true));
969
			$html = trim($doc->saveHTML());
970
			$tag = $node->nodeName;
971
972
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
973
		}
974
		else
975
			return $node->innertext;
976
	}
977
978
	/**
979
	 * Gets the outer html of a node
980
	 *
981
	 * @param object $node
982
	 */
983
	private function _get_outerHTML($node)
984
	{
985
		if ($this->_parser)
986
		{
987
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
988
				return htmlspecialchars_decode($this->doc->saveHTML($node));
989
			else
990
			{
991
				// @todo remove when 5.3.6 min
992
				$doc = new DOMDocument();
993
				$doc->appendChild($doc->importNode($node, true));
994
				$html = $doc->saveHTML();
995
996
				// We just want the html of the inserted node, it *may* be wrapped
997
				if (preg_match('~<body>(.*)</body>~su', $html, $body))
998
					$html = $body[1];
999
				elseif (preg_match('~<html>(.*)</html>~su', $html, $body))
1000
					$html = $body[1];
1001
1002
				// Clean it up
1003
				$html = rtrim($html, "\n");
1004
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1005
			}
1006
		}
1007
		else
1008
			return $node->outertext;
1009
	}
1010
1011
	/**
1012
	 * Escapes markup looking text in html to prevent accidental assignment
1013
	 *
1014
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1015
	 * be converted by md to html as <strong>stuff</strong>
1016
	 *
1017
	 * @param string $value
1018
	 */
1019
	private function _escape_text($value)
1020
	{
1021
		// Search and replace ...
1022
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1023
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
1024
1025
		return $value;
1026
	}
1027
1028
	/**
1029
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1030
	 * when markdown is run we don't interpret the ` as additional code blocks
1031
	 *
1032
	 * @param object $node
1033
	 * @param string $value
1034
	 */
1035
	private function _has_ticks($node, $value)
1036
	{
1037
		$ticks = '';
1038
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
1039
1040
		// Inside of a pre, we don't do anything
1041
		if ($code_parent === 'pre')
1042
			return $value;
1043
1044
		// If we have backticks in code, then we back tick the ticks
1045
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1046
		preg_match_all('~`+~', $value, $matches);
1047
		if (!empty($matches[0]))
1048
		{
1049
			// Yup ticks in the hair
1050
			$ticks = '`';
1051
			rsort($matches[0]);
1052
1053
			// Backtick as many as needed so markdown will work
1054
			while (true)
1055
			{
1056
				if (!in_array($ticks, $matches[0]))
1057
					break;
1058
				$ticks .= '`';
1059
			}
1060
		}
1061
1062
		return $ticks;
1063
	}
1064
1065
	/**
1066
	 * Breaks a string up so its no more than width characters long
1067
	 *
1068
	 * - Will break at word boundaries
1069
	 * - If no natural space is found will break mid-word
1070
	 *
1071
	 * @param string $string
1072
	 * @param int $width
1073
	 * @param string $break
1074
	 */
1075
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
1076
	{
1077
		$strings = explode($break, $string);
1078
		$lines = array();
1079
1080
		foreach ($strings as $string)
1081
		{
1082
			$in_quote = isset($string[0]) && $string[0] === '>';
1083
			while (!empty($string))
1084
			{
1085
				// Get the next #width characters before a break (space, punctuation tab etc)
1086
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1087
				{
1088
					// Add the #width to the output and set up for the next pass
1089
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1];
1090
					$string = Util::substr($string, Util::strlen($matches[1]));
1091
				}
1092
				// Humm just a long word with no place to break so we simply cut it after width characters
1093
				else
1094
				{
1095
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1096
					$string = Util::substr($string, $width);
1097
				}
1098
			}
1099
		}
1100
1101
		// Join it all the shortened sections up on our break characters
1102
		return implode($break, $lines);
1103
	}
1104
}
1105