Completed
Push — patch-1.0.7 ( cb33f1...94fbf1 )
by Emanuele
08:46
created

Html_2_Md::__construct()   B

Complexity

Conditions 1
Paths 1

Size

Total Lines 34
Code Lines 17

Duplication

Lines 0
Ratio 0 %
Metric Value
dl 0
loc 34
rs 8.8571
cc 1
eloc 17
nc 1
nop 1
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.0.4
11
 *
12
 */
13
14
if (!defined('ELK'))
15
	die('No access...');
16
17
/**
18
 * Converts HTML to Markdown text
19
 */
20
class Html_2_Md
21
{
22
	/**
23
	 * The value that will hold our dom object
24
	 * @var object
25
	 */
26
	public $doc;
27
28
	/**
29
	 * The value that will hold if we are using the internal or external parser
30
	 * @var boolean
31
	 */
32
	private $_parser;
33
34
	/**
35
	 * Line end character
36
	 * @var string
37
	 */
38
	public $line_end = "\n";
39
40
	/**
41
	 * Line break character
42
	 * @var string
43
	 */
44
	public $line_break = "\n\n";
45
46
	/**
47
	 * Wordwrap output, set to 0 to skip wrapping
48
	 * @var int
49
	 */
50
	public $body_width = 76;
51
52
	/**
53
	 * Strip remaining tags, set to false to leave them in
54
	 * @var boolean
55
	 */
56
	public $strip_tags = true;
57
58
	/**
59
	 * Regex to run on plain text to prevent markdown from erroneously converting
60
	 * @var string[]
61
	 */
62
	private $_textEscapeRegex = array();
63
64
	/**
65
	 * The passed html string to convert
66
	 * @var string
67
	 */
68
	public $html;
69
70
	/**
71
	 * The markdown equivalent to the  html string
72
	 * @var string
73
	 */
74
	public $markdown;
75
76
	/**
77
	 * Gets everything started using the built in or external parser
78
	 *
79
	 * @param string $html string of html to convert to MD text
80
	 */
81
	public function __construct($html)
82
	{
83
		// Up front, remove whitespace between html tags
84
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
85
86
		// The XML parser will not deal gracefully with these
87
		$this->html = strtr($this->html, array(
88
			'?<' => "|?|<",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal |?|< does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
89
			'?>' => "|?|>",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal |?|> does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
90
			'>?' => ">|?|",
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal >|?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
91
			'<?' => "&lt?"
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &lt? does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
92
		));
93
94
		// Set the dom parser to use and load the HTML to the parser
95
		$this->_set_parser();
96
97
		// Initialize the regex array to escape text areas so markdown does
98
		// not interpret plain text as markdown syntax
99
		$this->_textEscapeRegex = array(
0 ignored issues
show
Documentation Bug introduced by
It seems like (.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' => '\\[$1\\]$2\\[$3\\]')">array('([-*_])([ ]{0,2}\(.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' =">...> '\\[$1\\]$2\\[$3\\]') of type (.+)\\":"string","\\(.+)\\\\[(.*)\\]":"string"}>">array<string,string,{"([(.+)\\":"string","\\...">(.+)\\\\[(.*)\\]":"string"}> is incompatible with the declared type array<integer,string> of property $_textEscapeRegex.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
100
			// Things that may convert to an hr --- or - - - etc
101
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
102
			// or **stuff** => \*\*stuff\*\*
103
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
104
			// or versions of *italic* __italic__ _italic_
105
			'\*([^*\s]+)\*' => '\*$1\*',
106
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
107
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
108
			// nor `code`
109
			'`(.+)`' => '\`$1\`',
110
			// or links
111
			'\[(.+)\](\s*\()' => '\[$1\]$2',
112
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
113
		);
114
	}
115
116
	/**
117
	 * Set the DOM parser for class, loads the supplied HTML
118
	 */
119
	private function _set_parser()
120
	{
121
		// Using PHP built in functions ...
122
		if (class_exists('DOMDocument'))
123
		{
124
			$this->_parser = true;
125
			$previous = libxml_use_internal_errors(true);
126
127
			// Set up basic parameters for DomDocument, including silencing structural errors
128
			$this->doc = new DOMDocument();
129
			$this->doc->preserveWhiteSpace = false;
130
			$this->doc->encoding = 'UTF-8';
131
			$this->doc->loadHTML('<?xml encoding="UTF-8">' . $this->html);
132
133
			// Set the error handle back to what it was, and flush
134
			libxml_use_internal_errors($previous);
135
			libxml_clear_errors();
136
		}
137
		// Or using the external simple html parser
138
		else
139
		{
140
			$this->_parser = false;
141
			require_once(EXTDIR . '/simple_html_dom.php');
142
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
143
		}
144
	}
145
146
	/**
147
	 * Loads the html body and sends it to the parsing loop to convert all
148
	 * DOM nodes to markup
149
	 */
150
	public function get_markdown()
151
	{
152
		// If there is nothing to parse, its quite easy
153
		if (($this->_parser && $this->doc->getElementsByTagName('body')->item(0) === null) || (!$this->_parser && $this->doc === false))
154
			return '';
155
156
		// For this html node, find all child elements and convert
157
		$body = ($this->_parser) ? $this->doc->getElementsByTagName('body')->item(0) : $this->doc->root;
158
		$this->_convert_childNodes($body);
159
160
		// Done replacing HTML elements, now get the converted DOM tree back into a string
161
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
162
163
		// Using the internal DOM methods requires we need to do a little extra work
164
		if ($this->_parser)
165
		{
166
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
167
168
			if (preg_match('~<body>(.*)</body>~s', $this->markdown, $body))
169
				$this->markdown = $body[1];
170
			elseif (preg_match('~<html>(.*)</html>~s', $this->markdown, $body))
171
				$this->markdown = $body[1];
172
		}
173
174
		// Clean up any excess spacing etc
175
		$this->_clean_markdown();
176
177
		// Wordwrap?
178
		if (!empty($this->body_width))
179
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
180
181
		return $this->markdown;
182
	}
183
184
	/**
185
	 * Normalize any spacing and excess blank lines that may have been generated
186
	 */
187
	private function _clean_markdown()
188
	{
189
		// Remove non breakable spaces that may be hiding in here
190
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
191
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
192
193
		// Remove any "bonus" tags
194
		if ($this->strip_tags)
195
			$this->markdown = strip_tags($this->markdown);
196
197
		// Replace content that we "hide" from the XML parsers
198
		$this->markdown = strtr($this->markdown, array(
199
			"|?|" => '?',
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal |?| does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
200
			"&lt?" => '<?'
0 ignored issues
show
Coding Style Comprehensibility introduced by
The string literal &lt? does not require double quotes, as per coding-style, please use single quotes.

PHP provides two ways to mark string literals. Either with single quotes 'literal' or with double quotes "literal". The difference between these is that string literals in double quotes may contain variables with are evaluated at run-time as well as escape sequences.

String literals in single quotes on the other hand are evaluated very literally and the only two characters that needs escaping in the literal are the single quote itself (\') and the backslash (\\). Every other character is displayed as is.

Double quoted string literals may contain other variables or more complex escape sequences.

<?php

$singleQuoted = 'Value';
$doubleQuoted = "\tSingle is $singleQuoted";

print $doubleQuoted;

will print an indented: Single is Value

If your string literal does not contain variables or escape sequences, it should be defined using single quotes to make that fact clear.

For more information on PHP string literals and available escape sequences see the PHP core documentation.

Loading history...
201
		));
202
203
		// Strip the chaff and any excess blank lines we may have produced
204
		$this->markdown = trim($this->markdown);
205
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
206
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
207
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
208
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
209
	}
210
211
	/**
212
	 * For a given node, checks if it is anywhere nested inside of a code block
213
	 *  - Prevents converting anything that's inside a code block
214
	 *
215
	 * @param object $node
216
	 * @param boolean $parser flag for internal or external parser
217
	 */
218
	private static function _has_parent_code($node, $parser)
219
	{
220
		$parent = $parser ? $node->parentNode : $node->parentNode();
221
		while ($parent)
222
		{
223
			if ($parent === null)
224
				return false;
225
226
			// Anywhere nested inside a code block we don't render tags
227
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
228
			if ($tag === 'code')
229
				return true;
230
231
			// Back out another level, until we are done
232
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
233
		}
234
235
		return false;
236
	}
237
238
	/**
239
	 * Get the nesting level when inside a list
240
	 *
241
	 * @param object $node
242
	 * @param boolean $parser flag for internal or external parser
243
	 */
244
	private static function _has_parent_list($node, $parser)
245
	{
246
		$inlist = array('ul', 'ol');
247
		$depth = 0;
248
249
		$parent = $parser ? $node->parentNode : $node->parentNode();
250
		while ($parent)
251
		{
252
			// Anywhere nested inside a list we need to get the depth
253
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
254
			if (in_array($tag, $inlist))
255
				$depth++;
256
257
			// Back out another level
258
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
259
		}
260
261
		return $depth;
262
	}
263
264
	/**
265
	 * Traverse each node to its base, then convert tags to markup on the way back out
266
	 *
267
	 * @param object $node
268
	 */
269
	private function _convert_childNodes($node)
270
	{
271
		if (self::_has_parent_code($node, $this->_parser))
272
			return;
273
274
		// Keep traversing till we are at the base of this node
275
		if ($node->hasChildNodes())
276
		{
277
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
278
			for ($i = 0; $i < $num; $i++)
279
			{
280
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
281
				$this->_convert_childNodes($child);
282
			}
283
		}
284
285
		// At the root of this node, convert it to markdown
286
		$this->_convert_to_markdown($node);
287
	}
288
289
	/**
290
	 * Convert the supplied node into its markdown equivalent
291
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
292
	 *
293
	 * @param object $node
294
	 */
295
	private function _convert_to_markdown($node)
296
	{
297
		// HTML tag we are dealing with
298
		$tag = $this->_get_name($node);
299
300
		// Based on the tag, determine how to convert
301
		switch ($tag)
302
		{
303
			case 'a':
304
				$markdown = $this->_convert_anchor($node);
305
				break;
306
			case 'abbr':
307
				$markdown = $this->_convert_abbr($node);
308
				break;
309
			case 'b':
310
			case 'strong':
311
				$markdown = '**' . $this->_get_value($node) . '**';
312
				break;
313
			case 'blockquote':
314
				$markdown = $this->_convert_blockquote($node);
315
				break;
316
			case 'br':
317
				// DomDocument strips empty lines, this prevents that
318
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
319
				break;
320
			case 'center':
321
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
322
				break;
323
			case 'code':
324
				$markdown = $this->_convert_code($node);
325
				break;
326
			case 'dt':
327
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
328
				break;
329
			case 'dd':
330
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
331
				break;
332
			case 'dl':
333
				$markdown = trim($this->_get_value($node)) . $this->line_break;
334
				break;
335
			case 'em':
336
			case 'i':
337
				$markdown = '_' . $this->_get_value($node) . '_';
338
				break;
339
			case 'hr':
340
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
341
				break;
342
			case 'h1':
343
			case 'h2':
344
			case 'h3':
345
			case 'h4':
346
			case 'h5':
347
			case 'h6':
348
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
349
				break;
350
			case 'img':
351
				$markdown = $this->_convert_image($node);
352
				break;
353
			case 'ol':
354
			case 'ul':
355
				$markdown = rtrim($this->_get_value($node)) . $this->line_break;
356
				break;
357
			case 'li':
358
				$markdown = $this->_convert_list($node);
359
				break;
360
			case 'p':
361
				if (!$node->hasChildNodes())
362
				{
363
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
364
					$markdown = $this->_escape_text($markdown);
365
				}
366
				else
367
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
368
				break;
369
			case 'pre':
370
				$markdown = $this->_get_value($node) . $this->line_break;
371
				break;
372
			case 'div':
373
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
374
				if (!$node->hasChildNodes())
375
					$markdown = $this->_escape_text($markdown);
376
				break;
377
			//case '#text':
378
			//  $markdown = $this->_escape_text($this->_get_value($node));
379
			//  break;
380
			case 'title':
381
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
382
				break;
383
			case 'table':
384
				$markdown = $this->_convert_table($node) . $this->line_break;
385
				break;
386
			case 'th':
387
			case 'tr':
388
			case 'td':
389
			case 'tbody':
390
			case 'tfoot':
391
			case 'thead':
392
				// Just skip over these as we handle them in the table tag itself
393
				$markdown = '~`skip`~';
394
				break;
395
			case 'root':
396
			case 'span':
397
			case 'body':
398
				// Remove these tags and simply replace with the text inside the tags
399
				$markdown = $this->_get_innerHTML($node);
400
				break;
401
			default:
402
				// Don't know you or text, so just preserve whats there
403
				$markdown = $this->_get_outerHTML($node);
404
		}
405
406
		// Replace the node with our markdown replacement, or with the node itself if none was found
407
		if ($markdown !== '~`skip`~')
408
		{
409
			if ($this->_parser)
410
			{
411
				// Create a new text node with our markdown tag and replace the original node
412
				$markdown_node = $this->doc->createTextNode($markdown);
413
				$node->parentNode->replaceChild($markdown_node, $node);
414
			}
415
			else
416
				$node->outertext = $markdown;
417
		}
418
	}
419
420
	/**
421
	 * Converts <abbr> tags to markdown (extra)
422
	 *
423
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
424
	 * md:   *[HTML]: Hyper Text Markup Language
425
	 *
426
	 * @param object $node
427
	 */
428
	private function _convert_abbr($node)
429
	{
430
		$title = $node->getAttribute('title');
431
		$value = $this->_get_value($node);
432
433
		if (!empty($title))
434
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
435
		else
436
			$markdown = '';
437
438
		return $markdown;
439
	}
440
441
	/**
442
	 * Converts <a> tags to markdown
443
	 *
444
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
445
	 * md: [Awesome Site](http://somesite.com 'Title')
446
	 *
447
	 * @param object $node
448
	 * @return string
449
	 */
450
	private function _convert_anchor($node)
451
	{
452
		global $txt;
453
454
		$href = htmlentities($node->getAttribute('href'), ENT_COMPAT, 'UTF-8', false);
455
		$title = $node->getAttribute('title');
456
		$class = $node->getAttribute('class');
457
		$value = $this->_get_value($node);
458
459
		// Provide a more compact [name] if none is given
460
		if ($value == $node->getAttribute('href') || empty($value))
461
			$value = empty($title) ? $txt['link'] : $title;
462
463
		// Special processing just for our own footnotes
464
		if ($class === 'target' || $class === 'footnote_return')
465
			$markdown = $value;
466
		elseif (!empty($title))
467
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
468
		else
469
			$markdown = '[' . $value . '](' . $href . ')';
470
471
		// Some links can be very long and if we wrap them they break
472
		$line_strlen = Util::strlen($markdown);
473
		if ($line_strlen > $this->body_width)
474
			$this->body_width = $line_strlen;
475
476
		return $markdown;
477
	}
478
479
	/**
480
	 * Converts blockquotes to markdown > quote style
481
	 *
482
	 * html: <blockquote>quote</blockquote>
483
	 * md: > quote
484
	 *
485
	 * @param object $node
486
	 */
487
	private function _convert_blockquote($node)
488
	{
489
		$markdown = '';
490
491
		// All the contents of this block quote
492
		$value = $this->_get_value($node);
493
		$value = trim($value);
494
495
		// Go line by line
496
		$lines = preg_split('~\r\n|\r|\n~', $value);
497
498
		// Each line gets a '> ' in front of it, just like email quotes really
499
		foreach ($lines as $line)
500
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
501
502
		$markdown .= $this->line_end;
503
		return $markdown;
504
	}
505
506
	/**
507
	 * Converts code tags to markdown span `code` or block code
508
	 * Converts single line code to inline tick mark
509
	 * Converts multi line to 4 space indented code
510
	 *
511
	 * html: <code>code</code>
512
	 * md: `code`
513
	 *
514
	 * @param object $node
515
	 */
516
	private function _convert_code($node)
517
	{
518
		$value = $this->_get_innerHTML($node);
519
520
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselfs
521
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
522
523
		// If there are html tags in this code block, we need to disable strip tags
524
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
525
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
526
			$this->strip_tags = false;
527
528
		// Get the number of lines of code that we have
529
		$lines = preg_split('~\r\n|\r|\n~', $value);
530
		$total = count($lines);
531
532
		// If there's more than one line of code, use leading four space syntax
533
		if ($total > 1)
534
		{
535
			$first_line = trim($lines[0]);
536
			$last_line = trim($lines[$total - 1]);
537
538
			// Remove any leading and trailing blank lines
539
			if (empty($first_line))
540
				array_shift($lines);
541
			if (empty($last_line))
542
				array_pop($lines);
543
544
			// Convert what remains
545
			$markdown = '';
546
			foreach ($lines as $line)
547
			{
548
				// Adjust the word wrapping since this has code tags, leave it up to
549
				// the email client to mess these up ;)
550
				$line_strlen = strlen($line) + 5;
551
				if ($line_strlen > $this->body_width)
552
					$this->body_width = $line_strlen;
553
554
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
555
			}
556
557
			// The parser will encode, but we don't want that for our code block
558
			if ($this->_parser)
559
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
560
		}
561
		// Single line, back tick and move on
562
		else
563
		{
564
			// Account for backticks in the single line code itself
565
			$ticks = $this->_has_ticks($node, $value);
566
			if (!empty($ticks))
567
			{
568
				// If the ticks were at the start/end of the word space it off
569
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
570
					$lines[0] = ' ' . $lines[0] . ' ';
571
572
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
573
			}
574
			else
575
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
576
		}
577
578
		return $markdown;
579
	}
580
581
	/**
582
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
583
	 * all other headers are returned as atx style ### h3
584
	 *
585
	 * html: <h1>header</h1>
586
	 * md: header
587
	 *     ======
588
	 *
589
	 * html: <h3>header</h3>
590
	 * md: ###header
591
	 *
592
	 * @param int $level
593
	 * @param string $content
594
	 */
595
	private function _convert_header($level, $content)
596
	{
597
		$level = (int) ltrim($level, 'h');
598
599
		if ($level < 3)
600
		{
601
			$length = Util::strlen($content);
602
			$underline = ($level === 1) ? '=' : '-';
603
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
604
		}
605
		else
606
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
607
608
		return $markdown;
609
	}
610
611
	/**
612
	 * Converts <img> tags to markdown
613
	 *
614
	 * html: <img src='source' alt='alt' title='title' />
615
	 * md: ![alt](source 'title')
616
	 *
617
	 * @param object $node
618
	 */
619
	private function _convert_image($node)
620
	{
621
		$src = $node->getAttribute('src');
622
		$alt = $node->getAttribute('alt');
623
		$title = $node->getAttribute('title');
624
625
		if (!empty($title))
626
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
627
		else
628
			$markdown = '![' . $alt . '](' . $src . ')';
629
630
		return $markdown;
631
	}
632
633
	/**
634
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
635
	 *
636
	 * html: <ul><li>one</li></ul>
637
	 * md * one
638
	 *
639
	 * @param object $node
640
	 */
641
	private function _convert_list($node)
642
	{
643
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
644
		$value = $this->_get_value($node);
645
646
		$loose = rtrim($value) !== $value;
647
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
648
649
		// Unordered lists get a simple bullet
650
		if ($list_type === 'ul')
651
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
652
		// Ordered lists need a number
653
		else
654
		{
655
			$number = $this->_get_list_position($node);
656
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
657
		}
658
659
		return $markdown . (!$loose ? $this->line_end : '');
660
	}
661
662
	/**
663
	 * Converts tables tags to markdown extra table syntax
664
	 *
665
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
666
	 *
667
	 * @param object $node
668
	 */
669
	private function _convert_table($node)
670
	{
671
		$table_heading = $node->getElementsByTagName('th');
672
		if ($this->_get_item($table_heading, 0) === null)
673
			return;
674
675
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
676
677
		// Set up for a markdown table, then storm the castle
678
		$align = array();
679
		$value = array();
680
		$width = array();
681
		$max = array();
682
		$header = array();
683
		$rows = array();
684
685
		// We only markdown well formed tables ...
686
		if ($table_heading && $th_parent === 'tr')
687
		{
688
			// Find out how many columns we are dealing with
689
			$th_num = $this->_get_length($table_heading);
690
691
			for ($col = 0; $col < $th_num; $col++)
692
			{
693
				// Get the align and text for each th (html5 this is no longer valid)
694
				$th = $this->_get_item($table_heading, $col);
695
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
696
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
697
				$value[0][$col] = $this->_get_value($th);
698
				$width[0][$col] = Util::strlen($this->_get_value($th));
699
700
				// Seed the max col width
701
				$max[$col] = $width[0][$col];
702
			}
703
704
			// Get all of the rows
705
			$table_rows = $node->getElementsByTagName('tr');
706
			$num_rows = $this->_get_length($table_rows);
707
			for ($row = 1; $row < $num_rows; $row++)
708
			{
709
				// Start at row 1 and get all of the td's in this row
710
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
711
712
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
713
				for ($col = 0; $col < $th_num; $col++)
714
				{
715
					// Get the align and text for each td in this row
716
					$td = $this->_get_item($row_data, $col);
717
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
718
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
719
					$value[$row][$col] = $this->_get_value($td);
720
					$width[$row][$col] = Util::strlen($this->_get_value($td));
721
722
					// Keep track of the longest col cell as we go
723
					if ($width[$row][$col] > $max[$col])
724
						$max[$col] = $width[$row][$col];
725
				}
726
			}
727
728
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
729
			for ($row = 0; $row < $num_rows; $row++)
730
			{
731
				$temp = array();
732
				for ($col = 0; $col < $th_num; $col++)
733
				{
734
					// Build the header row once
735
					if ($row === 0)
736
						$header[] = str_repeat('-', $max[$col]);
737
738
					// Build the data for each col, align/pad as needed
739
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
740
				}
741
742
				// Join it all up so we have a nice looking row
743
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
744
745
				// Stuff in the header after the th row
746
				if ($row === 0)
747
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
748
			}
749
750
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
751
			$line_strlen = strlen($rows[1]) + 2;
752
			if ($line_strlen > $this->body_width)
753
				$this->body_width = $line_strlen;
754
755
			// Return what we did so it can be swapped in
756
			return implode($this->line_end, $rows);
757
		}
758
	}
759
760
	/**
761
	 * Helper function for getting a node object
762
	 *
763
	 * @param object $node
764
	 * @param int $item
765
	 */
766
	private function _get_item($node, $item)
767
	{
768
		if ($this->_parser)
769
			return $node->item($item);
770
		else
771
			return $node[$item];
772
	}
773
774
	/**
775
	 * Helper function for getting a node length
776
	 *
777
	 * @param object $node
778
	 */
779
	private function _get_length($node)
780
	{
781
		if ($this->_parser)
782
			return $node->length;
783
		else
784
			return count($node);
785
	}
786
787
	/**
788
	 * Helper function for getting a node value
789
	 *
790
	 * @param object $node
791
	 */
792
	private function _get_value($node)
793
	{
794
		if ($node === null)
795
			return '';
796
797
		if ($this->_parser)
798
			return $node->nodeValue;
799
		else
800
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
801
	}
802
803
	/**
804
	 * Helper function for getting a node name
805
	 *
806
	 * @param object $node
807
	 */
808
	private function _get_name($node)
809
	{
810
		if ($node === null)
811
			return '';
812
813
		if ($this->_parser)
814
			return $node->nodeName;
815
		else
816
			return $node->nodeName();
817
	}
818
819
	/**
820
	 * Helper function for creating ol's
821
	 *
822
	 * - Returns the absolute number of an <li> inside an <ol>
823
	 *
824
	 * @param object $node
825
	 */
826
	private function _get_list_position($node)
827
	{
828
		$position = 1;
829
830
		// Get all of the list nodes inside this parent
831
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
832
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
833
834
		// Loop through all li nodes and find where we are in this list
835
		for ($i = 0; $i < $total_nodes; $i++)
836
		{
837
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
838
			if ($current_node === $node)
839
				$position = $i + 1;
840
		}
841
842
		return $position;
843
	}
844
845
	/**
846
	 * Helper function for table creation
847
	 *
848
	 * - Builds td's to a give width, aligned as needed
849
	 *
850
	 * @param string $align
851
	 * @param int $width
852
	 * @param string $content
853
	 * @param int $max
854
	 */
855
	private function _align_row_content($align, $width, $content, $max)
856
	{
857
		switch ($align)
858
		{
859
			default:
860
			case 'left':
861
				$content .= str_repeat(' ', $max - $width);
862
				break;
863
			case 'right':
864
				$content = str_repeat(' ', $max - $width) . $content;
865
				break;
866
			case 'center':
867
				$paddingNeeded = $max - $width;
868
				$left = floor($paddingNeeded / 2);
869
				$right = $paddingNeeded - $left;
870
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
871
				break;
872
		}
873
874
		return $content;
875
	}
876
877
	/**
878
	 * Gets the inner html of a node
879
	 *
880
	 * @param object $node
881
	 */
882
	private function _get_innerHTML($node)
883
	{
884
		if ($this->_parser)
885
		{
886
			$doc = new DOMDocument();
887
			$doc->appendChild($doc->importNode($node, true));
888
			$html = trim($doc->saveHTML());
889
			$tag = $node->nodeName;
890
891
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
892
		}
893
		else
894
			return $node->innertext;
895
	}
896
897
	/**
898
	 * Gets the outer html of a node
899
	 *
900
	 * @param object $node
901
	 */
902
	private function _get_outerHTML($node)
903
	{
904
		if ($this->_parser)
905
		{
906
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
907
				return htmlspecialchars_decode($this->doc->saveHTML($node));
908
			else
909
			{
910
				// @todo remove when 5.3.6 min
911
				$doc = new DOMDocument();
912
				$doc->appendChild($doc->importNode($node, true));
913
				$html = $doc->saveHTML();
914
915
				// We just want the html of the inserted node, it *may* be wrapped
916
				if (preg_match('~<body>(.*)</body>~s', $html, $body))
917
					$html = $body[1];
918
				elseif (preg_match('~<html>(.*)</html>~s', $html, $body))
919
					$html = $body[1];
920
921
				// Clean it up
922
				$html = rtrim($html, "\n");
923
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
924
			}
925
		}
926
		else
927
			return $node->outertext;
928
	}
929
930
	/**
931
	 * Escapes markup looking text in html to prevent accidental assignment
932
	 *
933
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
934
	 * be converted by md to html as <strong>stuff</strong>
935
	 *
936
	 * @param string $value
937
	 */
938
	private function _escape_text($value)
939
	{
940
		// Search and replace ...
941
		foreach ($this->_textEscapeRegex as $regex => $replacement)
942
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
943
944
		return $value;
945
	}
946
947
	/**
948
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
949
	 * when markdown is run we don't interpret the ` as additional code blocks
950
	 *
951
	 * @param object $node
952
	 * @param string $value
953
	 */
954
	private function _has_ticks($node, $value)
955
	{
956
		$ticks = '';
957
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
958
959
		// Inside of a pre, we don't do anything
960
		if ($code_parent === 'pre')
961
			return $value;
962
963
		// If we have backticks in code, then we back tick the ticks
964
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
965
		preg_match_all('~`+~', $value, $matches);
966
		if (!empty($matches[0]))
967
		{
968
			// Yup ticks in the hair
969
			$ticks = '`';
970
			rsort($matches[0]);
971
972
			// Backtick as many as needed so markdown will work
973
			while (true)
974
			{
975
				if (!in_array($ticks, $matches[0]))
976
					break;
977
				$ticks .= '`';
978
			}
979
		}
980
981
		return $ticks;
982
	}
983
984
	/**
985
	 * Breaks a string up so its no more than width characters long
986
	 *
987
	 * - Will break at word boundaries
988
	 * - If no natural space is found will break mid-word
989
	 *
990
	 * @param string $string
991
	 * @param int $width
992
	 * @param string $break
993
	 */
994
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
995
	{
996
		$strings = explode($break, $string);
997
		$lines = array();
998
999
		foreach ($strings as $string)
1000
		{
1001
			$in_quote = isset($string[0]) && $string[0] === '>';
1002
			while (!empty($string))
1003
			{
1004
				// Get the next #width characters before a break (space, punctuation tab etc)
1005
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~', $string, $matches))
1006
				{
1007
					// Add the #width to the output and set up for the next pass
1008
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1];
1009
					$string = Util::substr($string, Util::strlen($matches[1]));
1010
				}
1011
				// Humm just a long word with no place to break so we simply cut it after width characters
1012
				else
1013
				{
1014
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1015
					$string = Util::substr($string, $width);
1016
				}
1017
			}
1018
		}
1019
1020
		// Join it all the shortened sections up on our break characters
1021
		return implode($break, $lines);
1022
	}
1023
}