Completed
Pull Request — master (#3490)
by Spuds
10:10
created

Html_2_Md::_check_link_lenght()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 3
dl 0
loc 7
rs 10
c 0
b 0
f 0
nc 4
nop 2
ccs 5
cts 5
cp 1
crap 3
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.1.7
11
 *
12
 */
13
14
/**
15
 * Converts HTML to Markdown text
16
 */
17
class Html_2_Md
18
{
19
	/**
20
	 * The value that will hold our dom object
21
	 * @var object
22
	 */
23
	public $doc;
24
25
	/**
26
	 * The value that will hold if we are using the internal or external parser
27
	 * @var boolean
28
	 */
29
	private $_parser;
30
31
	/**
32
	 * Line end character
33
	 * @var string
34
	 */
35
	public $line_end = "\n";
36
37
	/**
38
	 * Line break character
39
	 * @var string
40
	 */
41
	public $line_break = "\n\n";
42
43
	/**
44
	 * Wordwrap output, set to 0 to skip wrapping
45
	 * @var int
46
	 */
47
	public $body_width = 76;
48
49
	/**
50
	 * Strip remaining tags, set to false to leave them in
51
	 * @var boolean
52
	 */
53
	public $strip_tags = true;
54
55
	/**
56
	 * Regex to run on plain text to prevent markdown from erroneously converting
57
	 * @var string[]
58
	 */
59
	private $_textEscapeRegex = array();
60
61
	/**
62
	 * The passed html string to convert
63
	 * @var string
64
	 */
65
	public $html;
66
67
	/**
68
	 * The markdown equivalent to the  html string
69
	 * @var string
70
	 */
71
	public $markdown;
72
73
	/**
74
	 * Gets everything started using the built in or external parser
75
	 *
76
	 * @param string $html string of html to convert to MD text
77
	 */
78 1
	public function __construct($html)
79
	{
80
		// Up front, remove whitespace between html tags
81 1
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
82
83
		// The XML parser will not deal gracefully with these
84 1
		$this->html = strtr($this->html, array(
85 1
			'?<' => '|?|&lt',
86 1
			'?>' => '|?|&gt',
87 1
			'>?' => '&gt|?|',
88
			'<?' => '&lt|?|'
89 1
		));
90
91
		// Set the dom parser to use and load the HTML to the parser
92 1
		$this->_set_parser();
93
94
		// Initialize the regex array to escape text areas so markdown does
95
		// not interpret plain text as markdown syntax
96 1
		$this->_textEscapeRegex = array(
97
			// Things that may convert to an hr --- or - - - etc
98 1
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
99
			// or **stuff** => \*\*stuff\*\*
100 1
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
101
			// or versions of *italic* __italic__ _italic_
102 1
			'\*([^*\s]+)\*' => '\*$1\*',
103 1
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
104 1
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
105
			// nor `code`
106 1
			'`(.+)`' => '\`$1\`',
107
			// or links
108 1
			'\[(.+)\](\s*\()' => '\[$1\]$2',
109 1
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
110
		);
111 1
	}
112
113
	/**
114
	 * Set the DOM parser for class, loads the supplied HTML
115
	 */
116 1
	private function _set_parser()
117
	{
118
		// Using PHP built in functions ...
119 1
		if (class_exists('DOMDocument'))
120 1
		{
121 1
			$this->_parser = true;
122 1
			$previous = libxml_use_internal_errors(true);
123
124
			// Set up basic parameters for DomDocument, including silencing structural errors
125 1
			$this->_setupDOMDocument();
126
127
			// Set the error handle back to what it was, and flush
128 1
			libxml_use_internal_errors($previous);
129 1
			libxml_clear_errors();
130 1
		}
131
		// Or using the external simple html parser
132
		else
133
		{
134
			$this->_parser = false;
135
			require_once(EXTDIR . '/simple_html_dom.php');
136
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
0 ignored issues
show
Documentation Bug introduced by
It seems like str_get_html($this->html..., true, 'UTF-8', false) can also be of type false. However, the property $doc is declared as type object. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
137
		}
138 1
	}
139
140
	/**
141
	 * Loads the html body and sends it to the parsing loop to convert all
142
	 * DOM nodes to markup
143
	 */
144 1
	public function get_markdown()
145
	{
146
		// For this html node, find all child elements and convert
147 1
		$body = $this->_getBody();
148 1
		$this->_convert_childNodes($body);
149
150
		// Done replacing HTML elements, now get the converted DOM tree back into a string
151 1
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
152
153
		// Using the internal DOM methods requires we need to do a little extra work
154 1
		if ($this->_parser)
155 1
		{
156 1
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
157 1
		}
158
159
		// Clean up any excess spacing etc
160 1
		$this->_clean_markdown();
161
162
		// Convert any clear text links to MD
163 1
		$this->_convert_plaintxt_links();
164
165
		// Wordwrap?
166 1
		if (!empty($this->body_width))
167 1
		{
168 1
			$this->_check_line_lenght($this->markdown);
169 1
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
170
		}
171 1
172
		// The null character will trigger a base64 version in outbound email
173
		return $this->markdown . "\n\x00";
174
	}
175
176
	/**
177
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
178
	 * and above head markup
179
	 *
180 1
	 * @return object
181
	 */
182
	private function _getBody()
183 1
	{
184
		// If there is a head node, then off with his head!
185
		$this->_clipHead();
186 1
187 1
		// The body of the HTML is where its at.
188 1
		if ($this->_parser)
189 1
		{
190
			$body = $this->doc->getElementsByTagName('body')->item(0);
191
		}
192
		else
193
		{
194
			if ($this->doc->find('body', 0) !== null)
195
			{
196
				$body = $this->doc->find('body', 0);
197
			}
198
			elseif ($this->doc->find('html', 0) !== null)
199
			{
200
				$body = $this->doc->find('html', 0);
201
			}
202
			else
203
			{
204
				$body = $this->doc->root;
205
			}
206 1
		}
207
208
		return $body;
209
	}
210
211
	/**
212 1
	 * Remove any <head node from the DOM
213
	 */
214 1
	private function _clipHead()
215 1
	{
216 1
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0);
217 1
		if ($head !== null)
218 1
		{
219 1
			if ($this->_parser)
220 1
			{
221
				$head->parentNode->removeChild($head);
222
			}
223
			else
224
			{
225 1
				$this->doc->find('head', 0)->outertext = '';
226 1
			}
227
		}
228
	}
229
230
	/**
231 1
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
232
	 */
233
	private function _setupDOMDocument()
234 1
	{
235
		// If the html is already wrapped, remove it
236
		$this->html = $this->_returnBodyText($this->html);
237 1
238 1
		// Set up processing details
239 1
		$this->doc = new DOMDocument();
240
		$this->doc->preserveWhiteSpace = false;
241
		$this->doc->encoding = 'UTF-8';
242 1
243 1
		// Do what we can to ensure this is processed as UTF-8
244
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
245
	}
246
247
	/**
248 1
	 * Normalize any spacing and excess blank lines that may have been generated
249
	 */
250
	private function _clean_markdown()
251 1
	{
252
		// We only want the content, no wrappers
253
		$this->markdown = $this->_returnBodyText($this->markdown);
254 1
255 1
		// Remove non breakable spaces that may be hiding in here
256
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
257
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
258 1
259 1
		// Remove any "bonus" tags
260 1
		if ($this->strip_tags)
261 1
		{
262
			$this->markdown = strip_tags($this->markdown);
263
		}
264 1
265 1
		// Replace content that we "hide" from the XML parsers
266 1
		$this->markdown = strtr($this->markdown, array(
267 1
			'|?|&gt' => '?>',
268
			'|?|&lt' => '?<',
269 1
			'&lt|?|' => '<?',
270
			'&gt|?|' => '>?'
271
		));
272 1
273 1
		// We may have hidden content ending in ?<br /> due to the above
274 1
		$this->markdown = str_replace('<br />', "\n\n", $this->markdown);
275 1
276 1
		// Strip the chaff and any excess blank lines we may have produced
277 1
		$this->markdown = trim($this->markdown);
278
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
279
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
280
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
281
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
282
	}
283
284
	/**
285
	 * Looks for the text inside of <body> and then <html>, returning just the inner
286 1
	 *
287
	 * @param $text
288 1
	 *
289 1
	 * @return string
290
	 */
291
	private function _returnBodyText($text)
292 1
	{
293
		if (preg_match('~<body>(.*)</body>~su', $text, $body))
294 1
		{
295
			return $body[1];
296
		}
297 1
		elseif (preg_match('~<html>(.*)</html>~su', $text, $body))
298
		{
299
			return $body[1];
300
		}
301
302
		return $text;
303
	}
304
305
	/**
306
	 * For a given node, checks if it is anywhere nested inside of a code block
307
	 *  - Prevents converting anything that's inside a code block
308
	 *
309 1
	 * @param object $node
310
	 * @param boolean $parser flag for internal or external parser
311 1
	 *
312 1
	 * @return boolean
313
	 */
314 1
	private static function _has_parent_code($node, $parser)
315 1
	{
316
		$parent = $parser ? $node->parentNode : $node->parentNode();
317
		while ($parent)
318
		{
319
			if ($parent === null)
320 1
			{
321 1
				return false;
322 1
			}
323 1
324
			// Anywhere nested inside a code block we don't render tags
325
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
326
			if ($tag === 'code')
327 1
			{
328 1
				return true;
329
			}
330 1
331
			// Back out another level, until we are done
332
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
333
		}
334
335
		return false;
336
	}
337
338
	/**
339
	 * Get the nesting level when inside a list
340
	 *
341 1
	 * @param object $node
342
	 * @param boolean $parser flag for internal or external parser
343 1
	 *
344 1
	 * @return int
345
	 */
346 1
	private static function _has_parent_list($node, $parser)
347 1
	{
348
		$inlist = array('ul', 'ol');
349
		$depth = 0;
350 1
351 1
		$parent = $parser ? $node->parentNode : $node->parentNode();
352 1
		while ($parent)
353 1
		{
354 1
			// Anywhere nested inside a list we need to get the depth
355
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
356
			if (in_array($tag, $inlist))
357 1
			{
358 1
				$depth++;
359
			}
360 1
361
			// Back out another level
362
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
363
		}
364
365
		return $depth;
366
	}
367
368 1
	/**
369
	 * Traverse each node to its base, then convert tags to markup on the way back out
370 1
	 *
371 1
	 * @param object $node
372 1
	 */
373
	private function _convert_childNodes($node)
374
	{
375
		if (self::_has_parent_code($node, $this->_parser))
376 1
		{
377 1
			return;
378 1
		}
379 1
380
		// Keep traversing till we are at the base of this node
381 1
		if ($node->hasChildNodes())
382 1
		{
383 1
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
384 1
			for ($i = 0; $i < $num; $i++)
385
			{
386
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
387 1
				$this->_convert_childNodes($child);
388 1
			}
389
		}
390
391
		// At the root of this node, convert it to markdown
392
		$this->_convert_to_markdown($node);
393
	}
394
395
	/**
396 1
	 * Convert the supplied node into its markdown equivalent
397
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
398
	 *
399 1
	 * @param object $node
400
	 */
401
	private function _convert_to_markdown($node)
402
	{
403
		// HTML tag we are dealing with
404 1
		$tag = $this->_get_name($node);
405 1
406 1
		// Based on the tag, determine how to convert
407 1
		switch ($tag)
408
		{
409
			case 'a':
410 1
				if ($node->getAttribute('data-lightboximage') || $node->getAttribute('data-lightboxmessage'))
411 1
					$markdown = '~`skip`~';
412 1
				else
413 1
					$markdown = $this->line_end . $this->_convert_anchor($node) . $this->line_end;
414 1
				break;
415 1
			case 'abbr':
416 1
				$markdown = $this->_convert_abbr($node);
417 1
				break;
418
			case 'b':
419 1
			case 'strong':
420 1
				$markdown = '**' . $this->_get_value($node) . '**';
421 1
				break;
422
			case 'blockquote':
423
				$markdown = $this->_convert_blockquote($node);
424 1
				break;
425 1
			case 'br':
426 1
				// DomDocument strips empty lines, this prevents that
427 1
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
428
				break;
429
			case 'center':
430 1
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
431
				break;
432
			case 'code':
433 1
				$markdown = $this->_convert_code($node);
434
				break;
435
			case 'dt':
436 1
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
437 1
				break;
438 1
			case 'dd':
439 1
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
440 1
				break;
441
			case 'dl':
442
				$markdown = trim($this->_get_value($node)) . $this->line_break;
443 1
				break;
444 1
			case 'em':
445 1
			case 'i':
446 1
				$markdown = '_' . $this->_get_value($node) . '_';
447 1
				break;
448 1
			case 'hr':
449 1
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
450 1
				break;
451 1
			case 'h1':
452
			case 'h2':
453
			case 'h3':
454 1
			case 'h4':
455 1
			case 'h5':
456 1
			case 'h6':
457 1
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
0 ignored issues
show
Bug introduced by
$tag of type string is incompatible with the type integer expected by parameter $level of Html_2_Md::_convert_header(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

457
				$markdown = $this->_convert_header(/** @scrutinizer ignore-type */ $tag, $this->_get_value($node));
Loading history...
458 1
				break;
459 1
			case 'img':
460 1
				$markdown = $this->_convert_image($node) . $this->line_end;
461 1
				break;
462 1
			case 'ol':
463 1
			case 'ul':
464
				$markdown = $this->line_end . rtrim($this->_get_value($node)) . $this->line_break;
465
				if ($this->_has_parent_list($node, $this->_parser))
466
					$markdown = rtrim($this->_get_value($node)) . $this->line_end;
467
				break;
468
			case 'li':
469 1
				$markdown = $this->_convert_list($node);
470
				break;
471 1
			case 'p':
472 1
				if (!$node->hasChildNodes())
473 1
				{
474 1
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
475 1
					$markdown = $this->_escape_text($markdown);
476
				}
477
				else
478
				{
479
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
480
				}
481
				break;
482
			case 'pre':
483
				$markdown = $this->_get_value($node) . $this->line_break;
484
				break;
485 1
			case 'div':
486
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
487
				if (!$node->hasChildNodes())
488 1
				{
489 1
					$markdown = $this->_escape_text($markdown);
490 1
				}
491 1
				break;
492 1
			//case '#text':
493 1
			//  $markdown = $this->_escape_text($this->_get_value($node));
494 1
			//  break;
495 1
			case 'title':
496 1
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
497
				break;
498 1
			case 'table':
499 1
				$markdown = $this->_convert_table($node) . $this->line_break;
500 1
				break;
501 1
			case 'th':
502 1
			case 'tr':
503
			case 'td':
504 1
			case 'tbody':
505 1
			case 'tfoot':
506 1
			case 'thead':
507
				// Just skip over these as we handle them in the table tag itself
508 1
				$markdown = '~`skip`~';
509 1
				break;
510
			case 'root':
511
			case 'span':
512 1
			case 'body':
513 1
				// Remove these tags and simply replace with the text inside the tags
514 1
				$markdown = $this->_get_innerHTML($node);
515 1
				break;
516
			default:
517 1
				// Don't know you or text, so just preserve whats there
518 1
				$markdown = $this->_get_outerHTML($node);
519 1
		}
520
521
		// Replace the node with our markdown replacement, or with the node itself if none was found
522
		if ($markdown !== '~`skip`~')
523
		{
524 1
			if ($this->_parser)
525 1
			{
526
				// Create a new text node with our markdown tag and replace the original node
527
				$markdown_node = $this->doc->createTextNode($markdown);
528
				$node->parentNode->replaceChild($markdown_node, $node);
529
			}
530
			else
531
			{
532
				$node->outertext = $markdown;
533
			}
534
		}
535
	}
536
537
	/**
538
	 * Converts <abbr> tags to markdown (extra)
539
	 *
540
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
541
	 * md:   *[HTML]: Hyper Text Markup Language
542
	 *
543
	 * @param object $node
544
	 * @return string
545
	 */
546
	private function _convert_abbr($node)
547
	{
548
		$title = $node->getAttribute('title');
549
		$value = $this->_get_value($node);
550
551
		if (!empty($title))
552
		{
553
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
554
		}
555
		else
556
		{
557
			$markdown = '';
558
		}
559
560
		return $markdown;
561
	}
562 1
563
	/**
564 1
	 * Converts <a> tags to markdown
565
	 *
566 1
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
567 1
	 * md: [Awesome Site](http://somesite.com 'Title')
568 1
	 *
569 1
	 * @param object $node
570
	 * @return string
571
	 */
572 1
	private function _convert_anchor($node)
573 1
	{
574 1
		global $txt;
575 1
576
		$href = htmlspecialchars_decode($node->getAttribute('href'));
577
		$href = strtr($href, array('(' => '%28', ')' => '%29', '[' => '%5B', ']' => '%5D', '&' => '%26a'));
578 1
579 1
		$title = $node->getAttribute('title');
580
		$class = $node->getAttribute('class');
581
		$value = $this->_get_value($node);
582 1
583
		// Provide a more compact [name] if none is given
584
		if ($value == $node->getAttribute('href') || empty($value))
585
		{
586
			$value = empty($title) ? $txt['link'] : $title;
587
		}
588 1
589
		// Special processing just for our own footnotes
590
		if ($class === 'target' || $class === 'footnote_return')
591
		{
592 1
			$markdown = $value;
593
		}
594 1
		elseif (!empty($title))
595
		{
596
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
597
		}
598
		else
599
		{
600
			$markdown = '[' . $value . ']( ' . $href . ' )';
601
		}
602
603
		return $markdown;
604
	}
605
606 1
	/**
607
	 * Converts blockquotes to markdown > quote style
608 1
	 *
609
	 * html: <blockquote>quote</blockquote>
610
	 * md: > quote
611 1
	 *
612 1
	 * @param object $node
613
	 * @return string
614
	 */
615 1
	private function _convert_blockquote($node)
616
	{
617
		$markdown = '';
618 1
619
		// All the contents of this block quote
620 1
		$value = $this->_get_value($node);
621 1
		$value = trim($value);
622
623 1
		// Go line by line
624
		$lines = preg_split('~\r\n|\r|\n~', $value);
625 1
626
		// Each line gets a '> ' in front of it, just like email quotes really
627
		foreach ($lines as $line)
628
		{
629
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
630
		}
631
632
		$markdown .= $this->line_end;
633
634
		return $markdown;
635
	}
636
637
	/**
638
	 * Converts code tags to markdown span `code` or block code
639 1
	 * Converts single line code to inline tick mark
640
	 * Converts multi line to 4 space indented code
641 1
	 *
642
	 * html: <code>code</code>
643
	 * md: `code`
644 1
	 *
645
	 * @param object $node
646
	 * @return string
647
	 */
648 1
	private function _convert_code($node)
649 1
	{
650
		$value = $this->_get_innerHTML($node);
651
652
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselves
653
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
654 1
655 1
		// If there are html tags in this code block, we need to disable strip tags
656
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
657
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
658 1
		{
659 1
			$this->strip_tags = false;
660 1
		}
661 1
662
		// Get the number of lines of code that we have
663
		$lines = preg_split('~\r\n|\r|\n~', $value);
664 1
		$total = count($lines);
665 1
666
		// If there's more than one line of code, use leading four space syntax
667
		if ($total > 1)
668 1
		{
669 1
			$first_line = trim($lines[0]);
670 1
			$last_line = trim($lines[$total - 1]);
671 1
672
			// Remove any leading and trailing blank lines
673
			if (empty($first_line))
674 1
			{
675 1
				array_shift($lines);
676
			}
677
			if (empty($last_line))
678
			{
679 1
				array_pop($lines);
680
			}
681 1
682 1
			// Convert what remains
683
			$markdown = '';
684
			foreach ($lines as $line)
685 1
			{
686 1
				// Adjust the word wrapping since this has code tags, leave it up to
687 1
				// the email client to mess these up ;)
688 1
				$this->_check_line_lenght($markdown, 5);
689 1
690
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
691
			}
692
693
			// The parser will encode, but we don't want that for our code block
694
			if ($this->_parser)
695
			{
696
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
697
			}
698
		}
699
		// Single line, back tick and move on
700
		else
701
		{
702
			// Account for backticks in the single line code itself
703
			$ticks = $this->_has_ticks($node, $value);
704
			if (!empty($ticks))
705
			{
706
				// If the ticks were at the start/end of the word space it off
707
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
708
				{
709
					$lines[0] = ' ' . $lines[0] . ' ';
710
				}
711 1
712
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
713
			}
714
			else
715
			{
716
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
717
			}
718
		}
719
720
		return $markdown;
721
	}
722
723
	/**
724
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
725
	 * all other headers are returned as atx style ### h3
726
	 *
727
	 * html: <h1>header</h1>
728
	 * md: header
729 1
	 *     ======
730
	 *
731 1
	 * html: <h3>header</h3>
732
	 * md: ###header
733 1
	 *
734 1
	 * @param int $level
735
	 * @param string $content
736
	 * @return string
737
	 */
738
	private function _convert_header($level, $content)
739
	{
740
		$level = (int) ltrim($level, 'h');
741 1
742
		if ($level < 3)
743
		{
744 1
			$length = Util::strlen($content);
745
			$underline = ($level === 1) ? '=' : '-';
746
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
747
		}
748
		else
749
		{
750
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
751
		}
752
753
		return $markdown;
754
	}
755
756
	/**
757
	 * Converts <img> tags to markdown
758
	 *
759
	 * html: <img src='source' alt='alt' title='title' />
760
	 * md: ![alt](source 'title')
761
	 *
762
	 * @param object $node
763
	 * @return string
764
	 */
765
	private function _convert_image($node)
766
	{
767
		$src = $node->getAttribute('src');
768
		$alt = $node->getAttribute('alt');
769
		$title = $node->getAttribute('title');
770
771
		if (!empty($title))
772
		{
773
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
774
		}
775
		else
776
		{
777
			$markdown = '![' . $alt . '](' . $src . ')';
778
		}
779
780
		return $markdown;
781
	}
782
783
	/**
784
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
785
	 *
786
	 * html: <ul><li>one</li></ul>
787
	 * md * one
788
	 *
789 1
	 * @param object $node
790
	 * @return string
791 1
	 */
792 1
	private function _convert_list($node)
793
	{
794 1
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
795 1
		$value = $this->_get_value($node);
796
797
		$loose = rtrim($value) !== $value;
798 1
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
799 1
800 1
		// Unordered lists get a simple bullet
801 1
		if ($list_type === 'ul')
802
		{
803
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
804
		}
805 1
		// Ordered lists need a number
806 1
		else
807
		{
808
			$number = $this->_get_list_position($node);
809 1
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
810
		}
811
812
		return $markdown . (!$loose ? $this->line_end : '');
813
	}
814
815
	/**
816
	 * Converts tables tags to markdown extra table syntax
817
	 *
818
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
819
	 *
820 1
	 * @param object $node
821
	 * @return string
822 1
	 */
823 1
	private function _convert_table($node)
824 1
	{
825
		$table_heading = $node->getElementsByTagName('th');
826
		if ($this->_get_item($table_heading, 0) === null)
827
		{
828 1
			return '';
829
		}
830
831 1
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
832 1
833 1
		// Set up for a markdown table, then storm the castle
834 1
		$align = array();
835 1
		$value = array();
836 1
		$width = array();
837
		$max = array();
838
		$header = array();
839 1
		$rows = array();
840 1
841
		// We only markdown well formed tables ...
842 1
		if ($table_heading && $th_parent === 'tr')
843
		{
844 1
			// Find out how many columns we are dealing with
845
			$th_num = $this->_get_length($table_heading);
846
847 1
			for ($col = 0; $col < $th_num; $col++)
848 1
			{
849 1
				// Get the align and text for each th (html5 this is no longer valid)
850 1
				$th = $this->_get_item($table_heading, $col);
851 1
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
852
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
853
				$value[0][$col] = $this->_get_value($th);
854 1
				$width[0][$col] = Util::strlen($this->_get_value($th));
855 1
856
				// Seed the max col width
857
				$max[$col] = $width[0][$col];
858 1
			}
859 1
860 1
			// Get all of the rows
861
			$table_rows = $node->getElementsByTagName('tr');
862
			$num_rows = $this->_get_length($table_rows);
863 1
			for ($row = 1; $row < $num_rows; $row++)
864
			{
865
				// Start at row 1 and get all of the td's in this row
866 1
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
867
868
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
869 1
				for ($col = 0; $col < $th_num; $col++)
870 1
				{
871 1
					// Get the align and text for each td in this row
872 1
					$td = $this->_get_item($row_data, $col);
873 1
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
874
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
875
					$value[$row][$col] = $this->_get_value($td);
876 1
					$width[$row][$col] = Util::strlen($this->_get_value($td));
877 1
878
					// Keep track of the longest col cell as we go
879
					if ($width[$row][$col] > $max[$col])
880 1
					{
881 1
						$max[$col] = $width[$row][$col];
882
					}
883
				}
884 1
			}
885
886 1
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
887 1
			for ($row = 0; $row < $num_rows; $row++)
888
			{
889
				$temp = array();
890 1
				for ($col = 0; $col < $th_num; $col++)
891 1
				{
892 1
					// Build the header row once
893 1
					if ($row === 0)
894
					{
895
						$header[] = str_repeat('-', $max[$col]);
896 1
					}
897 1
898
					// Build the data for each col, align/pad as needed
899
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
900 1
				}
901
902
				// Join it all up so we have a nice looking row
903 1
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
904 1
905 1
				// Stuff in the header after the th row
906 1
				if ($row === 0)
907 1
				{
908
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
909
				}
910 1
			}
911
912
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
913 1
			$this->_check_line_lenght($rows[1], 2);
914
915
			// Return what we did so it can be swapped in
916
			return implode($this->line_end, $rows);
917
		}
918
	}
919
920
	/**
921
	 * Helper function for getting a node object
922
	 *
923
	 * @param object $node
924 1
	 * @param int $item
925
	 * @return object
926 1
	 */
927 1
	private function _get_item($node, $item)
928 1
	{
929
		if ($this->_parser)
930
		{
931
			return $node->item($item);
932
		}
933
		else
934
		{
935
			return $node[$item];
936
		}
937
	}
938
939
	/**
940
	 * Helper function for getting a node length
941
	 *
942 1
	 * @param object|array $node
943
	 * @return int
944 1
	 */
945 1
	private function _get_length($node)
946 1
	{
947
		if ($this->_parser)
948
		{
949
			return $node->length;
950
		}
951
		else
952
		{
953
			return count($node);
954
		}
955
	}
956
957
	/**
958
	 * Helper function for getting a node value
959
	 *
960 1
	 * @param object $node
961
	 * @return string
962 1
	 */
963 1
	private function _get_value($node)
964
	{
965
		if ($node === null)
966
		{
967 1
			return '';
968 1
		}
969 1
970
		if ($this->_parser)
971
		{
972
			return $node->nodeValue;
973
		}
974
		else
975
		{
976
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
977
		}
978
	}
979
980
	/**
981
	 * Helper function for getting a node name
982
	 *
983 1
	 * @param object $node
984
	 * @return string
985 1
	 */
986 1
	private function _get_name($node)
987
	{
988
		if ($node === null)
989
		{
990 1
			return '';
991 1
		}
992 1
993
		if ($this->_parser)
994
		{
995
			return $node->nodeName;
996
		}
997
		else
998
		{
999
			return $node->nodeName();
1000
		}
1001
	}
1002
1003
	/**
1004
	 * Helper function for creating ol's
1005
	 *
1006
	 * - Returns the absolute number of an <li> inside an <ol>
1007
	 *
1008 1
	 * @param object $node
1009
	 * @return int
1010 1
	 */
1011
	private function _get_list_position($node)
1012
	{
1013 1
		$position = 1;
1014 1
1015
		// Get all of the list nodes inside this parent
1016
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
1017 1
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
1018
1019 1
		// Loop through all li nodes and find where we are in this list
1020 1
		for ($i = 0; $i < $total_nodes; $i++)
1021 1
		{
1022 1
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
1023 1
			if ($current_node === $node)
1024 1
			{
1025
				$position = $i + 1;
1026 1
			}
1027
		}
1028
1029
		return $position;
1030
	}
1031
1032
	/**
1033
	 * Helper function for table creation
1034
	 *
1035
	 * - Builds td's to a give width, aligned as needed
1036
	 *
1037
	 * @param string $align
1038
	 * @param int $width
1039
	 * @param string $content
1040 1
	 * @param int $max
1041
	 * @return string
1042
	 */
1043
	private function _align_row_content($align, $width, $content, $max)
1044 1
	{
1045 1
		switch ($align)
1046 1
		{
1047 1
			default:
1048 1
			case 'left':
1049
				$content .= str_repeat(' ', $max - $width);
1050
				break;
1051 1
			case 'right':
1052
				$content = str_repeat(' ', $max - $width) . $content;
1053
				break;
1054
			case 'center':
1055
				$paddingNeeded = $max - $width;
1056
				$left = floor($paddingNeeded / 2);
1057 1
				$right = $paddingNeeded - $left;
1058
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
0 ignored issues
show
Bug introduced by
$left of type double is incompatible with the type integer expected by parameter $times of str_repeat(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1058
				$content = str_repeat(' ', /** @scrutinizer ignore-type */ $left) . $content . str_repeat(' ', $right);
Loading history...
1059 1
				break;
1060
		}
1061
1062
		return $content;
1063
	}
1064
1065
	/**
1066
	 * Gets the inner html of a node
1067
	 *
1068 1
	 * @param DOMNode|object $node
1069
	 * @return string
1070 1
	 */
1071 1
	private function _get_innerHTML($node)
1072 1
	{
1073 1
		if ($this->_parser)
1074 1
		{
1075 1
			$doc = new DOMDocument();
1076
			$doc->appendChild($doc->importNode($node, true));
1077 1
			$html = trim($doc->saveHTML());
1078
			$tag = $node->nodeName;
1079
1080
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
1081
		}
1082
		else
1083
		{
1084
			return $node->innertext;
0 ignored issues
show
Bug introduced by
The property innertext does not seem to exist on DOMNode.
Loading history...
1085
		}
1086
	}
1087
1088
	/**
1089
	 * Gets the outer html of a node
1090
	 *
1091 1
	 * @param DOMNode|object $node
1092
	 * @return string
1093 1
	 */
1094 1
	private function _get_outerHTML($node)
1095 1
	{
1096 1
		if ($this->_parser)
1097 1
		{
1098
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
1099
			{
1100
				return htmlspecialchars_decode($this->doc->saveHTML($node));
1101
			}
1102
			else
1103
			{
1104
				// @todo remove when 5.3.6 min
1105
				$doc = new DOMDocument();
1106
				$doc->appendChild($doc->importNode($node, true));
1107
				$html = $doc->saveHTML();
1108
1109
				// We just want the html of the inserted node, it *may* be wrapped
1110
				$html = $this->_returnBodyText($html);
1111
1112
				// Clean it up
1113
				$html = rtrim($html, "\n");
1114
1115
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1116
			}
1117
		}
1118
		else
1119
		{
1120
			return $node->outertext;
0 ignored issues
show
Bug introduced by
The property outertext does not seem to exist on DOMNode.
Loading history...
1121
		}
1122
	}
1123
1124
	/**
1125
	 * Escapes markup looking text in html to prevent accidental assignment
1126
	 *
1127
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1128
	 * be converted by md to html as <strong>stuff</strong>
1129
	 *
1130
	 * @param string $value
1131
	 * @return string
1132
	 */
1133
	private function _escape_text($value)
1134
	{
1135
		// Search and replace ...
1136
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1137
		{
1138
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
1139
		}
1140
1141
		return $value;
1142
	}
1143
1144
	/**
1145
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1146
	 * when markdown is run we don't interpret the ` as additional code blocks
1147
	 *
1148
	 * @param object $node
1149
	 * @param string $value
1150
	 * @return string
1151
	 */
1152
	private function _has_ticks($node, $value)
1153
	{
1154
		$ticks = '';
1155
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
1156
1157
		// Inside of a pre, we don't do anything
1158
		if ($code_parent === 'pre')
1159
		{
1160
			return $value;
1161
		}
1162
1163
		// If we have backticks in code, then we back tick the ticks
1164
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1165
		preg_match_all('~`+~', $value, $matches);
1166
		if (!empty($matches[0]))
1167
		{
1168
			// Yup ticks in the hair
1169
			$ticks = '`';
1170
			rsort($matches[0]);
1171
1172
			// Backtick as many as needed so markdown will work
1173
			while (true)
1174
			{
1175
				if (!in_array($ticks, $matches[0]))
1176
				{
1177
					break;
1178
				}
1179
				$ticks .= '`';
1180
			}
1181
		}
1182
1183
		return $ticks;
1184
	}
1185
1186
	/**
1187
	 * Helper function to adjust wrapping width for long-ish links
1188
	 *
1189 1
	 * @param string $markdown
1190
	 * @param bool|int $buffer
1191
	 */
1192 1
	private function _check_line_lenght($markdown, $buffer = false)
1193 1
	{
1194 1
		// Some Lines can be very long and if we wrap them they break
1195 1
		$lines = explode($this->line_end, $markdown);
1196 1
		foreach ($lines as $line)
1197 1
		{
1198
			$line_strlen = Util::strlen($line) + (!empty($buffer) ? (int) $buffer : 0);
1199
		if ($line_strlen > $this->body_width)
1200
		{
1201
			$this->body_width = $line_strlen;
1202 1
		}
1203
	}
1204 1
	}
1205 1
1206
	/**
1207
	 * Helper function to find and wrap plain text links in MD format
1208
	 */
1209
	private function _convert_plaintxt_links()
1210
	{
1211
		$this->markdown = preg_replace_callback('/((?<!\]\( |\]\()https?:\/\/|(?<!\]\( |\]\(|:\/\/)www)[-\p{L}0-9+&@#\/%?=~_|!:,.;]*[\p{L}0-9+&@#\/%=~_|]/iu', array($this, '_plaintxt_callback'), $this->markdown);
1212
	}
1213 1
1214
	/**
1215 1
	 * Callback function used by _convert_plaintxt_links for plain link to MD
1216
	 *
1217 1
	 * @param string[] $matches
1218 1
	 * @return string
1219
	 */
1220 1
	private function _plaintxt_callback($matches)
1221
	{
1222
		global $txt;
1223
1224
		$replacement = $this->line_end . '[' . $txt['link'] . ']( ' . trim($matches[0]) . ' )';
1225
1226
		return $replacement;
1227
	}
1228
1229
	/**
1230
	 * Breaks a string up so its no more than width characters long
1231
	 *
1232
	 * - Will break at word boundaries
1233
	 * - If no natural space is found will break mid-word
1234 1
	 *
1235
	 * @param string $string
1236 1
	 * @param int $width
1237 1
	 * @param string $break
1238
	 * @return string
1239 1
	 */
1240
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
1241 1
	{
1242 1
		$strings = explode($break, $string);
1243
		$lines = array();
1244
1245 1
		foreach ($strings as $string)
0 ignored issues
show
introduced by
$string is overwriting one of the parameters of this function.
Loading history...
1246 1
		{
1247
			$in_quote = isset($string[0]) && $string[0] === '>';
1248 1
			if (empty($string))
1249 1
			{
1250 1
				$lines[] = '';
1251
			}
1252
			while (!empty($string))
1253
			{
1254
				// Get the next #width characters before a break (space, punctuation tab etc)
1255
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1256
				{
1257 1
					// Add the #width to the output and set up for the next pass
1258 1
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . ltrim($matches[1], ' ');
1259
					$string = Util::substr($string, Util::strlen($matches[1]));
1260
				}
1261 1
				// Humm just a long word with no place to break so we simply cut it after width characters
1262
				else
1263
				{
1264
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1265
					$string = Util::substr($string, $width);
1266
				}
1267
			}
1268
		}
1269
1270
		// Join it all the shortened sections up on our break characters
1271
		return implode($break, $lines);
1272
	}
1273
}
1274