Completed
Pull Request — patch_1-1-7 (#3482)
by Spuds
06:13
created

Html_2_Md::_check_link_lenght()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 3
nc 4
nop 2
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.1.7
11
 *
12
 */
13
14
/**
15
 * Converts HTML to Markdown text
16
 */
17
class Html_2_Md
18
{
19
	/**
20
	 * The value that will hold our dom object
21
	 * @var object
22
	 */
23
	public $doc;
24
25
	/**
26
	 * The value that will hold if we are using the internal or external parser
27
	 * @var boolean
28
	 */
29
	private $_parser;
30
31
	/**
32
	 * Line end character
33
	 * @var string
34
	 */
35
	public $line_end = "\n";
36
37
	/**
38
	 * Line break character
39
	 * @var string
40
	 */
41
	public $line_break = "\n\n";
42
43
	/**
44
	 * Wordwrap output, set to 0 to skip wrapping
45
	 * @var int
46
	 */
47
	public $body_width = 76;
48
49
	/**
50
	 * Strip remaining tags, set to false to leave them in
51
	 * @var boolean
52
	 */
53
	public $strip_tags = true;
54
55
	/**
56
	 * Regex to run on plain text to prevent markdown from erroneously converting
57
	 * @var string[]
58
	 */
59
	private $_textEscapeRegex = array();
60
61
	/**
62
	 * The passed html string to convert
63
	 * @var string
64
	 */
65
	public $html;
66
67
	/**
68
	 * The markdown equivalent to the  html string
69
	 * @var string
70
	 */
71
	public $markdown;
72
73
	/**
74
	 * Gets everything started using the built in or external parser
75
	 *
76
	 * @param string $html string of html to convert to MD text
77
	 */
78
	public function __construct($html)
79
	{
80
		// Up front, remove whitespace between html tags
81
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
82
83
		// The XML parser will not deal gracefully with these
84
		$this->html = strtr($this->html, array(
85
			'?<' => '|?|&lt',
86
			'?>' => '|?|&gt',
87
			'>?' => '&gt|?|',
88
			'<?' => '&lt|?|'
89
		));
90
91
		// Set the dom parser to use and load the HTML to the parser
92
		$this->_set_parser();
93
94
		// Initialize the regex array to escape text areas so markdown does
95
		// not interpret plain text as markdown syntax
96
		$this->_textEscapeRegex = array(
97
			// Things that may convert to an hr --- or - - - etc
98
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
99
			// or **stuff** => \*\*stuff\*\*
100
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
101
			// or versions of *italic* __italic__ _italic_
102
			'\*([^*\s]+)\*' => '\*$1\*',
103
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
104
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
105
			// nor `code`
106
			'`(.+)`' => '\`$1\`',
107
			// or links
108
			'\[(.+)\](\s*\()' => '\[$1\]$2',
109
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
110
		);
111
	}
112
113
	/**
114
	 * Set the DOM parser for class, loads the supplied HTML
115
	 */
116
	private function _set_parser()
117
	{
118
		// Using PHP built in functions ...
119
		if (class_exists('DOMDocument'))
120
		{
121
			$this->_parser = true;
122
			$previous = libxml_use_internal_errors(true);
123
124
			// Set up basic parameters for DomDocument, including silencing structural errors
125
			$this->_setupDOMDocument();
126
127
			// Set the error handle back to what it was, and flush
128
			libxml_use_internal_errors($previous);
129
			libxml_clear_errors();
130
		}
131
		// Or using the external simple html parser
132
		else
133
		{
134
			$this->_parser = false;
135
			require_once(EXTDIR . '/simple_html_dom.php');
136
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
0 ignored issues
show
Documentation Bug introduced by
It seems like str_get_html($this->html..., true, 'UTF-8', false) can also be of type false. However, the property $doc is declared as type object. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
137
		}
138
	}
139
140
	/**
141
	 * Loads the html body and sends it to the parsing loop to convert all
142
	 * DOM nodes to markup
143
	 */
144
	public function get_markdown()
145
	{
146
		// For this html node, find all child elements and convert
147
		$body = $this->_getBody();
148
		$this->_convert_childNodes($body);
149
150
		// Done replacing HTML elements, now get the converted DOM tree back into a string
151
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
152
153
		// Using the internal DOM methods requires we need to do a little extra work
154
		if ($this->_parser)
155
		{
156
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
157
		}
158
159
		// Clean up any excess spacing etc
160
		$this->_clean_markdown();
161
162
		// Convert any clear text links to MD
163
		$this->_convert_plaintxt_links();
164
165
		// Wordwrap?
166
		if (!empty($this->body_width))
167
		{
168
			$this->_check_line_lenght($this->markdown);
169
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
170
		}
171
172
		// The null character will trigger a base64 version in outbound email
173
		return $this->markdown . "\n\x00";
174
	}
175
176
	/**
177
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
178
	 * and above head markup
179
	 *
180
	 * @return object
181
	 */
182
	private function _getBody()
183
	{
184
		// If there is a head node, then off with his head!
185
		$this->_clipHead();
186
187
		// The body of the HTML is where its at.
188
		if ($this->_parser)
189
		{
190
			$body = $this->doc->getElementsByTagName('body')->item(0);
191
		}
192
		else
193
		{
194
			if ($this->doc->find('body', 0) !== null)
195
			{
196
				$body = $this->doc->find('body', 0);
197
			}
198
			elseif ($this->doc->find('html', 0) !== null)
199
			{
200
				$body = $this->doc->find('html', 0);
201
			}
202
			else
203
			{
204
				$body = $this->doc->root;
205
			}
206
		}
207
208
		return $body;
209
	}
210
211
	/**
212
	 * Remove any <head node from the DOM
213
	 */
214
	private function _clipHead()
215
	{
216
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0);
217
		if ($head !== null)
218
		{
219
			if ($this->_parser)
220
			{
221
				$head->parentNode->removeChild($head);
222
			}
223
			else
224
			{
225
				$this->doc->find('head', 0)->outertext = '';
226
			}
227
		}
228
	}
229
230
	/**
231
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
232
	 */
233
	private function _setupDOMDocument()
234
	{
235
		// If the html is already wrapped, remove it
236
		$this->html = $this->_returnBodyText($this->html);
237
238
		// Set up processing details
239
		$this->doc = new DOMDocument();
240
		$this->doc->preserveWhiteSpace = false;
241
		$this->doc->encoding = 'UTF-8';
242
243
		// Do what we can to ensure this is processed as UTF-8
244
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
245
	}
246
247
	/**
248
	 * Normalize any spacing and excess blank lines that may have been generated
249
	 */
250
	private function _clean_markdown()
251
	{
252
		// We only want the content, no wrappers
253
		$this->markdown = $this->_returnBodyText($this->markdown);
254
255
		// Remove non breakable spaces that may be hiding in here
256
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
257
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
258
259
		// Remove any "bonus" tags
260
		if ($this->strip_tags)
261
		{
262
			$this->markdown = strip_tags($this->markdown);
263
		}
264
265
		// Replace content that we "hide" from the XML parsers
266
		$this->markdown = strtr($this->markdown, array(
267
			'|?|&gt' => '?>',
268
			'|?|&lt' => '?<',
269
			'&lt|?|' => '<?',
270
			'&gt|?|' => '>?'
271
		));
272
273
		// Strip the chaff and any excess blank lines we may have produced
274
		$this->markdown = trim($this->markdown);
275
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
276
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
277
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
278
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
279
	}
280
281
	/**
282
	 * Looks for the text inside of <body> and then <html>, returning just the inner
283
	 *
284
	 * @param $text
285
	 *
286
	 * @return string
287
	 */
288
	private function _returnBodyText($text)
289
	{
290
		if (preg_match('~<body>(.*)</body>~su', $text, $body))
291
		{
292
			return $body[1];
293
		}
294
		elseif (preg_match('~<html>(.*)</html>~su', $text, $body))
295
		{
296
			return $body[1];
297
		}
298
299
		return $text;
300
	}
301
302
	/**
303
	 * For a given node, checks if it is anywhere nested inside of a code block
304
	 *  - Prevents converting anything that's inside a code block
305
	 *
306
	 * @param object $node
307
	 * @param boolean $parser flag for internal or external parser
308
	 *
309
	 * @return boolean
310
	 */
311
	private static function _has_parent_code($node, $parser)
312
	{
313
		$parent = $parser ? $node->parentNode : $node->parentNode();
314
		while ($parent)
315
		{
316
			if ($parent === null)
317
			{
318
				return false;
319
			}
320
321
			// Anywhere nested inside a code block we don't render tags
322
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
323
			if ($tag === 'code')
324
			{
325
				return true;
326
			}
327
328
			// Back out another level, until we are done
329
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
330
		}
331
332
		return false;
333
	}
334
335
	/**
336
	 * Get the nesting level when inside a list
337
	 *
338
	 * @param object $node
339
	 * @param boolean $parser flag for internal or external parser
340
	 *
341
	 * @return int
342
	 */
343
	private static function _has_parent_list($node, $parser)
344
	{
345
		$inlist = array('ul', 'ol');
346
		$depth = 0;
347
348
		$parent = $parser ? $node->parentNode : $node->parentNode();
349
		while ($parent)
350
		{
351
			// Anywhere nested inside a list we need to get the depth
352
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
353
			if (in_array($tag, $inlist))
354
			{
355
				$depth++;
356
			}
357
358
			// Back out another level
359
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
360
		}
361
362
		return $depth;
363
	}
364
365
	/**
366
	 * Traverse each node to its base, then convert tags to markup on the way back out
367
	 *
368
	 * @param object $node
369
	 */
370
	private function _convert_childNodes($node)
371
	{
372
		if (self::_has_parent_code($node, $this->_parser))
373
		{
374
			return;
375
		}
376
377
		// Keep traversing till we are at the base of this node
378
		if ($node->hasChildNodes())
379
		{
380
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
381
			for ($i = 0; $i < $num; $i++)
382
			{
383
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
384
				$this->_convert_childNodes($child);
385
			}
386
		}
387
388
		// At the root of this node, convert it to markdown
389
		$this->_convert_to_markdown($node);
390
	}
391
392
	/**
393
	 * Convert the supplied node into its markdown equivalent
394
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
395
	 *
396
	 * @param object $node
397
	 */
398
	private function _convert_to_markdown($node)
399
	{
400
		// HTML tag we are dealing with
401
		$tag = $this->_get_name($node);
402
403
		// Based on the tag, determine how to convert
404
		switch ($tag)
405
		{
406
			case 'a':
407
				if ($node->getAttribute('data-lightboximage') || $node->getAttribute('data-lightboxmessage'))
408
					$markdown = '~`skip`~';
409
				else
410
					$markdown = $this->line_end . $this->_convert_anchor($node) . $this->line_end;
411
				break;
412
			case 'abbr':
413
				$markdown = $this->_convert_abbr($node);
414
				break;
415
			case 'b':
416
			case 'strong':
417
				$markdown = '**' . $this->_get_value($node) . '**';
418
				break;
419
			case 'blockquote':
420
				$markdown = $this->_convert_blockquote($node);
421
				break;
422
			case 'br':
423
				// DomDocument strips empty lines, this prevents that
424
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
425
				break;
426
			case 'center':
427
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
428
				break;
429
			case 'code':
430
				$markdown = $this->_convert_code($node);
431
				break;
432
			case 'dt':
433
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
434
				break;
435
			case 'dd':
436
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
437
				break;
438
			case 'dl':
439
				$markdown = trim($this->_get_value($node)) . $this->line_break;
440
				break;
441
			case 'em':
442
			case 'i':
443
				$markdown = '_' . $this->_get_value($node) . '_';
444
				break;
445
			case 'hr':
446
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
447
				break;
448
			case 'h1':
449
			case 'h2':
450
			case 'h3':
451
			case 'h4':
452
			case 'h5':
453
			case 'h6':
454
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
0 ignored issues
show
Bug introduced by
$tag of type string is incompatible with the type integer expected by parameter $level of Html_2_Md::_convert_header(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

454
				$markdown = $this->_convert_header(/** @scrutinizer ignore-type */ $tag, $this->_get_value($node));
Loading history...
455
				break;
456
			case 'img':
457
				$markdown = $this->_convert_image($node) . $this->line_end;
458
				break;
459
			case 'ol':
460
			case 'ul':
461
				$markdown = rtrim($this->_get_value($node)) . $this->line_break;
462
				break;
463
			case 'li':
464
				$markdown = $this->_convert_list($node);
465
				break;
466
			case 'p':
467
				if (!$node->hasChildNodes())
468
				{
469
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
470
					$markdown = $this->_escape_text($markdown);
471
				}
472
				else
473
				{
474
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
475
				}
476
				break;
477
			case 'pre':
478
				$markdown = $this->_get_value($node) . $this->line_break;
479
				break;
480
			case 'div':
481
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
482
				if (!$node->hasChildNodes())
483
				{
484
					$markdown = $this->_escape_text($markdown);
485
				}
486
				break;
487
			//case '#text':
488
			//  $markdown = $this->_escape_text($this->_get_value($node));
489
			//  break;
490
			case 'title':
491
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
492
				break;
493
			case 'table':
494
				$markdown = $this->_convert_table($node) . $this->line_break;
495
				break;
496
			case 'th':
497
			case 'tr':
498
			case 'td':
499
			case 'tbody':
500
			case 'tfoot':
501
			case 'thead':
502
				// Just skip over these as we handle them in the table tag itself
503
				$markdown = '~`skip`~';
504
				break;
505
			case 'root':
506
			case 'span':
507
			case 'body':
508
				// Remove these tags and simply replace with the text inside the tags
509
				$markdown = $this->_get_innerHTML($node);
510
				break;
511
			default:
512
				// Don't know you or text, so just preserve whats there
513
				$markdown = $this->_get_outerHTML($node);
514
		}
515
516
		// Replace the node with our markdown replacement, or with the node itself if none was found
517
		if ($markdown !== '~`skip`~')
518
		{
519
			if ($this->_parser)
520
			{
521
				// Create a new text node with our markdown tag and replace the original node
522
				$markdown_node = $this->doc->createTextNode($markdown);
523
				$node->parentNode->replaceChild($markdown_node, $node);
524
			}
525
			else
526
			{
527
				$node->outertext = $markdown;
528
			}
529
		}
530
	}
531
532
	/**
533
	 * Converts <abbr> tags to markdown (extra)
534
	 *
535
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
536
	 * md:   *[HTML]: Hyper Text Markup Language
537
	 *
538
	 * @param object $node
539
	 * @return string
540
	 */
541
	private function _convert_abbr($node)
542
	{
543
		$title = $node->getAttribute('title');
544
		$value = $this->_get_value($node);
545
546
		if (!empty($title))
547
		{
548
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
549
		}
550
		else
551
		{
552
			$markdown = '';
553
		}
554
555
		return $markdown;
556
	}
557
558
	/**
559
	 * Converts <a> tags to markdown
560
	 *
561
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
562
	 * md: [Awesome Site](http://somesite.com 'Title')
563
	 *
564
	 * @param object $node
565
	 * @return string
566
	 */
567
	private function _convert_anchor($node)
568
	{
569
		global $txt;
570
571
		$href = htmlspecialchars_decode($node->getAttribute('href'));
572
		$href = strtr($href, array('(' => '%28', ')' => '%29', '[' => '%5B', ']' => '%5D', '&' => '%26a'));
573
574
		$title = $node->getAttribute('title');
575
		$class = $node->getAttribute('class');
576
		$value = $this->_get_value($node);
577
578
		// Provide a more compact [name] if none is given
579
		if ($value == $node->getAttribute('href') || empty($value))
580
		{
581
			$value = empty($title) ? $txt['link'] : $title;
582
		}
583
584
		// Special processing just for our own footnotes
585
		if ($class === 'target' || $class === 'footnote_return')
586
		{
587
			$markdown = $value;
588
		}
589
		elseif (!empty($title))
590
		{
591
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
592
		}
593
		else
594
		{
595
			$markdown = '[' . $value . ']( ' . $href . ' )';
596
		}
597
598
		return $markdown;
599
	}
600
601
	/**
602
	 * Converts blockquotes to markdown > quote style
603
	 *
604
	 * html: <blockquote>quote</blockquote>
605
	 * md: > quote
606
	 *
607
	 * @param object $node
608
	 * @return string
609
	 */
610
	private function _convert_blockquote($node)
611
	{
612
		$markdown = '';
613
614
		// All the contents of this block quote
615
		$value = $this->_get_value($node);
616
		$value = trim($value);
617
618
		// Go line by line
619
		$lines = preg_split('~\r\n|\r|\n~', $value);
620
621
		// Each line gets a '> ' in front of it, just like email quotes really
622
		foreach ($lines as $line)
623
		{
624
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
625
		}
626
627
		$markdown .= $this->line_end;
628
629
		return $markdown;
630
	}
631
632
	/**
633
	 * Converts code tags to markdown span `code` or block code
634
	 * Converts single line code to inline tick mark
635
	 * Converts multi line to 4 space indented code
636
	 *
637
	 * html: <code>code</code>
638
	 * md: `code`
639
	 *
640
	 * @param object $node
641
	 * @return string
642
	 */
643
	private function _convert_code($node)
644
	{
645
		$value = $this->_get_innerHTML($node);
646
647
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselves
648
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
649
650
		// If there are html tags in this code block, we need to disable strip tags
651
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
652
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
653
		{
654
			$this->strip_tags = false;
655
		}
656
657
		// Get the number of lines of code that we have
658
		$lines = preg_split('~\r\n|\r|\n~', $value);
659
		$total = count($lines);
660
661
		// If there's more than one line of code, use leading four space syntax
662
		if ($total > 1)
663
		{
664
			$first_line = trim($lines[0]);
665
			$last_line = trim($lines[$total - 1]);
666
667
			// Remove any leading and trailing blank lines
668
			if (empty($first_line))
669
			{
670
				array_shift($lines);
671
			}
672
			if (empty($last_line))
673
			{
674
				array_pop($lines);
675
			}
676
677
			// Convert what remains
678
			$markdown = '';
679
			foreach ($lines as $line)
680
			{
681
				// Adjust the word wrapping since this has code tags, leave it up to
682
				// the email client to mess these up ;)
683
				$this->_check_line_lenght($markdown, 5);
684
685
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
686
			}
687
688
			// The parser will encode, but we don't want that for our code block
689
			if ($this->_parser)
690
			{
691
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
692
			}
693
		}
694
		// Single line, back tick and move on
695
		else
696
		{
697
			// Account for backticks in the single line code itself
698
			$ticks = $this->_has_ticks($node, $value);
699
			if (!empty($ticks))
700
			{
701
				// If the ticks were at the start/end of the word space it off
702
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
703
				{
704
					$lines[0] = ' ' . $lines[0] . ' ';
705
				}
706
707
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
708
			}
709
			else
710
			{
711
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
712
			}
713
		}
714
715
		return $markdown;
716
	}
717
718
	/**
719
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
720
	 * all other headers are returned as atx style ### h3
721
	 *
722
	 * html: <h1>header</h1>
723
	 * md: header
724
	 *     ======
725
	 *
726
	 * html: <h3>header</h3>
727
	 * md: ###header
728
	 *
729
	 * @param int $level
730
	 * @param string $content
731
	 * @return string
732
	 */
733
	private function _convert_header($level, $content)
734
	{
735
		$level = (int) ltrim($level, 'h');
736
737
		if ($level < 3)
738
		{
739
			$length = Util::strlen($content);
740
			$underline = ($level === 1) ? '=' : '-';
741
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
742
		}
743
		else
744
		{
745
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
746
		}
747
748
		return $markdown;
749
	}
750
751
	/**
752
	 * Converts <img> tags to markdown
753
	 *
754
	 * html: <img src='source' alt='alt' title='title' />
755
	 * md: ![alt](source 'title')
756
	 *
757
	 * @param object $node
758
	 * @return string
759
	 */
760
	private function _convert_image($node)
761
	{
762
		$src = $node->getAttribute('src');
763
		$alt = $node->getAttribute('alt');
764
		$title = $node->getAttribute('title');
765
766
		if (!empty($title))
767
		{
768
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
769
		}
770
		else
771
		{
772
			$markdown = '![' . $alt . '](' . $src . ')';
773
		}
774
775
		return $markdown;
776
	}
777
778
	/**
779
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
780
	 *
781
	 * html: <ul><li>one</li></ul>
782
	 * md * one
783
	 *
784
	 * @param object $node
785
	 * @return string
786
	 */
787
	private function _convert_list($node)
788
	{
789
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
790
		$value = $this->_get_value($node);
791
792
		$loose = rtrim($value) !== $value;
793
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
794
795
		// Unordered lists get a simple bullet
796
		if ($list_type === 'ul')
797
		{
798
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
799
		}
800
		// Ordered lists need a number
801
		else
802
		{
803
			$number = $this->_get_list_position($node);
804
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
805
		}
806
807
		return $markdown . (!$loose ? $this->line_end : '');
808
	}
809
810
	/**
811
	 * Converts tables tags to markdown extra table syntax
812
	 *
813
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
814
	 *
815
	 * @param object $node
816
	 * @return string
817
	 */
818
	private function _convert_table($node)
819
	{
820
		$table_heading = $node->getElementsByTagName('th');
821
		if ($this->_get_item($table_heading, 0) === null)
822
		{
823
			return '';
824
		}
825
826
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
827
828
		// Set up for a markdown table, then storm the castle
829
		$align = array();
830
		$value = array();
831
		$width = array();
832
		$max = array();
833
		$header = array();
834
		$rows = array();
835
836
		// We only markdown well formed tables ...
837
		if ($table_heading && $th_parent === 'tr')
838
		{
839
			// Find out how many columns we are dealing with
840
			$th_num = $this->_get_length($table_heading);
841
842
			for ($col = 0; $col < $th_num; $col++)
843
			{
844
				// Get the align and text for each th (html5 this is no longer valid)
845
				$th = $this->_get_item($table_heading, $col);
846
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
847
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
848
				$value[0][$col] = $this->_get_value($th);
849
				$width[0][$col] = Util::strlen($this->_get_value($th));
850
851
				// Seed the max col width
852
				$max[$col] = $width[0][$col];
853
			}
854
855
			// Get all of the rows
856
			$table_rows = $node->getElementsByTagName('tr');
857
			$num_rows = $this->_get_length($table_rows);
858
			for ($row = 1; $row < $num_rows; $row++)
859
			{
860
				// Start at row 1 and get all of the td's in this row
861
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
862
863
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
864
				for ($col = 0; $col < $th_num; $col++)
865
				{
866
					// Get the align and text for each td in this row
867
					$td = $this->_get_item($row_data, $col);
868
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
869
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
870
					$value[$row][$col] = $this->_get_value($td);
871
					$width[$row][$col] = Util::strlen($this->_get_value($td));
872
873
					// Keep track of the longest col cell as we go
874
					if ($width[$row][$col] > $max[$col])
875
					{
876
						$max[$col] = $width[$row][$col];
877
					}
878
				}
879
			}
880
881
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
882
			for ($row = 0; $row < $num_rows; $row++)
883
			{
884
				$temp = array();
885
				for ($col = 0; $col < $th_num; $col++)
886
				{
887
					// Build the header row once
888
					if ($row === 0)
889
					{
890
						$header[] = str_repeat('-', $max[$col]);
891
					}
892
893
					// Build the data for each col, align/pad as needed
894
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
895
				}
896
897
				// Join it all up so we have a nice looking row
898
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
899
900
				// Stuff in the header after the th row
901
				if ($row === 0)
902
				{
903
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
904
				}
905
			}
906
907
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
908
			$this->_check_line_lenght($rows[1], 2);
909
910
			// Return what we did so it can be swapped in
911
			return implode($this->line_end, $rows);
912
		}
913
	}
914
915
	/**
916
	 * Helper function for getting a node object
917
	 *
918
	 * @param object $node
919
	 * @param int $item
920
	 * @return object
921
	 */
922
	private function _get_item($node, $item)
923
	{
924
		if ($this->_parser)
925
		{
926
			return $node->item($item);
927
		}
928
		else
929
		{
930
			return $node[$item];
931
		}
932
	}
933
934
	/**
935
	 * Helper function for getting a node length
936
	 *
937
	 * @param object|array $node
938
	 * @return int
939
	 */
940
	private function _get_length($node)
941
	{
942
		if ($this->_parser)
943
		{
944
			return $node->length;
945
		}
946
		else
947
		{
948
			return count($node);
949
		}
950
	}
951
952
	/**
953
	 * Helper function for getting a node value
954
	 *
955
	 * @param object $node
956
	 * @return string
957
	 */
958
	private function _get_value($node)
959
	{
960
		if ($node === null)
961
		{
962
			return '';
963
		}
964
965
		if ($this->_parser)
966
		{
967
			return $node->nodeValue;
968
		}
969
		else
970
		{
971
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
972
		}
973
	}
974
975
	/**
976
	 * Helper function for getting a node name
977
	 *
978
	 * @param object $node
979
	 * @return string
980
	 */
981
	private function _get_name($node)
982
	{
983
		if ($node === null)
984
		{
985
			return '';
986
		}
987
988
		if ($this->_parser)
989
		{
990
			return $node->nodeName;
991
		}
992
		else
993
		{
994
			return $node->nodeName();
995
		}
996
	}
997
998
	/**
999
	 * Helper function for creating ol's
1000
	 *
1001
	 * - Returns the absolute number of an <li> inside an <ol>
1002
	 *
1003
	 * @param object $node
1004
	 * @return int
1005
	 */
1006
	private function _get_list_position($node)
1007
	{
1008
		$position = 1;
1009
1010
		// Get all of the list nodes inside this parent
1011
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
1012
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
1013
1014
		// Loop through all li nodes and find where we are in this list
1015
		for ($i = 0; $i < $total_nodes; $i++)
1016
		{
1017
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
1018
			if ($current_node === $node)
1019
			{
1020
				$position = $i + 1;
1021
			}
1022
		}
1023
1024
		return $position;
1025
	}
1026
1027
	/**
1028
	 * Helper function for table creation
1029
	 *
1030
	 * - Builds td's to a give width, aligned as needed
1031
	 *
1032
	 * @param string $align
1033
	 * @param int $width
1034
	 * @param string $content
1035
	 * @param int $max
1036
	 * @return string
1037
	 */
1038
	private function _align_row_content($align, $width, $content, $max)
1039
	{
1040
		switch ($align)
1041
		{
1042
			default:
1043
			case 'left':
1044
				$content .= str_repeat(' ', $max - $width);
1045
				break;
1046
			case 'right':
1047
				$content = str_repeat(' ', $max - $width) . $content;
1048
				break;
1049
			case 'center':
1050
				$paddingNeeded = $max - $width;
1051
				$left = floor($paddingNeeded / 2);
1052
				$right = $paddingNeeded - $left;
1053
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
0 ignored issues
show
Bug introduced by
$left of type double is incompatible with the type integer expected by parameter $times of str_repeat(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1053
				$content = str_repeat(' ', /** @scrutinizer ignore-type */ $left) . $content . str_repeat(' ', $right);
Loading history...
1054
				break;
1055
		}
1056
1057
		return $content;
1058
	}
1059
1060
	/**
1061
	 * Gets the inner html of a node
1062
	 *
1063
	 * @param DOMNode|object $node
1064
	 * @return string
1065
	 */
1066
	private function _get_innerHTML($node)
1067
	{
1068
		if ($this->_parser)
1069
		{
1070
			$doc = new DOMDocument();
1071
			$doc->appendChild($doc->importNode($node, true));
1072
			$html = trim($doc->saveHTML());
1073
			$tag = $node->nodeName;
1074
1075
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
1076
		}
1077
		else
1078
		{
1079
			return $node->innertext;
0 ignored issues
show
Bug introduced by
The property innertext does not seem to exist on DOMNode.
Loading history...
1080
		}
1081
	}
1082
1083
	/**
1084
	 * Gets the outer html of a node
1085
	 *
1086
	 * @param DOMNode|object $node
1087
	 * @return string
1088
	 */
1089
	private function _get_outerHTML($node)
1090
	{
1091
		if ($this->_parser)
1092
		{
1093
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
1094
			{
1095
				return htmlspecialchars_decode($this->doc->saveHTML($node));
1096
			}
1097
			else
1098
			{
1099
				// @todo remove when 5.3.6 min
1100
				$doc = new DOMDocument();
1101
				$doc->appendChild($doc->importNode($node, true));
1102
				$html = $doc->saveHTML();
1103
1104
				// We just want the html of the inserted node, it *may* be wrapped
1105
				$html = $this->_returnBodyText($html);
1106
1107
				// Clean it up
1108
				$html = rtrim($html, "\n");
1109
1110
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1111
			}
1112
		}
1113
		else
1114
		{
1115
			return $node->outertext;
0 ignored issues
show
Bug introduced by
The property outertext does not seem to exist on DOMNode.
Loading history...
1116
		}
1117
	}
1118
1119
	/**
1120
	 * Escapes markup looking text in html to prevent accidental assignment
1121
	 *
1122
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1123
	 * be converted by md to html as <strong>stuff</strong>
1124
	 *
1125
	 * @param string $value
1126
	 * @return string
1127
	 */
1128
	private function _escape_text($value)
1129
	{
1130
		// Search and replace ...
1131
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1132
		{
1133
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
1134
		}
1135
1136
		return $value;
1137
	}
1138
1139
	/**
1140
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1141
	 * when markdown is run we don't interpret the ` as additional code blocks
1142
	 *
1143
	 * @param object $node
1144
	 * @param string $value
1145
	 * @return string
1146
	 */
1147
	private function _has_ticks($node, $value)
1148
	{
1149
		$ticks = '';
1150
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
1151
1152
		// Inside of a pre, we don't do anything
1153
		if ($code_parent === 'pre')
1154
		{
1155
			return $value;
1156
		}
1157
1158
		// If we have backticks in code, then we back tick the ticks
1159
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1160
		preg_match_all('~`+~', $value, $matches);
1161
		if (!empty($matches[0]))
1162
		{
1163
			// Yup ticks in the hair
1164
			$ticks = '`';
1165
			rsort($matches[0]);
1166
1167
			// Backtick as many as needed so markdown will work
1168
			while (true)
1169
			{
1170
				if (!in_array($ticks, $matches[0]))
1171
				{
1172
					break;
1173
				}
1174
				$ticks .= '`';
1175
			}
1176
		}
1177
1178
		return $ticks;
1179
	}
1180
1181
	/**
1182
	 * Helper function to adjust wrapping width for long-ish links
1183
	 *
1184
	 * @param string $markdown
1185
	 * @param bool|int $buffer
1186
	 */
1187
	private function _check_line_lenght($markdown, $buffer = false)
1188
	{
1189
		// Some Lines can be very long and if we wrap them they break
1190
		$lines = explode($this->line_end, $markdown);
1191
		foreach ($lines as $line)
1192
		{
1193
			$line_strlen = Util::strlen($line) + (!empty($buffer) ? (int) $buffer : 0);
1194
		if ($line_strlen > $this->body_width)
1195
		{
1196
			$this->body_width = $line_strlen;
1197
		}
1198
	}
1199
	}
1200
1201
	/**
1202
	 * Helper function to find and wrap plain text links in MD format
1203
	 */
1204
	private function _convert_plaintxt_links()
1205
	{
1206
		$this->markdown = preg_replace_callback('/((?<!\]\( |\]\()https?:\/\/|(?<!\]\( |\]\(|:\/\/)www)[-\p{L}0-9+&@#\/%?=~_|!:,.;]*[\p{L}0-9+&@#\/%=~_|]/iu', array($this, '_plaintxt_callback'), $this->markdown);
1207
	}
1208
1209
	/**
1210
	 * Callback function used by _convert_plaintxt_links for plain link to MD
1211
	 *
1212
	 * @param string[] $matches
1213
	 * @return string
1214
	 */
1215
	private function _plaintxt_callback($matches)
1216
	{
1217
		global $txt;
1218
1219
		$replacement = $this->line_end . '[' . $txt['link'] . ']( ' . trim($matches[0]) . ' )';
1220
1221
		return $replacement;
1222
	}
1223
1224
	/**
1225
	 * Breaks a string up so its no more than width characters long
1226
	 *
1227
	 * - Will break at word boundaries
1228
	 * - If no natural space is found will break mid-word
1229
	 *
1230
	 * @param string $string
1231
	 * @param int $width
1232
	 * @param string $break
1233
	 * @return string
1234
	 */
1235
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
1236
	{
1237
		$strings = explode($break, $string);
1238
		$lines = array();
1239
1240
		foreach ($strings as $string)
0 ignored issues
show
introduced by
$string is overwriting one of the parameters of this function.
Loading history...
1241
		{
1242
			$in_quote = isset($string[0]) && $string[0] === '>';
1243
			while (!empty($string))
1244
			{
1245
				// Get the next #width characters before a break (space, punctuation tab etc)
1246
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1247
				{
1248
					// Add the #width to the output and set up for the next pass
1249
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . ltrim($matches[1], ' ');
1250
					$string = Util::substr($string, Util::strlen($matches[1]));
1251
				}
1252
				// Humm just a long word with no place to break so we simply cut it after width characters
1253
				else
1254
				{
1255
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1256
					$string = Util::substr($string, $width);
1257
				}
1258
			}
1259
		}
1260
1261
		// Join it all the shortened sections up on our break characters
1262
		return implode($break, $lines);
1263
	}
1264
}
1265