Passed
Push — master ( d9e5dd...36764d )
by Spuds
01:07 queued 26s
created

Html_2_Md::_check_line_lenght()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 4

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 4
eloc 5
nc 5
nop 2
dl 0
loc 10
ccs 4
cts 4
cp 1
crap 4
rs 10
c 2
b 0
f 0
1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.1.9
11
 *
12
 */
13
14
/**
15
 * Converts HTML to Markdown text
16
 */
17
class Html_2_Md
18
{
19
	/**
20
	 * The value that will hold our dom object
21
	 * @var object
22
	 */
23
	public $doc;
24
25
	/**
26
	 * The value that will hold if we are using the internal or external parser
27
	 * @var boolean
28
	 */
29
	private $_parser;
30
31
	/**
32
	 * Line end character
33
	 * @var string
34
	 */
35
	public $line_end = "\n";
36
37
	/**
38
	 * Line break character
39
	 * @var string
40
	 */
41
	public $line_break = "  \n\n";
42
43
	/**
44
	 * Wordwrap output, set to 0 to skip wrapping
45
	 * @var int
46
	 */
47
	public $body_width = 76;
48
49
	/**
50
	 * Strip remaining tags, set as false to leave them in
51
	 * @var boolean
52
	 */
53
	public $strip_tags = true;
54
55
	/**
56
	 * Regex to run on plain text to prevent markdown from erroneously converting
57
	 * @var string[]
58
	 */
59
	private $_textEscapeRegex;
60
61
	/**
62
	 * The passed html string to convert
63
	 * @var string
64
	 */
65
	public $html;
66
67
	/**
68
	 * The markdown equivalent to the  html string
69
	 * @var string
70
	 */
71
	public $markdown;
72
73
	/**
74
	 * Various settings on how render certain markdown tag
75
	 * @var string[]
76
	 */
77
	public $config = ['heading' => 'atx', 'bullet' => '*', 'em' => '_', 'strong' => '**'];
78 1
79
	/**
80
	 * Gets everything started using the built-in or external parser
81 1
	 *
82
	 * @param string $html string of html to convert to MD text
83
	 */
84 1
	public function __construct($html)
85 1
	{
86 1
		// Up front, remove whitespace between html tags
87 1
		$this->html = preg_replace('/(?:(?<=>)|(?<=\/>))(\s+)(?=<\/?)/', '', $html);
88
89 1
		// Replace invisible (except \n \t) characters with a space
90
		$this->html = preg_replace('~[^\S\n\t]~u', ' ', $this->html);
91
92 1
		// The XML parser will not deal gracefully with these
93
		$this->html = strtr($this->html, array(
94
			'?<' => '|?|&lt',
95
			'?>' => '|?|&gt',
96 1
			'>?' => '&gt|?|',
97
			'<?' => '&lt|?|'
98 1
		));
99
100 1
		// Set the dom parser to use and load the HTML to the parser
101
		$this->_set_parser();
102 1
103 1
		// Initialize the regex array to escape text areas so markdown does
104 1
		// not interpret plain text as markdown syntax
105
		$this->_textEscapeRegex = array(
106 1
			'~([*_\\[\\]\\\\])~' => '\\\\$1',
107
			'~^-~m' => '\\-',
108 1
			'~^\+ ~m' => '\\+ ',
109 1
			'~^(=+)~m' => '\\\\$1',
110
			'~^(#{1,6}) ~m' => '\\\\$1 ',
111 1
			'~`~' => '\\`',
112
			'~^>~m' => '\\>',
113
			'~^(\d+)\. ~m' => '$1\\. ',
114
		);
115
	}
116 1
117
	/**
118
	 * Set the DOM parser for class, loads the supplied HTML
119 1
	 */
120 1
	private function _set_parser()
121 1
	{
122 1
		// Using PHP built in functions ...
123
		if (class_exists('DOMDocument'))
124
		{
125 1
			$this->_parser = true;
126
			$previous = libxml_use_internal_errors(true);
127
128 1
			// Set up basic parameters for DomDocument, including silencing structural errors
129 1
			$this->_setupDOMDocument();
130 1
131
			// Set the error handle back to what it was, and flush
132
			libxml_use_internal_errors($previous);
133
			libxml_clear_errors();
134
		}
135
		// Or using the external simple html parser
136
		else
137
		{
138 1
			$this->_parser = false;
139
			require_once(EXTDIR . '/simple_html_dom.php');
140
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
0 ignored issues
show
Documentation Bug introduced by
It seems like str_get_html($this->html..., true, 'UTF-8', false) can also be of type false. However, the property $doc is declared as type object. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
141
		}
142
	}
143
144 1
	/**
145
	 * Loads the html body and sends it to the parsing loop to convert all
146
	 * DOM nodes to markup
147 1
	 */
148 1
	public function get_markdown()
149
	{
150
		// For this html node, find all child elements and convert
151 1
		$body = $this->_getBody();
152
		$this->_convert_childNodes($body);
153
154 1
		// Done replacing HTML elements, now get the converted DOM tree back into a string
155 1
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
156 1
157 1
		// Using the internal DOM methods requires we need to do a little extra work
158
		if ($this->_parser)
159
		{
160 1
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
161
		}
162
163 1
		// Clean up any excess spacing etc
164
		$this->_clean_markdown();
165
166 1
		// Wordwrap?
167 1
		if (!empty($this->body_width))
168 1
		{
169 1
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
170
		}
171 1
172
		// The null character will trigger a base64 version in outbound email
173
		return $this->markdown . "\n\x00";
174
	}
175
176
	/**
177
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
178
	 * and above head markup
179
	 *
180 1
	 * @return object
181
	 */
182
	private function _getBody()
183 1
	{
184
		// If there is a head node, then off with his head!
185
		$this->_clipHead();
186 1
187 1
		// The body of the HTML is where its at.
188 1
		if ($this->_parser)
189 1
		{
190
			$body = $this->doc->getElementsByTagName('body')->item(0);
191
		}
192
		else
193
		{
194
			if ($this->doc->find('body', 0) !== null)
195
			{
196
				$body = $this->doc->find('body', 0);
197
			}
198
			elseif ($this->doc->find('html', 0) !== null)
199
			{
200
				$body = $this->doc->find('html', 0);
201
			}
202
			else
203
			{
204
				$body = $this->doc->root;
205
			}
206 1
		}
207
208
		return $body;
209
	}
210
211
	/**
212 1
	 * Remove any <head node from the DOM
213
	 */
214 1
	private function _clipHead()
215 1
	{
216 1
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0);
217 1
		if ($head !== null)
218 1
		{
219 1
			if ($this->_parser)
220 1
			{
221
				$head->parentNode->removeChild($head);
222
			}
223
			else
224
			{
225 1
				$this->doc->find('head', 0)->outertext = '';
226 1
			}
227
		}
228
	}
229
230
	/**
231 1
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
232
	 */
233
	private function _setupDOMDocument()
234 1
	{
235
		// If the html is already wrapped, remove it
236
		$this->html = $this->_returnBodyText($this->html);
237 1
238 1
		// Set up processing details
239 1
		$this->doc = new DOMDocument();
240
		$this->doc->preserveWhiteSpace = false;
241
		$this->doc->encoding = 'UTF-8';
242 1
243 1
		// Do what we can to ensure this is processed as UTF-8
244
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
245
	}
246
247
	/**
248 1
	 * Normalize any spacing and excess blank lines that may have been generated
249
	 */
250
	private function _clean_markdown()
251 1
	{
252
		// We only want the content, no wrappers
253
		$this->markdown = $this->_returnBodyText($this->markdown);
254 1
255 1
		// Remove any "bonus" tags
256
		if ($this->strip_tags)
257
		{
258 1
			$this->markdown = strip_tags($this->markdown);
259 1
		}
260 1
261 1
		// Replace content that we "hide" from the XML parsers
262
		$this->markdown = strtr($this->markdown, array(
263
			'|?|&gt' => '?>',
264 1
			'|?|&lt' => '?<',
265 1
			'&lt|?|' => '<?',
266 1
			'&gt|?|' => '>?'
267 1
		));
268
269 1
		// We may have hidden content ending in ?<br /> due to the above
270
		$this->markdown = str_replace('<br />', "\n\n", $this->markdown);
271
272 1
		// Strip the chaff and any excess blank lines we may have produced
273 1
		$this->markdown = trim($this->markdown);
274 1
		$this->markdown = preg_replace("~(?:\s?\n\s?){3,6}~", "\n\n", $this->markdown);	}
275 1
276 1
	/**
277 1
	 * Looks for the text inside <body> and then <html>, returning just the inner
278
	 *
279
	 * @param $text
280
	 *
281
	 * @return string
282
	 */
283
	private function _returnBodyText($text)
284
	{
285
		if (preg_match('~<body.*?>(.*)</body>~su', $text, $body))
286 1
		{
287
			return $body[1];
288 1
		}
289 1
290
		if (preg_match('~<html.*?>(.*)</html>~su', $text, $body))
291
		{
292 1
			return $body[1];
293
		}
294 1
295
		// Parsers may have clipped the ending body or html tag off with the quote/signature
296
		if (preg_match('~<body.*?>(.*)~su', $text, $body))
297 1
		{
298
			return $body[1];
299
		}
300
301
		return $text;
302
	}
303
304
	/**
305
	 * For a given node, checks if it is anywhere nested inside a code block
306
	 *  - Prevents converting anything that's inside a code block
307
	 *
308
	 * @param object $node
309 1
	 * @param boolean $parser flag for internal or external parser
310
	 *
311 1
	 * @return boolean
312 1
	 */
313
	private static function _has_parent_code($node, $parser)
314 1
	{
315 1
		$parent = $parser ? $node->parentNode : $node->parentNode();
316
		while ($parent)
317
		{
318
			// Anywhere nested inside a code block we don't render tags
319
			if (in_array($parser ? $parent->nodeName : $parent->nodeName(), array('pre', 'code')))
320 1
			{
321 1
				return true;
322 1
			}
323 1
324
			// Back out another level, until we are done
325
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
326
		}
327 1
328 1
		return false;
329
	}
330 1
331
	/**
332
	 * Get the nesting level when inside a list
333
	 *
334
	 * @param object $node
335
	 * @param boolean $parser flag for internal or external parser
336
	 *
337
	 * @return int
338
	 */
339
	private static function _has_parent_list($node, $parser)
340
	{
341 1
		$inlist = array('ul', 'ol');
342
		$depth = 0;
343 1
344 1
		$parent = $parser ? $node->parentNode : $node->parentNode();
345
		while ($parent)
346 1
		{
347 1
			// Anywhere nested inside a list we need to get the depth
348
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
349
			if (in_array($tag, $inlist))
350 1
			{
351 1
				$depth++;
352 1
			}
353 1
354 1
			// Back out another level
355
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
356
		}
357 1
358 1
		return $depth;
359
	}
360 1
361
	/**
362
	 * Traverse each node to its base, then convert tags to markup on the way back out
363
	 *
364
	 * @param object $node
365
	 */
366
	private function _convert_childNodes($node)
367
	{
368 1
		if (self::_has_parent_code($node, $this->_parser) && $this->_get_name($node) !== 'code')
369
		{
370 1
			return;
371 1
		}
372 1
373
		// Keep traversing till we are at the base of this node
374
		if ($node->hasChildNodes())
375
		{
376 1
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
377 1
			for ($i = 0; $i < $num; $i++)
378 1
			{
379 1
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
380
				$this->_convert_childNodes($child);
381 1
			}
382 1
		}
383 1
384 1
		// At the root of this node, convert it to markdown
385
		$this->_convert_to_markdown($node);
386
	}
387 1
388 1
	/**
389
	 * Convert the supplied node into its markdown equivalent
390
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
391
	 *
392
	 * @param object $node
393
	 */
394
	private function _convert_to_markdown($node)
395
	{
396 1
		// HTML tag we are dealing with
397
		$tag = $this->_get_name($node);
398
399 1
		// Based on the tag, determine how to convert
400
		switch ($tag)
401
		{
402
			case 'a':
403
				$markdown = $this->_convert_anchor($node);
404 1
				break;
405 1
			case 'abbr':
406 1
				$markdown = $this->_convert_abbr($node);
407 1
				break;
408
			case 'b':
409
			case 'strong':
410 1
				$markdown = $this->config['strong'] . trim($this->_get_value($node)) . $this->config['strong'];
411 1
				break;
412 1
			case 'blockquote':
413 1
				$markdown = $this->_convert_blockquote($node);
414 1
				break;
415 1
			case 'br':
416 1
				$markdown = $this->line_break;
417 1
				break;
418
			case 'center':
419 1
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
420 1
				break;
421 1
			case 'cite':
422
				$markdown = $this->_convert_cite($node);
423
				break;
424 1
			case 'code':
425 1
				$markdown = $this->_convert_code($node);
426 1
				break;
427 1
			case 'dt':
428
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
429
				break;
430 1
			case 'dd':
431
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
432
				break;
433 1
			case 'dl':
434
				$markdown = trim($this->_get_value($node)) . $this->line_break;
435
				break;
436 1
			case 'em':
437 1
			case 'i':
438 1
				$markdown = $this->config['em'] . trim($this->_get_value($node)) . $this->config['em'];
439 1
				break;
440 1
			case 'hr':
441
				$markdown = $this->line_end . '---' . $this->line_end;
442
				break;
443 1
			case 'h1':
444 1
			case 'h2':
445 1
			case 'h3':
446 1
			case 'h4':
447 1
			case 'h5':
448 1
			case 'h6':
449 1
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
0 ignored issues
show
Bug introduced by
$tag of type string is incompatible with the type integer expected by parameter $level of Html_2_Md::_convert_header(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

449
				$markdown = $this->_convert_header(/** @scrutinizer ignore-type */ $tag, $this->_get_value($node));
Loading history...
450 1
				break;
451 1
			case 'img':
452
				$markdown = $this->_convert_image($node) . $this->line_end;
453
				break;
454 1
			case 'ol':
455 1
			case 'ul':
456 1
				if ($this->_has_parent_list($node, $this->_parser))
457 1
					$markdown = trim($this->_get_value($node));
458 1
				else
459 1
					$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
460 1
				break;
461 1
			case 'li':
462 1
				$markdown = $this->_convert_list($node);
463 1
				break;
464
			case 'p':
465
				$markdown = $this->line_end . rtrim($this->_get_value($node)) . $this->line_end;
466
				$markdown = $this->_convert_plaintxt_links($markdown, $node);
467
				$markdown = $this->_utf8_wordwrap($markdown, $this->body_width, $this->line_end);
468
				break;
469 1
			case 'pre':
470
				$markdown = $this->_get_innerHTML($node) . $this->line_break;
471 1
				break;
472 1
			case 'div':
473 1
				$markdown = $this->line_end . rtrim($this->_get_value($node));
474 1
				$markdown = $this->_utf8_wordwrap($markdown, $this->body_width, $this->line_end) . $this->line_break;
475 1
				break;
476
			case '#text':
477
				$markdown = $this->_escape_text($this->_get_value($node));
478
				$markdown = $this->_convert_plaintxt_links($markdown, $node);
479
				break;
480
			case 'title':
481
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
482
				break;
483
			case 'table':
484
				$markdown = $this->_convert_table($node) . $this->line_break;
485 1
				break;
486
			case 'th':
487
			case 'tr':
488 1
			case 'td':
489 1
			case 'tbody':
490 1
			case 'tfoot':
491 1
			case 'thead':
492 1
				// Just skip over these as we handle them in the table tag itself
493 1
				$markdown = '~`skip`~';
494 1
				break;
495 1
			case 'span':
496 1
				$markdown = $this->_convert_span($node);
497
				break;
498 1
			case 'root':
499 1
			case 'body':
500 1
				// Remove these tags and simply replace with the text inside the tags
501 1
				$markdown = $this->_get_innerHTML($node);
502 1
				break;
503
			default:
504 1
				// Don't know you or text, so just preserve whats there
505 1
				$markdown = $this->_get_outerHTML($node) . $this->line_end;
506 1
		}
507
508 1
		// Replace the node with our markdown replacement, or with the node itself if none was found
509 1
		if ($markdown !== '~`skip`~')
510
		{
511
			if ($this->_parser)
512 1
			{
513 1
				// Create a new text node with our markdown tag and replace the original node
514 1
				$markdown_node = $this->doc->createTextNode($markdown);
515 1
				$node->parentNode->replaceChild($markdown_node, $node);
516
			}
517 1
			else
518 1
			{
519 1
				$node->outertext = $markdown;
520
			}
521
		}
522
	}
523
524 1
	/**
525 1
	 * Converts <abbr> tags to markdown (extra)
526
	 *
527
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
528
	 * md:   *[HTML]: Hyper Text Markup Language
529
	 *
530
	 * @param object $node
531
	 * @return string
532
	 */
533
	private function _convert_abbr($node)
534
	{
535
		$title = $node->getAttribute('title');
536
		$value = $this->_get_value($node);
537
538
		return !empty($title) ? '*[' . $value . ']: ' . $title : '';
539
540
	}
541
542
	/**
543
	 * Converts <a> tags to markdown
544
	 *
545
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
546
	 * md: [Awesome Site](http://somesite.com 'Title')
547
	 *
548
	 * @param object $node
549
	 * @return string
550
	 */
551
	private function _convert_anchor($node)
552
	{
553
		global $txt;
554
555
		if ($node->getAttribute('data-lightboximage') || $node->getAttribute('data-lightboxmessage'))
556
			return '~`skip`~';
557
558
		$href = str_replace('\_', '_', htmlspecialchars_decode($node->getAttribute('href')));
559
		$title = $node->getAttribute('title');
560
		$class = $node->getAttribute('class');
561
		$value = str_replace('\_', '_', trim($this->_get_value($node), "\t\n\r\0\x0B"));
562 1
563
		// Provide a more compact [name] if none is given
564 1
		if ($value == $node->getAttribute('href') || empty($value))
565
		{
566 1
			$value = empty($title) ? $txt['link'] : $title;
567 1
		}
568 1
569 1
		// Special processing just for our own footnotes
570
		if ($class === 'target' || $class === 'footnote_return')
571
		{
572 1
			$markdown = $value;
573 1
		}
574 1
		elseif (!empty($title))
575 1
		{
576
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
577
		}
578 1
		else
579 1
		{
580
			$markdown = '[' . ($value === $txt['link'] ? 'X' : $value) . ']('  . $href . ' "' . $txt['link'] . '")';
581
		}
582 1
583
		$this->_check_line_length($markdown, $this->get_buffer($node));
584
585
		return $markdown . $this->line_end;
586
	}
587
588 1
	/**
589
	 * Converts blockquotes to markdown > quote style
590
	 *
591
	 * html: <blockquote>quote</blockquote>
592 1
	 * md: > quote
593
	 *
594 1
	 * @param object $node
595
	 * @return string
596
	 */
597
	private function _convert_blockquote($node)
598
	{
599
		$markdown = '';
600
601
		// All the contents of this block quote
602
		$value = trim($this->_get_value($node));
603
604
		// Go line by line
605
		$lines = preg_split('~\r\n|\r|\n~', $value);
606 1
607
		// Each line gets a '> ' in front of it, just like email quotes really
608 1
		foreach ($lines as $line)
609
		{
610
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
611 1
		}
612 1
613
		return $this->line_end . $markdown . $this->line_end;
614
	}
615 1
616
	/**
617
	 * Converts cites to markdown with the assumption that they are in a blockquote
618 1
	 *
619
	 * html: <blockquote>quote</blockquote>
620 1
	 * md: > quote
621 1
	 *
622
	 * @param object $node
623 1
	 * @return string
624
	 */
625 1
	private function _convert_cite($node)
626
	{
627
		// All the contents of this cite
628
		$markdown = trim($this->_get_value($node));
629
630
		// Drop the link, just use the citation [bla](link)
631
		if (preg_match('~\[(.*?)\]\(.*?\)~', $markdown, $match))
632
		{
633
			$markdown = $match[1];
634
		}
635
636
		return $this->line_end . $markdown . $this->line_end;
637
	}
638
639 1
	/**
640
	 * Converts code tags to markdown span `code` or block code
641 1
	 * Converts single line code to inline tick mark
642
	 * Converts multi line to 4 space indented code
643
	 *
644 1
	 * html: <code>code</code>
645
	 * md: `code`
646
	 *
647
	 * @param object $node
648 1
	 * @return string
649 1
	 */
650
	private function _convert_code($node)
651
	{
652
		$value = html_entity_decode($this->_get_innerHTML($node), ENT_COMPAT, 'UTF-8');
653
654 1
		// Empty Block
655 1
		if (empty($value))
656
		{
657
			return '``';
658 1
		}
659 1
660 1
		// Turn off things that may mangle code tags
661 1
		$this->strip_tags = false;
662
		$this->body_width = 0;
663
664 1
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselves
665 1
		$value = preg_replace('~<br( /)?' . '>~', $this->line_end, str_replace('&nbsp;', ' ', $value));
666
667
		// Get the number of lines of code that we have
668 1
		$lines = preg_split('~\r\n|\r|\n~', $value);
669 1
670 1
		// Remove leading and trailing blank lines
671 1
		while (trim($lines[0]) === '')
672
		{
673
			array_shift($lines);
674 1
		}
675 1
		while (trim($lines[count($lines) - 1]) === '')
676
		{
677
			array_pop($lines);
678
		}
679 1
680
		// If there's more than one line of code, use fenced code syntax
681 1
		$total = count($lines);
682 1
		if ($total > 1)
683
		{
684
			$fence = $this->line_end . '```' . $this->line_end;
685 1
686 1
			// Convert what remains
687 1
			$markdown = '';
688 1
			foreach ($lines as $line)
689 1
			{
690
				$markdown .= $line . $this->line_end;
691
			}
692
693
			return $fence . $markdown . $fence;
694
		}
695
696
		// Single line, back tick, accounting for lines with \'s, and move on
697
		$ticks = $this->_has_ticks($value);
698
		if (!empty($ticks))
699
		{
700
			// If the ticks were at the start/end of the word space it off
701
			if ($lines[0][0] === '`' || substr($lines[0], -1) === '`')
702
			{
703
				$lines[0] = ' ' . $lines[0] . ' ';
704
			}
705
706
			return $ticks . $lines[0] . $ticks;
707
		}
708
709
		return '`' . $lines[0] . '`';
710
	}
711 1
712
	/**
713
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
714
	 * all other headers are returned as atx style ### h3
715
	 *
716
	 * html: <h1>header</h1>
717
	 * md: header
718
	 *     ======
719
	 *
720
	 * html: <h3>header</h3>
721
	 * md: ###header
722
	 *
723
	 * @param int $level
724
	 * @param string $content
725
	 * @return string
726
	 */
727
	private function _convert_header($level, $content)
728
	{
729 1
		if ($this->config['heading'] === 'setext')
730
		{
731 1
			$length = Util::strlen($content);
732
733 1
			return $this->line_end . $content . $this->line_end . str_repeat('=', $length) . $this->line_break;
734 1
		}
735
736
		$level = (int) ltrim($level, 'h');
737
738
		return $this->line_end . str_repeat('#', $level) . ' ' . $content . $this->line_break;
739
	}
740
741 1
	/**
742
	 * Converts <img> tags to markdown
743
	 *
744 1
	 * html: <img src='source' alt='alt' title='title' />
745
	 * md: ![alt](source 'title')
746
	 *
747
	 * @param object $node
748
	 * @return string
749
	 */
750
	private function _convert_image($node)
751
	{
752
		$src = $node->getAttribute('src');
753
		$alt = $node->getAttribute('alt');
754
		$title = $node->getAttribute('title');
755
		$parent = $this->_parser ? $node->parentNode : $node->parentNode();
756
757
		// A plain linked image, just return the alt text for use in the link
758
		if ($this->_get_name($parent) === 'a' && !($parent->getAttribute('data-lightboximage') || $parent->getAttribute('data-lightboxmessage')))
759
		{
760
			return !empty($alt) ? $alt : (!empty($title) ? $title : 'xXx');
761
		}
762
763
		if (!empty($title))
764
		{
765
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
766
		}
767
		else
768
		{
769
			$markdown = '![' . $alt . '](' . $src . ')';
770
		}
771
772
		$this->_check_line_length($markdown, $this->get_buffer($node));
773
774
		return $markdown . $this->line_end;
775
	}
776
777
	/**
778
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
779
	 *
780
	 * html: <ul><li>one</li></ul>
781
	 * md * one
782
	 *
783
	 * @param object $node
784
	 * @return string
785
	 */
786
	private function _convert_list($node)
787
	{
788
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
789 1
		$value = $this->_get_value($node);
790
		$depth = $this->_has_parent_list($node, $this->_parser);
791 1
792 1
		$loose = $value[0] === $this->line_end ? $this->line_end : '';
793
794 1
		// Keep multi line list items indented the same as the list depth
795 1
		$indent = str_repeat('   ', $depth);
796
		$value = rtrim(implode($this->line_end . $indent, explode($this->line_end, trim($value))));
797
798 1
		// Unordered lists get a simple bullet
799 1
		if ($list_type === 'ul')
800 1
		{
801 1
			return $loose . $this->config['bullet'] . '   ' . $value . $this->line_end;
802
		}
803
804
		// Ordered lists need a number
805 1
		$start = (int) ($this->_parser ? $node->parentNode->getAttribute('start') : $node->parentNode()->getAttribute('start'));
806 1
		$start = $start > 0 ? $start - 1 : 0;
807
		$number = $start + $this->_get_list_position($node);
808
809 1
		return $loose . $number . '. ' . $value . $this->line_end;
810
	}
811
812
	/**
813
	 * Generally returns the innerHTML
814
	 *
815
	 * @param object $node
816
	 * @return string
817
	 */
818
	private function _convert_span($node)
819
	{
820 1
		return $this->_get_innerHTML($node);
821
	}
822 1
823 1
	/**
824 1
	 * Converts tables tags to markdown extra table syntax
825
	 *
826
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
827
	 *
828 1
	 * @param object $node
829
	 * @return string
830
	 */
831 1
	private function _convert_table($node)
832 1
	{
833 1
		$table_heading = $node->getElementsByTagName('th');
834 1
		if ($this->_get_item($table_heading, 0) === null)
835 1
		{
836 1
			return '';
837
		}
838
839 1
		$th_parent = $this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName();
840 1
841
		// Set up for a markdown table, then storm the castle
842 1
		$align = array();
843
		$value = array();
844 1
		$width = array();
845
		$max = array();
846
		$header = array();
847 1
		$rows = array();
848 1
849 1
		// We only markdown well formed tables ...
850 1
		if ($table_heading && $th_parent === 'tr')
851 1
		{
852
			// Find out how many columns we are dealing with
853
			$th_num = $this->_get_length($table_heading);
854 1
855 1
			for ($col = 0; $col < $th_num; $col++)
856
			{
857
				// Get the align and text for each th (html5 this is no longer valid)
858 1
				$th = $this->_get_item($table_heading, $col);
859 1
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
860 1
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
861
				$value[0][$col] = $this->_get_value($th);
862
				$width[0][$col] = Util::strlen($this->_get_value($th));
863 1
864
				// Seed the max col width
865
				$max[$col] = $width[0][$col];
866 1
			}
867
868
			// Get all of the rows
869 1
			$table_rows = $node->getElementsByTagName('tr');
870 1
			$num_rows = $this->_get_length($table_rows);
871 1
			for ($row = 1; $row < $num_rows; $row++)
872 1
			{
873 1
				// Start at row 1 and get all of the td's in this row
874
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
875
876 1
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
877 1
				for ($col = 0; $col < $th_num; $col++)
878
				{
879
					// Get the align and text for each td in this row
880 1
					$td = $this->_get_item($row_data, $col);
881 1
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
882
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
883
					$value[$row][$col] = $this->_get_value($td);
884 1
					$width[$row][$col] = Util::strlen($this->_get_value($td));
885
886 1
					// Keep track of the longest col cell as we go
887 1
					if ($width[$row][$col] > $max[$col])
888
					{
889
						$max[$col] = $width[$row][$col];
890 1
					}
891 1
				}
892 1
			}
893 1
894
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
895
			for ($row = 0; $row < $num_rows; $row++)
896 1
			{
897 1
				$temp = array();
898
				for ($col = 0; $col < $th_num; $col++)
899
				{
900 1
					// Build the header row once
901
					if ($row === 0)
902
					{
903 1
						$header[] = str_repeat('-', $max[$col]);
904 1
					}
905 1
906 1
					// Build the data for each col, align/pad as needed
907 1
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
908
				}
909
910 1
				// Join it all up so we have a nice looking row
911
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
912
913 1
				// Stuff in the header after the th row
914
				if ($row === 0)
915
				{
916
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
917
				}
918
			}
919
920
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
921
			$this->_check_line_length($rows[1], 2);
922
923
			// Return what we did so it can be swapped in
924 1
			return implode($this->line_end, $rows);
925
		}
926 1
	}
927 1
928 1
	/**
929
	 * Helper function for getting a node object
930
	 *
931
	 * @param object $node
932
	 * @param int $item
933
	 * @return object
934
	 */
935
	private function _get_item($node, $item)
936
	{
937
		if ($this->_parser)
938
		{
939
			return $node->item($item);
940
		}
941
		else
942 1
		{
943
			return $node[$item];
944 1
		}
945 1
	}
946 1
947
	/**
948
	 * Helper function for getting a node length
949
	 *
950
	 * @param object|array $node
951
	 * @return int
952
	 */
953
	private function _get_length($node)
954
	{
955
		if ($this->_parser)
956
		{
957
			return $node->length;
958
		}
959
		else
960 1
		{
961
			return count($node);
962 1
		}
963 1
	}
964
965
	/**
966
	 * Helper function for getting a node value
967 1
	 *
968 1
	 * @param object $node
969 1
	 * @return string
970
	 */
971
	private function _get_value($node)
972
	{
973
		if ($node === null)
974
		{
975
			return '';
976
		}
977
978
		if ($this->_parser)
979
		{
980
			return $node->nodeValue;
981
		}
982
		else
983 1
		{
984
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
985 1
		}
986 1
	}
987
988
	/**
989
	 * Helper function for getting a node name
990 1
	 *
991 1
	 * @param object $node
992 1
	 * @return string
993
	 */
994
	private function _get_name($node)
995
	{
996
		if ($node === null)
997
		{
998
			return '';
999
		}
1000
1001
		if ($this->_parser)
1002
		{
1003
			return $node->nodeName;
1004
		}
1005
		else
1006
		{
1007
			return $node->nodeName();
1008 1
		}
1009
	}
1010 1
1011
	/**
1012
	 * Helper function for creating ol's
1013 1
	 *
1014 1
	 * - Returns the absolute number of an <li> inside an <ol>
1015
	 *
1016
	 * @param object $node
1017 1
	 * @return int
1018
	 */
1019 1
	private function _get_list_position($node)
1020 1
	{
1021 1
		$position = 1;
1022 1
1023 1
		// Get all of the list nodes inside this parent
1024 1
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
1025
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
1026 1
1027
		// Loop through all li nodes and find where we are in this list
1028
		for ($i = 0; $i < $total_nodes; $i++)
1029
		{
1030
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
1031
			if ($current_node === $node)
1032
			{
1033
				$position = $i + 1;
1034
				break;
1035
			}
1036
		}
1037
1038
		return $position;
1039
	}
1040 1
1041
	/**
1042
	 * Helper function for table creation
1043
	 *
1044 1
	 * - Builds td's to a give width, aligned as needed
1045 1
	 *
1046 1
	 * @param string $align
1047 1
	 * @param int $width
1048 1
	 * @param string $content
1049
	 * @param int $max
1050
	 * @return string
1051 1
	 */
1052
	private function _align_row_content($align, $width, $content, $max)
1053
	{
1054
		switch ($align)
1055
		{
1056
			default:
1057 1
			case 'left':
1058
				$content .= str_repeat(' ', $max - $width);
1059 1
				break;
1060
			case 'right':
1061
				$content = str_repeat(' ', $max - $width) . $content;
1062
				break;
1063
			case 'center':
1064
				$paddingNeeded = $max - $width;
1065
				$left = (int) floor($paddingNeeded / 2);
1066
				$right = $paddingNeeded - $left;
1067
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
1068 1
				break;
1069
		}
1070 1
1071 1
		return $content;
1072 1
	}
1073 1
1074 1
	/**
1075 1
	 * Gets the inner html of a node
1076
	 *
1077 1
	 * @param DOMNode|object $node
1078
	 * @return string
1079
	 */
1080
	private function _get_innerHTML($node)
1081
	{
1082
		if ($this->_parser)
1083
		{
1084
			$doc = new DOMDocument();
1085
			$doc->preserveWhiteSpace = true;
1086
			$doc->appendChild($doc->importNode($node, true));
1087
			$html = trim($doc->saveHTML());
1088
			$tag = $node->nodeName;
1089
1090
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
1091 1
		}
1092
		else
1093 1
		{
1094 1
			return $node->innertext;
0 ignored issues
show
Bug introduced by
The property innertext does not seem to exist on DOMNode.
Loading history...
1095 1
		}
1096 1
	}
1097 1
1098
	/**
1099
	 * Gets the outer html of a node
1100
	 *
1101
	 * @param DOMNode|object $node
1102
	 * @return string
1103
	 */
1104
	private function _get_outerHTML($node)
1105
	{
1106
		if ($this->_parser)
1107
		{
1108
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
1109
			{
1110
				return htmlspecialchars_decode($this->doc->saveHTML($node));
1111
			}
1112
			else
1113
			{
1114
				// @todo remove when 5.3.6 min
1115
				$doc = new DOMDocument();
1116
				$doc->appendChild($doc->importNode($node, true));
1117
				$html = $doc->saveHTML();
1118
1119
				// We just want the html of the inserted node, it *may* be wrapped
1120
				$html = $this->_returnBodyText($html);
1121
1122
				// Clean it up
1123
				$html = rtrim($html, "\n");
1124
1125
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1126
			}
1127
		}
1128
		else
1129
		{
1130
			return $node->outertext;
0 ignored issues
show
Bug introduced by
The property outertext does not seem to exist on DOMNode.
Loading history...
1131
		}
1132
	}
1133
1134
	/**
1135
	 * Escapes markup looking text in html to prevent accidental assignment
1136
	 *
1137
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1138
	 * be converted by md to html as <strong>stuff</strong>
1139
	 *
1140
	 * @param string $value
1141
	 * @return string
1142
	 */
1143
	private function _escape_text($value)
1144
	{
1145
		// Search and replace ...
1146
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1147
		{
1148
			$value = preg_replace($regex, $replacement, $value);
1149
		}
1150
1151
		return $value;
1152
	}
1153
1154
	/**
1155
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1156
	 * when markdown is run we don't interpret the ` as additional code blocks
1157
	 *
1158
	 * @param string $value
1159
	 * @return string
1160
	 */
1161
	private function _has_ticks($value)
1162
	{
1163
		$ticks = '';
1164
1165
		// If we have backticks in code, then we back tick the ticks
1166
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1167
		preg_match_all('~`+~', $value, $matches);
1168
		if (!empty($matches[0]))
1169
		{
1170
			// Yup ticks in the hair
1171
			$ticks = '`';
1172
			rsort($matches[0]);
1173
1174
			// Backtick as many as needed so markdown will work
1175
			while (true)
1176
			{
1177
				if (!in_array($ticks, $matches[0]))
1178
				{
1179
					break;
1180
				}
1181
				$ticks .= '`';
1182
			}
1183
		}
1184
1185
		return $ticks;
1186
	}
1187
1188
	/**
1189 1
	 * Helper function to adjust wrapping width for long-ish links
1190
	 *
1191
	 * @param string $markdown
1192 1
	 * @param bool|int $buffer
1193 1
	 */
1194 1
	private function _check_line_length($markdown, $buffer = false)
1195 1
	{
1196 1
		// Off we do nothing
1197 1
		if ($this->body_width === 0)
1198
		{
1199
			return;
1200
		}
1201
1202 1
		// Some Lines can be very long and if we wrap them they break
1203
		$lines = explode($this->line_end, $markdown);
1204 1
		foreach ($lines as $line)
1205 1
		{
1206
			$line_strlen = Util::strlen($line) + (!empty($buffer) ? (int) $buffer : 0);
1207
			if ($line_strlen > $this->body_width)
1208
			{
1209
				$this->body_width = $line_strlen;
1210
			}
1211
		}
1212
	}
1213 1
1214
	/**
1215 1
	 * Helper function to find and wrap plain text links in MD format
1216
	 */
1217 1
	private function _convert_plaintxt_links($text, $node)
1218 1
	{
1219
		if (in_array($this->_get_name($this->_parser ? $node->parentNode : $node->parentNode()), array('a', 'code', 'pre')))
1220 1
		{
1221
			return $text;
1222
		}
1223
1224
		// Any evidence of a code block we skip
1225
		if (preg_match('~`.*`~s', $text) === 1)
1226
		{
1227
			return $text;
1228
		}
1229
1230
		// Link finding regex that will skip our markdown [link](xx) constructs
1231
		$re = '/((?<!\\\\\( |]\()https?:\/\/|(?<!\\\\\( |]\(|:\/\/)www)[-\p{L}0-9+&@#\/%?=~_|!:,.;]*[\p{L}0-9+&@#\/%=~_|]/ui';
1232
		$count = 0;
1233
		$text = preg_replace_callback($re,
1234 1
			function ($matches) {
1235
				return $this->_plaintxt_callback($matches);
1236 1
			}, $text, -1, $count);
1237 1
1238
		// If we made changes, lets protect that link from wrapping
1239 1
		if ($count > 0)
1240
		{
1241 1
			$this->_check_line_length($text);
1242 1
		}
1243
1244
		return $text;
1245 1
	}
1246 1
1247
	/**
1248 1
	 * Callback function used by _convert_plaintxt_links for plain link to MD
1249 1
	 *
1250 1
	 * @param string[] $matches
1251
	 * @return string
1252
	 */
1253
	private function _plaintxt_callback($matches)
1254
	{
1255
		global $txt;
1256
1257 1
		return '[' . $txt['link'] . '](' . trim(str_replace('\_', '_', $matches[0])) . ')';
1258 1
	}
1259
1260
	/**
1261 1
	 * Breaks a string up so its no more than width characters long
1262
	 *
1263
	 * - Will break at word boundaries
1264
	 * - If no natural space is found will break mid-word
1265
	 *
1266
	 * @param string $string
1267
	 * @param int $width
1268
	 * @param string $break
1269
	 * @return string
1270
	 */
1271
	private function _utf8_wordwrap($string, $width = 76, $break = "\n")
1272
	{
1273
		if ($width < 76)
1274
		{
1275
			return $string;
1276
		}
1277
1278
		$strings = explode($break, $string);
1279
		$lines = array();
1280
1281
		foreach ($strings as $string)
0 ignored issues
show
introduced by
$string is overwriting one of the parameters of this function.
Loading history...
1282
		{
1283
			$in_quote = isset($string[0]) && $string[0] === '>';
1284
			if (empty($string))
1285
			{
1286
				$lines[] = '';
1287
			}
1288
			while (!empty($string))
1289
			{
1290
				// Get the next #width characters before a break (space, punctuation tab etc)
1291
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1292
				{
1293
					// Add the #width to the output and set up for the next pass
1294
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1];
1295
					$string = Util::substr($string, Util::strlen($matches[1]));
1296
				}
1297
				// Humm just a long word with no place to break, so we simply cut it after width characters
1298
				else
1299
				{
1300
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1301
					$string = Util::substr($string, $width);
1302
				}
1303
			}
1304
		}
1305
1306
		// Join it all the shortened sections up on our break characters
1307
		return implode($break, $lines);
1308
	}
1309
1310
	/**
1311
	 * Gets the length of html in front of a given node and its parent.
1312
	 *
1313
	 * - Used to add needed buffer to adjust length wrapping
1314
	 *
1315
	 * @param $node
1316
	 * @return int
1317
	 */
1318
	private function get_buffer($node)
1319
	{
1320
		$cut = $this->_get_outerHTML($node);
1321
1322
		$parent = $this->_parser ? $node->parentNode : $node->parentNode();
1323
1324
		if ($this->_get_name($parent) !== 'body')
1325
		{
1326
			$string = $this->_get_innerHTML($parent);
1327
			$string = substr($string, 0, strpos($string, $cut));
1328
		}
1329
1330
		return empty($string) ? 0 : Util::strlen($string);
1331
	}
1332
}
1333