Completed
Push — patch_1-0-10 ( 7a6c4f...f03c6d )
by Emanuele
17:19 queued 17:09
created

Html_2_Md   D

Complexity

Total Complexity 179

Size/Duplication

Total Lines 1118
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 0
Metric Value
dl 0
loc 1118
rs 4.4102
c 0
b 0
f 0
wmc 179
lcom 1
cbo 1

32 Methods

Rating   Name   Duplication   Size   Complexity  
B __construct() 0 34 1
A _set_parser() 0 23 2
B get_markdown() 0 24 4
B _getBody() 0 28 4
A _clipHead() 0 15 4
A _setupDOMDocument() 0 13 1
B _clean_markdown() 0 28 2
A _returnBodyText() 0 9 3
B _has_parent_code() 0 19 7
B _has_parent_list() 0 19 6
B _convert_childNodes() 0 19 6
D _convert_to_markdown() 0 124 43
A _convert_abbr() 0 12 2
C _convert_anchor() 0 26 7
A _convert_blockquote() 0 19 2
C _convert_code() 0 62 13
A _convert_header() 0 15 3
A _convert_image() 0 16 2
A _convert_list() 0 20 4
F _convert_table() 0 88 18
A _get_item() 0 7 2
A _get_length() 0 7 2
A _get_value() 0 10 3
A _get_name() 0 10 3
B _get_list_position() 0 18 6
A _align_row_content() 0 21 4
A _get_innerHTML() 0 14 2
B _get_outerHTML() 0 24 3
A _escape_text() 0 8 2
B _has_ticks() 0 29 6
A _check_link_lenght() 0 9 3
D _utf8_wordwrap() 0 29 9

How to fix   Complexity   

Complex Class

Complex classes like Html_2_Md often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Html_2_Md, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * Converts HTML to Markdown text
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * @version 1.0.10
11
 *
12
 */
13
14
if (!defined('ELK'))
15
	die('No access...');
16
17
/**
18
 * Converts HTML to Markdown text
19
 */
20
class Html_2_Md
21
{
22
	/**
23
	 * The value that will hold our dom object
24
	 * @var object
25
	 */
26
	public $doc;
27
28
	/**
29
	 * The value that will hold if we are using the internal or external parser
30
	 * @var boolean
31
	 */
32
	private $_parser;
33
34
	/**
35
	 * Line end character
36
	 * @var string
37
	 */
38
	public $line_end = "\n";
39
40
	/**
41
	 * Line break character
42
	 * @var string
43
	 */
44
	public $line_break = "\n\n";
45
46
	/**
47
	 * Wordwrap output, set to 0 to skip wrapping
48
	 * @var int
49
	 */
50
	public $body_width = 76;
51
52
	/**
53
	 * Strip remaining tags, set to false to leave them in
54
	 * @var boolean
55
	 */
56
	public $strip_tags = true;
57
58
	/**
59
	 * Regex to run on plain text to prevent markdown from erroneously converting
60
	 * @var string[]
61
	 */
62
	private $_textEscapeRegex = array();
63
64
	/**
65
	 * The passed html string to convert
66
	 * @var string
67
	 */
68
	public $html;
69
70
	/**
71
	 * The markdown equivalent to the  html string
72
	 * @var string
73
	 */
74
	public $markdown;
75
76
	/**
77
	 * Gets everything started using the built in or external parser
78
	 *
79
	 * @param string $html string of html to convert to MD text
80
	 */
81
	public function __construct($html)
82
	{
83
		// Up front, remove whitespace between html tags
84
		$this->html = preg_replace('/(?:(?<=\>)|(?<=\/\>))(\s+)(?=\<\/?)/', '', $html);
85
86
		// The XML parser will not deal gracefully with these
87
		$this->html = strtr($this->html, array(
88
			'?<' => '|?|&lt',
89
			'?>' => '|?|&gt',
90
			'>?' => '&gt|?|',
91
			'<?' => '&lt|?|'
92
		));
93
94
		// Set the dom parser to use and load the HTML to the parser
95
		$this->_set_parser();
96
97
		// Initialize the regex array to escape text areas so markdown does
98
		// not interpret plain text as markdown syntax
99
		$this->_textEscapeRegex = array(
0 ignored issues
show
Documentation Bug introduced by
It seems like (.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' => '\\[$1\\]$2\\[$3\\]')">array('([-*_])([ ]{0,2}\(.+)\\' => '\\[$1\\]$2', '\\(.+)\\\\[(.*)\\]' =">...> '\\[$1\\]$2\\[$3\\]') of type (.+)\\":"string","\\(.+)\\\\[(.*)\\]":"string"}>">array<string,string,{"([(.+)\\":"string","\\...">(.+)\\\\[(.*)\\]":"string"}> is incompatible with the declared type array<integer,string> of property $_textEscapeRegex.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
100
			// Things that may convert to an hr --- or - - - etc
101
			'([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|',
102
			// or **stuff** => \*\*stuff\*\*
103
			'\*\*([^*\s]+)\*\*' => '\*\*$1\*\*',
104
			// or versions of *italic* __italic__ _italic_
105
			'\*([^*\s]+)\*' => '\*$1\*',
106
			'__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_',
107
			'_(?! |_)(.+)(?!<_| )_' => '\_$1\_',
108
			// nor `code`
109
			'`(.+)`' => '\`$1\`',
110
			// or links
111
			'\[(.+)\](\s*\()' => '\[$1\]$2',
112
			'\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]',
113
		);
114
	}
115
116
	/**
117
	 * Set the DOM parser for class, loads the supplied HTML
118
	 */
119
	private function _set_parser()
120
	{
121
		// Using PHP built in functions ...
122
		if (class_exists('DOMDocument'))
123
		{
124
			$this->_parser = true;
125
			$previous = libxml_use_internal_errors(true);
126
127
			// Set up basic parameters for DomDocument, including silencing structural errors
128
			$this->_setupDOMDocument();
129
130
			// Set the error handle back to what it was, and flush
131
			libxml_use_internal_errors($previous);
132
			libxml_clear_errors();
133
		}
134
		// Or using the external simple html parser
135
		else
136
		{
137
			$this->_parser = false;
138
			require_once(EXTDIR . '/simple_html_dom.php');
139
			$this->doc = str_get_html($this->html, true, true, 'UTF-8', false);
140
		}
141
	}
142
143
	/**
144
	 * Loads the html body and sends it to the parsing loop to convert all
145
	 * DOM nodes to markup
146
	 */
147
	public function get_markdown()
148
	{
149
		// For this html node, find all child elements and convert
150
		$body = $this->_getBody();
151
		$this->_convert_childNodes($body);
152
153
		// Done replacing HTML elements, now get the converted DOM tree back into a string
154
		$this->markdown = ($this->_parser) ? $this->doc->saveHTML() : $this->doc->save();
155
156
		// Using the internal DOM methods requires we need to do a little extra work
157
		if ($this->_parser)
158
		{
159
			$this->markdown = html_entity_decode(htmlspecialchars_decode($this->markdown, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
160
		}
161
162
		// Clean up any excess spacing etc
163
		$this->_clean_markdown();
164
165
		// Wordwrap?
166
		if (!empty($this->body_width))
167
			$this->markdown = $this->_utf8_wordwrap($this->markdown, $this->body_width, $this->line_end);
168
169
		return $this->markdown;
170
	}
171
172
	/**
173
	 * Returns just the body of the HTML, as best possible, so we are not dealing with head
174
	 * and above head markup
175
	 *
176
	 * @return object
177
	 */
178
	private function  _getBody()
179
	{
180
		// If there is a head node, then off with his head!
181
		$this->_clipHead();
182
183
		// The body of the HTML is where its at.
184
		if ($this->_parser)
185
		{
186
			$body = $this->doc->getElementsByTagName('body')->item(0);
187
		}
188
		else
189
		{
190
			if ($this->doc->find('body', 0) !== null)
191
			{
192
				$body = $this->doc->find('body', 0);
193
			}
194
			elseif ($this->doc->find('html', 0) !== null)
195
			{
196
				$body = $this->doc->find('html', 0);
197
			}
198
			else
199
			{
200
				$body = $this->doc->root;
201
			}
202
		}
203
204
		return $body;
205
	}
206
207
	/**
208
	 * Remove any <head node from the DOM
209
	 */
210
	private function _clipHead()
211
	{
212
		$head = ($this->_parser) ? $this->doc->getElementsByTagName('head')->item(0) : $this->doc->find('head', 0)->outertext;
213
		if ($head !== null)
214
		{
215
			if ($this->_parser)
216
			{
217
				$head->parentNode->removeChild($head);
218
			}
219
			else
220
			{
221
				$this->doc->find('head', 0)->outertext = '';
222
			}
223
		}
224
	}
225
226
	/**
227
	 * Sets up processing parameters for DOMDocument to ensure that text is processed as UTF-8
228
	 */
229
	private function _setupDOMDocument()
230
	{
231
		// If the html is already wrapped, remove it
232
		$this->html = $this->_returnBodyText($this->html);
233
234
		// Set up processing details
235
		$this->doc = new DOMDocument();
236
		$this->doc->preserveWhiteSpace = false;
237
		$this->doc->encoding = 'UTF-8';
238
239
		// Do what we can to ensure this is processed as UTF-8
240
		$this->doc->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $this->html . '</body></html>');
241
	}
242
243
	/**
244
	 * Normalize any spacing and excess blank lines that may have been generated
245
	 */
246
	private function _clean_markdown()
247
	{
248
		// We only want the content, no wrappers
249
		$this->markdown = $this->_returnBodyText($this->markdown);
250
251
		// Remove non breakable spaces that may be hiding in here
252
		$this->markdown = str_replace("\xC2\xA0\x20", ' ', $this->markdown);
253
		$this->markdown = str_replace("\xC2\xA0", ' ', $this->markdown);
254
255
		// Remove any "bonus" tags
256
		if ($this->strip_tags)
257
			$this->markdown = strip_tags($this->markdown);
258
259
		// Replace content that we "hide" from the XML parsers
260
		$this->markdown = strtr($this->markdown, array(
261
			'|?|&gt' => '?>',
262
			'|?|&lt' => '?<',
263
			'&lt|?|' => '<?',
264
			'&gt|?|' => '>?'
265
		));
266
267
		// Strip the chaff and any excess blank lines we may have produced
268
		$this->markdown = trim($this->markdown);
269
		$this->markdown = preg_replace("~(\n(\s)?){3,}~", "\n\n", $this->markdown);
270
		$this->markdown = preg_replace("~(^\s\s\n){3,}~m", "  \n  \n", $this->markdown);
271
		$this->markdown = preg_replace("~(^\s\s\r?\n){3,}~m", "  \n  \n", $this->markdown);
272
		$this->markdown = preg_replace("~(^\s\s(?:\r?\n){2}){3,}~m", "  \n  \n", $this->markdown);
273
	}
274
275
	/**
276
	 * Looks for the text inside of <body> and then <html>, returning just the inner
277
	 *
278
	 * @param $text
279
	 *
280
	 * @return string
281
	 */
282
	private function _returnBodyText($text)
283
	{
284
		if (preg_match('~<body>(.*)</body>~su', $text, $body))
285
			return $body[1];
286
		elseif (preg_match('~<html>(.*)</html>~su', $text, $body))
287
			return $body[1];
288
289
		return $text;
290
	}
291
292
	/**
293
	 * For a given node, checks if it is anywhere nested inside of a code block
294
	 *  - Prevents converting anything that's inside a code block
295
	 *
296
	 * @param object $node
297
	 * @param boolean $parser flag for internal or external parser
298
	 *
299
	 * @return boolean
300
	 */
301
	private static function _has_parent_code($node, $parser)
302
	{
303
		$parent = $parser ? $node->parentNode : $node->parentNode();
304
		while ($parent)
305
		{
306
			if ($parent === null)
307
				return false;
308
309
			// Anywhere nested inside a code block we don't render tags
310
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
311
			if ($tag === 'code')
312
				return true;
313
314
			// Back out another level, until we are done
315
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
316
		}
317
318
		return false;
319
	}
320
321
	/**
322
	 * Get the nesting level when inside a list
323
	 *
324
	 * @param object $node
325
	 * @param boolean $parser flag for internal or external parser
326
	 *
327
	 * @return int
328
	 */
329
	private static function _has_parent_list($node, $parser)
330
	{
331
		$inlist = array('ul', 'ol');
332
		$depth = 0;
333
334
		$parent = $parser ? $node->parentNode : $node->parentNode();
335
		while ($parent)
336
		{
337
			// Anywhere nested inside a list we need to get the depth
338
			$tag = $parser ? $parent->nodeName : $parent->nodeName();
339
			if (in_array($tag, $inlist))
340
				$depth++;
341
342
			// Back out another level
343
			$parent = $parser ? $parent->parentNode : $parent->parentNode();
344
		}
345
346
		return $depth;
347
	}
348
349
	/**
350
	 * Traverse each node to its base, then convert tags to markup on the way back out
351
	 *
352
	 * @param object $node
353
	 */
354
	private function _convert_childNodes($node)
355
	{
356
		if (self::_has_parent_code($node, $this->_parser))
357
			return;
358
359
		// Keep traversing till we are at the base of this node
360
		if ($node->hasChildNodes())
361
		{
362
			$num = $this->_parser ? $node->childNodes->length : count($node->childNodes());
363
			for ($i = 0; $i < $num; $i++)
364
			{
365
				$child = $this->_parser ? $node->childNodes->item($i) : $node->childNodes($i);
366
				$this->_convert_childNodes($child);
367
			}
368
		}
369
370
		// At the root of this node, convert it to markdown
371
		$this->_convert_to_markdown($node);
372
	}
373
374
	/**
375
	 * Convert the supplied node into its markdown equivalent
376
	 *  - Supports *some* markdown extra tags, namely: table, abbr & dl in a limited fashion
377
	 *
378
	 * @param object $node
379
	 */
380
	private function _convert_to_markdown($node)
381
	{
382
		// HTML tag we are dealing with
383
		$tag = $this->_get_name($node);
384
385
		// Based on the tag, determine how to convert
386
		switch ($tag)
387
		{
388
			case 'a':
389
				$markdown = $this->line_end . $this->_convert_anchor($node);
390
				break;
391
			case 'abbr':
392
				$markdown = $this->_convert_abbr($node);
393
				break;
394
			case 'b':
395
			case 'strong':
396
				$markdown = '**' . $this->_get_value($node) . '**';
397
				break;
398
			case 'blockquote':
399
				$markdown = $this->_convert_blockquote($node);
400
				break;
401
			case 'br':
402
				// DomDocument strips empty lines, this prevents that
403
				$markdown = "\xC2\xA0\xC2\xA0" . $this->line_break;
404
				break;
405
			case 'center':
406
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
407
				break;
408
			case 'code':
409
				$markdown = $this->_convert_code($node);
410
				break;
411
			case 'dt':
412
				$markdown = str_replace(array("\n", "\r", "\n\r"), '', $this->_get_value($node)) . $this->line_end;
413
				break;
414
			case 'dd':
415
				$markdown = ':   ' . $this->_get_value($node) . $this->line_break;
416
				break;
417
			case 'dl':
418
				$markdown = trim($this->_get_value($node)) . $this->line_break;
419
				break;
420
			case 'em':
421
			case 'i':
422
				$markdown = '_' . $this->_get_value($node) . '_';
423
				break;
424
			case 'hr':
425
				$markdown = $this->line_end . str_repeat('-', 3) . $this->line_end;
426
				break;
427
			case 'h1':
428
			case 'h2':
429
			case 'h3':
430
			case 'h4':
431
			case 'h5':
432
			case 'h6':
433
				$markdown = $this->_convert_header($tag, $this->_get_value($node));
434
				break;
435
			case 'img':
436
				$markdown = $this->_convert_image($node);
437
				break;
438
			case 'ol':
439
			case 'ul':
440
				$markdown = rtrim($this->_get_value($node)) . $this->line_break;
441
				break;
442
			case 'li':
443
				$markdown = $this->_convert_list($node);
444
				break;
445
			case 'p':
446
				if (!$node->hasChildNodes())
447
				{
448
					$markdown = str_replace("\n", ' ', $this->_get_value($node)) . $this->line_break;
449
					$markdown = $this->_escape_text($markdown);
450
				}
451
				else
452
					$markdown = rtrim($this->_get_value($node)) . $this->line_break;
453
				break;
454
			case 'pre':
455
				$markdown = $this->_get_value($node) . $this->line_break;
456
				break;
457
			case 'div':
458
				$markdown = $this->line_end . $this->_get_value($node) . $this->line_end;
459
				if (!$node->hasChildNodes())
460
					$markdown = $this->_escape_text($markdown);
461
				break;
462
			//case '#text':
463
			//  $markdown = $this->_escape_text($this->_get_value($node));
464
			//  break;
465
			case 'title':
466
				$markdown = '# ' . $this->_get_value($node) . $this->line_break;
467
				break;
468
			case 'table':
469
				$markdown = $this->_convert_table($node) . $this->line_break;
470
				break;
471
			case 'th':
472
			case 'tr':
473
			case 'td':
474
			case 'tbody':
475
			case 'tfoot':
476
			case 'thead':
477
				// Just skip over these as we handle them in the table tag itself
478
				$markdown = '~`skip`~';
479
				break;
480
			case 'root':
481
			case 'span':
482
			case 'body':
483
				// Remove these tags and simply replace with the text inside the tags
484
				$markdown = $this->_get_innerHTML($node);
485
				break;
486
			default:
487
				// Don't know you or text, so just preserve whats there
488
				$markdown = $this->_get_outerHTML($node);
489
		}
490
491
		// Replace the node with our markdown replacement, or with the node itself if none was found
492
		if ($markdown !== '~`skip`~')
493
		{
494
			if ($this->_parser)
495
			{
496
				// Create a new text node with our markdown tag and replace the original node
497
				$markdown_node = $this->doc->createTextNode($markdown);
498
				$node->parentNode->replaceChild($markdown_node, $node);
499
			}
500
			else
501
				$node->outertext = $markdown;
502
		}
503
	}
504
505
	/**
506
	 * Converts <abbr> tags to markdown (extra)
507
	 *
508
	 * html: <abbr title="Hyper Text Markup Language">HTML</abbr>
509
	 * md:   *[HTML]: Hyper Text Markup Language
510
	 *
511
	 * @param object $node
512
	 * @return string
513
	 */
514
	private function _convert_abbr($node)
515
	{
516
		$title = $node->getAttribute('title');
517
		$value = $this->_get_value($node);
518
519
		if (!empty($title))
520
			$markdown = '*[' . $value . ']: ' . $title . $this->line_break;
521
		else
522
			$markdown = '';
523
524
		return $markdown;
525
	}
526
527
	/**
528
	 * Converts <a> tags to markdown
529
	 *
530
	 * html: <a href='http://somesite.com' title='Title'>Awesome Site</a>
531
	 * md: [Awesome Site](http://somesite.com 'Title')
532
	 *
533
	 * @param object $node
534
	 * @return string
535
	 */
536
	private function _convert_anchor($node)
537
	{
538
		global $txt;
539
540
		$href = htmlentities($node->getAttribute('href'), ENT_COMPAT, 'UTF-8', false);
541
		$title = $node->getAttribute('title');
542
		$class = $node->getAttribute('class');
543
		$value = $this->_get_value($node);
544
545
		// Provide a more compact [name] if none is given
546
		if ($value == $node->getAttribute('href') || empty($value))
547
			$value = empty($title) ? $txt['link'] : $title;
548
549
		// Special processing just for our own footnotes
550
		if ($class === 'target' || $class === 'footnote_return')
551
			$markdown = $value;
552
		elseif (!empty($title))
553
			$markdown = '[' . $value . '](' . $href . ' "' . $title . '")';
554
		else
555
			$markdown = '[' . $value . '](' . $href . ')';
556
557
		// Some links can be very long and if we wrap them they break
558
		$this->_check_link_lenght($markdown);
559
560
		return $markdown;
561
	}
562
563
	/**
564
	 * Converts blockquotes to markdown > quote style
565
	 *
566
	 * html: <blockquote>quote</blockquote>
567
	 * md: > quote
568
	 *
569
	 * @param object $node
570
	 * @return string
571
	 */
572
	private function _convert_blockquote($node)
573
	{
574
		$markdown = '';
575
576
		// All the contents of this block quote
577
		$value = $this->_get_value($node);
578
		$value = trim($value);
579
580
		// Go line by line
581
		$lines = preg_split('~\r\n|\r|\n~', $value);
582
583
		// Each line gets a '> ' in front of it, just like email quotes really
584
		foreach ($lines as $line)
585
			$markdown .= '> ' . ltrim($line, "\t") . $this->line_end;
586
587
		$markdown .= $this->line_end;
588
589
		return $markdown;
590
	}
591
592
	/**
593
	 * Converts code tags to markdown span `code` or block code
594
	 * Converts single line code to inline tick mark
595
	 * Converts multi line to 4 space indented code
596
	 *
597
	 * html: <code>code</code>
598
	 * md: `code`
599
	 *
600
	 * @param object $node
601
	 * @return string
602
	 */
603
	private function _convert_code($node)
604
	{
605
		$value = $this->_get_innerHTML($node);
606
607
		// If we have a multi line code block, we are working outside to in, and need to convert the br's ourselves
608
		$value = preg_replace('~<br( /)?' . '>~', "\n", str_replace('&nbsp;', ' ', $value));
609
610
		// If there are html tags in this code block, we need to disable strip tags
611
		// This is NOT the ideal way to handle this, needs something along the lines of preparse and unpreparse.
612
		if ($this->strip_tags && preg_match('~<[^<]+>~', $value))
613
			$this->strip_tags = false;
614
615
		// Get the number of lines of code that we have
616
		$lines = preg_split('~\r\n|\r|\n~', $value);
617
		$total = count($lines);
618
619
		// If there's more than one line of code, use leading four space syntax
620
		if ($total > 1)
621
		{
622
			$first_line = trim($lines[0]);
623
			$last_line = trim($lines[$total - 1]);
624
625
			// Remove any leading and trailing blank lines
626
			if (empty($first_line))
627
				array_shift($lines);
628
			if (empty($last_line))
629
				array_pop($lines);
630
631
			// Convert what remains
632
			$markdown = '';
633
			foreach ($lines as $line)
634
			{
635
				// Adjust the word wrapping since this has code tags, leave it up to
636
				// the email client to mess these up ;)
637
				$this->_check_link_lenght($markdown, 5);
638
639
				$markdown .= str_repeat(' ', 4) . $line . $this->line_end;
640
			}
641
642
			// The parser will encode, but we don't want that for our code block
643
			if ($this->_parser)
644
				$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
645
		}
646
		// Single line, back tick and move on
647
		else
648
		{
649
			// Account for backticks in the single line code itself
650
			$ticks = $this->_has_ticks($node, $value);
651
			if (!empty($ticks))
652
			{
653
				// If the ticks were at the start/end of the word space it off
654
				if ($lines[0][0] == '`' || substr($lines[0], -1) == '`')
655
					$lines[0] = ' ' . $lines[0] . ' ';
656
657
				$markdown = $ticks . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . $ticks;
658
			}
659
			else
660
				$markdown = '`' . ($this->_parser ? html_entity_decode($lines[0], ENT_QUOTES, 'UTF-8') : $lines[0]) . '`';
661
		}
662
663
		return $markdown;
664
	}
665
666
	/**
667
	 * Converts <h1> and <h2> headers to markdown-style headers in setex style,
668
	 * all other headers are returned as atx style ### h3
669
	 *
670
	 * html: <h1>header</h1>
671
	 * md: header
672
	 *     ======
673
	 *
674
	 * html: <h3>header</h3>
675
	 * md: ###header
676
	 *
677
	 * @param int $level
678
	 * @param string $content
679
	 * @return string
680
	 */
681
	private function _convert_header($level, $content)
682
	{
683
		$level = (int) ltrim($level, 'h');
684
685
		if ($level < 3)
686
		{
687
			$length = Util::strlen($content);
688
			$underline = ($level === 1) ? '=' : '-';
689
			$markdown = $content . $this->line_end . str_repeat($underline, $length) . $this->line_break;
690
		}
691
		else
692
			$markdown = str_repeat('#', $level) . ' ' . $content . $this->line_break;
693
694
		return $markdown;
695
	}
696
697
	/**
698
	 * Converts <img> tags to markdown
699
	 *
700
	 * html: <img src='source' alt='alt' title='title' />
701
	 * md: ![alt](source 'title')
702
	 *
703
	 * @param object $node
704
	 * @return string
705
	 */
706
	private function _convert_image($node)
707
	{
708
		$src = $node->getAttribute('src');
709
		$alt = $node->getAttribute('alt');
710
		$title = $node->getAttribute('title');
711
712
		if (!empty($title))
713
			$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")';
714
		else
715
			$markdown = '![' . $alt . '](' . $src . ')';
716
717
		// Adjust width if needed to maintain the image
718
		$this->_check_link_lenght($markdown);
719
720
		return $markdown;
721
	}
722
723
	/**
724
	 * Converts ordered <ol> and unordered <ul> lists to markdown syntax
725
	 *
726
	 * html: <ul><li>one</li></ul>
727
	 * md * one
728
	 *
729
	 * @param object $node
730
	 * @return string
731
	 */
732
	private function _convert_list($node)
733
	{
734
		$list_type = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
735
		$value = $this->_get_value($node);
736
737
		$loose = rtrim($value) !== $value;
738
		$depth = max(0, $this->_has_parent_list($node, $this->_parser) - 1);
739
740
		// Unordered lists get a simple bullet
741
		if ($list_type === 'ul')
742
			$markdown = str_repeat("\t", $depth) . '* ' . $value;
743
		// Ordered lists need a number
744
		else
745
		{
746
			$number = $this->_get_list_position($node);
747
			$markdown = str_repeat("\t", $depth) . $number . '. ' . $value;
748
		}
749
750
		return $markdown . (!$loose ? $this->line_end : '');
751
	}
752
753
	/**
754
	 * Converts tables tags to markdown extra table syntax
755
	 *
756
	 * - Have to build top down vs normal inside out due to needing col numbers and widths
757
	 *
758
	 * @param object $node
759
	 * @return string
760
	 */
761
	private function _convert_table($node)
762
	{
763
		$table_heading = $node->getElementsByTagName('th');
764
		if ($this->_get_item($table_heading, 0) === null)
765
			return '';
766
767
		$th_parent = ($table_heading) ? ($this->_parser ? $this->_get_item($table_heading, 0)->parentNode->nodeName : $this->_get_item($table_heading, 0)->parentNode()->nodeName()) : false;
768
769
		// Set up for a markdown table, then storm the castle
770
		$align = array();
771
		$value = array();
772
		$width = array();
773
		$max = array();
774
		$header = array();
775
		$rows = array();
776
777
		// We only markdown well formed tables ...
778
		if ($table_heading && $th_parent === 'tr')
779
		{
780
			// Find out how many columns we are dealing with
781
			$th_num = $this->_get_length($table_heading);
782
783
			for ($col = 0; $col < $th_num; $col++)
784
			{
785
				// Get the align and text for each th (html5 this is no longer valid)
786
				$th = $this->_get_item($table_heading, $col);
787
				$align_value = ($th !== null) ? strtolower($th->getAttribute('align')) : false;
788
				$align[0][$col] = $align_value === false ? 'left' : $align_value;
789
				$value[0][$col] = $this->_get_value($th);
790
				$width[0][$col] = Util::strlen($this->_get_value($th));
791
792
				// Seed the max col width
793
				$max[$col] = $width[0][$col];
794
			}
795
796
			// Get all of the rows
797
			$table_rows = $node->getElementsByTagName('tr');
798
			$num_rows = $this->_get_length($table_rows);
799
			for ($row = 1; $row < $num_rows; $row++)
800
			{
801
				// Start at row 1 and get all of the td's in this row
802
				$row_data = $this->_get_item($table_rows, $row)->getElementsByTagName('td');
803
804
				// Simply use the th count as the number of columns, if its not right its not markdown-able anyway
805
				for ($col = 0; $col < $th_num; $col++)
806
				{
807
					// Get the align and text for each td in this row
808
					$td = $this->_get_item($row_data, $col);
809
					$align_value = ($td !== null) ? strtolower($td->getAttribute('align')) : false;
810
					$align[$row][$col] = $align_value === false ? 'left' : $align_value;
811
					$value[$row][$col] = $this->_get_value($td);
812
					$width[$row][$col] = Util::strlen($this->_get_value($td));
813
814
					// Keep track of the longest col cell as we go
815
					if ($width[$row][$col] > $max[$col])
816
						$max[$col] = $width[$row][$col];
817
				}
818
			}
819
820
			// Done collecting data, we can rebuild it, we can make it better than it was. Better...stronger...faster
821
			for ($row = 0; $row < $num_rows; $row++)
822
			{
823
				$temp = array();
824
				for ($col = 0; $col < $th_num; $col++)
825
				{
826
					// Build the header row once
827
					if ($row === 0)
828
						$header[] = str_repeat('-', $max[$col]);
829
830
					// Build the data for each col, align/pad as needed
831
					$temp[] = $this->_align_row_content($align[$row][$col], $width[$row][$col], $value[$row][$col], $max[$col]);
832
				}
833
834
				// Join it all up so we have a nice looking row
835
				$rows[] = '| ' . implode(' | ', $temp) . ' |';
836
837
				// Stuff in the header after the th row
838
				if ($row === 0)
839
					$rows[] = '| ' . implode(' | ', $header) . ' | ';
840
			}
841
842
			// Adjust the word wrapping since this has a table, will get mussed by email anyway
843
			$this->_check_link_lenght($rows[1], 2);
844
845
			// Return what we did so it can be swapped in
846
			return implode($this->line_end, $rows);
847
		}
848
	}
849
850
	/**
851
	 * Helper function for getting a node object
852
	 *
853
	 * @param object $node
854
	 * @param int $item
855
	 */
856
	private function _get_item($node, $item)
857
	{
858
		if ($this->_parser)
859
			return $node->item($item);
860
		else
861
			return $node[$item];
862
	}
863
864
	/**
865
	 * Helper function for getting a node length
866
	 *
867
	 * @param object|array $node
868
	 * @return int
869
	 */
870
	private function _get_length($node)
871
	{
872
		if ($this->_parser)
873
			return $node->length;
874
		else
875
			return count($node);
876
	}
877
878
	/**
879
	 * Helper function for getting a node value
880
	 *
881
	 * @param object $node
882
	 * @return string
883
	 */
884
	private function _get_value($node)
885
	{
886
		if ($node === null)
887
			return '';
888
889
		if ($this->_parser)
890
			return $node->nodeValue;
891
		else
892
			return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
893
	}
894
895
	/**
896
	 * Helper function for getting a node name
897
	 *
898
	 * @param object $node
899
	 * @return string
900
	 */
901
	private function _get_name($node)
902
	{
903
		if ($node === null)
904
			return '';
905
906
		if ($this->_parser)
907
			return $node->nodeName;
908
		else
909
			return $node->nodeName();
910
	}
911
912
	/**
913
	 * Helper function for creating ol's
914
	 *
915
	 * - Returns the absolute number of an <li> inside an <ol>
916
	 *
917
	 * @param object $node
918
	 * @return int
919
	 */
920
	private function _get_list_position($node)
921
	{
922
		$position = 1;
923
924
		// Get all of the list nodes inside this parent
925
		$list_node = $this->_parser ? $node->parentNode : $node->parentNode();
926
		$total_nodes = $this->_parser ? $node->parentNode->childNodes->length : count($list_node->childNodes());
927
928
		// Loop through all li nodes and find where we are in this list
929
		for ($i = 0; $i < $total_nodes; $i++)
930
		{
931
			$current_node = $this->_parser ? $list_node->childNodes->item($i) : $list_node->childNodes($i);
932
			if ($current_node === $node)
933
				$position = $i + 1;
934
		}
935
936
		return $position;
937
	}
938
939
	/**
940
	 * Helper function for table creation
941
	 *
942
	 * - Builds td's to a give width, aligned as needed
943
	 *
944
	 * @param string $align
945
	 * @param int $width
946
	 * @param string $content
947
	 * @param int $max
948
	 * @return string
949
	 */
950
	private function _align_row_content($align, $width, $content, $max)
951
	{
952
		switch ($align)
953
		{
954
			default:
955
			case 'left':
956
				$content .= str_repeat(' ', $max - $width);
957
				break;
958
			case 'right':
959
				$content = str_repeat(' ', $max - $width) . $content;
960
				break;
961
			case 'center':
962
				$paddingNeeded = $max - $width;
963
				$left = floor($paddingNeeded / 2);
964
				$right = $paddingNeeded - $left;
965
				$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
966
				break;
967
		}
968
969
		return $content;
970
	}
971
972
	/**
973
	 * Gets the inner html of a node
974
	 *
975
	 * @param DOMNode|object $node
976
	 * @return string
977
	 */
978
	private function _get_innerHTML($node)
979
	{
980
		if ($this->_parser)
981
		{
982
			$doc = new DOMDocument();
983
			$doc->appendChild($doc->importNode($node, true));
984
			$html = trim($doc->saveHTML());
985
			$tag = $node->nodeName;
986
987
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
988
		}
989
		else
990
			return $node->innertext;
991
	}
992
993
	/**
994
	 * Gets the outer html of a node
995
	 *
996
	 * @param DOMNode|object $node
997
	 * @return string
998
	 */
999
	private function _get_outerHTML($node)
1000
	{
1001
		if ($this->_parser)
1002
		{
1003
			if (version_compare(PHP_VERSION, '5.3.6') >= 0)
1004
				return htmlspecialchars_decode($this->doc->saveHTML($node));
1005
			else
1006
			{
1007
				// @todo remove when 5.3.6 min
1008
				$doc = new DOMDocument();
1009
				$doc->appendChild($doc->importNode($node, true));
1010
				$html = $doc->saveHTML();
1011
1012
				// We just want the html of the inserted node, it *may* be wrapped
1013
				$html = $this->_returnBodyText($html);
1014
1015
				// Clean it up
1016
				$html = rtrim($html, "\n");
1017
				return html_entity_decode(htmlspecialchars_decode($html, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
1018
			}
1019
		}
1020
		else
1021
			return $node->outertext;
1022
	}
1023
1024
	/**
1025
	 * Escapes markup looking text in html to prevent accidental assignment
1026
	 *
1027
	 * <p>*stuff*</p> should not convert to *stuff* but \*stuff\* since its not to
1028
	 * be converted by md to html as <strong>stuff</strong>
1029
	 *
1030
	 * @param string $value
1031
	 * @return string
1032
	 */
1033
	private function _escape_text($value)
1034
	{
1035
		// Search and replace ...
1036
		foreach ($this->_textEscapeRegex as $regex => $replacement)
1037
			$value = preg_replace('~' . $regex . '~', $replacement, $value);
1038
1039
		return $value;
1040
	}
1041
1042
	/**
1043
	 * If inline code contains backticks ` as part of its content, we need to wrap them so
1044
	 * when markdown is run we don't interpret the ` as additional code blocks
1045
	 *
1046
	 * @param object $node
1047
	 * @param string $value
1048
	 * @return string
1049
	 */
1050
	private function _has_ticks($node, $value)
1051
	{
1052
		$ticks = '';
1053
		$code_parent = $this->_parser ? $node->parentNode->nodeName : $node->parentNode()->nodeName();
1054
1055
		// Inside of a pre, we don't do anything
1056
		if ($code_parent === 'pre')
1057
			return $value;
1058
1059
		// If we have backticks in code, then we back tick the ticks
1060
		// e.g. <code>`bla`</code> will become `` `bla` `` so markdown will deal with it properly
1061
		preg_match_all('~`+~', $value, $matches);
1062
		if (!empty($matches[0]))
1063
		{
1064
			// Yup ticks in the hair
1065
			$ticks = '`';
1066
			rsort($matches[0]);
1067
1068
			// Backtick as many as needed so markdown will work
1069
			while (true)
1070
			{
1071
				if (!in_array($ticks, $matches[0]))
1072
					break;
1073
				$ticks .= '`';
1074
			}
1075
		}
1076
1077
		return $ticks;
1078
	}
1079
1080
	/**
1081
	 * Helper function to adjust wrapping width for long-ish links
1082
	 *
1083
	 * @param string $markdown
1084
	 * @param bool|int $buffer
1085
	 * @return int
1086
	 */
1087
	private function _check_link_lenght($markdown, $buffer = false)
1088
	{
1089
		// Some links can be very long and if we wrap them they break
1090
		$line_strlen = Util::strlen($markdown) + (!empty($buffer) ? (int) $buffer : 0);
1091
		if ($line_strlen > $this->body_width)
1092
		{
1093
			$this->body_width = $line_strlen;
1094
		}
1095
	}
1096
1097
	/**
1098
	 * Breaks a string up so its no more than width characters long
1099
	 *
1100
	 * - Will break at word boundaries
1101
	 * - If no natural space is found will break mid-word
1102
	 *
1103
	 * @param string $string
1104
	 * @param int $width
1105
	 * @param string $break
1106
	 * @return string
1107
	 */
1108
	private function _utf8_wordwrap($string, $width = 75, $break = "\n")
1109
	{
1110
		$strings = explode($break, $string);
1111
		$lines = array();
1112
1113
		foreach ($strings as $string)
1114
		{
1115
			$in_quote = isset($string[0]) && $string[0] === '>';
1116
			while (!empty($string))
1117
			{
1118
				// Get the next #width characters before a break (space, punctuation tab etc)
1119
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
1120
				{
1121
					// Add the #width to the output and set up for the next pass
1122
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . ltrim($matches[1], ' ');
1123
					$string = Util::substr($string, Util::strlen($matches[1]));
1124
				}
1125
				// Humm just a long word with no place to break so we simply cut it after width characters
1126
				else
1127
				{
1128
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
1129
					$string = Util::substr($string, $width);
1130
				}
1131
			}
1132
		}
1133
1134
		// Join it all the shortened sections up on our break characters
1135
		return implode($break, $lines);
1136
	}
1137
}
1138