Completed
Push — master ( 556b72...aa5532 )
by Josh
17:56
created

TemplateHelper::replaceTokensInText()   B

Complexity

Conditions 4
Paths 4

Size

Total Lines 45
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 45
rs 8.5806
c 0
b 0
f 0
cc 4
eloc 22
nc 4
nop 3
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2018 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Configurator\Helpers;
9
10
use DOMAttr;
11
use DOMCharacterData;
12
use DOMDocument;
13
use DOMElement;
14
use DOMNode;
15
use DOMProcessingInstruction;
16
use DOMText;
17
use DOMXPath;
18
use RuntimeException;
19
use s9e\TextFormatter\Configurator\Helpers\RegexpBuilder;
20
21
abstract class TemplateHelper
22
{
23
	/**
24
	* XSL namespace
25
	*/
26
	const XMLNS_XSL = 'http://www.w3.org/1999/XSL/Transform';
27
28
	/**
29
	* Return all attributes (literal or generated) that match given regexp
30
	*
31
	* @param  DOMDocument $dom    Document
32
	* @param  string      $regexp Regexp
33
	* @return array               Array of DOMNode instances
34
	*/
35
	public static function getAttributesByRegexp(DOMDocument $dom, $regexp)
36
	{
37
		$xpath = new DOMXPath($dom);
38
		$nodes = [];
39
40
		// Get literal attributes
41
		foreach ($xpath->query('//@*') as $attribute)
42
		{
43
			if (preg_match($regexp, $attribute->name))
44
			{
45
				$nodes[] = $attribute;
46
			}
47
		}
48
49
		// Get generated attributes
50
		foreach ($xpath->query('//xsl:attribute') as $attribute)
51
		{
52
			if (preg_match($regexp, $attribute->getAttribute('name')))
53
			{
54
				$nodes[] = $attribute;
55
			}
56
		}
57
58
		// Get attributes created with <xsl:copy-of/>
59
		foreach ($xpath->query('//xsl:copy-of') as $node)
60
		{
61
			$expr = $node->getAttribute('select');
62
63
			if (preg_match('/^@(\\w+)$/', $expr, $m)
64
			 && preg_match($regexp, $m[1]))
65
			{
66
				$nodes[] = $node;
67
			}
68
		}
69
70
		return $nodes;
71
	}
72
73
	/**
74
	* Return all DOMNodes whose content is CSS
75
	*
76
	* @param  DOMDocument $dom Document
77
	* @return array            Array of DOMNode instances
78
	*/
79
	public static function getCSSNodes(DOMDocument $dom)
80
	{
81
		$regexp = '/^style$/i';
82
		$nodes  = array_merge(
83
			self::getAttributesByRegexp($dom, $regexp),
84
			self::getElementsByRegexp($dom, '/^style$/i')
85
		);
86
87
		return $nodes;
88
	}
89
90
	/**
91
	* Return all elements (literal or generated) that match given regexp
92
	*
93
	* @param  DOMDocument $dom    Document
94
	* @param  string      $regexp Regexp
95
	* @return array               Array of DOMNode instances
96
	*/
97
	public static function getElementsByRegexp(DOMDocument $dom, $regexp)
98
	{
99
		$xpath = new DOMXPath($dom);
100
		$nodes = [];
101
102
		// Get literal attributes
103
		foreach ($xpath->query('//*') as $element)
104
		{
105
			if (preg_match($regexp, $element->localName))
106
			{
107
				$nodes[] = $element;
108
			}
109
		}
110
111
		// Get generated elements
112
		foreach ($xpath->query('//xsl:element') as $element)
113
		{
114
			if (preg_match($regexp, $element->getAttribute('name')))
115
			{
116
				$nodes[] = $element;
117
			}
118
		}
119
120
		// Get elements created with <xsl:copy-of/>
121
		// NOTE: this method of creating elements is disallowed by default
122
		foreach ($xpath->query('//xsl:copy-of') as $node)
123
		{
124
			$expr = $node->getAttribute('select');
125
126
			if (preg_match('/^\\w+$/', $expr)
127
			 && preg_match($regexp, $expr))
128
			{
129
				$nodes[] = $node;
130
			}
131
		}
132
133
		return $nodes;
134
	}
135
136
	/**
137
	* Return all DOMNodes whose content is JavaScript
138
	*
139
	* @param  DOMDocument $dom Document
140
	* @return array            Array of DOMNode instances
141
	*/
142
	public static function getJSNodes(DOMDocument $dom)
143
	{
144
		$regexp = '/^(?>data-s9e-livepreview-postprocess$|on)/i';
145
		$nodes  = array_merge(
146
			self::getAttributesByRegexp($dom, $regexp),
147
			self::getElementsByRegexp($dom, '/^script$/i')
148
		);
149
150
		return $nodes;
151
	}
152
153
	/**
154
	* Return all elements (literal or generated) that match given regexp
155
	*
156
	* Will return all <param/> descendants of <object/> and all attributes of <embed/> whose name
157
	* matches given regexp. This method will NOT catch <param/> elements whose 'name' attribute is
158
	* set via an <xsl:attribute/>
159
	*
160
	* @param  DOMDocument $dom    Document
161
	* @param  string      $regexp
162
	* @return array               Array of DOMNode instances
163
	*/
164
	public static function getObjectParamsByRegexp(DOMDocument $dom, $regexp)
165
	{
166
		$xpath = new DOMXPath($dom);
167
		$nodes = [];
168
169
		// Collect attributes from <embed/> elements
170
		foreach (self::getAttributesByRegexp($dom, $regexp) as $attribute)
171
		{
172
			if ($attribute->nodeType === XML_ATTRIBUTE_NODE)
173
			{
174
				if (strtolower($attribute->parentNode->localName) === 'embed')
175
				{
176
					$nodes[] = $attribute;
177
				}
178
			}
179
			elseif ($xpath->evaluate('ancestor::embed', $attribute))
180
			{
181
				// Assuming <xsl:attribute/> or <xsl:copy-of/>
182
				$nodes[] = $attribute;
183
			}
184
		}
185
186
		// Collect <param/> descendants of <object/> elements
187
		foreach ($dom->getElementsByTagName('object') as $object)
188
		{
189
			foreach ($object->getElementsByTagName('param') as $param)
190
			{
191
				if (preg_match($regexp, $param->getAttribute('name')))
192
				{
193
					$nodes[] = $param;
194
				}
195
			}
196
		}
197
198
		return $nodes;
199
	}
200
201
	/**
202
	* Return a list of parameters in use in given XSL
203
	*
204
	* @param  string $xsl XSL source
205
	* @return array       Alphabetically sorted list of unique parameter names
206
	*/
207
	public static function getParametersFromXSL($xsl)
208
	{
209
		$paramNames = [];
210
211
		// Wrap the XSL in boilerplate code because it might not have a root element
212
		$xsl = '<xsl:stylesheet xmlns:xsl="' . self::XMLNS_XSL . '">'
213
		     . '<xsl:template>'
214
		     . $xsl
215
		     . '</xsl:template>'
216
		     . '</xsl:stylesheet>';
217
218
		$dom = new DOMDocument;
219
		$dom->loadXML($xsl);
220
221
		$xpath = new DOMXPath($dom);
222
223
		// Start by collecting XPath expressions in XSL elements
224
		$query = '//xsl:*/@match | //xsl:*/@select | //xsl:*/@test';
225
		foreach ($xpath->query($query) as $attribute)
226
		{
227
			foreach (XPathHelper::getVariables($attribute->value) as $varName)
228
			{
229
				// Test whether this is the name of a local variable
230
				$varQuery = 'ancestor-or-self::*/'
231
				          . 'preceding-sibling::xsl:variable[@name="' . $varName . '"]';
232
233
				if (!$xpath->query($varQuery, $attribute)->length)
234
				{
235
					$paramNames[] = $varName;
236
				}
237
			}
238
		}
239
240
		// Collecting XPath expressions in attribute value templates
241
		$query = '//*[namespace-uri() != "' . self::XMLNS_XSL . '"]'
242
		       . '/@*[contains(., "{")]';
243
		foreach ($xpath->query($query) as $attribute)
244
		{
245
			$tokens = AVTHelper::parse($attribute->value);
246
247
			foreach ($tokens as $token)
248
			{
249
				if ($token[0] !== 'expression')
250
				{
251
					continue;
252
				}
253
254
				foreach (XPathHelper::getVariables($token[1]) as $varName)
255
				{
256
					// Test whether this is the name of a local variable
257
					$varQuery = 'ancestor-or-self::*/'
258
					          . 'preceding-sibling::xsl:variable[@name="' . $varName . '"]';
259
260
					if (!$xpath->query($varQuery, $attribute)->length)
261
					{
262
						$paramNames[] = $varName;
263
					}
264
				}
265
			}
266
		}
267
268
		// Dedupe and sort names
269
		$paramNames = array_unique($paramNames);
270
		sort($paramNames);
271
272
		return $paramNames;
273
	}
274
275
	/**
276
	* Return all DOMNodes whose content is an URL
277
	*
278
	* NOTE: it will also return HTML4 nodes whose content is an URI
279
	*
280
	* @param  DOMDocument $dom Document
281
	* @return array            Array of DOMNode instances
282
	*/
283
	public static function getURLNodes(DOMDocument $dom)
284
	{
285
		$regexp = '/(?>^(?>action|background|c(?>ite|lassid|odebase)|data|formaction|href|icon|longdesc|manifest|p(?>ing|luginspage|oster|rofile)|usemap)|src)$/i';
286
		$nodes  = self::getAttributesByRegexp($dom, $regexp);
287
288
		/**
289
		* @link http://helpx.adobe.com/flash/kb/object-tag-syntax-flash-professional.html
290
		* @link http://www.sitepoint.com/control-internet-explorer/
291
		*/
292
		foreach (self::getObjectParamsByRegexp($dom, '/^(?:dataurl|movie)$/i') as $param)
293
		{
294
			$node = $param->getAttributeNode('value');
295
			if ($node)
296
			{
297
				$nodes[] = $node;
298
			}
299
		}
300
301
		return $nodes;
302
	}
303
304
	/**
305
	* Highlight the source of a node inside of a template
306
	*
307
	* @param  DOMNode $node    Node to highlight
308
	* @param  string  $prepend HTML to prepend
309
	* @param  string  $append  HTML to append
310
	* @return string           Template's source, as HTML
311
	*/
312
	public static function highlightNode(DOMNode $node, $prepend, $append)
313
	{
314
		// Add a unique token to the node
315
		$uniqid = uniqid('_');
316
		if ($node instanceof DOMAttr)
317
		{
318
			$node->value .= $uniqid;
319
		}
320
		elseif ($node instanceof DOMElement)
321
		{
322
			$node->setAttribute($uniqid, '');
323
		}
324
		elseif ($node instanceof DOMCharacterData
325
		     || $node instanceof DOMProcessingInstruction)
326
		{
327
			$node->data .= $uniqid;
328
		}
329
330
		$dom = $node->ownerDocument;
331
		$dom->formatOutput = true;
332
333
		$docXml = self::innerXML($dom->documentElement);
334
		$docXml = trim(str_replace("\n  ", "\n", $docXml));
335
336
		$nodeHtml = htmlspecialchars(trim($dom->saveXML($node)));
337
		$docHtml  = htmlspecialchars($docXml);
338
339
		// Enclose the node's representation in our hilighting HTML
340
		$html = str_replace($nodeHtml, $prepend . $nodeHtml . $append, $docHtml);
341
342
		// Remove the unique token from HTML and from the node
343
		if ($node instanceof DOMAttr)
344
		{
345
			$node->value = substr($node->value, 0, -strlen($uniqid));
346
			$html = str_replace($uniqid, '', $html);
347
		}
348
		elseif ($node instanceof DOMElement)
349
		{
350
			$node->removeAttribute($uniqid);
351
			$html = str_replace(' ' . $uniqid . '=&quot;&quot;', '', $html);
352
		}
353
		elseif ($node instanceof DOMCharacterData
354
		     || $node instanceof DOMProcessingInstruction)
355
		{
356
			$node->data .= $uniqid;
357
			$html = str_replace($uniqid, '', $html);
358
		}
359
360
		return $html;
361
	}
362
363
	/**
364
	* Load a template as an xsl:template node
365
	*
366
	* Will attempt to load it as XML first, then as HTML as a fallback. Either way, an xsl:template
367
	* node is returned
368
	*
369
	* @param  string      $template
370
	* @return DOMDocument
371
	*/
372
	public static function loadTemplate($template)
373
	{
374
		$dom = self::loadTemplateAsXML($template);
375
		if ($dom)
376
		{
377
			return $dom;
378
		}
379
380
		$dom = self::loadTemplateAsXML(self::fixEntities($template));
381
		if ($dom)
382
		{
383
			return $dom;
384
		}
385
386
		// If the template contains an XSL element, abort now. Otherwise, try reparsing it as HTML
387
		if (strpos($template, '<xsl:') !== false)
388
		{
389
			$error = libxml_get_last_error();
390
391
			throw new RuntimeException('Invalid XSL: ' . $error->message);
392
		}
393
394
		return self::loadTemplateAsHTML($template);
395
	}
396
397
	/**
398
	* Replace simple templates (in an array, in-place) with a common template
399
	*
400
	* In some situations, renderers can take advantage of multiple tags having the same template. In
401
	* any configuration, there's almost always a number of "simple" tags that are rendered as an
402
	* HTML element of the same name with no HTML attributes. For instance, the system tag "p" used
403
	* for paragraphs, "B" tags used for "b" HTML elements, etc... This method replaces those
404
	* templates with a common template that uses a dynamic element name based on the tag's name,
405
	* either its nodeName or localName depending on whether the tag is namespaced, and normalized to
406
	* lowercase using XPath's translate() function
407
	*
408
	* @param  array<string> &$templates Associative array of [tagName => template]
409
	* @param  integer       $minCount
410
	* @return void
411
	*/
412
	public static function replaceHomogeneousTemplates(array &$templates, $minCount = 3)
413
	{
414
		$tagNames = [];
415
416
		// Prepare the XPath expression used for the element's name
417
		$expr = 'name()';
418
419
		// Identify "simple" tags, whose template is one element of the same name. Their template
420
		// can be replaced with a dynamic template shared by all the simple tags
421
		foreach ($templates as $tagName => $template)
422
		{
423
			// Generate the element name based on the tag's localName, lowercased
424
			$elName = strtolower(preg_replace('/^[^:]+:/', '', $tagName));
425
426
			if ($template === '<' . $elName . '><xsl:apply-templates/></' . $elName . '>')
427
			{
428
				$tagNames[] = $tagName;
429
430
				// Use local-name() if any of the tags are namespaced
431
				if (strpos($tagName, ':') !== false)
432
				{
433
					$expr = 'local-name()';
434
				}
435
			}
436
		}
437
438
		// We only bother replacing their template if there are at least $minCount simple tags.
439
		// Otherwise it only makes the stylesheet bigger
440
		if (count($tagNames) < $minCount)
441
		{
442
			return;
443
		}
444
445
		// Generate a list of uppercase characters from the tags' names
446
		$chars = preg_replace('/[^A-Z]+/', '', count_chars(implode('', $tagNames), 3));
447
448
		if (is_string($chars) && $chars !== '')
449
		{
450
			$expr = 'translate(' . $expr . ",'" . $chars . "','" . strtolower($chars) . "')";
451
		}
452
453
		// Prepare the common template
454
		$template = '<xsl:element name="{' . $expr . '}">'
455
		          . '<xsl:apply-templates/>'
456
		          . '</xsl:element>';
457
458
		// Replace the templates
459
		foreach ($tagNames as $tagName)
460
		{
461
			$templates[$tagName] = $template;
462
		}
463
	}
464
465
	/**
466
	* Replace parts of a template that match given regexp
467
	*
468
	* Treats attribute values as plain text. Replacements within XPath expression is unsupported.
469
	* The callback must return an array with two elements. The first must be either of 'expression',
470
	* 'literal' or 'passthrough', and the second element depends on the first.
471
	*
472
	*  - 'expression' indicates that the replacement must be treated as an XPath expression such as
473
	*    '@foo', which must be passed as the second element.
474
	*  - 'literal' indicates a literal (plain text) replacement, passed as its second element.
475
	*  - 'passthrough' indicates that the replacement should the tag's content. It works differently
476
	*    whether it is inside an attribute's value or a text node. Within an attribute's value, the
477
	*    replacement will be the text content of the tag. Within a text node, the replacement
478
	*    becomes an <xsl:apply-templates/> node.
479
	*
480
	* @param  string   $template Original template
481
	* @param  string   $regexp   Regexp for matching parts that need replacement
482
	* @param  callback $fn       Callback used to get the replacement
483
	* @return string             Processed template
484
	*/
485
	public static function replaceTokens($template, $regexp, $fn)
486
	{
487
		$dom   = self::loadTemplate($template);
488
		$xpath = new DOMXPath($dom);
489
490
		foreach ($xpath->query('//@*') as $attribute)
491
		{
492
			self::replaceTokensInAttribute($attribute, $regexp, $fn);
493
		}
494
		foreach ($xpath->query('//text()') as $node)
495
		{
496
			self::replaceTokensInText($node, $regexp, $fn);
497
		}
498
499
		return self::saveTemplate($dom);
500
	}
501
502
	/**
503
	* Replace parts of an attribute that match given regexp
504
	*
505
	* @param  DOMAttr  $attribute Attribute
506
	* @param  string   $regexp    Regexp for matching parts that need replacement
507
	* @param  callback $fn        Callback used to get the replacement
508
	* @return void
509
	*/
510
	protected static function replaceTokensInAttribute(DOMAttr $attribute, $regexp, $fn)
511
	{
512
		$attrValue = preg_replace_callback(
513
			$regexp,
514
			function ($m) use ($fn, $attribute)
515
			{
516
				$replacement = $fn($m, $attribute);
517
				if ($replacement[0] === 'expression')
518
				{
519
					return '{' . $replacement[1] . '}';
520
				}
521
				elseif ($replacement[0] === 'passthrough')
522
				{
523
					return '{.}';
524
				}
525
				else
526
				{
527
					return $replacement[1];
528
				}
529
			},
530
			$attribute->value
531
		);
532
		$attribute->value = htmlspecialchars($attrValue, ENT_COMPAT, 'UTF-8');
533
	}
534
535
	/**
536
	* Replace parts of a text node that match given regexp
537
	*
538
	* @param  DOMText  $node     Text node
539
	* @param  string   $regexp   Regexp for matching parts that need replacement
540
	* @param  callback $fn       Callback used to get the replacement
541
	* @return void
542
	*/
543
	protected static function replaceTokensInText(DOMText $node, $regexp, $fn)
544
	{
545
		// Grab the node's parent so that we can rebuild the text with added variables right
546
		// before the node, using DOM's insertBefore(). Technically, it would make more sense
547
		// to create a document fragment, append nodes then replace the node with the fragment
548
		// but it leads to namespace redeclarations, which looks ugly
549
		$parentNode = $node->parentNode;
550
		$dom        = $node->ownerDocument;
551
552
		preg_match_all($regexp, $node->textContent, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
553
		$lastPos = 0;
554
		foreach ($matches as $m)
555
		{
556
			$pos = $m[0][1];
557
558
			// Catch-up to current position
559
			$text = substr($node->textContent, $lastPos, $pos - $lastPos);
560
			$parentNode->insertBefore($dom->createTextNode($text), $node);
561
			$lastPos = $pos + strlen($m[0][0]);
562
563
			// Get the replacement for this token
564
			$replacement = $fn(array_column($m, 0), $node);
565
			if ($replacement[0] === 'expression')
566
			{
567
				$newNode = $dom->createElementNS(self::XMLNS_XSL, 'xsl:value-of');
568
				$newNode->setAttribute('select', $replacement[1]);
569
			}
570
			elseif ($replacement[0] === 'passthrough')
571
			{
572
				$newNode = $dom->createElementNS(self::XMLNS_XSL, 'xsl:apply-templates');
573
			}
574
			else
575
			{
576
				$newNode = $dom->createTextNode($replacement[1]);
577
			}
578
			$parentNode->insertBefore($newNode, $node);
579
		}
580
581
		// Append the rest of the text
582
		$text = substr($node->textContent, $lastPos);
583
		$parentNode->insertBefore($dom->createTextNode($text), $node);
584
585
		// Now remove the old text node
586
		$parentNode->removeChild($node);
587
	}
588
589
	/**
590
	* Serialize a loaded template back into a string
591
	*
592
	* NOTE: removes the root node created by loadTemplate()
593
	*
594
	* @param  DOMDocument $dom
595
	* @return string
596
	*/
597
	public static function saveTemplate(DOMDocument $dom)
598
	{
599
		return self::innerXML($dom->documentElement);
600
	}
601
602
	/**
603
	* Replace HTML entities and unescaped ampersands in given template
604
	*
605
	* @param  string $template
606
	* @return string
607
	*/
608
	protected static function fixEntities($template)
609
	{
610
		return preg_replace_callback(
611
			'(&(?!quot;|amp;|apos;|lt;|gt;)\\w+;)',
612
			function ($m)
613
			{
614
				return html_entity_decode($m[0], ENT_NOQUOTES, 'UTF-8');
615
			},
616
			preg_replace('(&(?![A-Za-z0-9]+;|#\\d+;|#x[A-Fa-f0-9]+;))', '&amp;', $template)
617
		);
618
	}
619
620
	/**
621
	* Get the XML content of an element
622
	*
623
	* @param  DOMElement $element
624
	* @return string
625
	*/
626
	protected static function innerXML(DOMElement $element)
627
	{
628
		// Serialize the XML then remove the outer element
629
		$xml = $element->ownerDocument->saveXML($element);
630
631
		$pos = 1 + strpos($xml, '>');
632
		$len = strrpos($xml, '<') - $pos;
633
634
		// If the template is empty, return an empty string
635
		if ($len < 1)
636
		{
637
			return '';
638
		}
639
640
		$xml = substr($xml, $pos, $len);
641
642
		return $xml;
643
	}
644
645
	/**
646
	* Load given HTML template in a DOM document
647
	*
648
	* @param  string      $template Original template
649
	* @return DOMDocument
650
	*/
651
	protected static function loadTemplateAsHTML($template)
652
	{
653
		$dom  = new DOMDocument;
654
		$html = '<?xml version="1.0" encoding="utf-8" ?><html><body><div>' . $template . '</div></body></html>';
655
656
		$useErrors = libxml_use_internal_errors(true);
657
		$dom->loadHTML($html);
658
		self::removeInvalidAttributes($dom);
659
		libxml_use_internal_errors($useErrors);
660
661
		// Now dump the thing as XML then reload it with the proper root element
662
		$xml = '<?xml version="1.0" encoding="utf-8" ?><xsl:template xmlns:xsl="' . self::XMLNS_XSL . '">' . self::innerXML($dom->documentElement->firstChild->firstChild) . '</xsl:template>';
663
664
		$useErrors = libxml_use_internal_errors(true);
665
		$dom->loadXML($xml);
666
		libxml_use_internal_errors($useErrors);
667
668
		return $dom;
669
	}
670
671
	/**
672
	* Load given XSL template in a DOM document
673
	*
674
	* @param  string           $template Original template
675
	* @return bool|DOMDocument           DOMDocument on success, FALSE otherwise
676
	*/
677
	protected static function loadTemplateAsXML($template)
678
	{
679
		$xml = '<?xml version="1.0" encoding="utf-8" ?><xsl:template xmlns:xsl="' . self::XMLNS_XSL . '">' . $template . '</xsl:template>';
680
681
		$useErrors = libxml_use_internal_errors(true);
682
		$dom       = new DOMDocument;
683
		$success   = $dom->loadXML($xml);
684
		self::removeInvalidAttributes($dom);
685
		libxml_use_internal_errors($useErrors);
686
687
		return ($success) ? $dom : false;
688
	}
689
690
	/**
691
	* Remove attributes with an invalid name from given DOM document
692
	*
693
	* @param  DOMDocument $dom
694
	* @return void
695
	*/
696
	protected static function removeInvalidAttributes(DOMDocument $dom)
697
	{
698
		$xpath = new DOMXPath($dom);
699
		foreach ($xpath->query('//@*') as $attribute)
700
		{
701
			if (!preg_match('(^(?:[-\\w]+:)?(?!\\d)[-\\w]+$)D', $attribute->nodeName))
702
			{
703
				$attribute->parentNode->removeAttributeNode($attribute);
704
			}
705
		}
706
	}
707
}