HtmlFormatter::formatElement()   B
last analyzed

Complexity

Conditions 8
Paths 19

Size

Total Lines 29
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 8

Importance

Changes 0
Metric Value
cc 8
eloc 16
nc 19
nop 4
dl 0
loc 29
ccs 18
cts 18
cp 1
crap 8
rs 8.4444
c 0
b 0
f 0
1
<?php
2
3
namespace Elgg\Views;
4
5
use Elgg\EventsService;
6
use Elgg\Exceptions\Configuration\RegistrationException;
7
use Elgg\Exceptions\InvalidArgumentException;
8
use Elgg\Traits\Loggable;
9
use Elgg\ViewsService;
10
use Pelago\Emogrifier\CssInliner;
11
use Pelago\Emogrifier\HtmlProcessor\CssToAttributeConverter;
12
13
/**
14
 * Various helper method for formatting and sanitizing output
15
 */
16
class HtmlFormatter {
17
18
	use Loggable;
19
	
20
	/**
21
	 * Mentions regex
22
	 *
23
	 * Match anchor tag with all attributes and wrapped html
24
	 * we want to exclude matches that have already been wrapped in an anchor
25
	 * '<a[^>]*?>.*?<\/a>'
26
	 *
27
	 * Match tag name and attributes
28
	 * we want to exclude matches that found within tag attributes
29
	 * '<.*?>'
30
	 *
31
	 * Match at least one space or punctuation char before a match
32
	 * '(^|\s|\!|\.|\?|>|\G)+'
33
	 *
34
	 * Match @ followed by username
35
	 * @see \Elgg\Users\Accounts::assertValidUsername()
36
	 * '(@([^\s<&]+))'
37
	 */
38
	public const MENTION_REGEX = '/<a[^>]*?>.*?<\/a>|<.*?>|(^|\s|\!|\.|\?|>|\G)+(@([^\s<&]+))/iu';
39
40
	/**
41
	 * Output constructor.
42
	 *
43
	 * @param ViewsService  $views  Views service
44
	 * @param EventsService $events Events service
45
	 * @param AutoParagraph $autop  Paragraph wrapper
46
	 */
47 767
	public function __construct(
48
		protected ViewsService $views,
49
		protected EventsService $events,
50
		protected AutoParagraph $autop
51
	) {
52 767
	}
53
54
	/**
55
	 * Prepare HTML output
56
	 *
57
	 * @param string $html    HTML string
58
	 * @param array  $options Formatting options
59
	 *
60
	 * @option bool $parse_urls Replace URLs with anchor tags
61
	 * @option bool $parse_emails Replace email addresses with anchor tags
62
	 * @option bool $sanitize Sanitize HTML tags
63
	 * @option bool $autop Add paragraphs instead of new lines
64
	 *
65
	 * @return string
66
	 */
67 33
	public function formatBlock(string $html, array $options = []): string {
68 33
		$options = array_merge([
69 33
			'parse_urls' => true,
70 33
			'parse_emails' => true,
71 33
			'parse_mentions' => true,
72 33
			'sanitize' => true,
73 33
			'autop' => true,
74 33
		], $options);
75
76 33
		$params = [
77 33
			'options' => $options,
78 33
			'html' => $html,
79 33
		];
80
81 33
		$params = $this->events->triggerResults('prepare', 'html', [], $params);
82
83 33
		$html = (string) elgg_extract('html', $params);
84 33
		$options = (array) elgg_extract('options', $params);
85
86 33
		if (elgg_extract('parse_urls', $options)) {
87 32
			$html = $this->parseUrls($html);
88
		}
89
90 33
		if (elgg_extract('parse_emails', $options)) {
91 32
			$html = $this->parseEmails($html);
92
		}
93
		
94 33
		if (elgg_extract('parse_mentions', $options)) {
95 32
			$html = $this->parseMentions($html);
96
		}
97
98 33
		if (elgg_extract('sanitize', $options)) {
99 30
			$html = elgg_sanitize_input($html);
100
		}
101
102 33
		if (elgg_extract('autop', $options)) {
103 32
			$html = $this->addParagaraphs($html);
104
		}
105
106 33
		return $html;
107
	}
108
109
	/**
110
	 * Takes a string and turns any URLs into formatted links
111
	 *
112
	 * @param string $text The input string
113
	 *
114
	 * @return string The output string with formatted links
115
	 */
116 52
	public function parseUrls(string $text): string {
117
118 52
		$linkify = new \Misd\Linkify\Linkify();
119
120 52
		return $linkify->processUrls($text, ['attr' => ['rel' => 'nofollow']]);
121
	}
122
123
	/**
124
	 * Takes a string and turns any email addresses into formatted links
125
	 *
126
	 * @param string $text The input string
127
	 *
128
	 * @return string The output string with formatted links
129
	 * @since 2.3
130
	 */
131 42
	public function parseEmails(string $text): string {
132 42
		$linkify = new \Misd\Linkify\Linkify();
133
134 42
		return $linkify->processEmails($text, ['attr' => ['rel' => 'nofollow']]);
135
	}
136
	
137
	/**
138
	 * Takes a string and turns any @ mentions into a formatted link
139
	 *
140
	 * @param string $text The input string
141
	 *
142
	 * @return string
143
	 * @since 5.0
144
	 */
145 34
	public function parseMentions(string $text): string {
146 34
		$callback = function (array $matches) {
147 17
			$source = elgg_extract(0, $matches);
148 17
			$preceding_char = elgg_extract(1, $matches);
149 17
			$username = elgg_extract(3, $matches);
150
			
151 17
			if (empty($username)) {
152 17
				return $source;
153
			}
154
			
155
			try {
156 2
				_elgg_services()->accounts->assertValidUsername($username);
157 2
			} catch (RegistrationException $e) {
158 2
				return $source;
159
			}
160
			
161 2
			$user = elgg_get_user_by_username($username);
162
			
163
			// Catch the trailing period when used as punctuation and not a username.
164 2
			$period = '';
165 2
			if (!$user && str_ends_with($username, '.')) {
166 1
				$user = elgg_get_user_by_username(substr($username, 0, -1));
167 1
				$period = '.';
168
			}
169
			
170 2
			if (!$user) {
171 1
				return $source;
172
			}
173
			
174 2
			if (elgg_get_config('mentions_display_format') === 'username') {
175 1
				$replacement = elgg_view_url($user->getURL(), "@{$user->username}");
176
			} else {
177 1
				$replacement = elgg_view_url($user->getURL(), $user->getDisplayName());
178
			}
179
			
180 2
			return $preceding_char . $replacement . $period;
181 34
		};
182
		
183 34
		return preg_replace_callback(self::MENTION_REGEX, $callback, $text) ?? $text;
184
	}
185
186
	/**
187
	 * Create paragraphs from text with line spacing
188
	 *
189
	 * @param string $string The string
190
	 *
191
	 * @return string
192
	 **/
193 45
	public function addParagaraphs(string $string): string {
194
		try {
195 45
			$result = $this->autop->process($string);
196 45
			if ($result !== false) {
197 45
				return $result;
198
			}
199
		} catch (\RuntimeException $e) {
200
			$this->getLogger()->warning('AutoParagraph failed to process the string: ' . $e->getMessage());
201
		}
202
		
203
		return $string;
204
	}
205
206
	/**
207
	 * Converts an associative array into a string of well-formed HTML/XML attributes
208
	 * Returns a concatenated string of HTML attributes to be inserted into a tag (e.g., <tag $attrs>)
209
	 *
210
	 * An example of the attributes:
211
	 * Attribute value can be a scalar value, an array of scalar values, or true
212
	 * <code>
213
	 *     $attrs = [
214
	 *         'class' => ['elgg-input', 'elgg-input-text'], // will be imploded with spaces
215
	 *         'style' => ['margin-left:10px;', 'color: #666;'], // will be imploded with spaces
216
	 *         'alt' => 'Alt text', // will be left as is
217
	 *         'disabled' => true, // will be converted to disabled="disabled"
218
	 *         'data-options' => json_encode(['foo' => 'bar']), // will be output as an escaped JSON string
219
	 *         'batch' => <\ElggBatch>, // will be ignored
220
	 *         'items' => [<\ElggObject>], // will be ignored
221
	 *     ];
222
	 * </code>
223
	 *
224
	 * @param array $attrs An array of attribute => value pairs
225
	 *
226
	 * @return string
227
	 *
228
	 * @see elgg_format_element()
229
	 */
230 225
	public function formatAttributes(array $attrs = []): string {
231 225
		if (empty($attrs)) {
232
			return '';
233
		}
234
235 225
		$attributes = [];
236
237 225
		foreach ($attrs as $attr => $val) {
238 225
			if (!str_starts_with($attr, 'data-') && str_contains($attr, '_')) {
239
				// this is probably a view $vars variable not meant for output
240 72
				continue;
241
			}
242
243 225
			$attr = strtolower($attr);
244
245 225
			if (!isset($val) || $val === false) {
246 105
				continue;
247
			}
248
249 225
			if ($val === true) {
250 65
				$val = $attr; //e.g. checked => true ==> checked="checked"
251
			}
252
253 225
			if (is_array($val) && empty($val)) {
254
				//e.g. ['class' => []]
255 50
				continue;
256
			}
257
			
258 225
			if (is_scalar($val)) {
259 211
				$val = [$val];
260
			}
261
262 225
			if (!is_array($val)) {
263 8
				continue;
264
			}
265
266
			// Check if array contains non-scalar values and bail if so
267 225
			$filtered_val = array_filter($val, function($e) {
268 225
				return is_scalar($e);
269 225
			});
270
271 225
			if (count($val) != count($filtered_val)) {
272 1
				continue;
273
			}
274
275 225
			$val = implode(' ', $val);
276
277 225
			$val = htmlspecialchars($val, ENT_QUOTES, 'UTF-8', false);
278 225
			$attributes[] = "$attr=\"$val\"";
279
		}
280
281 225
		return implode(' ', $attributes);
282
	}
283
284
	/**
285
	 * Format an HTML element
286
	 *
287
	 * @param string $tag_name   The element tagName. e.g. "div". This will not be validated.
288
	 *
289
	 * @param array  $attributes The element attributes.
290
	 *
291
	 * @param string $text       The contents of the element. Assumed to be HTML unless encode_text is true.
292
	 *
293
	 * @param array  $options    Options array with keys:
294
	 *
295
	 *                           - encode_text   => (bool, default false) If true, $text will be HTML-escaped. Already-escaped entities
296
	 *                           will not be double-escaped.
297
	 *
298
	 *                           - double_encode => (bool, default false) If true, the $text HTML escaping will be allowed to double
299
	 *                           encode HTML entities: '&times;' will become '&amp;times;'
300
	 *
301
	 *                           - is_void       => (bool) If given, this determines whether the function will return just the open tag.
302
	 *                           Otherwise this will be determined by the tag name according to this list:
303
	 *                           http://www.w3.org/html/wg/drafts/html/master/single-page.html#void-elements
304
	 *
305
	 *                           - is_xml        => (bool, default false) If true, void elements will be formatted like "<tag />"
306
	 *
307
	 * @return string
308
	 * @since 1.9.0
309
	 * @throws InvalidArgumentException
310
	 */
311 233
	public function formatElement(string $tag_name, array $attributes = [], string $text = '', array $options = []): string {
312 233
		if ($tag_name === '') {
313 1
			throw new InvalidArgumentException('$tag_name is required');
314
		}
315
		
316
		// from http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
317 232
		$is_void = $options['is_void'] ?? in_array(strtolower($tag_name), [
318 232
			'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem',
319 232
			'meta', 'param', 'source', 'track', 'wbr'
320 232
		]);
321
322 232
		if (!empty($options['encode_text']) && is_string($text)) {
323 56
			$double_encode = !empty($options['double_encode']);
324 56
			$text = htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8', $double_encode);
325
		}
326
327 232
		$attrs = '';
328 232
		if (!empty($attributes)) {
329 222
			$attrs = $this->formatAttributes($attributes);
330 222
			if ($attrs !== '') {
331 222
				$attrs = " $attrs";
332
			}
333
		}
334
335 232
		if ($is_void) {
336 117
			return empty($options['is_xml']) ? "<{$tag_name}{$attrs}>" : "<{$tag_name}{$attrs} />";
337
		}
338
		
339 228
		return "<{$tag_name}{$attrs}>$text</$tag_name>";
340
	}
341
342
	/**
343
	 * Strip tags and offer plugins the chance.
344
	 * Plugins register for output:strip_tags event.
345
	 * Original string included in $params['original_string']
346
	 *
347
	 * @param string $string         Formatted string
348
	 * @param string $allowable_tags Optional parameter to specify tags which should not be stripped
349
	 *
350
	 * @return string String run through strip_tags() and any event.
351
	 */
352 144
	public function stripTags(string $string, string $allowable_tags = null): string {
353 144
		$params = [
354 144
			'original_string' => $string,
355 144
			'allowable_tags' => $allowable_tags,
356 144
		];
357
358 144
		$string = strip_tags($string, $allowable_tags);
359 144
		return (string) $this->events->triggerResults('format', 'strip_tags', $params, $string);
360
	}
361
362
	/**
363
	 * Decode HTML markup into a raw text string
364
	 *
365
	 * This applies html_entity_decode() to a string while re-entitising HTML
366
	 * special char entities to prevent them from being decoded back to their
367
	 * unsafe original forms.
368
	 *
369
	 * This relies on html_entity_decode() not translating entities when
370
	 * doing so leaves behind another entity, e.g. &amp;gt; if decoded would
371
	 * create &gt; which is another entity itself. This seems to escape the
372
	 * usual behaviour where any two paired entities creating a HTML tag are
373
	 * usually decoded, i.e. a lone &gt; is not decoded, but &lt;foo&gt; would
374
	 * be decoded to <foo> since it creates a full tag.
375
	 *
376
	 * Note: html_entity_decode() is poorly explained in the manual - which is really
377
	 * bad given its potential for misuse on user input already escaped elsewhere.
378
	 * Stackoverflow is littered with advice to use this function in the precise
379
	 * way that would lead to user input being capable of injecting arbitrary HTML.
380
	 *
381
	 * @param string $string Encoded HTML
382
	 *
383
	 * @return string
384
	 *
385
	 * @author Pádraic Brady
386
	 * @copyright Copyright (c) 2010 Pádraic Brady (http://blog.astrumfutura.com)
387
	 * @license Released under dual-license GPL2/MIT by explicit permission of Pádraic Brady
388
	 */
389
	public function decode(string $string): string {
390
		$string = str_replace(
391
			['&gt;', '&lt;', '&amp;', '&quot;', '&#039;'],
392
			['&amp;gt;', '&amp;lt;', '&amp;amp;', '&amp;quot;', '&amp;#039;'],
393
			$string
394
		);
395
		$string = html_entity_decode($string, ENT_NOQUOTES, 'UTF-8');
396
		return str_replace(
397
			['&amp;gt;', '&amp;lt;', '&amp;amp;', '&amp;quot;', '&amp;#039;'],
398
			['&gt;', '&lt;', '&amp;', '&quot;', '&#039;'],
399
			$string
400
		);
401
	}
402
	
403
	/**
404
	 * Adds inline style to html content
405
	 *
406
	 * @param string $html      html content
407
	 * @param string $css       style text
408
	 * @param bool   $body_only toggle to return the body contents instead of a full html
409
	 *
410
	 * @return string
411
	 *
412
	 * @since 4.0
413
	 */
414 10
	public function inlineCss(string $html, string $css, bool $body_only = false): string {
415 10
		if (empty($html) || empty($css)) {
416
			return $html;
417
		}
418
		
419 10
		$html_with_inlined_css = CssInliner::fromHtml($html)->disableStyleBlocksParsing()->inlineCss($css)->render();
420 10
		$inlined_attribute_converter = CssToAttributeConverter::fromHtml($html_with_inlined_css)->convertCssToVisualAttributes();
421
		
422 10
		return $body_only ? $inlined_attribute_converter->renderBodyContent() : $inlined_attribute_converter->render();
423
	}
424
	
425
	/**
426
	 * Replaces relative urls in href or src attributes in text
427
	 *
428
	 * @param string $text source content
429
	 *
430
	 * @return string
431
	 *
432
	 * @since 4.0
433
	 */
434 11
	public function normalizeUrls(string $text): string {
435 11
		$pattern = '/\s(?:href|src)=([\'"]\S+[\'"])/i';
436
		
437
		// find all matches
438 11
		$matches = [];
439 11
		preg_match_all($pattern, $text, $matches);
440
		
441 11
		if (empty($matches) || !isset($matches[1])) {
442
			return $text;
443
		}
444
		
445
		// go through all the matches
446 11
		$urls = $matches[1];
447 11
		$urls = array_unique($urls);
448
		
449 11
		foreach ($urls as $url) {
450
			// remove wrapping quotes from the url
451 11
			$real_url = substr($url, 1, -1);
452
			// normalize url
453 11
			$new_url = elgg_normalize_url($real_url);
454
			// make the correct replacement string
455 11
			$replacement = str_replace($real_url, $new_url, $url);
456
			
457
			// replace the url in the content
458 11
			$text = str_replace($url, $replacement, $text);
459
		}
460
		
461 11
		return $text;
462
	}
463
}
464