HTMLText::exists() - Code Metrics - Inspection of "Merge pull request #8144 from open-sausages/pulls/..." - silverstripe/silverstripe-framework - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 3 ( aecac8...3139b2 )

by Daniel

created 2018-06-06 14:51 UTC

HTMLText::exists() A

↳ Parent: HTMLText

Complexity

Conditions	4
Paths	4

Size

Total Lines	21
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	9
nc	4
nop	0
dl	0
loc	21
rs	9.0534
c	0
b	0
f	0

<?php
/**
 * Represents a large text field that contains HTML content.
 * This behaves similarly to {@link Text}, but the template processor won't escape any HTML content within it.
 *
 * @see HTMLVarchar
 * @see Text
 * @see Varchar
 *
 * @package framework
 * @subpackage model
 */
class HTMLText extends Text {
	private static $escape_type = 'xml';

	private static $casting = array(
		"AbsoluteLinks" => "HTMLText",
		"BigSummary" => "HTMLText",
		"ContextSummary" => "HTMLText",
		"FirstParagraph" => "HTMLText",
		"FirstSentence" => "HTMLText",
		"LimitCharacters" => "HTMLText",
		"LimitSentences" => "HTMLText",
		"Lower" => "HTMLText",
		"LowerCase" => "HTMLText",
		"Summary" => "HTMLText",
		"Upper" => "HTMLText",
		"UpperCase" => "HTMLText",
		'EscapeXML' => 'HTMLText',
		'LimitWordCount' => 'HTMLText',
		'LimitWordCountXML' => 'HTMLText',
		'NoHTML' => 'Text',
	);

	protected $processShortcodes = true;

	/**
	 * Check if shortcodes are enabled
	 *
	 * @return bool
	 */
	public function getProcessShortcodes() {
		return $this->processShortcodes;
	}

	/**
	 * Set shortcodes on or off by default
	 *
	 * @param bool $process
	 * @return $this
	 */
	public function setProcessShortcodes($process) {
		$this->processShortcodes = (bool)$process;
		return $this;
	}

	protected $whitelist = false;

	public function __construct($name = null, $options = array()) {
		if(is_string($options)) {
			$options = array('whitelist' => $options);
		}

		return parent::__construct($name, $options);

	}

	/**
	 * @param array $options
	 *
	 * Options accepted in addition to those provided by Text:
	 *
	 *   - shortcodes: If true, shortcodes will be turned into the appropriate HTML.
	 *                 If false, shortcodes will not be processed.
	 *
	 *   - whitelist: If provided, a comma-separated list of elements that will be allowed to be stored
	 *                (be careful on relying on this for XSS protection - some seemingly-safe elements allow
	 *                attributes that can be exploited, for instance <img onload="exploiting_code();" src="..." />)
	 *                Text nodes outside of HTML tags are filtered out by default, but may be included by adding
	 *                the text() directive. E.g. 'link,meta,text()' will allow only <link /> <meta /> and text at
	 *                the root level.
	 */
	public function setOptions(array $options = array()) {
		parent::setOptions($options);

		if(array_key_exists("shortcodes", $options)) {
			$this->processShortcodes = !!$options["shortcodes"];
		}

		if(array_key_exists("whitelist", $options)) {
			if(is_array($options['whitelist'])) {
				$this->whitelist = $options['whitelist'];

			}
			else {
				$this->whitelist = preg_split('/,\s*/', $options['whitelist']);

			}
		}
	}

	/**
	 * Create a summary of the content. This will be some section of the first paragraph, limited by
	 * $maxWords. All internal tags are stripped out - the return value is a string
	 *
	 * This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly
	 * the same
	 *
	 * @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit
	 * @param int $flex Number of words to search through when looking for a nice cut point
	 * @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point
	 * @return string A nice(ish) summary with no html tags (but possibly still some html entities)
	 *
	 * @see framework/core/model/fieldtypes/Text#Summary($maxWords)
	 */
	public function Summary($maxWords = 50, $flex = 15, $add = '...') {
		$str = false;

		/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
		if (class_exists('SimpleXMLElement')) {
			$doc = new DOMDocument();

			// Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error
			set_error_handler(function($no, $str) {
                throw new Exception("HTML Parse Error: " . $str);
            }, E_ALL);
			//  Nonbreaking spaces get converted into weird characters, so strip them
			$value = str_replace('&nbsp;', ' ', $this->RAW());
			try {
				$res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $value);
			}
			catch (Exception $e) { $res = false; }
			restore_error_handler();

			if ($res) {
				$xml = simplexml_import_dom($doc);
				$res = $xml->xpath('//p');
				if (!empty($res)) $str = strip_tags($res[0]->asXML());
			}
		}

		/* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags.
		 * We don't use strip_tags because that does very badly on broken HTML */
		if (!$str) {
''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
			/* See if we can pull a paragraph out*/

			// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}u', '', $this->value);
			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];

			/* If _that_ failed, just use the whole text */
			if (!$str) $str = $this->value;

			/* Now pull out all the html-alike stuff */
			/* Take out anything that is obviously a tag */
			$str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str);
			/* Strip out any left over looking bits. Textual < or > should already be encoded to &lt; or &gt; */
			$str = preg_replace('{</|<|>}', '', $str);
		}

		/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
		 * whitespace normalization) */
		$words = preg_split('/\s+/u', $str);
		if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);

		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
		 * very good at it) */
		for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) {
			if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) {
				return implode(' ', array_slice($words, 0, $i+1));
			}
		}

		// If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end
		return implode(' ', array_slice($words, 0, $maxWords)) . $add;
	}

	/**
	 * Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or
	 * there isn't one), it returns the same as Summary()
	 *
	 * This is the HTML aware equivilent to Text#FirstSentence
	 *
	 * @see framework/core/model/fieldtypes/Text#FirstSentence()
	 */
	public function FirstSentence() {
		/* Use summary's html processing logic to get the first paragraph */
		$paragraph = $this->Summary(-1);

		/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
		$words = preg_split('/\s+/u', $paragraph);
		foreach ($words as $i => $word) {
			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
				return implode(' ', array_slice($words, 0, $i+1));
			}
		}

		/* If we didn't find a sentence ending, use the summary. We re-call rather than using paragraph so that
		 * Summary will limit the result this time */
		return $this->Summary();
	}

	public function RAW() {
		if ($this->processShortcodes) {
			return ShortcodeParser::get_active()->parse($this->value);
		}
		else {
			return $this->value;
		}
	}

	/**
	 * Return the value of the field with relative links converted to absolute urls (with placeholders parsed).
	 * @return string
	 */
	public function AbsoluteLinks() {
		return HTTP::absoluteURLs($this->forTemplate());
	}

	public function forTemplate() {
		return $this->RAW();
	}

	public function prepValueForDB($value) {
		return parent::prepValueForDB($this->whitelistContent($value));
	}

	/**
	 * Filter the given $value string through the whitelist filter
	 *
	 * @param string $value Input html content
	 * @return string Value with all non-whitelisted content stripped (if applicable)
	 */
	public function whitelistContent($value) {
		if($this->whitelist) {
			$dom = Injector::inst()->create('HTMLValue', $value);

			$query = array();
			$textFilter = ' | //body/text()';
			foreach ($this->whitelist as $tag) {

				if($tag === 'text()') {
					$textFilter = ''; // Disable text filter if allowed
				} else {
					$query[] = 'not(self::'.$tag.')';
				}
			}

			foreach($dom->query('//body//*['.implode(' and ', $query).']'.$textFilter) as $el) {
				if ($el->parentNode) $el->parentNode->removeChild($el);
			}

			$value = $dom->getContent();
		}
		return $value;
	}

	/**
	 * Returns true if the field has meaningful content.
	 * Excludes null content like <h1></h1>, <p></p> ,etc
	 *
	 * @return boolean
	 */
	public function exists() {
		$value = $this->value;

		if (!$this->isPopulated($value)) {
		    return false;
        }

		// If it's got a content tag
		if(preg_match('/<(img|embed|object|iframe|meta|source|link)[^>]*>/i', $value)) {
			return true;
		}

		// If it's just one or two tags on its own (and not the above) it's empty.
		// This might be <p></p> or <h1></h1> or whatever.
		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) {
			return false;
		}

		// Otherwise its content is genuine content
		return true;
	}

	public function scaffoldFormField($title = null, $params = null) {
		return new HtmlEditorField($this->name, $title);
	}

	public function scaffoldSearchField($title = null, $params = null) {
		return new TextField($this->name, $title);
	}

}




1			<?php
2			/**
3			* Represents a large text field that contains HTML content.
4			* This behaves similarly to {@link Text}, but the template processor won't escape any HTML content within it.
5			*
6			* @see HTMLVarchar
7			* @see Text
8			* @see Varchar
9			*
10			* @package framework
11			* @subpackage model
12			*/
13			class HTMLText extends Text {
14			private static $escape_type = 'xml';
15
16			private static $casting = array(
17			"AbsoluteLinks" => "HTMLText",
18			"BigSummary" => "HTMLText",
19			"ContextSummary" => "HTMLText",
20			"FirstParagraph" => "HTMLText",
21			"FirstSentence" => "HTMLText",
22			"LimitCharacters" => "HTMLText",
23			"LimitSentences" => "HTMLText",
24			"Lower" => "HTMLText",
25			"LowerCase" => "HTMLText",
26			"Summary" => "HTMLText",
27			"Upper" => "HTMLText",
28			"UpperCase" => "HTMLText",
29			'EscapeXML' => 'HTMLText',
30			'LimitWordCount' => 'HTMLText',
31			'LimitWordCountXML' => 'HTMLText',
32			'NoHTML' => 'Text',
33			);
34
35			protected $processShortcodes = true;
36
37			/**
38			* Check if shortcodes are enabled
39			*
40			* @return bool
41			*/
42			public function getProcessShortcodes() {
43			return $this->processShortcodes;
44			}
45
46			/**
47			* Set shortcodes on or off by default
48			*
49			* @param bool $process
50			* @return $this
51			*/
52			public function setProcessShortcodes($process) {
53			$this->processShortcodes = (bool)$process;
54			return $this;
55			}
56
57			protected $whitelist = false;
58
59			public function __construct($name = null, $options = array()) {
60			if(is_string($options)) {
61			$options = array('whitelist' => $options);
62			}
63
64			return parent::__construct($name, $options);
			0 ignored issues – show Bug introduced 2016-09-13 22:09 UTC by Report Bug Copy Issue Report Constructors do not have meaningful return values, anything that is returned from here is discarded. Are you sure this is correct? Loading history...
65			}
66
67			/**
68			* @param array $options
69			*
70			* Options accepted in addition to those provided by Text:
71			*
72			* - shortcodes: If true, shortcodes will be turned into the appropriate HTML.
73			* If false, shortcodes will not be processed.
74			*
75			* - whitelist: If provided, a comma-separated list of elements that will be allowed to be stored
76			* (be careful on relying on this for XSS protection - some seemingly-safe elements allow
77			* attributes that can be exploited, for instance <img onload="exploiting_code();" src="..." />)
78			* Text nodes outside of HTML tags are filtered out by default, but may be included by adding
79			* the text() directive. E.g. 'link,meta,text()' will allow only <link /> <meta /> and text at
80			* the root level.
81			*/
82			public function setOptions(array $options = array()) {
83			parent::setOptions($options);
84
85			if(array_key_exists("shortcodes", $options)) {
86			$this->processShortcodes = !!$options["shortcodes"];
87			}
88
89			if(array_key_exists("whitelist", $options)) {
90			if(is_array($options['whitelist'])) {
91			$this->whitelist = $options['whitelist'];
			0 ignored issues – show Documentation Bug introduced 2016-06-23 23:55 UTC by Report Bug Copy Issue Report It seems like `$options['whitelist']` of type `array` is incompatible with the declared type `boolean` of property `$whitelist`. Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property. Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property.. Loading history...
92			}
93			else {
94			$this->whitelist = preg_split('/,\s*/', $options['whitelist']);
			0 ignored issues – show Documentation Bug introduced 2016-06-23 23:55 UTC by Report Bug Copy Issue Report It seems like `preg_split('/,\\s*/', $options['whitelist'])` of type `array` is incompatible with the declared type `boolean` of property `$whitelist`. Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property. Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property.. Loading history...
95			}
96			}
97			}
98
99			/**
100			* Create a summary of the content. This will be some section of the first paragraph, limited by
101			* $maxWords. All internal tags are stripped out - the return value is a string
102			*
103			* This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly
104			* the same
105			*
106			* @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit
107			* @param int $flex Number of words to search through when looking for a nice cut point
108			* @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point
109			* @return string A nice(ish) summary with no html tags (but possibly still some html entities)
110			*
111			* @see framework/core/model/fieldtypes/Text#Summary($maxWords)
112			*/
113			public function Summary($maxWords = 50, $flex = 15, $add = '...') {
114			$str = false;
115
116			/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
117			if (class_exists('SimpleXMLElement')) {
118			$doc = new DOMDocument();
119
120			// Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error
121			set_error_handler(function($no, $str) {
122			throw new Exception("HTML Parse Error: " . $str);
123			}, E_ALL);
124			// Nonbreaking spaces get converted into weird characters, so strip them
125			$value = str_replace(' ', ' ', $this->RAW());
126			try {
127			$res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $value);
128			}
129			catch (Exception $e) { $res = false; }
130			restore_error_handler();
131
132			if ($res) {
133			$xml = simplexml_import_dom($doc);
134			$res = $xml->xpath('//p');
135			if (!empty($res)) $str = strip_tags($res[0]->asXML());
136			}
137			}
138
139			/* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags.
140			* We don't use strip_tags because that does very badly on broken HTML */
141			if (!$str) {
			0 ignored issues – show Bug Best Practice introduced 2016-06-23 23:55 UTC by Report Bug Copy Issue Report The expression `$str` of type `string\|false` is loosely compared to `false`; this is ambiguous if the string can be empty. You might want to explicitly use `=== false` instead. In PHP, under loose comparison (like `==`, or `!=`, or `switch` conditions), values of different types might be equal. For `string` values, the empty string `''` is a special case, in particular the following results might be unexpected: '' == false // true '' == null // true 'ab' == false // false 'ab' == null // false // It is often better to use strict comparison '' === false // false '' === null // false Loading history...
142			/* See if we can pull a paragraph out*/
143
144			// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
145			$str = preg_replace('{^\s(<.+?>)<img[^>]*>}u', '', $this->value);
146			if (preg_match('{<p(\s[^<>])?>(.[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];
147
148			/* If _that_ failed, just use the whole text */
149			if (!$str) $str = $this->value;
150
151			/* Now pull out all the html-alike stuff */
152			/* Take out anything that is obviously a tag */
153			$str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str);
154			/* Strip out any left over looking bits. Textual < or > should already be encoded to < or > */
155			$str = preg_replace('{</\|<\|>}', '', $str);
156			}
157
158			/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
159			* whitespace normalization) */
160			$words = preg_split('/\s+/u', $str);
161			if ($maxWords == -1 \|\| count($words) <= $maxWords) return implode(' ', $words);
162
163			/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
164			* very good at it) */
165			for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) {
166			if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr\|Mr\|Mrs\|Ms\|Miss\|Sr\|Jr\|No)\.$/i', $words[$i])) {
167			return implode(' ', array_slice($words, 0, $i+1));
168			}
169			}
170
171			// If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end
172			return implode(' ', array_slice($words, 0, $maxWords)) . $add;
173			}
174
175			/**
176			* Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or
177			* there isn't one), it returns the same as Summary()
178			*
179			* This is the HTML aware equivilent to Text#FirstSentence
180			*
181			* @see framework/core/model/fieldtypes/Text#FirstSentence()
182			*/
183			public function FirstSentence() {
184			/* Use summary's html processing logic to get the first paragraph */
185			$paragraph = $this->Summary(-1);
186
187			/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
188			$words = preg_split('/\s+/u', $paragraph);
189			foreach ($words as $i => $word) {
190			if (preg_match('/(!\|\?\|\.)$/', $word) && !preg_match('/(Dr\|Mr\|Mrs\|Ms\|Miss\|Sr\|Jr\|No)\.$/i', $word)) {
191			return implode(' ', array_slice($words, 0, $i+1));
192			}
193			}
194
195			/* If we didn't find a sentence ending, use the summary. We re-call rather than using paragraph so that
196			* Summary will limit the result this time */
197			return $this->Summary();
198			}
199
200			public function RAW() {
201			if ($this->processShortcodes) {
202			return ShortcodeParser::get_active()->parse($this->value);
203			}
204			else {
205			return $this->value;
206			}
207			}
208
209			/**
210			* Return the value of the field with relative links converted to absolute urls (with placeholders parsed).
211			* @return string
212			*/
213			public function AbsoluteLinks() {
214			return HTTP::absoluteURLs($this->forTemplate());
215			}
216
217			public function forTemplate() {
218			return $this->RAW();
219			}
220
221			public function prepValueForDB($value) {
222			return parent::prepValueForDB($this->whitelistContent($value));
223			}
224
225			/**
226			* Filter the given $value string through the whitelist filter
227			*
228			* @param string $value Input html content
229			* @return string Value with all non-whitelisted content stripped (if applicable)
230			*/
231			public function whitelistContent($value) {
232			if($this->whitelist) {
233			$dom = Injector::inst()->create('HTMLValue', $value);
234
235			$query = array();
236			$textFilter = ' \| //body/text()';
237			foreach ($this->whitelist as $tag) {
			0 ignored issues – show Bug introduced 2016-06-23 23:55 UTC by Report Bug Copy Issue Report The expression `$this->whitelist` of type `boolean` is not traversable. Loading history...
238			if($tag === 'text()') {
239			$textFilter = ''; // Disable text filter if allowed
240			} else {
241			$query[] = 'not(self::'.$tag.')';
242			}
243			}
244
245			foreach($dom->query('//body//*['.implode(' and ', $query).']'.$textFilter) as $el) {
246			if ($el->parentNode) $el->parentNode->removeChild($el);
247			}
248
249			$value = $dom->getContent();
250			}
251			return $value;
252			}
253
254			/**
255			* Returns true if the field has meaningful content.
256			* Excludes null content like <h1></h1>, <p></p> ,etc
257			*
258			* @return boolean
259			*/
260			public function exists() {
261			$value = $this->value;
262
263			if (!$this->isPopulated($value)) {
264			return false;
265			}
266
267			// If it's got a content tag
268			if(preg_match('/<(img\|embed\|object\|iframe\|meta\|source\|link)[^>]*>/i', $value)) {
269			return true;
270			}
271
272			// If it's just one or two tags on its own (and not the above) it's empty.
273			// This might be <p></p> or <h1></h1> or whatever.
274			if(preg_match('/^[\\s](<[^>]+>[\\s]){1,2}$/u', $value)) {
275			return false;
276			}
277
278			// Otherwise its content is genuine content
279			return true;
280			}
281
282			public function scaffoldFormField($title = null, $params = null) {
283			return new HtmlEditorField($this->name, $title);
284			}
285
286			public function scaffoldSearchField($title = null, $params = null) {
287			return new TextField($this->name, $title);
288			}
289
290			}
291
292
293

silverstripe / silverstripe-framework

Push — 3 ( aecac8...3139b2 )

HTMLText::exists() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like