Completed
Push — 3 ( aecac8...3139b2 )
by Daniel
17:56
created

HTMLText::exists()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 21
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 9
nc 4
nop 0
dl 0
loc 21
rs 9.0534
c 0
b 0
f 0
1
<?php
2
/**
3
 * Represents a large text field that contains HTML content.
4
 * This behaves similarly to {@link Text}, but the template processor won't escape any HTML content within it.
5
 *
6
 * @see HTMLVarchar
7
 * @see Text
8
 * @see Varchar
9
 *
10
 * @package framework
11
 * @subpackage model
12
 */
13
class HTMLText extends Text {
14
	private static $escape_type = 'xml';
15
16
	private static $casting = array(
17
		"AbsoluteLinks" => "HTMLText",
18
		"BigSummary" => "HTMLText",
19
		"ContextSummary" => "HTMLText",
20
		"FirstParagraph" => "HTMLText",
21
		"FirstSentence" => "HTMLText",
22
		"LimitCharacters" => "HTMLText",
23
		"LimitSentences" => "HTMLText",
24
		"Lower" => "HTMLText",
25
		"LowerCase" => "HTMLText",
26
		"Summary" => "HTMLText",
27
		"Upper" => "HTMLText",
28
		"UpperCase" => "HTMLText",
29
		'EscapeXML' => 'HTMLText',
30
		'LimitWordCount' => 'HTMLText',
31
		'LimitWordCountXML' => 'HTMLText',
32
		'NoHTML' => 'Text',
33
	);
34
35
	protected $processShortcodes = true;
36
37
	/**
38
	 * Check if shortcodes are enabled
39
	 *
40
	 * @return bool
41
	 */
42
	public function getProcessShortcodes() {
43
		return $this->processShortcodes;
44
	}
45
46
	/**
47
	 * Set shortcodes on or off by default
48
	 *
49
	 * @param bool $process
50
	 * @return $this
51
	 */
52
	public function setProcessShortcodes($process) {
53
		$this->processShortcodes = (bool)$process;
54
		return $this;
55
	}
56
57
	protected $whitelist = false;
58
59
	public function __construct($name = null, $options = array()) {
60
		if(is_string($options)) {
61
			$options = array('whitelist' => $options);
62
		}
63
64
		return parent::__construct($name, $options);
0 ignored issues
show
Bug introduced by
Constructors do not have meaningful return values, anything that is returned from here is discarded. Are you sure this is correct?
Loading history...
65
	}
66
67
	/**
68
	 * @param array $options
69
	 *
70
	 * Options accepted in addition to those provided by Text:
71
	 *
72
	 *   - shortcodes: If true, shortcodes will be turned into the appropriate HTML.
73
	 *                 If false, shortcodes will not be processed.
74
	 *
75
	 *   - whitelist: If provided, a comma-separated list of elements that will be allowed to be stored
76
	 *                (be careful on relying on this for XSS protection - some seemingly-safe elements allow
77
	 *                attributes that can be exploited, for instance <img onload="exploiting_code();" src="..." />)
78
	 *                Text nodes outside of HTML tags are filtered out by default, but may be included by adding
79
	 *                the text() directive. E.g. 'link,meta,text()' will allow only <link /> <meta /> and text at
80
	 *                the root level.
81
	 */
82
	public function setOptions(array $options = array()) {
83
		parent::setOptions($options);
84
85
		if(array_key_exists("shortcodes", $options)) {
86
			$this->processShortcodes = !!$options["shortcodes"];
87
		}
88
89
		if(array_key_exists("whitelist", $options)) {
90
			if(is_array($options['whitelist'])) {
91
				$this->whitelist = $options['whitelist'];
0 ignored issues
show
Documentation Bug introduced by
It seems like $options['whitelist'] of type array is incompatible with the declared type boolean of property $whitelist.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
92
			}
93
			else {
94
				$this->whitelist = preg_split('/,\s*/', $options['whitelist']);
0 ignored issues
show
Documentation Bug introduced by
It seems like preg_split('/,\\s*/', $options['whitelist']) of type array is incompatible with the declared type boolean of property $whitelist.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
95
			}
96
		}
97
	}
98
99
	/**
100
	 * Create a summary of the content. This will be some section of the first paragraph, limited by
101
	 * $maxWords. All internal tags are stripped out - the return value is a string
102
	 *
103
	 * This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly
104
	 * the same
105
	 *
106
	 * @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit
107
	 * @param int $flex Number of words to search through when looking for a nice cut point
108
	 * @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point
109
	 * @return string A nice(ish) summary with no html tags (but possibly still some html entities)
110
	 *
111
	 * @see framework/core/model/fieldtypes/Text#Summary($maxWords)
112
	 */
113
	public function Summary($maxWords = 50, $flex = 15, $add = '...') {
114
		$str = false;
115
116
		/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
117
		if (class_exists('SimpleXMLElement')) {
118
			$doc = new DOMDocument();
119
120
			// Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error
121
			set_error_handler(function($no, $str) {
122
                throw new Exception("HTML Parse Error: " . $str);
123
            }, E_ALL);
124
			//  Nonbreaking spaces get converted into weird characters, so strip them
125
			$value = str_replace('&nbsp;', ' ', $this->RAW());
126
			try {
127
				$res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $value);
128
			}
129
			catch (Exception $e) { $res = false; }
130
			restore_error_handler();
131
132
			if ($res) {
133
				$xml = simplexml_import_dom($doc);
134
				$res = $xml->xpath('//p');
135
				if (!empty($res)) $str = strip_tags($res[0]->asXML());
136
			}
137
		}
138
139
		/* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags.
140
		 * We don't use strip_tags because that does very badly on broken HTML */
141
		if (!$str) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $str of type string|false is loosely compared to false; this is ambiguous if the string can be empty. You might want to explicitly use === false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
142
			/* See if we can pull a paragraph out*/
143
144
			// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
145
			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}u', '', $this->value);
146
			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];
147
148
			/* If _that_ failed, just use the whole text */
149
			if (!$str) $str = $this->value;
150
151
			/* Now pull out all the html-alike stuff */
152
			/* Take out anything that is obviously a tag */
153
			$str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str);
154
			/* Strip out any left over looking bits. Textual < or > should already be encoded to &lt; or &gt; */
155
			$str = preg_replace('{</|<|>}', '', $str);
156
		}
157
158
		/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
159
		 * whitespace normalization) */
160
		$words = preg_split('/\s+/u', $str);
161
		if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);
162
163
		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
164
		 * very good at it) */
165
		for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) {
166
			if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) {
167
				return implode(' ', array_slice($words, 0, $i+1));
168
			}
169
		}
170
171
		// If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end
172
		return implode(' ', array_slice($words, 0, $maxWords)) . $add;
173
	}
174
175
	/**
176
	 * Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or
177
	 * there isn't one), it returns the same as Summary()
178
	 *
179
	 * This is the HTML aware equivilent to Text#FirstSentence
180
	 *
181
	 * @see framework/core/model/fieldtypes/Text#FirstSentence()
182
	 */
183
	public function FirstSentence() {
184
		/* Use summary's html processing logic to get the first paragraph */
185
		$paragraph = $this->Summary(-1);
186
187
		/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
188
		$words = preg_split('/\s+/u', $paragraph);
189
		foreach ($words as $i => $word) {
190
			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
191
				return implode(' ', array_slice($words, 0, $i+1));
192
			}
193
		}
194
195
		/* If we didn't find a sentence ending, use the summary. We re-call rather than using paragraph so that
196
		 * Summary will limit the result this time */
197
		return $this->Summary();
198
	}
199
200
	public function RAW() {
201
		if ($this->processShortcodes) {
202
			return ShortcodeParser::get_active()->parse($this->value);
203
		}
204
		else {
205
			return $this->value;
206
		}
207
	}
208
209
	/**
210
	 * Return the value of the field with relative links converted to absolute urls (with placeholders parsed).
211
	 * @return string
212
	 */
213
	public function AbsoluteLinks() {
214
		return HTTP::absoluteURLs($this->forTemplate());
215
	}
216
217
	public function forTemplate() {
218
		return $this->RAW();
219
	}
220
221
	public function prepValueForDB($value) {
222
		return parent::prepValueForDB($this->whitelistContent($value));
223
	}
224
225
	/**
226
	 * Filter the given $value string through the whitelist filter
227
	 *
228
	 * @param string $value Input html content
229
	 * @return string Value with all non-whitelisted content stripped (if applicable)
230
	 */
231
	public function whitelistContent($value) {
232
		if($this->whitelist) {
233
			$dom = Injector::inst()->create('HTMLValue', $value);
234
235
			$query = array();
236
			$textFilter = ' | //body/text()';
237
			foreach ($this->whitelist as $tag) {
0 ignored issues
show
Bug introduced by
The expression $this->whitelist of type boolean is not traversable.
Loading history...
238
				if($tag === 'text()') {
239
					$textFilter = ''; // Disable text filter if allowed
240
				} else {
241
					$query[] = 'not(self::'.$tag.')';
242
				}
243
			}
244
245
			foreach($dom->query('//body//*['.implode(' and ', $query).']'.$textFilter) as $el) {
246
				if ($el->parentNode) $el->parentNode->removeChild($el);
247
			}
248
249
			$value = $dom->getContent();
250
		}
251
		return $value;
252
	}
253
254
	/**
255
	 * Returns true if the field has meaningful content.
256
	 * Excludes null content like <h1></h1>, <p></p> ,etc
257
	 *
258
	 * @return boolean
259
	 */
260
	public function exists() {
261
		$value = $this->value;
262
263
		if (!$this->isPopulated($value)) {
264
		    return false;
265
        }
266
267
		// If it's got a content tag
268
		if(preg_match('/<(img|embed|object|iframe|meta|source|link)[^>]*>/i', $value)) {
269
			return true;
270
		}
271
272
		// If it's just one or two tags on its own (and not the above) it's empty.
273
		// This might be <p></p> or <h1></h1> or whatever.
274
		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) {
275
			return false;
276
		}
277
278
		// Otherwise its content is genuine content
279
		return true;
280
	}
281
282
	public function scaffoldFormField($title = null, $params = null) {
283
		return new HtmlEditorField($this->name, $title);
284
	}
285
286
	public function scaffoldSearchField($title = null, $params = null) {
287
		return new TextField($this->name, $title);
288
	}
289
290
}
291
292
293