Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like HTMLText often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HTMLText, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class HTMLText extends Text { |
||
14 | private static $escape_type = 'xml'; |
||
|
|||
15 | |||
16 | private static $casting = array( |
||
17 | "AbsoluteLinks" => "HTMLText", |
||
18 | "BigSummary" => "HTMLText", |
||
19 | "ContextSummary" => "HTMLText", |
||
20 | "FirstParagraph" => "HTMLText", |
||
21 | "FirstSentence" => "HTMLText", |
||
22 | "LimitCharacters" => "HTMLText", |
||
23 | "LimitSentences" => "HTMLText", |
||
24 | "Lower" => "HTMLText", |
||
25 | "LowerCase" => "HTMLText", |
||
26 | "Summary" => "HTMLText", |
||
27 | "Upper" => "HTMLText", |
||
28 | "UpperCase" => "HTMLText", |
||
29 | 'EscapeXML' => 'HTMLText', |
||
30 | 'LimitWordCount' => 'HTMLText', |
||
31 | 'LimitWordCountXML' => 'HTMLText', |
||
32 | 'NoHTML' => 'Text', |
||
33 | ); |
||
34 | |||
35 | protected $processShortcodes = true; |
||
36 | |||
37 | protected $whitelist = false; |
||
38 | |||
39 | public function __construct($name = null, $options = array()) { |
||
40 | if(is_string($options)) { |
||
41 | $options = array('whitelist' => $options); |
||
42 | } |
||
43 | |||
44 | return parent::__construct($name, $options); |
||
45 | } |
||
46 | |||
47 | /** |
||
48 | * @param array $options |
||
49 | * |
||
50 | * Options accepted in addition to those provided by Text: |
||
51 | * |
||
52 | * - shortcodes: If true, shortcodes will be turned into the appropriate HTML. |
||
53 | * If false, shortcodes will not be processed. |
||
54 | * |
||
55 | * - whitelist: If provided, a comma-separated list of elements that will be allowed to be stored |
||
56 | * (be careful on relying on this for XSS protection - some seemingly-safe elements allow |
||
57 | * attributes that can be exploited, for instance <img onload="exploiting_code();" src="..." />) |
||
58 | * Text nodes outside of HTML tags are filtered out by default, but may be included by adding |
||
59 | * the text() directive. E.g. 'link,meta,text()' will allow only <link /> <meta /> and text at |
||
60 | * the root level. |
||
61 | */ |
||
62 | public function setOptions(array $options = array()) { |
||
78 | |||
79 | /** |
||
80 | * Create a summary of the content. This will be some section of the first paragraph, limited by |
||
81 | * $maxWords. All internal tags are stripped out - the return value is a string |
||
82 | * |
||
83 | * This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly |
||
84 | * the same |
||
85 | * |
||
86 | * @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit |
||
87 | * @param int $flex Number of words to search through when looking for a nice cut point |
||
88 | * @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point |
||
89 | * @return string A nice(ish) summary with no html tags (but possibly still some html entities) |
||
90 | * |
||
91 | * @see framework/core/model/fieldtypes/Text#Summary($maxWords) |
||
92 | */ |
||
93 | public function Summary($maxWords = 50, $flex = 15, $add = '...') { |
||
94 | $str = false; |
||
95 | |||
96 | /* First we need the text of the first paragraph, without tags. Try using SimpleXML first */ |
||
97 | if (class_exists('SimpleXMLElement')) { |
||
98 | $doc = new DOMDocument(); |
||
99 | |||
100 | // Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error |
||
101 | set_error_handler(create_function('$no, $str', 'throw new Exception("HTML Parse Error: ".$str);'), E_ALL); |
||
102 | // Nonbreaking spaces get converted into weird characters, so strip them |
||
103 | $value = str_replace(' ', ' ', $this->value); |
||
104 | try { |
||
105 | $res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $value); |
||
106 | } |
||
107 | catch (Exception $e) { $res = false; } |
||
108 | restore_error_handler(); |
||
109 | |||
110 | if ($res) { |
||
111 | $xml = simplexml_import_dom($doc); |
||
112 | $res = $xml->xpath('//p'); |
||
113 | if (!empty($res)) $str = strip_tags($res[0]->asXML()); |
||
114 | } |
||
115 | } |
||
116 | |||
117 | /* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags. |
||
118 | * We don't use strip_tags because that does very badly on broken HTML */ |
||
119 | if (!$str) { |
||
120 | /* See if we can pull a paragraph out*/ |
||
121 | |||
122 | // Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph |
||
123 | $str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}', '', $this->value); |
||
124 | if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $str, $matches)) $str = $matches[2]; |
||
125 | |||
126 | /* If _that_ failed, just use the whole text */ |
||
127 | if (!$str) $str = $this->value; |
||
128 | |||
129 | /* Now pull out all the html-alike stuff */ |
||
130 | /* Take out anything that is obviously a tag */ |
||
131 | $str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str); |
||
132 | /* Strip out any left over looking bits. Textual < or > should already be encoded to < or > */ |
||
133 | $str = preg_replace('{</|<|>}', '', $str); |
||
134 | } |
||
135 | |||
136 | /* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for |
||
137 | * whitespace normalization) */ |
||
138 | $words = preg_split('/\s+/', $str); |
||
139 | if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words); |
||
140 | |||
141 | /* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't |
||
142 | * very good at it) */ |
||
143 | for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) { |
||
144 | if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) { |
||
145 | return implode(' ', array_slice($words, 0, $i+1)); |
||
146 | } |
||
147 | } |
||
148 | |||
149 | // If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end |
||
150 | return implode(' ', array_slice($words, 0, $maxWords)) . $add; |
||
151 | } |
||
152 | |||
153 | /** |
||
154 | * Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or |
||
155 | * there isn't one), it returns the same as Summary() |
||
156 | * |
||
157 | * This is the HTML aware equivilent to Text#FirstSentence |
||
158 | * |
||
159 | * @see framework/core/model/fieldtypes/Text#FirstSentence() |
||
160 | */ |
||
161 | public function FirstSentence() { |
||
177 | |||
178 | /** |
||
179 | * Return the value of the field with relative links converted to absolute urls (with placeholders parsed). |
||
180 | * @return string |
||
181 | */ |
||
182 | public function AbsoluteLinks() { |
||
185 | |||
186 | public function forTemplate() { |
||
187 | if ($this->processShortcodes) { |
||
188 | return ShortcodeParser::get_active()->parse($this->value); |
||
189 | } |
||
190 | else { |
||
191 | return $this->value; |
||
192 | } |
||
193 | } |
||
194 | |||
195 | public function prepValueForDB($value) { |
||
198 | |||
199 | /** |
||
200 | * Filter the given $value string through the whitelist filter |
||
201 | * |
||
202 | * @param string $value Input html content |
||
203 | * @return string Value with all non-whitelisted content stripped (if applicable) |
||
204 | */ |
||
205 | public function whitelistContent($value) { |
||
206 | if($this->whitelist) { |
||
207 | $dom = Injector::inst()->create('HTMLValue', $value); |
||
208 | |||
209 | $query = array(); |
||
210 | $textFilter = ' | //body/text()'; |
||
211 | foreach ($this->whitelist as $tag) { |
||
212 | if($tag === 'text()') { |
||
213 | $textFilter = ''; // Disable text filter if allowed |
||
214 | } else { |
||
215 | $query[] = 'not(self::'.$tag.')'; |
||
216 | } |
||
217 | } |
||
218 | |||
219 | foreach($dom->query('//body//*['.implode(' and ', $query).']'.$textFilter) as $el) { |
||
220 | if ($el->parentNode) $el->parentNode->removeChild($el); |
||
221 | } |
||
222 | |||
223 | $value = $dom->getContent(); |
||
224 | } |
||
225 | return $value; |
||
226 | } |
||
227 | |||
228 | /** |
||
229 | * Returns true if the field has meaningful content. |
||
230 | * Excludes null content like <h1></h1>, <p></p> ,etc |
||
231 | * |
||
232 | * @return boolean |
||
233 | */ |
||
234 | public function exists() { |
||
254 | |||
255 | public function scaffoldFormField($title = null, $params = null) { |
||
258 | |||
259 | public function scaffoldSearchField($title = null, $params = null) { |
||
262 | |||
263 | } |
||
264 | |||
265 | |||
266 |