Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like HTMLText often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HTMLText, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 13 | class HTMLText extends Text { |
||
| 14 | private static $escape_type = 'xml'; |
||
|
|
|||
| 15 | |||
| 16 | private static $casting = array( |
||
| 17 | "AbsoluteLinks" => "HTMLText", |
||
| 18 | "BigSummary" => "HTMLText", |
||
| 19 | "ContextSummary" => "HTMLText", |
||
| 20 | "FirstParagraph" => "HTMLText", |
||
| 21 | "FirstSentence" => "HTMLText", |
||
| 22 | "LimitCharacters" => "HTMLText", |
||
| 23 | "LimitSentences" => "HTMLText", |
||
| 24 | "Lower" => "HTMLText", |
||
| 25 | "LowerCase" => "HTMLText", |
||
| 26 | "Summary" => "HTMLText", |
||
| 27 | "Upper" => "HTMLText", |
||
| 28 | "UpperCase" => "HTMLText", |
||
| 29 | 'EscapeXML' => 'HTMLText', |
||
| 30 | 'LimitWordCount' => 'HTMLText', |
||
| 31 | 'LimitWordCountXML' => 'HTMLText', |
||
| 32 | 'NoHTML' => 'Text', |
||
| 33 | ); |
||
| 34 | |||
| 35 | protected $processShortcodes = true; |
||
| 36 | |||
| 37 | protected $whitelist = false; |
||
| 38 | |||
| 39 | public function __construct($name = null, $options = array()) { |
||
| 40 | if(is_string($options)) { |
||
| 41 | $options = array('whitelist' => $options); |
||
| 42 | } |
||
| 43 | |||
| 44 | return parent::__construct($name, $options); |
||
| 45 | } |
||
| 46 | |||
| 47 | /** |
||
| 48 | * @param array $options |
||
| 49 | * |
||
| 50 | * Options accepted in addition to those provided by Text: |
||
| 51 | * |
||
| 52 | * - shortcodes: If true, shortcodes will be turned into the appropriate HTML. |
||
| 53 | * If false, shortcodes will not be processed. |
||
| 54 | * |
||
| 55 | * - whitelist: If provided, a comma-separated list of elements that will be allowed to be stored |
||
| 56 | * (be careful on relying on this for XSS protection - some seemingly-safe elements allow |
||
| 57 | * attributes that can be exploited, for instance <img onload="exploiting_code();" src="..." />) |
||
| 58 | * Text nodes outside of HTML tags are filtered out by default, but may be included by adding |
||
| 59 | * the text() directive. E.g. 'link,meta,text()' will allow only <link /> <meta /> and text at |
||
| 60 | * the root level. |
||
| 61 | */ |
||
| 62 | public function setOptions(array $options = array()) { |
||
| 78 | |||
| 79 | /** |
||
| 80 | * Create a summary of the content. This will be some section of the first paragraph, limited by |
||
| 81 | * $maxWords. All internal tags are stripped out - the return value is a string |
||
| 82 | * |
||
| 83 | * This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly |
||
| 84 | * the same |
||
| 85 | * |
||
| 86 | * @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit |
||
| 87 | * @param int $flex Number of words to search through when looking for a nice cut point |
||
| 88 | * @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point |
||
| 89 | * @return string A nice(ish) summary with no html tags (but possibly still some html entities) |
||
| 90 | * |
||
| 91 | * @see framework/core/model/fieldtypes/Text#Summary($maxWords) |
||
| 92 | */ |
||
| 93 | public function Summary($maxWords = 50, $flex = 15, $add = '...') { |
||
| 94 | $str = false; |
||
| 95 | |||
| 96 | /* First we need the text of the first paragraph, without tags. Try using SimpleXML first */ |
||
| 97 | if (class_exists('SimpleXMLElement')) { |
||
| 98 | $doc = new DOMDocument(); |
||
| 99 | |||
| 100 | // Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error |
||
| 101 | set_error_handler(create_function('$no, $str', 'throw new Exception("HTML Parse Error: ".$str);'), E_ALL); |
||
| 102 | // Nonbreaking spaces get converted into weird characters, so strip them |
||
| 103 | $value = str_replace(' ', ' ', $this->value); |
||
| 104 | try { |
||
| 105 | $res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $value); |
||
| 106 | } |
||
| 107 | catch (Exception $e) { $res = false; } |
||
| 108 | restore_error_handler(); |
||
| 109 | |||
| 110 | if ($res) { |
||
| 111 | $xml = simplexml_import_dom($doc); |
||
| 112 | $res = $xml->xpath('//p'); |
||
| 113 | if (!empty($res)) $str = strip_tags($res[0]->asXML()); |
||
| 114 | } |
||
| 115 | } |
||
| 116 | |||
| 117 | /* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags. |
||
| 118 | * We don't use strip_tags because that does very badly on broken HTML */ |
||
| 119 | if (!$str) { |
||
| 120 | /* See if we can pull a paragraph out*/ |
||
| 121 | |||
| 122 | // Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph |
||
| 123 | $str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}', '', $this->value); |
||
| 124 | if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $str, $matches)) $str = $matches[2]; |
||
| 125 | |||
| 126 | /* If _that_ failed, just use the whole text */ |
||
| 127 | if (!$str) $str = $this->value; |
||
| 128 | |||
| 129 | /* Now pull out all the html-alike stuff */ |
||
| 130 | /* Take out anything that is obviously a tag */ |
||
| 131 | $str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str); |
||
| 132 | /* Strip out any left over looking bits. Textual < or > should already be encoded to < or > */ |
||
| 133 | $str = preg_replace('{</|<|>}', '', $str); |
||
| 134 | } |
||
| 135 | |||
| 136 | /* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for |
||
| 137 | * whitespace normalization) */ |
||
| 138 | $words = preg_split('/\s+/', $str); |
||
| 139 | if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words); |
||
| 140 | |||
| 141 | /* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't |
||
| 142 | * very good at it) */ |
||
| 143 | for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) { |
||
| 144 | if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) { |
||
| 145 | return implode(' ', array_slice($words, 0, $i+1)); |
||
| 146 | } |
||
| 147 | } |
||
| 148 | |||
| 149 | // If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end |
||
| 150 | return implode(' ', array_slice($words, 0, $maxWords)) . $add; |
||
| 151 | } |
||
| 152 | |||
| 153 | /** |
||
| 154 | * Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or |
||
| 155 | * there isn't one), it returns the same as Summary() |
||
| 156 | * |
||
| 157 | * This is the HTML aware equivilent to Text#FirstSentence |
||
| 158 | * |
||
| 159 | * @see framework/core/model/fieldtypes/Text#FirstSentence() |
||
| 160 | */ |
||
| 161 | public function FirstSentence() { |
||
| 177 | |||
| 178 | /** |
||
| 179 | * Return the value of the field with relative links converted to absolute urls (with placeholders parsed). |
||
| 180 | * @return string |
||
| 181 | */ |
||
| 182 | public function AbsoluteLinks() { |
||
| 185 | |||
| 186 | public function forTemplate() { |
||
| 187 | if ($this->processShortcodes) { |
||
| 188 | return ShortcodeParser::get_active()->parse($this->value); |
||
| 189 | } |
||
| 190 | else { |
||
| 191 | return $this->value; |
||
| 192 | } |
||
| 193 | } |
||
| 194 | |||
| 195 | public function prepValueForDB($value) { |
||
| 198 | |||
| 199 | /** |
||
| 200 | * Filter the given $value string through the whitelist filter |
||
| 201 | * |
||
| 202 | * @param string $value Input html content |
||
| 203 | * @return string Value with all non-whitelisted content stripped (if applicable) |
||
| 204 | */ |
||
| 205 | public function whitelistContent($value) { |
||
| 206 | if($this->whitelist) { |
||
| 207 | $dom = Injector::inst()->create('HTMLValue', $value); |
||
| 208 | |||
| 209 | $query = array(); |
||
| 210 | $textFilter = ' | //body/text()'; |
||
| 211 | foreach ($this->whitelist as $tag) { |
||
| 212 | if($tag === 'text()') { |
||
| 213 | $textFilter = ''; // Disable text filter if allowed |
||
| 214 | } else { |
||
| 215 | $query[] = 'not(self::'.$tag.')'; |
||
| 216 | } |
||
| 217 | } |
||
| 218 | |||
| 219 | foreach($dom->query('//body//*['.implode(' and ', $query).']'.$textFilter) as $el) { |
||
| 220 | if ($el->parentNode) $el->parentNode->removeChild($el); |
||
| 221 | } |
||
| 222 | |||
| 223 | $value = $dom->getContent(); |
||
| 224 | } |
||
| 225 | return $value; |
||
| 226 | } |
||
| 227 | |||
| 228 | /** |
||
| 229 | * Returns true if the field has meaningful content. |
||
| 230 | * Excludes null content like <h1></h1>, <p></p> ,etc |
||
| 231 | * |
||
| 232 | * @return boolean |
||
| 233 | */ |
||
| 234 | public function exists() { |
||
| 254 | |||
| 255 | public function scaffoldFormField($title = null, $params = null) { |
||
| 258 | |||
| 259 | public function scaffoldSearchField($title = null, $params = null) { |
||
| 262 | |||
| 263 | } |
||
| 264 | |||
| 265 | |||
| 266 |