| Total Complexity | 50 |
| Total Lines | 297 |
| Duplicated Lines | 0 % |
| Changes | 2 | ||
| Bugs | 0 | Features | 0 |
Complex classes like HtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HtmlParser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 5 | class HtmlParser { |
||
| 6 | const ORIGINAL_TEXT_PLACEHOLDER = '#####__ORIGINAL_TEXT__#####'; |
||
| 7 | const LT_PLACEHOLDER = '#####__LT_PLACEHOLDER__#####'; |
||
| 8 | const GT_PLACEHOLDER = '#####__GT_PLACEHOLDER__#####'; |
||
| 9 | |||
| 10 | /** |
||
| 11 | * This solution is taken from here and then modified: |
||
| 12 | * https://www.php.net/manual/fr/regexp.reference.recursive.php#95568 |
||
| 13 | * |
||
| 14 | * @param string $html |
||
| 15 | * |
||
| 16 | * @return array |
||
| 17 | */ |
||
| 18 | public static function parse( $html ) { |
||
| 29 | } |
||
| 30 | |||
| 31 | /** |
||
| 32 | * Protect all < symbols that are not part of html tags. |
||
| 33 | * |
||
| 34 | * Example: |
||
| 35 | * |
||
| 36 | * <div id="1">< Ciao <<div id="2"></div></div> |
||
| 37 | * |
||
| 38 | * is converted to: |
||
| 39 | * |
||
| 40 | * <div id="1">#####__LT_PLACEHOLDER__##### Ciao #####__LT_PLACEHOLDER__#####<div id="2"></div></div> |
||
| 41 | * |
||
| 42 | * @param string $html |
||
| 43 | * |
||
| 44 | * @return string |
||
| 45 | */ |
||
| 46 | private static function protectNotHtmlLessThanSymbols( $html ) { |
||
| 47 | preg_match_all( '/<|>/iu', $html, $matches, PREG_OFFSET_CAPTURE ); |
||
| 48 | |||
| 49 | $delta = 0; |
||
| 50 | $realNextOffset = 0; |
||
| 51 | $next = null; |
||
| 52 | |||
| 53 | foreach ( $matches[ 0 ] as $key => $match ) { |
||
| 54 | |||
| 55 | $current = $matches[ 0 ][ $key ][ 0 ]; |
||
| 56 | |||
| 57 | if ( isset( $matches[ 0 ][ $key + 1 ][ 0 ] ) ) { |
||
| 58 | $next = $matches[ 0 ][ $key + 1 ][ 0 ]; |
||
| 59 | $nextOffset = $matches[ 0 ][ $key + 1 ][ 1 ]; |
||
| 60 | $realNextOffset = ( $delta === 0 ) ? $nextOffset : ( $nextOffset + $delta ); |
||
| 61 | } |
||
| 62 | |||
| 63 | $length = strlen( $match[ 0 ] ); |
||
| 64 | $offset = $matches[ 0 ][ $key ][ 1 ]; |
||
| 65 | $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta ); |
||
| 66 | |||
| 67 | if ( $current === '<' && isset( $next ) ) { |
||
| 68 | |||
| 69 | // 1. if next is > or |
||
| 70 | // 2. next is < and is not html tag (like < >) |
||
| 71 | $insideAngularTags = substr( $html, $realOffset, ( $realNextOffset - $realOffset + 1 ) ); |
||
| 72 | |||
| 73 | if ( $next !== '>' || !Strings::isHtmlString( $insideAngularTags ) ) { |
||
| 74 | $html = substr_replace( $html, self::LT_PLACEHOLDER, $realOffset, $length ); |
||
| 75 | $delta = $delta + strlen( self::LT_PLACEHOLDER ) - $length; |
||
| 76 | } |
||
| 77 | } |
||
| 78 | } |
||
| 79 | |||
| 80 | return !is_array( $html ) ? $html : implode( $html ); |
||
| 81 | } |
||
| 82 | |||
| 83 | /** |
||
| 84 | * Protect not closed html tags. |
||
| 85 | * |
||
| 86 | * Example: |
||
| 87 | * |
||
| 88 | * Ciao <div> this div is not closed. <div>Instead, this is a closed div.</div> |
||
| 89 | * |
||
| 90 | * is converted to: |
||
| 91 | * |
||
| 92 | * Ciao #####__LT_PLACEHOLDER__#####div#####__GT_PLACEHOLDER__##### this div is not closed. <div>Instead, this is a closed div.</div> |
||
| 93 | * |
||
| 94 | * @param string $html |
||
| 95 | * |
||
| 96 | * @return string |
||
| 97 | */ |
||
| 98 | private static function protectNotClosedHtmlTags( $html ) { |
||
| 169 | } |
||
| 170 | |||
| 171 | /** |
||
| 172 | * @param string $html |
||
| 173 | * @param bool $toBeEscaped |
||
| 174 | * |
||
| 175 | * @return array |
||
| 176 | */ |
||
| 177 | private static function extractHtmlNode( $html, $toBeEscaped = false ) { |
||
| 178 | $pattern = "/<([a-zA-Z0-9._-]+)([^>]|[^<]*?)(([\s]*\/>)|" . |
||
| 179 | "(>((([^<]*?|<\!\-\-.*?\-\->)|(?R))*)<\/\\1[\s]*>))/sm"; |
||
| 180 | preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE ); |
||
| 181 | |||
| 182 | $elements = []; |
||
| 183 | |||
| 184 | foreach ( $matches[ 0 ] as $key => $match ) { |
||
| 185 | |||
| 186 | $attributes = isset( $matches[ 2 ][ $key ][ 0 ] ) ? self::getAttributes( $matches[ 2 ][ $key ][ 0 ] ) : []; |
||
| 187 | $base64Decoded = ( isset( $attributes[ 'equiv-text' ] ) ) ? base64_decode( str_replace( "base64:", "", $attributes[ 'equiv-text' ] ) ) : null; |
||
| 188 | $tagName = $matches[ 1 ][ $key ][ 0 ]; |
||
| 189 | $text = !empty( $matches[ 6 ][ $key ][ 0 ] ) ? $matches[ 6 ][ $key ][ 0 ] : ""; |
||
| 190 | $originalText = $text; |
||
| 191 | $strippedText = strip_tags( $text ); |
||
| 192 | |||
| 193 | // get start and end tags |
||
| 194 | $explodedNode = explode( self::ORIGINAL_TEXT_PLACEHOLDER, str_replace( $originalText, self::ORIGINAL_TEXT_PLACEHOLDER, $match[ 0 ] ) ); |
||
| 195 | |||
| 196 | $start = ( isset( $explodedNode[ 0 ] ) ) ? $explodedNode[ 0 ] : ""; |
||
| 197 | $end = ( isset( $explodedNode[ 1 ] ) ) ? $explodedNode[ 1 ] : ""; |
||
| 198 | |||
| 199 | // inner_html |
||
| 200 | $inner_html = self::getInnerHtml( $matches, $key, $toBeEscaped ); |
||
| 201 | |||
| 202 | // node |
||
| 203 | $node = self::rebuildNode( $originalText, $toBeEscaped, $start, $end ); |
||
| 204 | |||
| 205 | // terminator |
||
| 206 | $terminator = ( $toBeEscaped ) ? '>' : '>'; |
||
| 207 | |||
| 208 | // self closed |
||
| 209 | $selfClosed = Strings::contains( '/>', trim( $start ) ); |
||
| 210 | |||
| 211 | $elements[] = (object)[ |
||
| 212 | 'node' => self::restoreLessThanAndGreaterThanSymbols( $node ), |
||
| 213 | 'start' => self::restoreLessThanAndGreaterThanSymbols( $start ), |
||
| 214 | 'end' => self::restoreLessThanAndGreaterThanSymbols( $end ), |
||
| 215 | 'terminator' => $terminator, |
||
| 216 | 'offset' => $match[ 1 ], |
||
| 217 | 'tagname' => $tagName, |
||
| 218 | 'attributes' => $attributes, |
||
| 219 | 'base64_decoded' => $base64Decoded, |
||
| 220 | 'self_closed' => $selfClosed, |
||
| 221 | 'omittag' => ( $matches[ 4 ][ $key ][ 1 ] > -1 ), // boolean |
||
| 222 | 'inner_html' => $inner_html, |
||
| 223 | 'has_children' => is_array( $inner_html ), |
||
| 224 | 'original_text' => ( $toBeEscaped ) ? self::restoreLessThanAndGreaterThanSymbols( Strings::escapeOnlyHTMLTags( $originalText ) ) : self::restoreLessThanAndGreaterThanSymbols( $originalText ), |
||
| 225 | 'stripped_text' => self::restoreLessThanAndGreaterThanSymbols( $strippedText ), |
||
| 226 | ]; |
||
| 227 | } |
||
| 228 | |||
| 229 | return $elements; |
||
| 230 | } |
||
| 231 | |||
| 232 | /** |
||
| 233 | * @param $text |
||
| 234 | * |
||
| 235 | * @return string|string[] |
||
| 236 | */ |
||
| 237 | private static function restoreLessThanAndGreaterThanSymbols( $text ) { |
||
| 239 | } |
||
| 240 | |||
| 241 | /** |
||
| 242 | * @param string $originalText |
||
| 243 | * @param bool $toBeEscaped |
||
| 244 | * @param string $start |
||
| 245 | * @param string $end |
||
| 246 | * |
||
| 247 | * @return string |
||
| 248 | */ |
||
| 249 | private static function rebuildNode( $originalText, $toBeEscaped, $start = null, $end = null ) { |
||
| 263 | } |
||
| 264 | |||
| 265 | /** |
||
| 266 | * @param $content |
||
| 267 | * |
||
| 268 | * @return mixed |
||
| 269 | */ |
||
| 270 | public static function getAttributes( $content ) { |
||
| 271 | $pattern = '/(.*?)=("|\'|\\\")(.*?)("|\'|\\\"|\\\')/'; |
||
| 272 | |||
| 273 | preg_match_all( $pattern, $content, $matches, PREG_OFFSET_CAPTURE ); |
||
| 274 | |||
| 275 | $attributes = []; |
||
| 276 | |||
| 277 | if ( isset( $matches[ 1 ] ) && count( $matches[ 1 ] ) > 0 ) { |
||
| 278 | foreach ( $matches[ 1 ] as $key => $match ) { |
||
| 279 | $attributes[ trim( $match[ 0 ] ) ] = $matches[ 3 ][ $key ][ 0 ]; |
||
| 280 | } |
||
| 281 | } |
||
| 282 | |||
| 283 | return $attributes; |
||
| 284 | } |
||
| 285 | |||
| 286 | /** |
||
| 287 | * @param array $matches |
||
| 288 | * @param string $key |
||
| 289 | * |
||
| 290 | * @param bool $toBeEscaped |
||
| 291 | * |
||
| 292 | * @return array|mixed|string |
||
| 293 | */ |
||
| 294 | private static function getInnerHtml( $matches, $key, $toBeEscaped = false ) { |
||
| 302 | } |
||
| 303 | } |
||
| 304 |