Complex classes like RegexHelper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use RegexHelper, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 22 | final class RegexHelper |
||
| 23 | { |
||
| 24 | /** @deprecated Use PARTIAL_ESCAPABLE instead */ |
||
| 25 | const ESCAPABLE = 0; |
||
| 26 | |||
| 27 | /** @deprecated Use PARTIAL_ESCAPED_CHAR instead */ |
||
| 28 | const ESCAPED_CHAR = 1; |
||
| 29 | |||
| 30 | /** @deprecated Use PARTIAL_IN_DOUBLE_QUOTES instead */ |
||
| 31 | const IN_DOUBLE_QUOTES = 2; |
||
| 32 | |||
| 33 | /** @deprecated Use PARTIAL_IN_SINGLE_QUOTES instead */ |
||
| 34 | const IN_SINGLE_QUOTES = 3; |
||
| 35 | |||
| 36 | /** @deprecated Use PARTIAL_IN_PARENS instead */ |
||
| 37 | const IN_PARENS = 4; |
||
| 38 | |||
| 39 | /** @deprecated Use PARTIAL_REG_CHAR instead */ |
||
| 40 | const REG_CHAR = 5; |
||
| 41 | |||
| 42 | /** @deprecated Use PARTIAL_IN_PARENS_NOSP instead */ |
||
| 43 | const IN_PARENS_NOSP = 6; |
||
| 44 | |||
| 45 | /** @deprecated Use PARTIAL_TAGNAME instead */ |
||
| 46 | const TAGNAME = 7; |
||
| 47 | |||
| 48 | /** @deprecated Use PARTIAL_BLOCKTAGNAME instead */ |
||
| 49 | const BLOCKTAGNAME = 8; |
||
| 50 | |||
| 51 | /** @deprecated Use PARTIAL_ATTRIBUTENAME instead */ |
||
| 52 | const ATTRIBUTENAME = 9; |
||
| 53 | |||
| 54 | /** @deprecated Use PARTIAL_UNQUOTEDVALUE instead */ |
||
| 55 | const UNQUOTEDVALUE = 10; |
||
| 56 | |||
| 57 | /** @deprecated Use PARTIAL_SINGLEQUOTEDVALUE instead */ |
||
| 58 | const SINGLEQUOTEDVALUE = 11; |
||
| 59 | |||
| 60 | /** @deprecated Use PARTIAL_DOUBLEQUOTEDVALUE instead */ |
||
| 61 | const DOUBLEQUOTEDVALUE = 12; |
||
| 62 | |||
| 63 | /** @deprecated Use PARTIAL_ATTRIBUTEVALUE instead */ |
||
| 64 | const ATTRIBUTEVALUE = 13; |
||
| 65 | |||
| 66 | /** @deprecated Use PARTIAL_ATTRIBUTEVALUESPEC instead */ |
||
| 67 | const ATTRIBUTEVALUESPEC = 14; |
||
| 68 | |||
| 69 | /** @deprecated Use PARTIAL_ATTRIBUTE instead */ |
||
| 70 | const ATTRIBUTE = 15; |
||
| 71 | |||
| 72 | /** @deprecated Use PARTIAL_OPENTAG instead */ |
||
| 73 | const OPENTAG = 16; |
||
| 74 | |||
| 75 | /** @deprecated Use PARTIAL_CLOSETAG instead */ |
||
| 76 | const CLOSETAG = 17; |
||
| 77 | |||
| 78 | /** @deprecated Use PARTIAL_OPENBLOCKTAG instead */ |
||
| 79 | const OPENBLOCKTAG = 18; |
||
| 80 | |||
| 81 | /** @deprecated Use PARTIAL_CLOSEBLOCKTAG instead */ |
||
| 82 | const CLOSEBLOCKTAG = 19; |
||
| 83 | |||
| 84 | /** @deprecated Use PARTIAL_HTMLCOMMENT instead */ |
||
| 85 | const HTMLCOMMENT = 20; |
||
| 86 | |||
| 87 | /** @deprecated Use PARTIAL_PROCESSINGINSTRUCTION instead */ |
||
| 88 | const PROCESSINGINSTRUCTION = 21; |
||
| 89 | |||
| 90 | /** @deprecated Use PARTIAL_DECLARATION instead */ |
||
| 91 | const DECLARATION = 22; |
||
| 92 | |||
| 93 | /** @deprecated Use PARTIAL_CDATA instead */ |
||
| 94 | const CDATA = 23; |
||
| 95 | |||
| 96 | /** @deprecated Use PARTIAL_HTMLTAG instead */ |
||
| 97 | const HTMLTAG = 24; |
||
| 98 | |||
| 99 | /** @deprecated Use PARTIAL_HTMLBLOCKOPEN instead */ |
||
| 100 | const HTMLBLOCKOPEN = 25; |
||
| 101 | |||
| 102 | /** @deprecated Use PARTIAL_LINK_TITLE instead */ |
||
| 103 | const LINK_TITLE = 26; |
||
| 104 | |||
| 105 | // Partial regular expressions (wrap with `/` on each side before use) |
||
| 106 | const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});'; |
||
| 107 | const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]'; |
||
| 108 | const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE; |
||
| 109 | const PARTIAL_IN_DOUBLE_QUOTES = '"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"'; |
||
| 110 | const PARTIAL_IN_SINGLE_QUOTES = '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\''; |
||
| 111 | const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)'; |
||
| 112 | const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]'; |
||
| 113 | const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)'; |
||
| 114 | const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*'; |
||
| 115 | const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'; |
||
| 116 | const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'; |
||
| 117 | const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+'; |
||
| 118 | const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\''; |
||
| 119 | const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"'; |
||
| 120 | const PARTIAL_ATTRIBUTEVALUE = '(?:' . self::PARTIAL_UNQUOTEDVALUE . '|' . self::PARTIAL_SINGLEQUOTEDVALUE . '|' . self::PARTIAL_DOUBLEQUOTEDVALUE . ')'; |
||
| 121 | const PARTIAL_ATTRIBUTEVALUESPEC = '(?:' . '\s*=' . '\s*' . self::PARTIAL_ATTRIBUTEVALUE . ')'; |
||
| 122 | const PARTIAL_ATTRIBUTE = '(?:' . '\s+' . self::PARTIAL_ATTRIBUTENAME . self::PARTIAL_ATTRIBUTEVALUESPEC . '?)'; |
||
| 123 | const PARTIAL_OPENTAG = '<' . self::PARTIAL_TAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>'; |
||
| 124 | const PARTIAL_CLOSETAG = '<\/' . self::PARTIAL_TAGNAME . '\s*[>]'; |
||
| 125 | const PARTIAL_OPENBLOCKTAG = '<' . self::PARTIAL_BLOCKTAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>'; |
||
| 126 | const PARTIAL_CLOSEBLOCKTAG = '<\/' . self::PARTIAL_BLOCKTAGNAME . '\s*[>]'; |
||
| 127 | const PARTIAL_HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->'; |
||
| 128 | const PARTIAL_PROCESSINGINSTRUCTION = '[<][?].*?[?][>]'; |
||
| 129 | const PARTIAL_DECLARATION = '<![A-Z]+' . '\s+[^>]*>'; |
||
| 130 | const PARTIAL_CDATA = '<!\[CDATA\[[\s\S]*?]\]>'; |
||
| 131 | const PARTIAL_HTMLTAG = '(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . '|' . self::PARTIAL_HTMLCOMMENT . '|' . |
||
| 132 | self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')'; |
||
| 133 | const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' . |
||
| 134 | '\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])'; |
||
| 135 | const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' . |
||
| 136 | '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' . |
||
| 137 | '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\))'; |
||
| 138 | |||
| 139 | /** @deprecated Use PARTIAL_ESCAPABLE instead */ |
||
| 140 | const REGEX_ESCAPABLE = self::PARTIAL_ESCAPABLE; |
||
| 141 | |||
| 142 | /** @deprecated Use PARTIAL_ENTITY instead */ |
||
| 143 | const REGEX_ENTITY = self::PARTIAL_ENTITY; |
||
| 144 | |||
| 145 | const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u'; |
||
| 146 | const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i'; |
||
| 147 | const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i'; |
||
| 148 | const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/'; |
||
| 149 | |||
| 150 | const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/'; |
||
| 151 | const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/'; |
||
| 152 | const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u'; |
||
| 153 | const REGEX_THEMATIC_BREAK = '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/'; |
||
| 154 | const REGEX_LINK_DESTINATION_BRACES = '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . self::PARTIAL_ESCAPED_CHAR . '|' . '\\\\)*[>]' . ')/'; |
||
| 155 | |||
| 156 | /** |
||
| 157 | * @deprecated Instance methods will be removed in 0.18 or 1.0 (whichever comes first) |
||
| 158 | */ |
||
| 159 | protected static $instance; |
||
| 160 | |||
| 161 | /** |
||
| 162 | * @return RegexHelper |
||
| 163 | * |
||
| 164 | * @deprecated Instances are no longer needed and will be removed in 0.18 or 1.0 |
||
| 165 | */ |
||
| 166 | public static function getInstance() |
||
| 176 | |||
| 177 | /** |
||
| 178 | * Returns a partial regex |
||
| 179 | * |
||
| 180 | * It'll need to be wrapped with /.../ before use |
||
| 181 | * |
||
| 182 | * @param int $const |
||
| 183 | * |
||
| 184 | * @return string |
||
| 185 | * |
||
| 186 | * @deprecated Just grab the constant directly |
||
| 187 | */ |
||
| 188 | public function getPartialRegex($const) |
||
| 222 | |||
| 223 | /** |
||
| 224 | * @return string |
||
| 225 | * |
||
| 226 | * @deprecated Use PARTIAL_HTMLTAG and wrap it yourself instead |
||
| 227 | */ |
||
| 228 | public function getHtmlTagRegex() |
||
| 234 | |||
| 235 | /** |
||
| 236 | * @return string |
||
| 237 | * |
||
| 238 | * @deprecated Use PARTIAL_LINK_TITLE and wrap it yourself instead |
||
| 239 | */ |
||
| 240 | public function getLinkTitleRegex() |
||
| 246 | |||
| 247 | /** |
||
| 248 | * @return string |
||
| 249 | * |
||
| 250 | * @deprecated Use REGEX_LINK_DESTINATION_BRACES instead |
||
| 251 | */ |
||
| 252 | public function getLinkDestinationBracesRegex() |
||
| 258 | |||
| 259 | /** |
||
| 260 | * @return string |
||
| 261 | * |
||
| 262 | * @deprecated Use the REGEX_THEMATIC_BREAK constant directly |
||
| 263 | */ |
||
| 264 | public function getThematicBreakRegex() |
||
| 270 | |||
| 271 | /** |
||
| 272 | * Attempt to match a regex in string s at offset offset |
||
| 273 | * |
||
| 274 | * @param string $regex |
||
| 275 | * @param string $string |
||
| 276 | * @param int $offset |
||
| 277 | * |
||
| 278 | * @return int|null Index of match, or null |
||
| 279 | */ |
||
| 280 | 1776 | public static function matchAt($regex, $string, $offset = 0) |
|
| 293 | |||
| 294 | /** |
||
| 295 | * Functional wrapper around preg_match_all |
||
| 296 | * |
||
| 297 | * @param string $pattern |
||
| 298 | * @param string $subject |
||
| 299 | * @param int $offset |
||
| 300 | * |
||
| 301 | * @return array|null |
||
| 302 | */ |
||
| 303 | 1875 | public static function matchAll($pattern, $subject, $offset = 0) |
|
| 324 | |||
| 325 | /** |
||
| 326 | * Replace backslash escapes with literal characters |
||
| 327 | * |
||
| 328 | * @param string $string |
||
| 329 | * |
||
| 330 | * @return string |
||
| 331 | */ |
||
| 332 | 498 | public static function unescape($string) |
|
| 343 | |||
| 344 | /** |
||
| 345 | * @param int $type HTML block type |
||
| 346 | * |
||
| 347 | * @return string|null |
||
| 348 | */ |
||
| 349 | 279 | public static function getHtmlBlockOpenRegex($type) |
|
| 368 | |||
| 369 | /** |
||
| 370 | * @param int $type HTML block type |
||
| 371 | * |
||
| 372 | * @return string|null |
||
| 373 | */ |
||
| 374 | 60 | public static function getHtmlBlockCloseRegex($type) |
|
| 389 | |||
| 390 | /** |
||
| 391 | * @param string $url |
||
| 392 | * |
||
| 393 | * @return bool |
||
| 394 | */ |
||
| 395 | 30 | public static function isLinkPotentiallyUnsafe($url) |
|
| 399 | } |
||
| 400 |
If you suppress an error, we recommend checking for the error condition explicitly: