| Total Complexity | 66 |
| Total Lines | 263 |
| Duplicated Lines | 14.83 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like HtmlBlockPreprocessor often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """ |
||
| 59 | class HtmlBlockPreprocessor(Preprocessor): |
||
| 60 | """Remove html blocks from the text and store them for later retrieval.""" |
||
| 61 | |||
| 62 | right_tag_patterns = ["</%s>", "%s>"] |
||
| 63 | attrs_pattern = r""" |
||
| 64 | \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
||
| 65 | | # OR |
||
| 66 | \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
||
| 67 | | # OR |
||
| 68 | \s+(?P<attr2>[^>"'/= ]+) # attr |
||
| 69 | """ |
||
| 70 | left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
||
| 71 | attrs_pattern |
||
| 72 | attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
||
| 73 | left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
||
| 74 | markdown_in_raw = False |
||
| 75 | |||
| 76 | def _get_left_tag(self, block): |
||
| 77 | m = self.left_tag_re.match(block) |
||
| 78 | if m: |
||
| 79 | tag = m.group('tag') |
||
| 80 | raw_attrs = m.group('attrs') |
||
| 81 | attrs = {} |
||
| 82 | if raw_attrs: |
||
| 83 | for ma in self.attrs_re.finditer(raw_attrs): |
||
| 84 | if ma.group('attr'): |
||
| 85 | if ma.group('value'): |
||
| 86 | attrs[ma.group('attr').strip()] = ma.group('value') |
||
| 87 | else: |
||
| 88 | attrs[ma.group('attr').strip()] = "" |
||
| 89 | elif ma.group('attr1'): |
||
| 90 | if ma.group('value1'): |
||
| 91 | attrs[ma.group('attr1').strip()] = ma.group( |
||
| 92 | 'value1' |
||
| 93 | ) |
||
| 94 | else: |
||
| 95 | attrs[ma.group('attr1').strip()] = "" |
||
| 96 | elif ma.group('attr2'): |
||
| 97 | attrs[ma.group('attr2').strip()] = "" |
||
| 98 | return tag, len(m.group(0)), attrs |
||
| 99 | else: |
||
| 100 | tag = block[1:].split(">", 1)[0].lower() |
||
| 101 | return tag, len(tag)+2, {} |
||
| 102 | |||
| 103 | def _recursive_tagfind(self, ltag, rtag, start_index, block): |
||
| 104 | while 1: |
||
| 105 | i = block.find(rtag, start_index) |
||
| 106 | if i == -1: |
||
| 107 | return -1 |
||
| 108 | j = block.find(ltag, start_index) |
||
| 109 | # if no ltag, or rtag found before another ltag, return index |
||
| 110 | if (j > i or j == -1): |
||
| 111 | return i + len(rtag) |
||
| 112 | # another ltag found before rtag, use end of ltag as starting |
||
| 113 | # point and search again |
||
| 114 | j = block.find('>', j) |
||
| 115 | start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
||
| 116 | if start_index == -1: |
||
| 117 | # HTML potentially malformed- ltag has no corresponding |
||
| 118 | # rtag |
||
| 119 | return -1 |
||
| 120 | |||
| 121 | def _get_right_tag(self, left_tag, left_index, block): |
||
| 122 | for p in self.right_tag_patterns: |
||
| 123 | tag = p % left_tag |
||
| 124 | i = self._recursive_tagfind( |
||
| 125 | "<%s" % left_tag, tag, left_index, block |
||
| 126 | ) |
||
| 127 | if i > 2: |
||
| 128 | return tag.lstrip("<").rstrip(">"), i |
||
| 129 | return block.rstrip()[-left_index:-1].lower(), len(block) |
||
| 130 | |||
| 131 | def _equal_tags(self, left_tag, right_tag): |
||
| 132 | if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
||
| 133 | return True |
||
| 134 | if ("/" + left_tag) == right_tag: |
||
| 135 | return True |
||
| 136 | if (right_tag == "--" and left_tag == "--"): |
||
| 137 | return True |
||
| 138 | elif left_tag == right_tag[1:] and right_tag[0] == "/": |
||
| 139 | return True |
||
| 140 | else: |
||
| 141 | return False |
||
| 142 | |||
| 143 | def _is_oneliner(self, tag): |
||
| 144 | return (tag in ['hr', 'hr/']) |
||
| 145 | |||
| 146 | def _stringindex_to_listindex(self, stringindex, items): |
||
| 147 | """ |
||
| 148 | Same effect as concatenating the strings in items, |
||
| 149 | finding the character to which stringindex refers in that string, |
||
| 150 | and returning the index of the item in which that character resides. |
||
| 151 | """ |
||
| 152 | items.append('dummy') |
||
| 153 | i, count = 0, 0 |
||
| 154 | while count <= stringindex: |
||
| 155 | count += len(items[i]) |
||
| 156 | i += 1 |
||
| 157 | return i - 1 |
||
| 158 | |||
| 159 | def _nested_markdown_in_html(self, items): |
||
| 160 | """Find and process html child elements of the given element block.""" |
||
| 161 | for i, item in enumerate(items): |
||
| 162 | if self.left_tag_re.match(item): |
||
| 163 | left_tag, left_index, attrs = \ |
||
| 164 | self._get_left_tag(''.join(items[i:])) |
||
| 165 | right_tag, data_index = self._get_right_tag( |
||
| 166 | left_tag, left_index, ''.join(items[i:])) |
||
| 167 | right_listindex = \ |
||
| 168 | self._stringindex_to_listindex(data_index, items[i:]) + i |
||
| 169 | if 'markdown' in attrs.keys(): |
||
| 170 | items[i] = items[i][left_index:] # remove opening tag |
||
| 171 | placeholder = self.markdown.htmlStash.store_tag( |
||
| 172 | left_tag, attrs, i + 1, right_listindex + 1) |
||
| 173 | items.insert(i, placeholder) |
||
| 174 | if len(items) - right_listindex <= 1: # last nest, no tail |
||
| 175 | right_listindex -= 1 |
||
| 176 | items[right_listindex] = items[right_listindex][ |
||
| 177 | :-len(right_tag) - 2] # remove closing tag |
||
| 178 | else: # raw html |
||
| 179 | if len(items) - right_listindex <= 1: # last element |
||
| 180 | right_listindex -= 1 |
||
| 181 | if right_listindex <= i: |
||
| 182 | right_listindex = i + 1 |
||
| 183 | placeholder = self.markdown.htmlStash.store('\n\n'.join( |
||
| 184 | items[i:right_listindex])) |
||
| 185 | del items[i:right_listindex] |
||
| 186 | items.insert(i, placeholder) |
||
| 187 | return items |
||
| 188 | |||
| 189 | def run(self, lines): |
||
| 190 | text = "\n".join(lines) |
||
| 191 | new_blocks = [] |
||
| 192 | text = text.rsplit("\n\n") |
||
| 193 | items = [] |
||
| 194 | left_tag = '' |
||
| 195 | right_tag = '' |
||
| 196 | in_tag = False # flag |
||
| 197 | |||
| 198 | while text: |
||
| 199 | block = text[0] |
||
| 200 | if block.startswith("\n"): |
||
| 201 | block = block[1:] |
||
| 202 | text = text[1:] |
||
| 203 | |||
| 204 | if block.startswith("\n"): |
||
| 205 | block = block[1:] |
||
| 206 | |||
| 207 | if not in_tag: |
||
| 208 | if block.startswith("<") and len(block.strip()) > 1: |
||
| 209 | |||
| 210 | if block[1:4] == "!--": |
||
| 211 | # is a comment block |
||
| 212 | left_tag, left_index, attrs = "--", 2, {} |
||
| 213 | else: |
||
| 214 | left_tag, left_index, attrs = self._get_left_tag(block) |
||
| 215 | right_tag, data_index = self._get_right_tag(left_tag, |
||
| 216 | left_index, |
||
| 217 | block) |
||
| 218 | # keep checking conditions below and maybe just append |
||
| 219 | |||
| 220 | if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
||
| 221 | text.insert(0, block[data_index:]) |
||
| 222 | block = block[:data_index] |
||
| 223 | |||
| 224 | if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
||
| 225 | new_blocks.append(block) |
||
| 226 | continue |
||
| 227 | |||
| 228 | if self._is_oneliner(left_tag): |
||
| 229 | new_blocks.append(block.strip()) |
||
| 230 | continue |
||
| 231 | |||
| 232 | if block.rstrip().endswith(">") \ |
||
| 233 | and self._equal_tags(left_tag, right_tag): |
||
| 234 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
||
| 235 | block = block[left_index:-len(right_tag) - 2] |
||
| 236 | new_blocks.append(self.markdown.htmlStash. |
||
| 237 | store_tag(left_tag, attrs, 0, 2)) |
||
| 238 | new_blocks.extend([block]) |
||
| 239 | else: |
||
| 240 | new_blocks.append( |
||
| 241 | self.markdown.htmlStash.store(block.strip())) |
||
| 242 | continue |
||
| 243 | else: |
||
| 244 | # if is block level tag and is not complete |
||
| 245 | if (not self._equal_tags(left_tag, right_tag)) and \ |
||
| 246 | (util.isBlockLevel(left_tag) or left_tag == "--"): |
||
| 247 | items.append(block.strip()) |
||
| 248 | in_tag = True |
||
| 249 | else: |
||
| 250 | new_blocks.append( |
||
| 251 | self.markdown.htmlStash.store(block.strip()) |
||
| 252 | ) |
||
| 253 | continue |
||
| 254 | |||
| 255 | else: |
||
| 256 | new_blocks.append(block) |
||
| 257 | |||
| 258 | else: |
||
| 259 | items.append(block) |
||
| 260 | |||
| 261 | # Need to evaluate all items so we can calculate relative to the left index. |
||
| 262 | right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) |
||
| 263 | # Adjust data_index: relative to items -> relative to last block |
||
| 264 | prev_block_length = 0 |
||
| 265 | for item in items[:-1]: |
||
| 266 | prev_block_length += len(item) |
||
| 267 | data_index -= prev_block_length |
||
| 268 | |||
| 269 | if self._equal_tags(left_tag, right_tag): |
||
| 270 | # if find closing tag |
||
| 271 | |||
| 272 | if data_index < len(block): |
||
| 273 | # we have more text after right_tag |
||
| 274 | items[-1] = block[:data_index] |
||
| 275 | text.insert(0, block[data_index:]) |
||
| 276 | |||
| 277 | in_tag = False |
||
| 278 | View Code Duplication | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
|
|
|||
| 279 | items[0] = items[0][left_index:] |
||
| 280 | items[-1] = items[-1][:-len(right_tag) - 2] |
||
| 281 | if items[len(items) - 1]: # not a newline/empty string |
||
| 282 | right_index = len(items) + 3 |
||
| 283 | else: |
||
| 284 | right_index = len(items) + 2 |
||
| 285 | new_blocks.append(self.markdown.htmlStash.store_tag( |
||
| 286 | left_tag, attrs, 0, right_index)) |
||
| 287 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
||
| 288 | new_blocks.extend( |
||
| 289 | self._nested_markdown_in_html(items)) |
||
| 290 | nests = len(self.markdown.htmlStash.tag_data) - \ |
||
| 291 | placeholderslen |
||
| 292 | self.markdown.htmlStash.tag_data[-1 - nests][ |
||
| 293 | 'right_index'] += nests - 2 |
||
| 294 | else: |
||
| 295 | new_blocks.append( |
||
| 296 | self.markdown.htmlStash.store('\n\n'.join(items))) |
||
| 297 | items = [] |
||
| 298 | |||
| 299 | View Code Duplication | if items: |
|
| 300 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
||
| 301 | items[0] = items[0][left_index:] |
||
| 302 | items[-1] = items[-1][:-len(right_tag) - 2] |
||
| 303 | if items[len(items) - 1]: # not a newline/empty string |
||
| 304 | right_index = len(items) + 3 |
||
| 305 | else: |
||
| 306 | right_index = len(items) + 2 |
||
| 307 | new_blocks.append( |
||
| 308 | self.markdown.htmlStash.store_tag( |
||
| 309 | left_tag, attrs, 0, right_index)) |
||
| 310 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
||
| 311 | new_blocks.extend(self._nested_markdown_in_html(items)) |
||
| 312 | nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
||
| 313 | self.markdown.htmlStash.tag_data[-1 - nests][ |
||
| 314 | 'right_index'] += nests - 2 |
||
| 315 | else: |
||
| 316 | new_blocks.append( |
||
| 317 | self.markdown.htmlStash.store('\n\n'.join(items))) |
||
| 318 | new_blocks.append('\n') |
||
| 319 | |||
| 320 | new_text = "\n\n".join(new_blocks) |
||
| 321 | return new_text.split("\n") |
||
| 322 | |||
| 353 |