Total Complexity | 66 |
Total Lines | 263 |
Duplicated Lines | 14.83 % |
Changes | 1 | ||
Bugs | 0 | Features | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like HtmlBlockPreprocessor often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """ |
||
59 | class HtmlBlockPreprocessor(Preprocessor): |
||
60 | """Remove html blocks from the text and store them for later retrieval.""" |
||
61 | |||
62 | right_tag_patterns = ["</%s>", "%s>"] |
||
63 | attrs_pattern = r""" |
||
64 | \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
||
65 | | # OR |
||
66 | \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
||
67 | | # OR |
||
68 | \s+(?P<attr2>[^>"'/= ]+) # attr |
||
69 | """ |
||
70 | left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
||
71 | attrs_pattern |
||
72 | attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
||
73 | left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
||
74 | markdown_in_raw = False |
||
75 | |||
76 | def _get_left_tag(self, block): |
||
77 | m = self.left_tag_re.match(block) |
||
78 | if m: |
||
79 | tag = m.group('tag') |
||
80 | raw_attrs = m.group('attrs') |
||
81 | attrs = {} |
||
82 | if raw_attrs: |
||
83 | for ma in self.attrs_re.finditer(raw_attrs): |
||
84 | if ma.group('attr'): |
||
85 | if ma.group('value'): |
||
86 | attrs[ma.group('attr').strip()] = ma.group('value') |
||
87 | else: |
||
88 | attrs[ma.group('attr').strip()] = "" |
||
89 | elif ma.group('attr1'): |
||
90 | if ma.group('value1'): |
||
91 | attrs[ma.group('attr1').strip()] = ma.group( |
||
92 | 'value1' |
||
93 | ) |
||
94 | else: |
||
95 | attrs[ma.group('attr1').strip()] = "" |
||
96 | elif ma.group('attr2'): |
||
97 | attrs[ma.group('attr2').strip()] = "" |
||
98 | return tag, len(m.group(0)), attrs |
||
99 | else: |
||
100 | tag = block[1:].split(">", 1)[0].lower() |
||
101 | return tag, len(tag)+2, {} |
||
102 | |||
103 | def _recursive_tagfind(self, ltag, rtag, start_index, block): |
||
104 | while 1: |
||
105 | i = block.find(rtag, start_index) |
||
106 | if i == -1: |
||
107 | return -1 |
||
108 | j = block.find(ltag, start_index) |
||
109 | # if no ltag, or rtag found before another ltag, return index |
||
110 | if (j > i or j == -1): |
||
111 | return i + len(rtag) |
||
112 | # another ltag found before rtag, use end of ltag as starting |
||
113 | # point and search again |
||
114 | j = block.find('>', j) |
||
115 | start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
||
116 | if start_index == -1: |
||
117 | # HTML potentially malformed- ltag has no corresponding |
||
118 | # rtag |
||
119 | return -1 |
||
120 | |||
121 | def _get_right_tag(self, left_tag, left_index, block): |
||
122 | for p in self.right_tag_patterns: |
||
123 | tag = p % left_tag |
||
124 | i = self._recursive_tagfind( |
||
125 | "<%s" % left_tag, tag, left_index, block |
||
126 | ) |
||
127 | if i > 2: |
||
128 | return tag.lstrip("<").rstrip(">"), i |
||
129 | return block.rstrip()[-left_index:-1].lower(), len(block) |
||
130 | |||
131 | def _equal_tags(self, left_tag, right_tag): |
||
132 | if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
||
133 | return True |
||
134 | if ("/" + left_tag) == right_tag: |
||
135 | return True |
||
136 | if (right_tag == "--" and left_tag == "--"): |
||
137 | return True |
||
138 | elif left_tag == right_tag[1:] and right_tag[0] == "/": |
||
139 | return True |
||
140 | else: |
||
141 | return False |
||
142 | |||
143 | def _is_oneliner(self, tag): |
||
144 | return (tag in ['hr', 'hr/']) |
||
145 | |||
146 | def _stringindex_to_listindex(self, stringindex, items): |
||
147 | """ |
||
148 | Same effect as concatenating the strings in items, |
||
149 | finding the character to which stringindex refers in that string, |
||
150 | and returning the index of the item in which that character resides. |
||
151 | """ |
||
152 | items.append('dummy') |
||
153 | i, count = 0, 0 |
||
154 | while count <= stringindex: |
||
155 | count += len(items[i]) |
||
156 | i += 1 |
||
157 | return i - 1 |
||
158 | |||
159 | def _nested_markdown_in_html(self, items): |
||
160 | """Find and process html child elements of the given element block.""" |
||
161 | for i, item in enumerate(items): |
||
162 | if self.left_tag_re.match(item): |
||
163 | left_tag, left_index, attrs = \ |
||
164 | self._get_left_tag(''.join(items[i:])) |
||
165 | right_tag, data_index = self._get_right_tag( |
||
166 | left_tag, left_index, ''.join(items[i:])) |
||
167 | right_listindex = \ |
||
168 | self._stringindex_to_listindex(data_index, items[i:]) + i |
||
169 | if 'markdown' in attrs.keys(): |
||
170 | items[i] = items[i][left_index:] # remove opening tag |
||
171 | placeholder = self.markdown.htmlStash.store_tag( |
||
172 | left_tag, attrs, i + 1, right_listindex + 1) |
||
173 | items.insert(i, placeholder) |
||
174 | if len(items) - right_listindex <= 1: # last nest, no tail |
||
175 | right_listindex -= 1 |
||
176 | items[right_listindex] = items[right_listindex][ |
||
177 | :-len(right_tag) - 2] # remove closing tag |
||
178 | else: # raw html |
||
179 | if len(items) - right_listindex <= 1: # last element |
||
180 | right_listindex -= 1 |
||
181 | if right_listindex <= i: |
||
182 | right_listindex = i + 1 |
||
183 | placeholder = self.markdown.htmlStash.store('\n\n'.join( |
||
184 | items[i:right_listindex])) |
||
185 | del items[i:right_listindex] |
||
186 | items.insert(i, placeholder) |
||
187 | return items |
||
188 | |||
189 | def run(self, lines): |
||
190 | text = "\n".join(lines) |
||
191 | new_blocks = [] |
||
192 | text = text.rsplit("\n\n") |
||
193 | items = [] |
||
194 | left_tag = '' |
||
195 | right_tag = '' |
||
196 | in_tag = False # flag |
||
197 | |||
198 | while text: |
||
199 | block = text[0] |
||
200 | if block.startswith("\n"): |
||
201 | block = block[1:] |
||
202 | text = text[1:] |
||
203 | |||
204 | if block.startswith("\n"): |
||
205 | block = block[1:] |
||
206 | |||
207 | if not in_tag: |
||
208 | if block.startswith("<") and len(block.strip()) > 1: |
||
209 | |||
210 | if block[1:4] == "!--": |
||
211 | # is a comment block |
||
212 | left_tag, left_index, attrs = "--", 2, {} |
||
213 | else: |
||
214 | left_tag, left_index, attrs = self._get_left_tag(block) |
||
215 | right_tag, data_index = self._get_right_tag(left_tag, |
||
216 | left_index, |
||
217 | block) |
||
218 | # keep checking conditions below and maybe just append |
||
219 | |||
220 | if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
||
221 | text.insert(0, block[data_index:]) |
||
222 | block = block[:data_index] |
||
223 | |||
224 | if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
||
225 | new_blocks.append(block) |
||
226 | continue |
||
227 | |||
228 | if self._is_oneliner(left_tag): |
||
229 | new_blocks.append(block.strip()) |
||
230 | continue |
||
231 | |||
232 | if block.rstrip().endswith(">") \ |
||
233 | and self._equal_tags(left_tag, right_tag): |
||
234 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
||
235 | block = block[left_index:-len(right_tag) - 2] |
||
236 | new_blocks.append(self.markdown.htmlStash. |
||
237 | store_tag(left_tag, attrs, 0, 2)) |
||
238 | new_blocks.extend([block]) |
||
239 | else: |
||
240 | new_blocks.append( |
||
241 | self.markdown.htmlStash.store(block.strip())) |
||
242 | continue |
||
243 | else: |
||
244 | # if is block level tag and is not complete |
||
245 | if (not self._equal_tags(left_tag, right_tag)) and \ |
||
246 | (util.isBlockLevel(left_tag) or left_tag == "--"): |
||
247 | items.append(block.strip()) |
||
248 | in_tag = True |
||
249 | else: |
||
250 | new_blocks.append( |
||
251 | self.markdown.htmlStash.store(block.strip()) |
||
252 | ) |
||
253 | continue |
||
254 | |||
255 | else: |
||
256 | new_blocks.append(block) |
||
257 | |||
258 | else: |
||
259 | items.append(block) |
||
260 | |||
261 | # Need to evaluate all items so we can calculate relative to the left index. |
||
262 | right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) |
||
263 | # Adjust data_index: relative to items -> relative to last block |
||
264 | prev_block_length = 0 |
||
265 | for item in items[:-1]: |
||
266 | prev_block_length += len(item) |
||
267 | data_index -= prev_block_length |
||
268 | |||
269 | if self._equal_tags(left_tag, right_tag): |
||
270 | # if find closing tag |
||
271 | |||
272 | if data_index < len(block): |
||
273 | # we have more text after right_tag |
||
274 | items[-1] = block[:data_index] |
||
275 | text.insert(0, block[data_index:]) |
||
276 | |||
277 | in_tag = False |
||
278 | View Code Duplication | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
|
|||
279 | items[0] = items[0][left_index:] |
||
280 | items[-1] = items[-1][:-len(right_tag) - 2] |
||
281 | if items[len(items) - 1]: # not a newline/empty string |
||
282 | right_index = len(items) + 3 |
||
283 | else: |
||
284 | right_index = len(items) + 2 |
||
285 | new_blocks.append(self.markdown.htmlStash.store_tag( |
||
286 | left_tag, attrs, 0, right_index)) |
||
287 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
||
288 | new_blocks.extend( |
||
289 | self._nested_markdown_in_html(items)) |
||
290 | nests = len(self.markdown.htmlStash.tag_data) - \ |
||
291 | placeholderslen |
||
292 | self.markdown.htmlStash.tag_data[-1 - nests][ |
||
293 | 'right_index'] += nests - 2 |
||
294 | else: |
||
295 | new_blocks.append( |
||
296 | self.markdown.htmlStash.store('\n\n'.join(items))) |
||
297 | items = [] |
||
298 | |||
299 | View Code Duplication | if items: |
|
300 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
||
301 | items[0] = items[0][left_index:] |
||
302 | items[-1] = items[-1][:-len(right_tag) - 2] |
||
303 | if items[len(items) - 1]: # not a newline/empty string |
||
304 | right_index = len(items) + 3 |
||
305 | else: |
||
306 | right_index = len(items) + 2 |
||
307 | new_blocks.append( |
||
308 | self.markdown.htmlStash.store_tag( |
||
309 | left_tag, attrs, 0, right_index)) |
||
310 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
||
311 | new_blocks.extend(self._nested_markdown_in_html(items)) |
||
312 | nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
||
313 | self.markdown.htmlStash.tag_data[-1 - nests][ |
||
314 | 'right_index'] += nests - 2 |
||
315 | else: |
||
316 | new_blocks.append( |
||
317 | self.markdown.htmlStash.store('\n\n'.join(items))) |
||
318 | new_blocks.append('\n') |
||
319 | |||
320 | new_text = "\n\n".join(new_blocks) |
||
321 | return new_text.split("\n") |
||
322 | |||
353 |