Completed
Push — master ( 32cfa8...ec62d3 )
by Dongxin
48s
created

HtmlBlockPreprocessor.run()   F

Complexity

Conditions 31

Size

Total Lines 133

Duplication

Lines 39
Ratio 29.32 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 31
c 1
b 0
f 0
dl 39
loc 133
rs 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like HtmlBlockPreprocessor.run() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
PRE-PROCESSORS
3
=============================================================================
4
5
Preprocessors work on source text before we start doing anything too
6
complicated.
7
"""
8
9
from __future__ import absolute_import
10
from __future__ import unicode_literals
11
from . import util
12
from . import odict
13
import re
14
15
16
def build_preprocessors(md_instance, **kwargs):
17
    """ Build the default set of preprocessors used by Markdown. """
18
    preprocessors = odict.OrderedDict()
19
    preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
20
    if md_instance.safeMode != 'escape':
21
        preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
22
    preprocessors["reference"] = ReferencePreprocessor(md_instance)
23
    return preprocessors
24
25
26
class Preprocessor(util.Processor):
27
    """
28
    Preprocessors are run after the text is broken into lines.
29
30
    Each preprocessor implements a "run" method that takes a pointer to a
31
    list of lines of the document, modifies it as necessary and returns
32
    either the same pointer or a pointer to a new list.
33
34
    Preprocessors must extend markdown.Preprocessor.
35
36
    """
37
    def run(self, lines):
38
        """
39
        Each subclass of Preprocessor should override the `run` method, which
40
        takes the document as a list of strings split by newlines and returns
41
        the (possibly modified) list of lines.
42
43
        """
44
        pass  # pragma: no cover
45
46
47
class NormalizeWhitespace(Preprocessor):
48
    """ Normalize whitespace for consistant parsing. """
49
50
    def run(self, lines):
51
        source = '\n'.join(lines)
52
        source = source.replace(util.STX, "").replace(util.ETX, "")
53
        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
54
        source = source.expandtabs(self.markdown.tab_length)
55
        source = re.sub(r'(?<=\n) +\n', '\n', source)
56
        return source.split('\n')
57
58
59
class HtmlBlockPreprocessor(Preprocessor):
60
    """Remove html blocks from the text and store them for later retrieval."""
61
62
    right_tag_patterns = ["</%s>", "%s>"]
63
    attrs_pattern = r"""
64
        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
65
        |                                                       # OR
66
        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)             # attr=value
67
        |                                                       # OR
68
        \s+(?P<attr2>[^>"'/= ]+)                                # attr
69
        """
70
    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
71
                       attrs_pattern
72
    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
73
    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
74
    markdown_in_raw = False
75
76
    def _get_left_tag(self, block):
77
        m = self.left_tag_re.match(block)
78
        if m:
79
            tag = m.group('tag')
80
            raw_attrs = m.group('attrs')
81
            attrs = {}
82
            if raw_attrs:
83
                for ma in self.attrs_re.finditer(raw_attrs):
84
                    if ma.group('attr'):
85
                        if ma.group('value'):
86
                            attrs[ma.group('attr').strip()] = ma.group('value')
87
                        else:
88
                            attrs[ma.group('attr').strip()] = ""
89
                    elif ma.group('attr1'):
90
                        if ma.group('value1'):
91
                            attrs[ma.group('attr1').strip()] = ma.group(
92
                                'value1'
93
                            )
94
                        else:
95
                            attrs[ma.group('attr1').strip()] = ""
96
                    elif ma.group('attr2'):
97
                        attrs[ma.group('attr2').strip()] = ""
98
            return tag, len(m.group(0)), attrs
99
        else:
100
            tag = block[1:].split(">", 1)[0].lower()
101
            return tag, len(tag)+2, {}
102
103
    def _recursive_tagfind(self, ltag, rtag, start_index, block):
104
        while 1:
105
            i = block.find(rtag, start_index)
106
            if i == -1:
107
                return -1
108
            j = block.find(ltag, start_index)
109
            # if no ltag, or rtag found before another ltag, return index
110
            if (j > i or j == -1):
111
                return i + len(rtag)
112
            # another ltag found before rtag, use end of ltag as starting
113
            # point and search again
114
            j = block.find('>', j)
115
            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
116
            if start_index == -1:
117
                # HTML potentially malformed- ltag has no corresponding
118
                # rtag
119
                return -1
120
121
    def _get_right_tag(self, left_tag, left_index, block):
122
        for p in self.right_tag_patterns:
123
            tag = p % left_tag
124
            i = self._recursive_tagfind(
125
                "<%s" % left_tag, tag, left_index, block
126
            )
127
            if i > 2:
128
                return tag.lstrip("<").rstrip(">"), i
129
        return block.rstrip()[-left_index:-1].lower(), len(block)
130
131
    def _equal_tags(self, left_tag, right_tag):
132
        if left_tag[0] in ['?', '@', '%']:  # handle PHP, etc.
133
            return True
134
        if ("/" + left_tag) == right_tag:
135
            return True
136
        if (right_tag == "--" and left_tag == "--"):
137
            return True
138
        elif left_tag == right_tag[1:] and right_tag[0] == "/":
139
            return True
140
        else:
141
            return False
142
143
    def _is_oneliner(self, tag):
144
        return (tag in ['hr', 'hr/'])
145
146
    def _stringindex_to_listindex(self, stringindex, items):
147
        """
148
        Same effect as concatenating the strings in items,
149
        finding the character to which stringindex refers in that string,
150
        and returning the index of the item in which that character resides.
151
        """
152
        items.append('dummy')
153
        i, count = 0, 0
154
        while count <= stringindex:
155
            count += len(items[i])
156
            i += 1
157
        return i - 1
158
159
    def _nested_markdown_in_html(self, items):
160
        """Find and process html child elements of the given element block."""
161
        for i, item in enumerate(items):
162
            if self.left_tag_re.match(item):
163
                left_tag, left_index, attrs = \
164
                    self._get_left_tag(''.join(items[i:]))
165
                right_tag, data_index = self._get_right_tag(
166
                    left_tag, left_index, ''.join(items[i:]))
167
                right_listindex = \
168
                    self._stringindex_to_listindex(data_index, items[i:]) + i
169
                if 'markdown' in attrs.keys():
170
                    items[i] = items[i][left_index:]  # remove opening tag
171
                    placeholder = self.markdown.htmlStash.store_tag(
172
                        left_tag, attrs, i + 1, right_listindex + 1)
173
                    items.insert(i, placeholder)
174
                    if len(items) - right_listindex <= 1:  # last nest, no tail
175
                        right_listindex -= 1
176
                    items[right_listindex] = items[right_listindex][
177
                        :-len(right_tag) - 2]  # remove closing tag
178
                else:  # raw html
179
                    if len(items) - right_listindex <= 1:  # last element
180
                        right_listindex -= 1
181
                    if right_listindex <= i:
182
                        right_listindex = i + 1
183
                    placeholder = self.markdown.htmlStash.store('\n\n'.join(
184
                        items[i:right_listindex]))
185
                    del items[i:right_listindex]
186
                    items.insert(i, placeholder)
187
        return items
188
189
    def run(self, lines):
190
        text = "\n".join(lines)
191
        new_blocks = []
192
        text = text.rsplit("\n\n")
193
        items = []
194
        left_tag = ''
195
        right_tag = ''
196
        in_tag = False  # flag
197
198
        while text:
199
            block = text[0]
200
            if block.startswith("\n"):
201
                block = block[1:]
202
            text = text[1:]
203
204
            if block.startswith("\n"):
205
                block = block[1:]
206
207
            if not in_tag:
208
                if block.startswith("<") and len(block.strip()) > 1:
209
210
                    if block[1:4] == "!--":
211
                        # is a comment block
212
                        left_tag, left_index, attrs = "--", 2, {}
213
                    else:
214
                        left_tag, left_index, attrs = self._get_left_tag(block)
215
                    right_tag, data_index = self._get_right_tag(left_tag,
216
                                                                left_index,
217
                                                                block)
218
                    # keep checking conditions below and maybe just append
219
220
                    if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
221
                        text.insert(0, block[data_index:])
222
                        block = block[:data_index]
223
224
                    if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
225
                        new_blocks.append(block)
226
                        continue
227
228
                    if self._is_oneliner(left_tag):
229
                        new_blocks.append(block.strip())
230
                        continue
231
232
                    if block.rstrip().endswith(">") \
233
                            and self._equal_tags(left_tag, right_tag):
234
                        if self.markdown_in_raw and 'markdown' in attrs.keys():
235
                            block = block[left_index:-len(right_tag) - 2]
236
                            new_blocks.append(self.markdown.htmlStash.
237
                                              store_tag(left_tag, attrs, 0, 2))
238
                            new_blocks.extend([block])
239
                        else:
240
                            new_blocks.append(
241
                                self.markdown.htmlStash.store(block.strip()))
242
                        continue
243
                    else:
244
                        # if is block level tag and is not complete
245
                        if (not self._equal_tags(left_tag, right_tag)) and \
246
                           (util.isBlockLevel(left_tag) or left_tag == "--"):
247
                            items.append(block.strip())
248
                            in_tag = True
249
                        else:
250
                            new_blocks.append(
251
                                self.markdown.htmlStash.store(block.strip())
252
                            )
253
                        continue
254
255
                else:
256
                    new_blocks.append(block)
257
258
            else:
259
                items.append(block)
260
261
                # Need to evaluate all items so we can calculate relative to the left index.
262
                right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items))
263
                # Adjust data_index: relative to items -> relative to last block
264
                prev_block_length = 0
265
                for item in items[:-1]:
266
                    prev_block_length += len(item)
267
                data_index -= prev_block_length
268
269
                if self._equal_tags(left_tag, right_tag):
270
                    # if find closing tag
271
272
                    if data_index < len(block):
273
                        # we have more text after right_tag
274
                        items[-1] = block[:data_index]
275
                        text.insert(0, block[data_index:])
276
277
                    in_tag = False
278 View Code Duplication
                    if self.markdown_in_raw and 'markdown' in attrs.keys():
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
279
                        items[0] = items[0][left_index:]
280
                        items[-1] = items[-1][:-len(right_tag) - 2]
281
                        if items[len(items) - 1]:  # not a newline/empty string
282
                            right_index = len(items) + 3
283
                        else:
284
                            right_index = len(items) + 2
285
                        new_blocks.append(self.markdown.htmlStash.store_tag(
286
                            left_tag, attrs, 0, right_index))
287
                        placeholderslen = len(self.markdown.htmlStash.tag_data)
288
                        new_blocks.extend(
289
                            self._nested_markdown_in_html(items))
290
                        nests = len(self.markdown.htmlStash.tag_data) - \
291
                            placeholderslen
292
                        self.markdown.htmlStash.tag_data[-1 - nests][
293
                            'right_index'] += nests - 2
294
                    else:
295
                        new_blocks.append(
296
                            self.markdown.htmlStash.store('\n\n'.join(items)))
297
                    items = []
298
299 View Code Duplication
        if items:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
300
            if self.markdown_in_raw and 'markdown' in attrs.keys():
301
                items[0] = items[0][left_index:]
302
                items[-1] = items[-1][:-len(right_tag) - 2]
303
                if items[len(items) - 1]:  # not a newline/empty string
304
                    right_index = len(items) + 3
305
                else:
306
                    right_index = len(items) + 2
307
                new_blocks.append(
308
                    self.markdown.htmlStash.store_tag(
309
                        left_tag, attrs, 0, right_index))
310
                placeholderslen = len(self.markdown.htmlStash.tag_data)
311
                new_blocks.extend(self._nested_markdown_in_html(items))
312
                nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
313
                self.markdown.htmlStash.tag_data[-1 - nests][
314
                    'right_index'] += nests - 2
315
            else:
316
                new_blocks.append(
317
                    self.markdown.htmlStash.store('\n\n'.join(items)))
318
            new_blocks.append('\n')
319
320
        new_text = "\n\n".join(new_blocks)
321
        return new_text.split("\n")
322
323
324
class ReferencePreprocessor(Preprocessor):
325
    """ Remove reference definitions from text and store for later use. """
326
327
    TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
328
    RE = re.compile(
329
        r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
330
    )
331
    TITLE_RE = re.compile(r'^%s$' % TITLE)
332
333
    def run(self, lines):
334
        new_text = []
335
        while lines:
336
            line = lines.pop(0)
337
            m = self.RE.match(line)
338
            if m:
339
                id = m.group(1).strip().lower()
340
                link = m.group(2).lstrip('<').rstrip('>')
341
                t = m.group(5) or m.group(6) or m.group(7)
342
                if not t:
343
                    # Check next line for title
344
                    tm = self.TITLE_RE.match(lines[0])
345
                    if tm:
346
                        lines.pop(0)
347
                        t = tm.group(2) or tm.group(3) or tm.group(4)
348
                self.markdown.references[id] = (link, t)
349
            else:
350
                new_text.append(line)
351
352
        return new_text  # + "\n"
353