Completed
Push — master ( 32cfa8...ec62d3 )
by Dongxin
48s
created

pythonx/markdown_lib/blockprocessors.py (2 issues)

1
"""
2
CORE MARKDOWN BLOCKPARSER
3
===========================================================================
4
5
This parser handles basic parsing of Markdown blocks.  It doesn't concern
6
itself with inline elements such as **bold** or *italics*, but rather just
7
catches blocks, lists, quotes, etc.
8
9
The BlockParser is made up of a bunch of BlockProssors, each handling a
10
different type of block. Extensions may add/replace/remove BlockProcessors
11
as they need to alter how markdown blocks are parsed.
12
"""
13
14
from __future__ import absolute_import
15
from __future__ import division
16
from __future__ import unicode_literals
17
import logging
18
import re
19
from . import util
20
from .blockparser import BlockParser
21
22
logger = logging.getLogger('MARKDOWN')
23
24
25
def build_block_parser(md_instance, **kwargs):
26
    """ Build the default block parser used by Markdown. """
27
    parser = BlockParser(md_instance)
28
    parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)
29
    parser.blockprocessors['indent'] = ListIndentProcessor(parser)
30
    parser.blockprocessors['code'] = CodeBlockProcessor(parser)
31
    parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)
32
    parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)
33
    parser.blockprocessors['hr'] = HRProcessor(parser)
34
    parser.blockprocessors['olist'] = OListProcessor(parser)
35
    parser.blockprocessors['ulist'] = UListProcessor(parser)
36
    parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)
37
    parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)
38
    return parser
39
40
41
class BlockProcessor(object):
42
    """ Base class for block processors.
43
44
    Each subclass will provide the methods below to work with the source and
45
    tree. Each processor will need to define it's own ``test`` and ``run``
46
    methods. The ``test`` method should return True or False, to indicate
47
    whether the current block should be processed by this processor. If the
48
    test passes, the parser will call the processors ``run`` method.
49
50
    """
51
52
    def __init__(self, parser):
53
        self.parser = parser
54
        self.tab_length = parser.markdown.tab_length
55
56
    def lastChild(self, parent):
57
        """ Return the last child of an etree element. """
58
        if len(parent):
59
            return parent[-1]
60
        else:
61
            return None
62
63
    def detab(self, text):
64
        """ Remove a tab from the front of each line of the given text. """
65
        newtext = []
66
        lines = text.split('\n')
67
        for line in lines:
68
            if line.startswith(' '*self.tab_length):
69
                newtext.append(line[self.tab_length:])
70
            elif not line.strip():
71
                newtext.append('')
72
            else:
73
                break
74
        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
75
76
    def looseDetab(self, text, level=1):
77
        """ Remove a tab from front of lines but allowing dedented lines. """
78
        lines = text.split('\n')
79
        for i in range(len(lines)):
80
            if lines[i].startswith(' '*self.tab_length*level):
81
                lines[i] = lines[i][self.tab_length*level:]
82
        return '\n'.join(lines)
83
84
    def test(self, parent, block):
85
        """ Test for block type. Must be overridden by subclasses.
86
87
        As the parser loops through processors, it will call the ``test``
88
        method on each to determine if the given block of text is of that
89
        type. This method must return a boolean ``True`` or ``False``. The
90
        actual method of testing is left to the needs of that particular
91
        block type. It could be as simple as ``block.startswith(some_string)``
92
        or a complex regular expression. As the block type may be different
93
        depending on the parent of the block (i.e. inside a list), the parent
94
        etree element is also provided and may be used as part of the test.
95
96
        Keywords:
97
98
        * ``parent``: A etree element which will be the parent of the block.
99
        * ``block``: A block of text from the source which has been split at
100
            blank lines.
101
        """
102
        pass  # pragma: no cover
103
104
    def run(self, parent, blocks):
105
        """ Run processor. Must be overridden by subclasses.
106
107
        When the parser determines the appropriate type of a block, the parser
108
        will call the corresponding processor's ``run`` method. This method
109
        should parse the individual lines of the block and append them to
110
        the etree.
111
112
        Note that both the ``parent`` and ``etree`` keywords are pointers
113
        to instances of the objects which should be edited in place. Each
114
        processor must make changes to the existing objects as there is no
115
        mechanism to return new/different objects to replace them.
116
117
        This means that this method should be adding SubElements or adding text
118
        to the parent, and should remove (``pop``) or add (``insert``) items to
119
        the list of blocks.
120
121
        Keywords:
122
123
        * ``parent``: A etree element which is the parent of the current block.
124
        * ``blocks``: A list of all remaining blocks of the document.
125
        """
126
        pass  # pragma: no cover
127
128
129
class ListIndentProcessor(BlockProcessor):
130
    """ Process children of list items.
131
132
    Example:
133
        * a list item
134
            process this part
135
136
            or this part
137
138
    """
139
140
    ITEM_TYPES = ['li']
141
    LIST_TYPES = ['ul', 'ol']
142
143
    def __init__(self, *args):
144
        super(ListIndentProcessor, self).__init__(*args)
145
        self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length)
146
147
    def test(self, parent, block):
148
        return block.startswith(' '*self.tab_length) and \
149
            not self.parser.state.isstate('detabbed') and \
150
            (parent.tag in self.ITEM_TYPES or
151
                (len(parent) and parent[-1] is not None and
152
                    (parent[-1].tag in self.LIST_TYPES)))
153
154
    def run(self, parent, blocks):
155
        block = blocks.pop(0)
156
        level, sibling = self.get_level(parent, block)
157
        block = self.looseDetab(block, level)
158
159
        self.parser.state.set('detabbed')
160
        if parent.tag in self.ITEM_TYPES:
161
            # It's possible that this parent has a 'ul' or 'ol' child list
162
            # with a member.  If that is the case, then that should be the
163
            # parent.  This is intended to catch the edge case of an indented
164
            # list whose first member was parsed previous to this point
165
            # see OListProcessor
166
            if len(parent) and parent[-1].tag in self.LIST_TYPES:
167
                self.parser.parseBlocks(parent[-1], [block])
168
            else:
169
                # The parent is already a li. Just parse the child block.
170
                self.parser.parseBlocks(parent, [block])
171
        elif sibling.tag in self.ITEM_TYPES:
172
            # The sibling is a li. Use it as parent.
173
            self.parser.parseBlocks(sibling, [block])
174
        elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
175
            # The parent is a list (``ol`` or ``ul``) which has children.
176
            # Assume the last child li is the parent of this block.
177
            if sibling[-1].text:
178
                # If the parent li has text, that text needs to be moved to a p
179
                # The p must be 'inserted' at beginning of list in the event
180
                # that other children already exist i.e.; a nested sublist.
181
                p = util.etree.Element('p')
182
                p.text = sibling[-1].text
183
                sibling[-1].text = ''
184
                sibling[-1].insert(0, p)
185
            self.parser.parseChunk(sibling[-1], block)
186
        else:
187
            self.create_item(sibling, block)
188
        self.parser.state.reset()
189
190
    def create_item(self, parent, block):
191
        """ Create a new li and parse the block with it as the parent. """
192
        li = util.etree.SubElement(parent, 'li')
193
        self.parser.parseBlocks(li, [block])
194
195
    def get_level(self, parent, block):
196
        """ Get level of indent based on list level. """
197
        # Get indent level
198
        m = self.INDENT_RE.match(block)
199
        if m:
200
            indent_level = len(m.group(1))/self.tab_length
201
        else:
202
            indent_level = 0
203
        if self.parser.state.isstate('list'):
204
            # We're in a tightlist - so we already are at correct parent.
205
            level = 1
206
        else:
207
            # We're in a looselist - so we need to find parent.
208
            level = 0
209
        # Step through children of tree to find matching indent level.
210
        while indent_level > level:
211
            child = self.lastChild(parent)
212
            if (child is not None and
213
               (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
214
                if child.tag in self.LIST_TYPES:
215
                    level += 1
216
                parent = child
217
            else:
218
                # No more child levels. If we're short of indent_level,
219
                # we have a code block. So we stop here.
220
                break
221
        return level, parent
222
223
224
class CodeBlockProcessor(BlockProcessor):
225
    """ Process code blocks. """
226
227
    def test(self, parent, block):
228
        return block.startswith(' '*self.tab_length)
229
230
    def run(self, parent, blocks):
231
        sibling = self.lastChild(parent)
232
        block = blocks.pop(0)
233
        theRest = ''
234
        if (sibling is not None and sibling.tag == "pre" and
235
           len(sibling) and sibling[0].tag == "code"):
236
            # The previous block was a code block. As blank lines do not start
237
            # new code blocks, append this block to the previous, adding back
238
            # linebreaks removed from the split into a list.
239
            code = sibling[0]
240
            block, theRest = self.detab(block)
241
            code.text = util.AtomicString(
242
                '%s\n%s\n' % (code.text, block.rstrip())
243
            )
244
        else:
245
            # This is a new codeblock. Create the elements and insert text.
246
            pre = util.etree.SubElement(parent, 'pre')
247
            code = util.etree.SubElement(pre, 'code')
248
            block, theRest = self.detab(block)
249
            code.text = util.AtomicString('%s\n' % block.rstrip())
250
        if theRest:
251
            # This block contained unindented line(s) after the first indented
252
            # line. Insert these lines as the first block of the master blocks
253
            # list for future processing.
254
            blocks.insert(0, theRest)
255
256
257
class BlockQuoteProcessor(BlockProcessor):
258
259
    RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
260
261
    def test(self, parent, block):
262
        return bool(self.RE.search(block))
263
264
    def run(self, parent, blocks):
265
        block = blocks.pop(0)
266
        m = self.RE.search(block)
267
        if m:
268
            before = block[:m.start()]  # Lines before blockquote
269
            # Pass lines before blockquote in recursively for parsing forst.
270
            self.parser.parseBlocks(parent, [before])
271
            # Remove ``> `` from begining of each line.
272
            block = '\n'.join(
273
                [self.clean(line) for line in block[m.start():].split('\n')]
274
            )
275
        sibling = self.lastChild(parent)
276
        if sibling is not None and sibling.tag == "blockquote":
277
            # Previous block was a blockquote so set that as this blocks parent
278
            quote = sibling
279
        else:
280
            # This is a new blockquote. Create a new parent element.
281
            quote = util.etree.SubElement(parent, 'blockquote')
282
        # Recursively parse block with blockquote as parent.
283
        # change parser state so blockquotes embedded in lists use p tags
284
        self.parser.state.set('blockquote')
285
        self.parser.parseChunk(quote, block)
286
        self.parser.state.reset()
287
288
    def clean(self, line):
289
        """ Remove ``>`` from beginning of a line. """
290
        m = self.RE.match(line)
291
        if line.strip() == ">":
292
            return ""
293
        elif m:
294
            return m.group(2)
295
        else:
296
            return line
297
298
299
class OListProcessor(BlockProcessor):
300
    """ Process ordered list blocks. """
301
302
    TAG = 'ol'
303
    # The integer (python string) with which the lists starts (default=1)
304
    # Eg: If list is intialized as)
305
    #   3. Item
306
    # The ol tag will get starts="3" attribute
307
    STARTSWITH = '1'
308
    # List of allowed sibling tags.
309
    SIBLING_TAGS = ['ol', 'ul']
310
311
    def __init__(self, parser):
312
        super(OListProcessor, self).__init__(parser)
313
        # Detect an item (``1. item``). ``group(1)`` contains contents of item.
314
        self.RE = re.compile(r'^[ ]{0,%d}\d+\.[ ]+(.*)' % (self.tab_length - 1))
315
        # Detect items on secondary lines. they can be of either list type.
316
        self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.)|[*+-])[ ]+(.*)' %
317
                                   (self.tab_length - 1))
318
        # Detect indented (nested) items of either type
319
        self.INDENT_RE = re.compile(r'^[ ]{%d,%d}((\d+\.)|[*+-])[ ]+.*' %
320
                                    (self.tab_length, self.tab_length * 2 - 1))
321
322
    def test(self, parent, block):
323
        return bool(self.RE.match(block))
324
325
    def run(self, parent, blocks):
326
        # Check fr multiple items in one block.
327
        items = self.get_items(blocks.pop(0))
328
        sibling = self.lastChild(parent)
329
330
        if sibling is not None and sibling.tag in self.SIBLING_TAGS:
331
            # Previous block was a list item, so set that as parent
332
            lst = sibling
333
            # make sure previous item is in a p- if the item has text,
334
            # then it isn't in a p
335
            if lst[-1].text:
336
                # since it's possible there are other children for this
337
                # sibling, we can't just SubElement the p, we need to
338
                # insert it as the first item.
339
                p = util.etree.Element('p')
340
                p.text = lst[-1].text
341
                lst[-1].text = ''
342
                lst[-1].insert(0, p)
343
            # if the last item has a tail, then the tail needs to be put in a p
344
            # likely only when a header is not followed by a blank line
345
            lch = self.lastChild(lst[-1])
346
            if lch is not None and lch.tail:
347
                p = util.etree.SubElement(lst[-1], 'p')
348
                p.text = lch.tail.lstrip()
349
                lch.tail = ''
350
351
            # parse first block differently as it gets wrapped in a p.
352
            li = util.etree.SubElement(lst, 'li')
353
            self.parser.state.set('looselist')
354
            firstitem = items.pop(0)
355
            self.parser.parseBlocks(li, [firstitem])
356
            self.parser.state.reset()
357
        elif parent.tag in ['ol', 'ul']:
358
            # this catches the edge case of a multi-item indented list whose
359
            # first item is in a blank parent-list item:
360
            # * * subitem1
361
            #     * subitem2
362
            # see also ListIndentProcessor
363
            lst = parent
364
        else:
365
            # This is a new list so create parent with appropriate tag.
366
            lst = util.etree.SubElement(parent, self.TAG)
367
            # Check if a custom start integer is set
368
            if not self.parser.markdown.lazy_ol and self.STARTSWITH != '1':
369
                lst.attrib['start'] = self.STARTSWITH
370
371
        self.parser.state.set('list')
372
        # Loop through items in block, recursively parsing each with the
373
        # appropriate parent.
374
        for item in items:
375
            if item.startswith(' '*self.tab_length):
376
                # Item is indented. Parse with last item as parent
377
                self.parser.parseBlocks(lst[-1], [item])
378
            else:
379
                # New item. Create li and parse with it as parent
380
                li = util.etree.SubElement(lst, 'li')
381
                self.parser.parseBlocks(li, [item])
382
        self.parser.state.reset()
383
384
    def get_items(self, block):
385
        """ Break a block into list items. """
386
        items = []
387
        for line in block.split('\n'):
388
            m = self.CHILD_RE.match(line)
389
            if m:
390
                # This is a new list item
391
                # Check first item for the start index
392
                if not items and self.TAG == 'ol':
393
                    # Detect the integer value of first list item
394
                    INTEGER_RE = re.compile(r'(\d+)')
395
                    self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
396
                # Append to the list
397
                items.append(m.group(3))
398
            elif self.INDENT_RE.match(line):
399
                # This is an indented (possibly nested) item.
400
                if items[-1].startswith(' '*self.tab_length):
401
                    # Previous item was indented. Append to that item.
402
                    items[-1] = '%s\n%s' % (items[-1], line)
403
                else:
404
                    items.append(line)
405
            else:
406
                # This is another line of previous item. Append to that item.
407
                items[-1] = '%s\n%s' % (items[-1], line)
408
        return items
409
410
411
class UListProcessor(OListProcessor):
412
    """ Process unordered list blocks. """
413
414
    TAG = 'ul'
415
416
    def __init__(self, parser):
417
        super(UListProcessor, self).__init__(parser)
418
        # Detect an item (``1. item``). ``group(1)`` contains contents of item.
419
        self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))
420
421
422
class HashHeaderProcessor(BlockProcessor):
423
    """ Process Hash Headers. """
424
425
    # Detect a header at start of any line in block
426
    RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
427
428
    def test(self, parent, block):
429
        return bool(self.RE.search(block))
430
431 View Code Duplication
    def run(self, parent, blocks):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
432
        block = blocks.pop(0)
433
        m = self.RE.search(block)
434
        if m:
435
            before = block[:m.start()]  # All lines before header
436
            after = block[m.end():]     # All lines after header
437
            if before:
438
                # As the header was not the first line of the block and the
439
                # lines before the header must be parsed first,
440
                # recursively parse this lines as a block.
441
                self.parser.parseBlocks(parent, [before])
442
            # Create header using named groups from RE
443
            h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))
444
            h.text = m.group('header').strip()
445
            if after:
446
                # Insert remaining lines as first block for future parsing.
447
                blocks.insert(0, after)
448
        else:  # pragma: no cover
449
            # This should never happen, but just in case...
450
            logger.warn("We've got a problem header: %r" % block)
451
452
453
class SetextHeaderProcessor(BlockProcessor):
454
    """ Process Setext-style Headers. """
455
456
    # Detect Setext-style header. Must be first 2 lines of block.
457
    RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)
458
459
    def test(self, parent, block):
460
        return bool(self.RE.match(block))
461
462
    def run(self, parent, blocks):
463
        lines = blocks.pop(0).split('\n')
464
        # Determine level. ``=`` is 1 and ``-`` is 2.
465
        if lines[1].startswith('='):
466
            level = 1
467
        else:
468
            level = 2
469
        h = util.etree.SubElement(parent, 'h%d' % level)
470
        h.text = lines[0].strip()
471
        if len(lines) > 2:
472
            # Block contains additional lines. Add to  master blocks for later.
473
            blocks.insert(0, '\n'.join(lines[2:]))
474
475
476
class HRProcessor(BlockProcessor):
477
    """ Process Horizontal Rules. """
478
479
    RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
480
    # Detect hr on any line of a block.
481
    SEARCH_RE = re.compile(RE, re.MULTILINE)
482
483
    def test(self, parent, block):
484
        m = self.SEARCH_RE.search(block)
485
        # No atomic grouping in python so we simulate it here for performance.
486
        # The regex only matches what would be in the atomic group - the HR.
487
        # Then check if we are at end of block or if next char is a newline.
488
        if m and (m.end() == len(block) or block[m.end()] == '\n'):
489
            # Save match object on class instance so we can use it later.
490
            self.match = m
491
            return True
492
        return False
493
494 View Code Duplication
    def run(self, parent, blocks):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
495
        block = blocks.pop(0)
496
        match = self.match
497
        # Check for lines in block before hr.
498
        prelines = block[:match.start()].rstrip('\n')
499
        if prelines:
500
            # Recursively parse lines before hr so they get parsed first.
501
            self.parser.parseBlocks(parent, [prelines])
502
        # create hr
503
        util.etree.SubElement(parent, 'hr')
504
        # check for lines in block after hr.
505
        postlines = block[match.end():].lstrip('\n')
506
        if postlines:
507
            # Add lines after hr to master blocks for later parsing.
508
            blocks.insert(0, postlines)
509
510
511
class EmptyBlockProcessor(BlockProcessor):
512
    """ Process blocks that are empty or start with an empty line. """
513
514
    def test(self, parent, block):
515
        return not block or block.startswith('\n')
516
517
    def run(self, parent, blocks):
518
        block = blocks.pop(0)
519
        filler = '\n\n'
520
        if block:
521
            # Starts with empty line
522
            # Only replace a single line.
523
            filler = '\n'
524
            # Save the rest for later.
525
            theRest = block[1:]
526
            if theRest:
527
                # Add remaining lines to master blocks for later.
528
                blocks.insert(0, theRest)
529
        sibling = self.lastChild(parent)
530
        if (sibling is not None and sibling.tag == 'pre' and
531
           len(sibling) and sibling[0].tag == 'code'):
532
            # Last block is a codeblock. Append to preserve whitespace.
533
            sibling[0].text = util.AtomicString(
534
                '%s%s' % (sibling[0].text, filler)
535
            )
536
537
538
class ParagraphProcessor(BlockProcessor):
539
    """ Process Paragraph blocks. """
540
541
    def test(self, parent, block):
542
        return True
543
544
    def run(self, parent, blocks):
545
        block = blocks.pop(0)
546
        if block.strip():
547
            # Not a blank block. Add to parent, otherwise throw it away.
548
            if self.parser.state.isstate('list'):
549
                # The parent is a tight-list.
550
                #
551
                # Check for any children. This will likely only happen in a
552
                # tight-list when a header isn't followed by a blank line.
553
                # For example:
554
                #
555
                #     * # Header
556
                #     Line 2 of list item - not part of header.
557
                sibling = self.lastChild(parent)
558
                if sibling is not None:
559
                    # Insetrt after sibling.
560
                    if sibling.tail:
561
                        sibling.tail = '%s\n%s' % (sibling.tail, block)
562
                    else:
563
                        sibling.tail = '\n%s' % block
564
                else:
565
                    # Append to parent.text
566
                    if parent.text:
567
                        parent.text = '%s\n%s' % (parent.text, block)
568
                    else:
569
                        parent.text = block.lstrip()
570
            else:
571
                # Create a regular paragraph
572
                p = util.etree.SubElement(parent, 'p')
573
                p.text = block.lstrip()
574