Completed
Push — master ( 32cfa8...ec62d3 )
by Dongxin
48s
created

EscapePattern   A

Complexity

Total Complexity 2

Size/Duplication

Total Lines 9
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 9
rs 10
wmc 2

1 Method

Rating   Name   Duplication   Size   Complexity  
A handleMatch() 0 6 2
1
"""
2
INLINE PATTERNS
3
=============================================================================
4
5
Inline patterns such as *emphasis* are handled by means of auxiliary
6
objects, one per pattern.  Pattern objects must be instances of classes
7
that extend markdown.Pattern.  Each pattern object uses a single regular
8
expression and needs support the following methods:
9
10
    pattern.getCompiledRegExp() # returns a regular expression
11
12
    pattern.handleMatch(m) # takes a match object and returns
13
                           # an ElementTree element or just plain text
14
15
All of python markdown's built-in patterns subclass from Pattern,
16
but you can add additional patterns that don't.
17
18
Also note that all the regular expressions used by inline must
19
capture the whole block.  For this reason, they all start with
20
'^(.*)' and end with '(.*)!'.  In case with built-in expression
21
Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23
Finally, the order in which regular expressions are applied is very
24
important - e.g. if we first replace http://.../ links with <a> tags
25
and _then_ try to replace inline html, we would end up with a mess.
26
So, we apply the expressions in the following order:
27
28
* escape and backticks have to go before everything else, so
29
  that we can preempt any markdown patterns by escaping them.
30
31
* then we handle auto-links (must be done before inline html)
32
33
* then we handle inline HTML.  At this point we will simply
34
  replace all inline HTML strings with a placeholder and add
35
  the actual HTML to a hash.
36
37
* then inline images (must be done before links)
38
39
* then bracketed links, first regular then reference-style
40
41
* finally we apply strong and emphasis
42
"""
43
44
from __future__ import absolute_import
45
from __future__ import unicode_literals
46
from . import util
47
from . import odict
48
import re
49
try:  # pragma: no cover
50
    from urllib.parse import urlparse, urlunparse
51
except ImportError:  # pragma: no cover
52
    from urlparse import urlparse, urlunparse
53
try:  # pragma: no cover
54
    from html import entities
55
except ImportError:  # pragma: no cover
56
    import htmlentitydefs as entities
57
58
59
def build_inlinepatterns(md_instance, **kwargs):
60
    """ Build the default set of inline patterns for Markdown. """
61
    inlinePatterns = odict.OrderedDict()
62
    inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
63
    inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
64
    inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
65
    inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
66
    inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
67
    inlinePatterns["image_reference"] = ImageReferencePattern(
68
        IMAGE_REFERENCE_RE, md_instance
69
    )
70
    inlinePatterns["short_reference"] = ReferencePattern(
71
        SHORT_REF_RE, md_instance
72
    )
73
    inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
74
    inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
75
    inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
76
    if md_instance.safeMode != 'escape':
77
        inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
78
    inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
79
    inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
80
    inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
81
    inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
82
    inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
83
    inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
84
    if md_instance.smart_emphasis:
85
        inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
86
    else:
87
        inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
88
    return inlinePatterns
89
90
91
"""
92
The actual regular expressions for patterns
93
-----------------------------------------------------------------------------
94
"""
95
96
NOBRACKET = r'[^\]\[]*'
97
BRK = (
98
    r'\[(' +
99
    (NOBRACKET + r'(\[')*6 +
100
    (NOBRACKET + r'\])*')*6 +
101
    NOBRACKET + r')\]'
102
)
103
NOIMG = r'(?<!\!)'
104
105
# `e=f()` or ``e=f("`")``
106
BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'
107
108
# \<
109
ESCAPE_RE = r'\\(.)'
110
111
# *emphasis*
112
EMPHASIS_RE = r'(\*)([^\*]+)\2'
113
114
# **strong**
115
STRONG_RE = r'(\*{2}|_{2})(.+?)\2'
116
117
# ***strongem*** or ***em*strong**
118
EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'
119
120
# ***strong**em*
121
STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'
122
123
# _smart_emphasis_
124
SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
125
126
# _emphasis_
127
EMPHASIS_2_RE = r'(_)(.+?)\2'
128
129
# [text](url) or [text](<url>) or [text](url "title")
130
LINK_RE = NOIMG + BRK + \
131
    r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
132
133
# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
134
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(\s*(<.*?>|([^"\)\s]+\s*"[^"]*"|[^\)\s]*))\s*\)'
135
136
# [Google][3]
137
REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
138
139
# [Google]
140
SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
141
142
# ![alt text][2]
143
IMAGE_REFERENCE_RE = r'\!' + BRK + r'\s?\[([^\]]*)\]'
144
145
# stand-alone * or _
146
NOT_STRONG_RE = r'((^| )(\*|_)( |$))'
147
148
# <http://www.123.com>
149
AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
150
151
# <[email protected]>
152
AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'
153
154
# <...>
155
HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'
156
157
# &amp;
158
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
159
160
# two spaces at end of line
161
LINE_BREAK_RE = r'  \n'
162
163
164
def dequote(string):
165
    """Remove quotes from around a string."""
166
    if ((string.startswith('"') and string.endswith('"')) or
167
       (string.startswith("'") and string.endswith("'"))):
168
        return string[1:-1]
169
    else:
170
        return string
171
172
173
ATTR_RE = re.compile(r"\{@([^\}]*)=([^\}]*)}")  # {@id=123}
174
175
176
def handleAttributes(text, parent):
177
    """Set values of an element based on attribute definitions ({@id=123})."""
178
    def attributeCallback(match):
179
        parent.set(match.group(1), match.group(2).replace('\n', ' '))
180
    return ATTR_RE.sub(attributeCallback, text)
181
182
183
"""
184
The pattern classes
185
-----------------------------------------------------------------------------
186
"""
187
188
189
class Pattern(object):
190
    """Base class that inline patterns subclass. """
191
192
    def __init__(self, pattern, markdown_instance=None):
193
        """
194
        Create an instant of an inline pattern.
195
196
        Keyword arguments:
197
198
        * pattern: A regular expression that matches a pattern
199
200
        """
201
        self.pattern = pattern
202
        self.compiled_re = re.compile("^(.*?)%s(.*)$" % pattern,
203
                                      re.DOTALL | re.UNICODE)
204
205
        # Api for Markdown to pass safe_mode into instance
206
        self.safe_mode = False
207
        if markdown_instance:
208
            self.markdown = markdown_instance
209
210
    def getCompiledRegExp(self):
211
        """ Return a compiled regular expression. """
212
        return self.compiled_re
213
214
    def handleMatch(self, m):
215
        """Return a ElementTree element from the given match.
216
217
        Subclasses should override this method.
218
219
        Keyword arguments:
220
221
        * m: A re match object containing a match of the pattern.
222
223
        """
224
        pass  # pragma: no cover
225
226
    def type(self):
227
        """ Return class name, to define pattern type """
228
        return self.__class__.__name__
229
230
    def unescape(self, text):
231
        """ Return unescaped text given text with an inline placeholder. """
232
        try:
233
            stash = self.markdown.treeprocessors['inline'].stashed_nodes
234
        except KeyError:  # pragma: no cover
235
            return text
236
237
        def itertext(el):  # pragma: no cover
238
            ' Reimplement Element.itertext for older python versions '
239
            tag = el.tag
240
            if not isinstance(tag, util.string_type) and tag is not None:
241
                return
242
            if el.text:
243
                yield el.text
244
            for e in el:
245
                for s in itertext(e):
246
                    yield s
247
                if e.tail:
248
                    yield e.tail
249
250
        def get_stash(m):
251
            id = m.group(1)
252
            if id in stash:
253
                value = stash.get(id)
254
                if isinstance(value, util.string_type):
255
                    return value
256
                else:
257
                    # An etree Element - return text content only
258
                    return ''.join(itertext(value))
259
        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
260
261
262
class SimpleTextPattern(Pattern):
263
    """ Return a simple text of group(2) of a Pattern. """
264
    def handleMatch(self, m):
265
        return m.group(2)
266
267
268
class EscapePattern(Pattern):
269
    """ Return an escaped character. """
270
271
    def handleMatch(self, m):
272
        char = m.group(2)
273
        if char in self.markdown.ESCAPED_CHARS:
274
            return '%s%s%s' % (util.STX, ord(char), util.ETX)
275
        else:
276
            return None
277
278
279
class SimpleTagPattern(Pattern):
280
    """
281
    Return element of type `tag` with a text attribute of group(3)
282
    of a Pattern.
283
284
    """
285
    def __init__(self, pattern, tag):
286
        Pattern.__init__(self, pattern)
287
        self.tag = tag
288
289
    def handleMatch(self, m):
290
        el = util.etree.Element(self.tag)
291
        el.text = m.group(3)
292
        return el
293
294
295
class SubstituteTagPattern(SimpleTagPattern):
296
    """ Return an element of type `tag` with no children. """
297
    def handleMatch(self, m):
298
        return util.etree.Element(self.tag)
299
300
301
class BacktickPattern(Pattern):
302
    """ Return a `<code>` element containing the matching text. """
303
    def __init__(self, pattern):
304
        Pattern.__init__(self, pattern)
305
        self.ESCAPED_BSLASH = '%s%s%s' % (util.STX, ord('\\'), util.ETX)
306
        self.tag = 'code'
307
308
    def handleMatch(self, m):
309
        if m.group(4):
310
            el = util.etree.Element(self.tag)
311
            el.text = util.AtomicString(m.group(4).strip())
312
            return el
313
        else:
314
            return m.group(2).replace('\\\\', self.ESCAPED_BSLASH)
315
316
317
class DoubleTagPattern(SimpleTagPattern):
318
    """Return a ElementTree element nested in tag2 nested in tag1.
319
320
    Useful for strong emphasis etc.
321
322
    """
323
    def handleMatch(self, m):
324
        tag1, tag2 = self.tag.split(",")
325
        el1 = util.etree.Element(tag1)
326
        el2 = util.etree.SubElement(el1, tag2)
327
        el2.text = m.group(3)
328
        if len(m.groups()) == 5:
329
            el2.tail = m.group(4)
330
        return el1
331
332
333
class HtmlPattern(Pattern):
334
    """ Store raw inline html and return a placeholder. """
335
    def handleMatch(self, m):
336
        rawhtml = self.unescape(m.group(2))
337
        place_holder = self.markdown.htmlStash.store(rawhtml)
338
        return place_holder
339
340
    def unescape(self, text):
341
        """ Return unescaped text given text with an inline placeholder. """
342
        try:
343
            stash = self.markdown.treeprocessors['inline'].stashed_nodes
344
        except KeyError:  # pragma: no cover
345
            return text
346
347
        def get_stash(m):
348
            id = m.group(1)
349
            value = stash.get(id)
350
            if value is not None:
351
                try:
352
                    return self.markdown.serializer(value)
353
                except:
354
                    return r'\%s' % value
355
356
        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
357
358
359
class LinkPattern(Pattern):
360
    """ Return a link element from the given match. """
361
    def handleMatch(self, m):
362
        el = util.etree.Element("a")
363
        el.text = m.group(2)
364
        title = m.group(13)
365
        href = m.group(9)
366
367
        if href:
368
            if href[0] == "<":
369
                href = href[1:-1]
370
            el.set("href", self.sanitize_url(self.unescape(href.strip())))
371
        else:
372
            el.set("href", "")
373
374
        if title:
375
            title = dequote(self.unescape(title))
376
            el.set("title", title)
377
        return el
378
379
    def sanitize_url(self, url):
380
        """
381
        Sanitize a url against xss attacks in "safe_mode".
382
383
        Rather than specifically blacklisting `javascript:alert("XSS")` and all
384
        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
385
        safe url formats. Most urls contain a network location, however some
386
        are known not to (i.e.: mailto links). Script urls do not contain a
387
        location. Additionally, for `javascript:...`, the scheme would be
388
        "javascript" but some aliases will appear to `urlparse()` to have no
389
        scheme. On top of that relative links (i.e.: "foo/bar.html") have no
390
        scheme. Therefore we must check "path", "parameters", "query" and
391
        "fragment" for any literal colons. We don't check "scheme" for colons
392
        because it *should* never have any and "netloc" must allow the form:
393
        `username:password@host:port`.
394
395
        """
396
        if not self.markdown.safeMode:
397
            # Return immediately bipassing parsing.
398
            return url
399
400
        try:
401
            scheme, netloc, path, params, query, fragment = url = urlparse(url)
402
        except ValueError:  # pragma: no cover
403
            # Bad url - so bad it couldn't be parsed.
404
            return ''
405
406
        locless_schemes = ['', 'mailto', 'news']
407
        allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
408
        if scheme not in allowed_schemes:
409
            # Not a known (allowed) scheme. Not safe.
410
            return ''
411
412
        if netloc == '' and scheme not in locless_schemes:  # pragma: no cover
413
            # This should not happen. Treat as suspect.
414
            return ''
415
416
        for part in url[2:]:
417
            if ":" in part:
418
                # A colon in "path", "parameters", "query"
419
                # or "fragment" is suspect.
420
                return ''
421
422
        # Url passes all tests. Return url as-is.
423
        return urlunparse(url)
424
425
426
class ImagePattern(LinkPattern):
427
    """ Return a img element from the given match. """
428
    def handleMatch(self, m):
429
        el = util.etree.Element("img")
430
        src_parts = m.group(9).split()
431
        if src_parts:
432
            src = src_parts[0]
433
            if src[0] == "<" and src[-1] == ">":
434
                src = src[1:-1]
435
            el.set('src', self.sanitize_url(self.unescape(src)))
436
        else:
437
            el.set('src', "")
438
        if len(src_parts) > 1:
439
            el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
440
441
        if self.markdown.enable_attributes:
442
            truealt = handleAttributes(m.group(2), el)
443
        else:
444
            truealt = m.group(2)
445
446
        el.set('alt', self.unescape(truealt))
447
        return el
448
449
450
class ReferencePattern(LinkPattern):
451
    """ Match to a stored reference and return link element. """
452
453
    NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
454
455
    def handleMatch(self, m):
456
        try:
457
            id = m.group(9).lower()
458
        except IndexError:
459
            id = None
460
        if not id:
461
            # if we got something like "[Google][]" or "[Google]"
462
            # we'll use "google" as the id
463
            id = m.group(2).lower()
464
465
        # Clean up linebreaks in id
466
        id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
467
        if id not in self.markdown.references:  # ignore undefined refs
468
            return None
469
        href, title = self.markdown.references[id]
470
471
        text = m.group(2)
472
        return self.makeTag(href, title, text)
473
474
    def makeTag(self, href, title, text):
475
        el = util.etree.Element('a')
476
477
        el.set('href', self.sanitize_url(href))
478
        if title:
479
            el.set('title', title)
480
481
        el.text = text
482
        return el
483
484
485
class ImageReferencePattern(ReferencePattern):
486
    """ Match to a stored reference and return img element. """
487
    def makeTag(self, href, title, text):
488
        el = util.etree.Element("img")
489
        el.set("src", self.sanitize_url(href))
490
        if title:
491
            el.set("title", title)
492
493
        if self.markdown.enable_attributes:
494
            text = handleAttributes(text, el)
495
496
        el.set("alt", self.unescape(text))
497
        return el
498
499
500
class AutolinkPattern(Pattern):
501
    """ Return a link Element given an autolink (`<http://example/com>`). """
502
    def handleMatch(self, m):
503
        el = util.etree.Element("a")
504
        el.set('href', self.unescape(m.group(2)))
505
        el.text = util.AtomicString(m.group(2))
506
        return el
507
508
509
class AutomailPattern(Pattern):
510
    """
511
    Return a mailto link Element given an automail link (`<[email protected]>`).
512
    """
513
    def handleMatch(self, m):
514
        el = util.etree.Element('a')
515
        email = self.unescape(m.group(2))
516
        if email.startswith("mailto:"):
517
            email = email[len("mailto:"):]
518
519
        def codepoint2name(code):
520
            """Return entity definition by code, or the code if not defined."""
521
            entity = entities.codepoint2name.get(code)
522
            if entity:
523
                return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
524
            else:
525
                return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
526
527
        letters = [codepoint2name(ord(letter)) for letter in email]
528
        el.text = util.AtomicString(''.join(letters))
529
530
        mailto = "mailto:" + email
531
        mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
532
                          ord(letter) for letter in mailto])
533
        el.set('href', mailto)
534
        return el
535