LinkPattern.handleMatch() - Code Metrics - Inspection of "pythonx/markdown_parser.py" - MikeCoder/markdown-preview.vim - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 32cfa8...ec62d3 )

by Dongxin

created 2017-08-15 03:07 UTC

LinkPattern.handleMatch() A

↳ Parent: LinkPattern

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	4
c	1
b	0
f	0
dl	0
loc	17
rs	9.2

"""
INLINE PATTERNS
=============================================================================

Inline patterns such as *emphasis* are handled by means of auxiliary
objects, one per pattern.  Pattern objects must be instances of classes
that extend markdown.Pattern.  Each pattern object uses a single regular
expression and needs support the following methods:

    pattern.getCompiledRegExp() # returns a regular expression

    pattern.handleMatch(m) # takes a match object and returns
                           # an ElementTree element or just plain text

All of python markdown's built-in patterns subclass from Pattern,
but you can add additional patterns that don't.

Also note that all the regular expressions used by inline must
capture the whole block.  For this reason, they all start with
'^(.*)' and end with '(.*)!'.  In case with built-in expression
Pattern takes care of adding the "^(.*)" and "(.*)!".

Finally, the order in which regular expressions are applied is very
important - e.g. if we first replace http://.../ links with <a> tags
and _then_ try to replace inline html, we would end up with a mess.
So, we apply the expressions in the following order:

* escape and backticks have to go before everything else, so
  that we can preempt any markdown patterns by escaping them.

* then we handle auto-links (must be done before inline html)

* then we handle inline HTML.  At this point we will simply
  replace all inline HTML strings with a placeholder and add
  the actual HTML to a hash.

* then inline images (must be done before links)

* then bracketed links, first regular then reference-style

* finally we apply strong and emphasis
"""

from __future__ import absolute_import
from __future__ import unicode_literals
from . import util
from . import odict
import re
try:  # pragma: no cover
    from urllib.parse import urlparse, urlunparse
except ImportError:  # pragma: no cover
    from urlparse import urlparse, urlunparse
try:  # pragma: no cover
    from html import entities
except ImportError:  # pragma: no cover
    import htmlentitydefs as entities


def build_inlinepatterns(md_instance, **kwargs):
    """ Build the default set of inline patterns for Markdown. """
    inlinePatterns = odict.OrderedDict()
    inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
    inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
    inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
    inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
    inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
    inlinePatterns["image_reference"] = ImageReferencePattern(
        IMAGE_REFERENCE_RE, md_instance
    )
    inlinePatterns["short_reference"] = ReferencePattern(
        SHORT_REF_RE, md_instance
    )
    inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
    inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
    inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
    if md_instance.safeMode != 'escape':
        inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
    inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
    inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
    inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
    inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
    inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
    inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
    if md_instance.smart_emphasis:
        inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
    else:
        inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
    return inlinePatterns


"""
The actual regular expressions for patterns
-----------------------------------------------------------------------------
"""

NOBRACKET = r'[^\]\[]*'
BRK = (
    r'\[(' +
    (NOBRACKET + r'(\[')*6 +
    (NOBRACKET + r'\])*')*6 +
    NOBRACKET + r')\]'
)
NOIMG = r'(?<!\!)'

# `e=f()` or ``e=f("`")``
BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'

# \<
ESCAPE_RE = r'\\(.)'

# *emphasis*
EMPHASIS_RE = r'(\*)([^\*]+)\2'

# **strong**
STRONG_RE = r'(\*{2}|_{2})(.+?)\2'

# ***strongem*** or ***em*strong**
EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'

# ***strong**em*
STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'

# _smart_emphasis_
SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'

# _emphasis_
EMPHASIS_2_RE = r'(_)(.+?)\2'

# [text](url) or [text](<url>) or [text](url "title")
LINK_RE = NOIMG + BRK + \
    r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''

# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(\s*(<.*?>|([^"\)\s]+\s*"[^"]*"|[^\)\s]*))\s*\)'

# [Google][3]
REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'

# [Google]
SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'

# ![alt text][2]
IMAGE_REFERENCE_RE = r'\!' + BRK + r'\s?\[([^\]]*)\]'

# stand-alone * or _
NOT_STRONG_RE = r'((^| )(\*|_)( |$))'

# <http://www.123.com>
AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'

# <[email protected]>
AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'

# <...>
HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'

# &amp;
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'

# two spaces at end of line
LINE_BREAK_RE = r'  \n'


def dequote(string):
    """Remove quotes from around a string."""
    if ((string.startswith('"') and string.endswith('"')) or
       (string.startswith("'") and string.endswith("'"))):
        return string[1:-1]
    else:
        return string


ATTR_RE = re.compile(r"\{@([^\}]*)=([^\}]*)}")  # {@id=123}


def handleAttributes(text, parent):
    """Set values of an element based on attribute definitions ({@id=123})."""
    def attributeCallback(match):
        parent.set(match.group(1), match.group(2).replace('\n', ' '))
    return ATTR_RE.sub(attributeCallback, text)


"""
The pattern classes
-----------------------------------------------------------------------------
"""


class Pattern(object):
    """Base class that inline patterns subclass. """

    def __init__(self, pattern, markdown_instance=None):
        """
        Create an instant of an inline pattern.

        Keyword arguments:

        * pattern: A regular expression that matches a pattern

        """
        self.pattern = pattern
        self.compiled_re = re.compile("^(.*?)%s(.*)$" % pattern,
                                      re.DOTALL | re.UNICODE)

        # Api for Markdown to pass safe_mode into instance
        self.safe_mode = False
        if markdown_instance:
            self.markdown = markdown_instance

    def getCompiledRegExp(self):
        """ Return a compiled regular expression. """
        return self.compiled_re

    def handleMatch(self, m):
        """Return a ElementTree element from the given match.

        Subclasses should override this method.

        Keyword arguments:

        * m: A re match object containing a match of the pattern.

        """
        pass  # pragma: no cover

    def type(self):
        """ Return class name, to define pattern type """
        return self.__class__.__name__

    def unescape(self, text):
        """ Return unescaped text given text with an inline placeholder. """
        try:
            stash = self.markdown.treeprocessors['inline'].stashed_nodes
        except KeyError:  # pragma: no cover
            return text

        def itertext(el):  # pragma: no cover
            ' Reimplement Element.itertext for older python versions '
            tag = el.tag
            if not isinstance(tag, util.string_type) and tag is not None:
                return
            if el.text:
                yield el.text
            for e in el:
                for s in itertext(e):
                    yield s
                if e.tail:
                    yield e.tail

        def get_stash(m):
            id = m.group(1)
            if id in stash:
                value = stash.get(id)
                if isinstance(value, util.string_type):
                    return value
                else:
                    # An etree Element - return text content only
                    return ''.join(itertext(value))
        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)


class SimpleTextPattern(Pattern):
    """ Return a simple text of group(2) of a Pattern. """
    def handleMatch(self, m):
        return m.group(2)


class EscapePattern(Pattern):
    """ Return an escaped character. """

    def handleMatch(self, m):
        char = m.group(2)
        if char in self.markdown.ESCAPED_CHARS:
            return '%s%s%s' % (util.STX, ord(char), util.ETX)
        else:
            return None


class SimpleTagPattern(Pattern):
    """
    Return element of type `tag` with a text attribute of group(3)
    of a Pattern.

    """
    def __init__(self, pattern, tag):
        Pattern.__init__(self, pattern)
        self.tag = tag

    def handleMatch(self, m):
        el = util.etree.Element(self.tag)
        el.text = m.group(3)
        return el


class SubstituteTagPattern(SimpleTagPattern):
    """ Return an element of type `tag` with no children. """
    def handleMatch(self, m):
        return util.etree.Element(self.tag)


class BacktickPattern(Pattern):
    """ Return a `<code>` element containing the matching text. """
    def __init__(self, pattern):
        Pattern.__init__(self, pattern)
        self.ESCAPED_BSLASH = '%s%s%s' % (util.STX, ord('\\'), util.ETX)
        self.tag = 'code'

    def handleMatch(self, m):
        if m.group(4):
            el = util.etree.Element(self.tag)
            el.text = util.AtomicString(m.group(4).strip())
            return el
        else:
            return m.group(2).replace('\\\\', self.ESCAPED_BSLASH)


class DoubleTagPattern(SimpleTagPattern):
    """Return a ElementTree element nested in tag2 nested in tag1.

    Useful for strong emphasis etc.

    """
    def handleMatch(self, m):
        tag1, tag2 = self.tag.split(",")
        el1 = util.etree.Element(tag1)
        el2 = util.etree.SubElement(el1, tag2)
        el2.text = m.group(3)
        if len(m.groups()) == 5:
            el2.tail = m.group(4)
        return el1


class HtmlPattern(Pattern):
    """ Store raw inline html and return a placeholder. """
    def handleMatch(self, m):
        rawhtml = self.unescape(m.group(2))
        place_holder = self.markdown.htmlStash.store(rawhtml)
        return place_holder

    def unescape(self, text):
        """ Return unescaped text given text with an inline placeholder. """
        try:
            stash = self.markdown.treeprocessors['inline'].stashed_nodes
        except KeyError:  # pragma: no cover
            return text

        def get_stash(m):
            id = m.group(1)
            value = stash.get(id)
            if value is not None:
                try:
                    return self.markdown.serializer(value)
                except:
                    return r'\%s' % value

        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)


class LinkPattern(Pattern):
    """ Return a link element from the given match. """
    def handleMatch(self, m):
        el = util.etree.Element("a")
        el.text = m.group(2)
        title = m.group(13)
        href = m.group(9)

        if href:
            if href[0] == "<":
                href = href[1:-1]
            el.set("href", self.sanitize_url(self.unescape(href.strip())))
        else:
            el.set("href", "")

        if title:
            title = dequote(self.unescape(title))
            el.set("title", title)
        return el

    def sanitize_url(self, url):
        """
        Sanitize a url against xss attacks in "safe_mode".

        Rather than specifically blacklisting `javascript:alert("XSS")` and all
        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
        safe url formats. Most urls contain a network location, however some
        are known not to (i.e.: mailto links). Script urls do not contain a
        location. Additionally, for `javascript:...`, the scheme would be
        "javascript" but some aliases will appear to `urlparse()` to have no
        scheme. On top of that relative links (i.e.: "foo/bar.html") have no
        scheme. Therefore we must check "path", "parameters", "query" and
        "fragment" for any literal colons. We don't check "scheme" for colons
        because it *should* never have any and "netloc" must allow the form:
        `username:password@host:port`.

        """
        if not self.markdown.safeMode:
            # Return immediately bipassing parsing.
            return url

        try:
            scheme, netloc, path, params, query, fragment = url = urlparse(url)
        except ValueError:  # pragma: no cover
            # Bad url - so bad it couldn't be parsed.
            return ''

        locless_schemes = ['', 'mailto', 'news']
        allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
        if scheme not in allowed_schemes:
            # Not a known (allowed) scheme. Not safe.
            return ''

        if netloc == '' and scheme not in locless_schemes:  # pragma: no cover
            # This should not happen. Treat as suspect.
            return ''

        for part in url[2:]:
            if ":" in part:
                # A colon in "path", "parameters", "query"
                # or "fragment" is suspect.
                return ''

        # Url passes all tests. Return url as-is.
        return urlunparse(url)


class ImagePattern(LinkPattern):
    """ Return a img element from the given match. """
    def handleMatch(self, m):
        el = util.etree.Element("img")
        src_parts = m.group(9).split()
        if src_parts:
            src = src_parts[0]
            if src[0] == "<" and src[-1] == ">":
                src = src[1:-1]
            el.set('src', self.sanitize_url(self.unescape(src)))
        else:
            el.set('src', "")
        if len(src_parts) > 1:
            el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

        if self.markdown.enable_attributes:
            truealt = handleAttributes(m.group(2), el)
        else:
            truealt = m.group(2)

        el.set('alt', self.unescape(truealt))
        return el


class ReferencePattern(LinkPattern):
    """ Match to a stored reference and return link element. """

    NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

    def handleMatch(self, m):
        try:
            id = m.group(9).lower()
        except IndexError:
            id = None
        if not id:
            # if we got something like "[Google][]" or "[Google]"
            # we'll use "google" as the id
            id = m.group(2).lower()

        # Clean up linebreaks in id
        id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
        if id not in self.markdown.references:  # ignore undefined refs
            return None
        href, title = self.markdown.references[id]

        text = m.group(2)
        return self.makeTag(href, title, text)

    def makeTag(self, href, title, text):
        el = util.etree.Element('a')

        el.set('href', self.sanitize_url(href))
        if title:
            el.set('title', title)

        el.text = text
        return el


class ImageReferencePattern(ReferencePattern):
    """ Match to a stored reference and return img element. """
    def makeTag(self, href, title, text):
        el = util.etree.Element("img")
        el.set("src", self.sanitize_url(href))
        if title:
            el.set("title", title)

        if self.markdown.enable_attributes:
            text = handleAttributes(text, el)

        el.set("alt", self.unescape(text))
        return el


class AutolinkPattern(Pattern):
    """ Return a link Element given an autolink (`<http://example/com>`). """
    def handleMatch(self, m):
        el = util.etree.Element("a")
        el.set('href', self.unescape(m.group(2)))
        el.text = util.AtomicString(m.group(2))
        return el


class AutomailPattern(Pattern):
    """
    Return a mailto link Element given an automail link (`<[email protected]>`).
    """
    def handleMatch(self, m):
        el = util.etree.Element('a')
        email = self.unescape(m.group(2))
        if email.startswith("mailto:"):
            email = email[len("mailto:"):]

        def codepoint2name(code):
            """Return entity definition by code, or the code if not defined."""
            entity = entities.codepoint2name.get(code)
            if entity:
                return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
            else:
                return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

        letters = [codepoint2name(ord(letter)) for letter in email]
        el.text = util.AtomicString(''.join(letters))

        mailto = "mailto:" + email
        mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
                          ord(letter) for letter in mailto])
        el.set('href', mailto)
        return el


1			"""
2			INLINE PATTERNS
3			=============================================================================
4
5			Inline patterns such as emphasis are handled by means of auxiliary
6			objects, one per pattern. Pattern objects must be instances of classes
7			that extend markdown.Pattern. Each pattern object uses a single regular
8			expression and needs support the following methods:
9
10			pattern.getCompiledRegExp() # returns a regular expression
11
12			pattern.handleMatch(m) # takes a match object and returns
13			# an ElementTree element or just plain text
14
15			All of python markdown's built-in patterns subclass from Pattern,
16			but you can add additional patterns that don't.
17
18			Also note that all the regular expressions used by inline must
19			capture the whole block. For this reason, they all start with
20			'^(.)' and end with '(.)!'. In case with built-in expression
21			Pattern takes care of adding the "^(.)" and "(.)!".
22
23			Finally, the order in which regular expressions are applied is very
24			important - e.g. if we first replace http://.../ links with <a> tags
25			and _then_ try to replace inline html, we would end up with a mess.
26			So, we apply the expressions in the following order:
27
28			* escape and backticks have to go before everything else, so
29			that we can preempt any markdown patterns by escaping them.
30
31			* then we handle auto-links (must be done before inline html)
32
33			* then we handle inline HTML. At this point we will simply
34			replace all inline HTML strings with a placeholder and add
35			the actual HTML to a hash.
36
37			* then inline images (must be done before links)
38
39			* then bracketed links, first regular then reference-style
40
41			* finally we apply strong and emphasis
42			"""
43
44			from __future__ import absolute_import
45			from __future__ import unicode_literals
46			from . import util
47			from . import odict
48			import re
49			try: # pragma: no cover
50			from urllib.parse import urlparse, urlunparse
51			except ImportError: # pragma: no cover
52			from urlparse import urlparse, urlunparse
53			try: # pragma: no cover
54			from html import entities
55			except ImportError: # pragma: no cover
56			import htmlentitydefs as entities
57
58
59			def build_inlinepatterns(md_instance, **kwargs):
60			""" Build the default set of inline patterns for Markdown. """
61			inlinePatterns = odict.OrderedDict()
62			inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
63			inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
64			inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
65			inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
66			inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
67			inlinePatterns["image_reference"] = ImageReferencePattern(
68			IMAGE_REFERENCE_RE, md_instance
69			)
70			inlinePatterns["short_reference"] = ReferencePattern(
71			SHORT_REF_RE, md_instance
72			)
73			inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
74			inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
75			inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
76			if md_instance.safeMode != 'escape':
77			inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
78			inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
79			inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
80			inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
81			inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
82			inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
83			inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
84			if md_instance.smart_emphasis:
85			inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
86			else:
87			inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
88			return inlinePatterns
89
90
91			"""
92			The actual regular expressions for patterns
93			-----------------------------------------------------------------------------
94			"""
95
96			NOBRACKET = r'[^\]\[]*'
97			BRK = (
98			r'\[(' +
99			(NOBRACKET + r'(\[')*6 +
100			(NOBRACKET + r'\])')6 +
101			NOBRACKET + r')\]'
102			)
103			NOIMG = r'(?<!\!)'
104
105			# `e=f()` or ``e=f("`")``
106			BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)\|(?<!\\)(`+)(.+?)(?<!`)\3(?!`))'
107
108			# \<
109			ESCAPE_RE = r'\\(.)'
110
111			# emphasis
112			EMPHASIS_RE = r'(\)([^\]+)\2'
113
114			# strong
115			STRONG_RE = r'(\*{2}\|_{2})(.+?)\2'
116
117			# *strongem* or **emstrong**
118			EM_STRONG_RE = r'(\\|_)\2{2}(.+?)\2(.?)\2{2}'
119
120			# *strongem*
121			STRONG_EM_RE = r'(\\|_)\2{2}(.+?)\2{2}(.?)\2'
122
123			# _smart_emphasis_
124			SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
125
126			# _emphasis_
127			EMPHASIS_2_RE = r'(_)(.+?)\2'
128
129			# [text](url) or [text](<url>) or [text](url "title")
130			LINK_RE = NOIMG + BRK + \
131			r'''\(\s(<.?>\|((?:(?:\(.?\))\|[^\(\)]))?)\s((['"])(.?)\12\s*)?\)'''
132
133			# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
134			IMAGE_LINK_RE = r'\!' + BRK + r'\s\(\s(<.?>\|([^"\)\s]+\s"[^"]"\|[^\)\s]))\s*\)'
135
136			# [Google][3]
137			REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
138
139			# [Google]
140			SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
141
142			# ![alt text][2]
143			IMAGE_REFERENCE_RE = r'\!' + BRK + r'\s?\[([^\]]*)\]'
144
145			# stand-alone * or _
146			NOT_STRONG_RE = r'((^\| )(\*\|_)( \|$))'
147
148			# <http://www.123.com>
149			AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
150
151			# <[email protected]>
152			AUTOMAIL_RE = r'<([^> \!]@[^> ])>'
153
154			# <...>
155			HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)'
156
157			# &
158			ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
159
160			# two spaces at end of line
161			LINE_BREAK_RE = r' \n'
162
163
164			def dequote(string):
165			"""Remove quotes from around a string."""
166			if ((string.startswith('"') and string.endswith('"')) or
167			(string.startswith("'") and string.endswith("'"))):
168			return string[1:-1]
169			else:
170			return string
171
172
173			ATTR_RE = re.compile(r"\{@([^\}])=([^\}])}") # {@id=123}
174
175
176			def handleAttributes(text, parent):
177			"""Set values of an element based on attribute definitions ({@id=123})."""
178			def attributeCallback(match):
179			parent.set(match.group(1), match.group(2).replace('\n', ' '))
180			return ATTR_RE.sub(attributeCallback, text)
181
182
183			"""
184			The pattern classes
185			-----------------------------------------------------------------------------
186			"""
187
188
189			class Pattern(object):
190			"""Base class that inline patterns subclass. """
191
192			def __init__(self, pattern, markdown_instance=None):
193			"""
194			Create an instant of an inline pattern.
195
196			Keyword arguments:
197
198			* pattern: A regular expression that matches a pattern
199
200			"""
201			self.pattern = pattern
202			self.compiled_re = re.compile("^(.?)%s(.)$" % pattern,
203			re.DOTALL \| re.UNICODE)
204
205			# Api for Markdown to pass safe_mode into instance
206			self.safe_mode = False
207			if markdown_instance:
208			self.markdown = markdown_instance
209
210			def getCompiledRegExp(self):
211			""" Return a compiled regular expression. """
212			return self.compiled_re
213
214			def handleMatch(self, m):
215			"""Return a ElementTree element from the given match.
216
217			Subclasses should override this method.
218
219			Keyword arguments:
220
221			* m: A re match object containing a match of the pattern.
222
223			"""
224			pass # pragma: no cover
225
226			def type(self):
227			""" Return class name, to define pattern type """
228			return self.__class__.__name__
229
230			def unescape(self, text):
231			""" Return unescaped text given text with an inline placeholder. """
232			try:
233			stash = self.markdown.treeprocessors['inline'].stashed_nodes
234			except KeyError: # pragma: no cover
235			return text
236
237			def itertext(el): # pragma: no cover
238			' Reimplement Element.itertext for older python versions '
239			tag = el.tag
240			if not isinstance(tag, util.string_type) and tag is not None:
241			return
242			if el.text:
243			yield el.text
244			for e in el:
245			for s in itertext(e):
246			yield s
247			if e.tail:
248			yield e.tail
249
250			def get_stash(m):
251			id = m.group(1)
252			if id in stash:
253			value = stash.get(id)
254			if isinstance(value, util.string_type):
255			return value
256			else:
257			# An etree Element - return text content only
258			return ''.join(itertext(value))
259			return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
260
261
262			class SimpleTextPattern(Pattern):
263			""" Return a simple text of group(2) of a Pattern. """
264			def handleMatch(self, m):
265			return m.group(2)
266
267
268			class EscapePattern(Pattern):
269			""" Return an escaped character. """
270
271			def handleMatch(self, m):
272			char = m.group(2)
273			if char in self.markdown.ESCAPED_CHARS:
274			return '%s%s%s' % (util.STX, ord(char), util.ETX)
275			else:
276			return None
277
278
279			class SimpleTagPattern(Pattern):
280			"""
281			Return element of type `tag` with a text attribute of group(3)
282			of a Pattern.
283
284			"""
285			def __init__(self, pattern, tag):
286			Pattern.__init__(self, pattern)
287			self.tag = tag
288
289			def handleMatch(self, m):
290			el = util.etree.Element(self.tag)
291			el.text = m.group(3)
292			return el
293
294
295			class SubstituteTagPattern(SimpleTagPattern):
296			""" Return an element of type `tag` with no children. """
297			def handleMatch(self, m):
298			return util.etree.Element(self.tag)
299
300
301			class BacktickPattern(Pattern):
302			""" Return a `<code>` element containing the matching text. """
303			def __init__(self, pattern):
304			Pattern.__init__(self, pattern)
305			self.ESCAPED_BSLASH = '%s%s%s' % (util.STX, ord('\\'), util.ETX)
306			self.tag = 'code'
307
308			def handleMatch(self, m):
309			if m.group(4):
310			el = util.etree.Element(self.tag)
311			el.text = util.AtomicString(m.group(4).strip())
312			return el
313			else:
314			return m.group(2).replace('\\\\', self.ESCAPED_BSLASH)
315
316
317			class DoubleTagPattern(SimpleTagPattern):
318			"""Return a ElementTree element nested in tag2 nested in tag1.
319
320			Useful for strong emphasis etc.
321
322			"""
323			def handleMatch(self, m):
324			tag1, tag2 = self.tag.split(",")
325			el1 = util.etree.Element(tag1)
326			el2 = util.etree.SubElement(el1, tag2)
327			el2.text = m.group(3)
328			if len(m.groups()) == 5:
329			el2.tail = m.group(4)
330			return el1
331
332
333			class HtmlPattern(Pattern):
334			""" Store raw inline html and return a placeholder. """
335			def handleMatch(self, m):
336			rawhtml = self.unescape(m.group(2))
337			place_holder = self.markdown.htmlStash.store(rawhtml)
338			return place_holder
339
340			def unescape(self, text):
341			""" Return unescaped text given text with an inline placeholder. """
342			try:
343			stash = self.markdown.treeprocessors['inline'].stashed_nodes
344			except KeyError: # pragma: no cover
345			return text
346
347			def get_stash(m):
348			id = m.group(1)
349			value = stash.get(id)
350			if value is not None:
351			try:
352			return self.markdown.serializer(value)
353			except:
354			return r'\%s' % value
355
356			return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
357
358
359			class LinkPattern(Pattern):
360			""" Return a link element from the given match. """
361			def handleMatch(self, m):
362			el = util.etree.Element("a")
363			el.text = m.group(2)
364			title = m.group(13)
365			href = m.group(9)
366
367			if href:
368			if href[0] == "<":
369			href = href[1:-1]
370			el.set("href", self.sanitize_url(self.unescape(href.strip())))
371			else:
372			el.set("href", "")
373
374			if title:
375			title = dequote(self.unescape(title))
376			el.set("title", title)
377			return el
378
379			def sanitize_url(self, url):
380			"""
381			Sanitize a url against xss attacks in "safe_mode".
382
383			Rather than specifically blacklisting `javascript:alert("XSS")` and all
384			its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
385			safe url formats. Most urls contain a network location, however some
386			are known not to (i.e.: mailto links). Script urls do not contain a
387			location. Additionally, for `javascript:...`, the scheme would be
388			"javascript" but some aliases will appear to `urlparse()` to have no
389			scheme. On top of that relative links (i.e.: "foo/bar.html") have no
390			scheme. Therefore we must check "path", "parameters", "query" and
391			"fragment" for any literal colons. We don't check "scheme" for colons
392			because it should never have any and "netloc" must allow the form:
393			`username:password@host:port`.
394
395			"""
396			if not self.markdown.safeMode:
397			# Return immediately bipassing parsing.
398			return url
399
400			try:
401			scheme, netloc, path, params, query, fragment = url = urlparse(url)
402			except ValueError: # pragma: no cover
403			# Bad url - so bad it couldn't be parsed.
404			return ''
405
406			locless_schemes = ['', 'mailto', 'news']
407			allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
408			if scheme not in allowed_schemes:
409			# Not a known (allowed) scheme. Not safe.
410			return ''
411
412			if netloc == '' and scheme not in locless_schemes: # pragma: no cover
413			# This should not happen. Treat as suspect.
414			return ''
415
416			for part in url[2:]:
417			if ":" in part:
418			# A colon in "path", "parameters", "query"
419			# or "fragment" is suspect.
420			return ''
421
422			# Url passes all tests. Return url as-is.
423			return urlunparse(url)
424
425
426			class ImagePattern(LinkPattern):
427			""" Return a img element from the given match. """
428			def handleMatch(self, m):
429			el = util.etree.Element("img")
430			src_parts = m.group(9).split()
431			if src_parts:
432			src = src_parts[0]
433			if src[0] == "<" and src[-1] == ">":
434			src = src[1:-1]
435			el.set('src', self.sanitize_url(self.unescape(src)))
436			else:
437			el.set('src', "")
438			if len(src_parts) > 1:
439			el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
440
441			if self.markdown.enable_attributes:
442			truealt = handleAttributes(m.group(2), el)
443			else:
444			truealt = m.group(2)
445
446			el.set('alt', self.unescape(truealt))
447			return el
448
449
450			class ReferencePattern(LinkPattern):
451			""" Match to a stored reference and return link element. """
452
453			NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
454
455			def handleMatch(self, m):
456			try:
457			id = m.group(9).lower()
458			except IndexError:
459			id = None
460			if not id:
461			# if we got something like "[Google][]" or "[Google]"
462			# we'll use "google" as the id
463			id = m.group(2).lower()
464
465			# Clean up linebreaks in id
466			id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
467			if id not in self.markdown.references: # ignore undefined refs
468			return None
469			href, title = self.markdown.references[id]
470
471			text = m.group(2)
472			return self.makeTag(href, title, text)
473
474			def makeTag(self, href, title, text):
475			el = util.etree.Element('a')
476
477			el.set('href', self.sanitize_url(href))
478			if title:
479			el.set('title', title)
480
481			el.text = text
482			return el
483
484
485			class ImageReferencePattern(ReferencePattern):
486			""" Match to a stored reference and return img element. """
487			def makeTag(self, href, title, text):
488			el = util.etree.Element("img")
489			el.set("src", self.sanitize_url(href))
490			if title:
491			el.set("title", title)
492
493			if self.markdown.enable_attributes:
494			text = handleAttributes(text, el)
495
496			el.set("alt", self.unescape(text))
497			return el
498
499
500			class AutolinkPattern(Pattern):
501			""" Return a link Element given an autolink (`<http://example/com>`). """
502			def handleMatch(self, m):
503			el = util.etree.Element("a")
504			el.set('href', self.unescape(m.group(2)))
505			el.text = util.AtomicString(m.group(2))
506			return el
507
508
509			class AutomailPattern(Pattern):
510			"""
511			Return a mailto link Element given an automail link (`<[email protected]>`).
512			"""
513			def handleMatch(self, m):
514			el = util.etree.Element('a')
515			email = self.unescape(m.group(2))
516			if email.startswith("mailto:"):
517			email = email[len("mailto:"):]
518
519			def codepoint2name(code):
520			"""Return entity definition by code, or the code if not defined."""
521			entity = entities.codepoint2name.get(code)
522			if entity:
523			return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
524			else:
525			return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
526
527			letters = [codepoint2name(ord(letter)) for letter in email]
528			el.text = util.AtomicString(''.join(letters))
529
530			mailto = "mailto:" + email
531			mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
532			ord(letter) for letter in mailto])
533			el.set('href', mailto)
534			return el
535

MikeCoder / markdown-preview.vim

Push — master ( 32cfa8...ec62d3 )

LinkPattern.handleMatch() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like