|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# Copyright (c) 2012 Trent Mick. |
|
3
|
|
|
# Copyright (c) 2007-2008 ActiveState Corp. |
|
4
|
|
|
# License: MIT (http://www.opensource.org/licenses/mit-license.php) |
|
5
|
|
|
|
|
6
|
|
|
from __future__ import generators |
|
7
|
|
|
|
|
8
|
|
|
r"""A fast and complete Python implementation of Markdown. |
|
9
|
|
|
|
|
10
|
|
|
[from http://daringfireball.net/projects/markdown/] |
|
11
|
|
|
> Markdown is a text-to-HTML filter; it translates an easy-to-read / |
|
12
|
|
|
> easy-to-write structured text format into HTML. Markdown's text |
|
13
|
|
|
> format is most similar to that of plain text email, and supports |
|
14
|
|
|
> features such as headers, *emphasis*, code blocks, blockquotes, and |
|
15
|
|
|
> links. |
|
16
|
|
|
> |
|
17
|
|
|
> Markdown's syntax is designed not as a generic markup language, but |
|
18
|
|
|
> specifically to serve as a front-end to (X)HTML. You can use span-level |
|
19
|
|
|
> HTML tags anywhere in a Markdown document, and you can use block level |
|
20
|
|
|
> HTML tags (like <div> and <table> as well). |
|
21
|
|
|
|
|
22
|
|
|
Module usage: |
|
23
|
|
|
|
|
24
|
|
|
>>> import markdown2 |
|
25
|
|
|
>>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)` |
|
26
|
|
|
u'<p><em>boo!</em></p>\n' |
|
27
|
|
|
|
|
28
|
|
|
>>> markdowner = Markdown() |
|
29
|
|
|
>>> markdowner.convert("*boo!*") |
|
30
|
|
|
u'<p><em>boo!</em></p>\n' |
|
31
|
|
|
>>> markdowner.convert("**boom!**") |
|
32
|
|
|
u'<p><strong>boom!</strong></p>\n' |
|
33
|
|
|
|
|
34
|
|
|
This implementation of Markdown implements the full "core" syntax plus a |
|
35
|
|
|
number of extras (e.g., code syntax coloring, footnotes) as described on |
|
36
|
|
|
<https://github.com/trentm/python-markdown2/wiki/Extras>. |
|
37
|
|
|
""" |
|
38
|
|
|
|
|
39
|
|
|
cmdln_desc = """A fast and complete Python implementation of Markdown, a |
|
40
|
|
|
text-to-HTML conversion tool for web writers. |
|
41
|
|
|
|
|
42
|
|
|
Supported extra syntax options (see -x|--extras option below and |
|
43
|
|
|
see <https://github.com/trentm/python-markdown2/wiki/Extras> for details): |
|
44
|
|
|
|
|
45
|
|
|
* code-friendly: Disable _ and __ for em and strong. |
|
46
|
|
|
* cuddled-lists: Allow lists to be cuddled to the preceding paragraph. |
|
47
|
|
|
* fenced-code-blocks: Allows a code block to not have to be indented |
|
48
|
|
|
by fencing it with '```' on a line before and after. Based on |
|
49
|
|
|
<http://github.github.com/github-flavored-markdown/> with support for |
|
50
|
|
|
syntax highlighting. |
|
51
|
|
|
* footnotes: Support footnotes as in use on daringfireball.net and |
|
52
|
|
|
implemented in other Markdown processors (tho not in Markdown.pl v1.0.1). |
|
53
|
|
|
* header-ids: Adds "id" attributes to headers. The id value is a slug of |
|
54
|
|
|
the header text. |
|
55
|
|
|
* highlightjs-lang: Allows specifying the language which used for syntax |
|
56
|
|
|
highlighting when using fenced-code-blocks and highlightjs. |
|
57
|
|
|
* html-classes: Takes a dict mapping html tag names (lowercase) to a |
|
58
|
|
|
string to use for a "class" tag attribute. Currently only supports "img", |
|
59
|
|
|
"table", "pre" and "code" tags. Add an issue if you require this for other |
|
60
|
|
|
tags. |
|
61
|
|
|
* link-patterns: Auto-link given regex patterns in text (e.g. bug number |
|
62
|
|
|
references, revision number references). |
|
63
|
|
|
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to |
|
64
|
|
|
have markdown processing be done on its contents. Similar to |
|
65
|
|
|
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with |
|
66
|
|
|
some limitations. |
|
67
|
|
|
* metadata: Extract metadata from a leading '---'-fenced block. |
|
68
|
|
|
See <https://github.com/trentm/python-markdown2/issues/77> for details. |
|
69
|
|
|
* nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See |
|
70
|
|
|
<http://en.wikipedia.org/wiki/Nofollow>. |
|
71
|
|
|
* numbering: Support of generic counters. Non standard extension to |
|
72
|
|
|
allow sequential numbering of figures, tables, equations, exhibits etc. |
|
73
|
|
|
* pyshell: Treats unindented Python interactive shell sessions as <code> |
|
74
|
|
|
blocks. |
|
75
|
|
|
* smarty-pants: Replaces ' and " with curly quotation marks or curly |
|
76
|
|
|
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes, |
|
77
|
|
|
and ellipses. |
|
78
|
|
|
* spoiler: A special kind of blockquote commonly hidden behind a |
|
79
|
|
|
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>. |
|
80
|
|
|
* tag-friendly: Requires atx style headers to have a space between the # and |
|
81
|
|
|
the header text. Useful for applications that require twitter style tags to |
|
82
|
|
|
pass through the parser. |
|
83
|
|
|
* tables: Tables using the same format as GFM |
|
84
|
|
|
<https://help.github.com/articles/github-flavored-markdown#tables> and |
|
85
|
|
|
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>. |
|
86
|
|
|
* toc: The returned HTML string gets a new "toc_html" attribute which is |
|
87
|
|
|
a Table of Contents for the document. (experimental) |
|
88
|
|
|
* use-file-vars: Look for an Emacs-style markdown-extras file variable to turn |
|
89
|
|
|
on Extras. |
|
90
|
|
|
* wiki-tables: Google Code Wiki-style tables. See |
|
91
|
|
|
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>. |
|
92
|
|
|
* xml: Passes one-liner processing instructions and namespaced XML tags. |
|
93
|
|
|
""" |
|
94
|
|
|
|
|
95
|
|
|
# Dev Notes: |
|
96
|
|
|
# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm |
|
97
|
|
|
# not yet sure if there implications with this. Compare 'pydoc sre' |
|
98
|
|
|
# and 'perldoc perlre'. |
|
99
|
|
|
|
|
100
|
|
|
__version_info__ = (2, 3, 5) |
|
101
|
|
|
__version__ = '.'.join(map(str, __version_info__)) |
|
102
|
|
|
__author__ = "Trent Mick" |
|
103
|
|
|
|
|
104
|
|
|
import sys |
|
105
|
|
|
import re |
|
106
|
|
|
import logging |
|
107
|
|
|
from hashlib import sha256 |
|
108
|
|
|
import optparse |
|
109
|
|
|
from random import random, randint |
|
110
|
|
|
import codecs |
|
111
|
|
|
try: |
|
112
|
|
|
from urllib import quote_plus |
|
113
|
|
|
except ImportError: |
|
114
|
|
|
from urllib.parse import quote_plus |
|
115
|
|
|
|
|
116
|
|
|
|
|
117
|
|
|
# ---- Python version compat |
|
118
|
|
|
|
|
119
|
|
|
if sys.version_info[:2] < (2, 4): |
|
120
|
|
|
def reversed(sequence): |
|
121
|
|
|
for i in sequence[::-1]: |
|
122
|
|
|
yield i |
|
123
|
|
|
|
|
124
|
|
|
# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3). |
|
125
|
|
|
if sys.version_info[0] <= 2: |
|
126
|
|
|
py3 = False |
|
127
|
|
|
try: |
|
128
|
|
|
bytes |
|
129
|
|
|
except NameError: |
|
130
|
|
|
bytes = str |
|
131
|
|
|
base_string_type = basestring |
|
132
|
|
|
elif sys.version_info[0] >= 3: |
|
133
|
|
|
py3 = True |
|
134
|
|
|
unicode = str |
|
135
|
|
|
base_string_type = str |
|
136
|
|
|
|
|
137
|
|
|
# ---- globals |
|
138
|
|
|
|
|
139
|
|
|
DEBUG = False |
|
140
|
|
|
log = logging.getLogger("markdown") |
|
141
|
|
|
|
|
142
|
|
|
DEFAULT_TAB_WIDTH = 4 |
|
143
|
|
|
|
|
144
|
|
|
|
|
145
|
|
|
SECRET_SALT = bytes(randint(0, 1000000)) |
|
146
|
|
|
# MD5 function was previously used for this; the "md5" prefix was kept for |
|
147
|
|
|
# backwards compatibility. |
|
148
|
|
|
def _hash_text(s): |
|
149
|
|
|
return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:] |
|
150
|
|
|
|
|
151
|
|
|
# Table of hash values for escaped characters: |
|
152
|
|
|
g_escape_table = dict([(ch, _hash_text(ch)) |
|
153
|
|
|
for ch in '\\`*_{}[]()>#+-.!']) |
|
154
|
|
|
|
|
155
|
|
|
|
|
156
|
|
|
# ---- exceptions |
|
157
|
|
|
class MarkdownError(Exception): |
|
158
|
|
|
pass |
|
159
|
|
|
|
|
160
|
|
|
|
|
161
|
|
|
# ---- public api |
|
162
|
|
|
|
|
163
|
|
|
def markdown_path(path, encoding="utf-8", |
|
164
|
|
|
html4tags=False, tab_width=DEFAULT_TAB_WIDTH, |
|
165
|
|
|
safe_mode=None, extras=None, link_patterns=None, |
|
166
|
|
|
footnote_title=None, footnote_return_symbol=None, |
|
167
|
|
|
use_file_vars=False): |
|
168
|
|
|
fp = codecs.open(path, 'r', encoding) |
|
169
|
|
|
text = fp.read() |
|
170
|
|
|
fp.close() |
|
171
|
|
|
return Markdown(html4tags=html4tags, tab_width=tab_width, |
|
172
|
|
|
safe_mode=safe_mode, extras=extras, |
|
173
|
|
|
link_patterns=link_patterns, |
|
174
|
|
|
footnote_title=footnote_title, |
|
175
|
|
|
footnote_return_symbol=footnote_return_symbol, |
|
176
|
|
|
use_file_vars=use_file_vars).convert(text) |
|
177
|
|
|
|
|
178
|
|
|
|
|
179
|
|
|
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, |
|
180
|
|
|
safe_mode=None, extras=None, link_patterns=None, |
|
181
|
|
|
footnote_title=None, footnote_return_symbol=None, |
|
182
|
|
|
use_file_vars=False): |
|
183
|
|
|
return Markdown(html4tags=html4tags, tab_width=tab_width, |
|
184
|
|
|
safe_mode=safe_mode, extras=extras, |
|
185
|
|
|
link_patterns=link_patterns, |
|
186
|
|
|
footnote_title=footnote_title, |
|
187
|
|
|
footnote_return_symbol=footnote_return_symbol, |
|
188
|
|
|
use_file_vars=use_file_vars).convert(text) |
|
189
|
|
|
|
|
190
|
|
|
|
|
191
|
|
|
class Markdown(object): |
|
192
|
|
|
# The dict of "extras" to enable in processing -- a mapping of |
|
193
|
|
|
# extra name to argument for the extra. Most extras do not have an |
|
194
|
|
|
# argument, in which case the value is None. |
|
195
|
|
|
# |
|
196
|
|
|
# This can be set via (a) subclassing and (b) the constructor |
|
197
|
|
|
# "extras" argument. |
|
198
|
|
|
extras = None |
|
199
|
|
|
|
|
200
|
|
|
urls = None |
|
201
|
|
|
titles = None |
|
202
|
|
|
html_blocks = None |
|
203
|
|
|
html_spans = None |
|
204
|
|
|
html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py |
|
205
|
|
|
|
|
206
|
|
|
# Used to track when we're inside an ordered or unordered list |
|
207
|
|
|
# (see _ProcessListItems() for details): |
|
208
|
|
|
list_level = 0 |
|
209
|
|
|
|
|
210
|
|
|
_ws_only_line_re = re.compile(r"^[ \t]+$", re.M) |
|
211
|
|
|
|
|
212
|
|
|
def __init__(self, html4tags=False, tab_width=4, safe_mode=None, |
|
213
|
|
|
extras=None, link_patterns=None, |
|
214
|
|
|
footnote_title=None, footnote_return_symbol=None, |
|
215
|
|
|
use_file_vars=False): |
|
216
|
|
|
if html4tags: |
|
217
|
|
|
self.empty_element_suffix = ">" |
|
218
|
|
|
else: |
|
219
|
|
|
self.empty_element_suffix = " />" |
|
220
|
|
|
self.tab_width = tab_width |
|
221
|
|
|
|
|
222
|
|
|
# For compatibility with earlier markdown2.py and with |
|
223
|
|
|
# markdown.py's safe_mode being a boolean, |
|
224
|
|
|
# safe_mode == True -> "replace" |
|
225
|
|
|
if safe_mode is True: |
|
226
|
|
|
self.safe_mode = "replace" |
|
227
|
|
|
else: |
|
228
|
|
|
self.safe_mode = safe_mode |
|
229
|
|
|
|
|
230
|
|
|
# Massaging and building the "extras" info. |
|
231
|
|
|
if self.extras is None: |
|
232
|
|
|
self.extras = {} |
|
233
|
|
|
elif not isinstance(self.extras, dict): |
|
234
|
|
|
self.extras = dict([(e, None) for e in self.extras]) |
|
235
|
|
|
if extras: |
|
236
|
|
|
if not isinstance(extras, dict): |
|
237
|
|
|
extras = dict([(e, None) for e in extras]) |
|
238
|
|
|
self.extras.update(extras) |
|
239
|
|
|
assert isinstance(self.extras, dict) |
|
240
|
|
|
if "toc" in self.extras and "header-ids" not in self.extras: |
|
241
|
|
|
self.extras["header-ids"] = None # "toc" implies "header-ids" |
|
242
|
|
|
self._instance_extras = self.extras.copy() |
|
243
|
|
|
|
|
244
|
|
|
self.link_patterns = link_patterns |
|
245
|
|
|
self.footnote_title = footnote_title |
|
246
|
|
|
self.footnote_return_symbol = footnote_return_symbol |
|
247
|
|
|
self.use_file_vars = use_file_vars |
|
248
|
|
|
self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) |
|
249
|
|
|
|
|
250
|
|
|
self._escape_table = g_escape_table.copy() |
|
251
|
|
|
if "smarty-pants" in self.extras: |
|
252
|
|
|
self._escape_table['"'] = _hash_text('"') |
|
253
|
|
|
self._escape_table["'"] = _hash_text("'") |
|
254
|
|
|
|
|
255
|
|
|
def reset(self): |
|
256
|
|
|
self.urls = {} |
|
257
|
|
|
self.titles = {} |
|
258
|
|
|
self.html_blocks = {} |
|
259
|
|
|
self.html_spans = {} |
|
260
|
|
|
self.list_level = 0 |
|
261
|
|
|
self.extras = self._instance_extras.copy() |
|
262
|
|
|
if "footnotes" in self.extras: |
|
263
|
|
|
self.footnotes = {} |
|
264
|
|
|
self.footnote_ids = [] |
|
265
|
|
|
if "header-ids" in self.extras: |
|
266
|
|
|
self._count_from_header_id = {} # no `defaultdict` in Python 2.4 |
|
267
|
|
|
if "metadata" in self.extras: |
|
268
|
|
|
self.metadata = {} |
|
269
|
|
|
|
|
270
|
|
|
# Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel" |
|
271
|
|
|
# should only be used in <a> tags with an "href" attribute. |
|
272
|
|
|
_a_nofollow = re.compile(r""" |
|
273
|
|
|
<(a) |
|
274
|
|
|
( |
|
275
|
|
|
[^>]* |
|
276
|
|
|
href= # href is required |
|
277
|
|
|
['"]? # HTML5 attribute values do not have to be quoted |
|
278
|
|
|
[^#'"] # We don't want to match href values that start with # (like footnotes) |
|
279
|
|
|
) |
|
280
|
|
|
""", |
|
281
|
|
|
re.IGNORECASE | re.VERBOSE |
|
282
|
|
|
) |
|
283
|
|
|
|
|
284
|
|
|
# Opens the linked document in a new window or tab |
|
285
|
|
|
# should only used in <a> tags with an "href" attribute. |
|
286
|
|
|
# same with _a_nofollow |
|
287
|
|
|
_a_blank = _a_nofollow |
|
288
|
|
|
|
|
289
|
|
|
def convert(self, text): |
|
290
|
|
|
"""Convert the given text.""" |
|
291
|
|
|
# Main function. The order in which other subs are called here is |
|
292
|
|
|
# essential. Link and image substitutions need to happen before |
|
293
|
|
|
# _EscapeSpecialChars(), so that any *'s or _'s in the <a> |
|
294
|
|
|
# and <img> tags get encoded. |
|
295
|
|
|
|
|
296
|
|
|
# Clear the global hashes. If we don't clear these, you get conflicts |
|
297
|
|
|
# from other articles when generating a page which contains more than |
|
298
|
|
|
# one article (e.g. an index page that shows the N most recent |
|
299
|
|
|
# articles): |
|
300
|
|
|
self.reset() |
|
301
|
|
|
|
|
302
|
|
|
if not isinstance(text, unicode): |
|
303
|
|
|
# TODO: perhaps shouldn't presume UTF-8 for string input? |
|
304
|
|
|
text = unicode(text, 'utf-8') |
|
305
|
|
|
|
|
306
|
|
View Code Duplication |
if self.use_file_vars: |
|
|
|
|
|
|
307
|
|
|
# Look for emacs-style file variable hints. |
|
308
|
|
|
emacs_vars = self._get_emacs_vars(text) |
|
309
|
|
|
if "markdown-extras" in emacs_vars: |
|
310
|
|
|
splitter = re.compile("[ ,]+") |
|
311
|
|
|
for e in splitter.split(emacs_vars["markdown-extras"]): |
|
312
|
|
|
if '=' in e: |
|
313
|
|
|
ename, earg = e.split('=', 1) |
|
314
|
|
|
try: |
|
315
|
|
|
earg = int(earg) |
|
316
|
|
|
except ValueError: |
|
317
|
|
|
pass |
|
318
|
|
|
else: |
|
319
|
|
|
ename, earg = e, None |
|
320
|
|
|
self.extras[ename] = earg |
|
321
|
|
|
|
|
322
|
|
|
# Standardize line endings: |
|
323
|
|
|
text = text.replace("\r\n", "\n") |
|
324
|
|
|
text = text.replace("\r", "\n") |
|
325
|
|
|
|
|
326
|
|
|
# Make sure $text ends with a couple of newlines: |
|
327
|
|
|
text += "\n\n" |
|
328
|
|
|
|
|
329
|
|
|
# Convert all tabs to spaces. |
|
330
|
|
|
text = self._detab(text) |
|
331
|
|
|
|
|
332
|
|
|
# Strip any lines consisting only of spaces and tabs. |
|
333
|
|
|
# This makes subsequent regexen easier to write, because we can |
|
334
|
|
|
# match consecutive blank lines with /\n+/ instead of something |
|
335
|
|
|
# contorted like /[ \t]*\n+/ . |
|
336
|
|
|
text = self._ws_only_line_re.sub("", text) |
|
337
|
|
|
|
|
338
|
|
|
# strip metadata from head and extract |
|
339
|
|
|
if "metadata" in self.extras: |
|
340
|
|
|
text = self._extract_metadata(text) |
|
341
|
|
|
|
|
342
|
|
|
text = self.preprocess(text) |
|
343
|
|
|
|
|
344
|
|
|
if "fenced-code-blocks" in self.extras and not self.safe_mode: |
|
345
|
|
|
text = self._do_fenced_code_blocks(text) |
|
346
|
|
|
|
|
347
|
|
|
if self.safe_mode: |
|
348
|
|
|
text = self._hash_html_spans(text) |
|
349
|
|
|
|
|
350
|
|
|
# Turn block-level HTML blocks into hash entries |
|
351
|
|
|
text = self._hash_html_blocks(text, raw=True) |
|
352
|
|
|
|
|
353
|
|
|
if "fenced-code-blocks" in self.extras and self.safe_mode: |
|
354
|
|
|
text = self._do_fenced_code_blocks(text) |
|
355
|
|
|
|
|
356
|
|
|
# Because numbering references aren't links (yet?) then we can do everything associated with counters |
|
357
|
|
|
# before we get started |
|
358
|
|
|
if "numbering" in self.extras: |
|
359
|
|
|
text = self._do_numbering(text) |
|
360
|
|
|
|
|
361
|
|
|
# Strip link definitions, store in hashes. |
|
362
|
|
|
if "footnotes" in self.extras: |
|
363
|
|
|
# Must do footnotes first because an unlucky footnote defn |
|
364
|
|
|
# looks like a link defn: |
|
365
|
|
|
# [^4]: this "looks like a link defn" |
|
366
|
|
|
text = self._strip_footnote_definitions(text) |
|
367
|
|
|
text = self._strip_link_definitions(text) |
|
368
|
|
|
|
|
369
|
|
|
text = self._run_block_gamut(text) |
|
370
|
|
|
|
|
371
|
|
|
if "footnotes" in self.extras: |
|
372
|
|
|
text = self._add_footnotes(text) |
|
373
|
|
|
|
|
374
|
|
|
text = self.postprocess(text) |
|
375
|
|
|
|
|
376
|
|
|
text = self._unescape_special_chars(text) |
|
377
|
|
|
|
|
378
|
|
|
if self.safe_mode: |
|
379
|
|
|
text = self._unhash_html_spans(text) |
|
380
|
|
|
|
|
381
|
|
|
if "nofollow" in self.extras: |
|
382
|
|
|
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) |
|
383
|
|
|
|
|
384
|
|
|
if "target-blank-links" in self.extras: |
|
385
|
|
|
text = self._a_blank.sub(r'<\1 target="_blank"\2', text) |
|
386
|
|
|
|
|
387
|
|
|
text += "\n" |
|
388
|
|
|
|
|
389
|
|
|
rv = UnicodeWithAttrs(text) |
|
390
|
|
|
if "toc" in self.extras: |
|
391
|
|
|
rv._toc = self._toc |
|
392
|
|
|
if "metadata" in self.extras: |
|
393
|
|
|
rv.metadata = self.metadata |
|
394
|
|
|
return rv |
|
395
|
|
|
|
|
396
|
|
|
def postprocess(self, text): |
|
397
|
|
|
"""A hook for subclasses to do some postprocessing of the html, if |
|
398
|
|
|
desired. This is called before unescaping of special chars and |
|
399
|
|
|
unhashing of raw HTML spans. |
|
400
|
|
|
""" |
|
401
|
|
|
return text |
|
402
|
|
|
|
|
403
|
|
|
def preprocess(self, text): |
|
404
|
|
|
"""A hook for subclasses to do some preprocessing of the Markdown, if |
|
405
|
|
|
desired. This is called after basic formatting of the text, but prior |
|
406
|
|
|
to any extras, safe mode, etc. processing. |
|
407
|
|
|
""" |
|
408
|
|
|
return text |
|
409
|
|
|
|
|
410
|
|
|
# Is metadata if the content starts with optional '---'-fenced `key: value` |
|
411
|
|
|
# pairs. E.g. (indented for presentation): |
|
412
|
|
|
# --- |
|
413
|
|
|
# foo: bar |
|
414
|
|
|
# another-var: blah blah |
|
415
|
|
|
# --- |
|
416
|
|
|
# # header |
|
417
|
|
|
# or: |
|
418
|
|
|
# foo: bar |
|
419
|
|
|
# another-var: blah blah |
|
420
|
|
|
# |
|
421
|
|
|
# # header |
|
422
|
|
|
_meta_data_pattern = re.compile(r'^(?:---[\ \t]*\n)?(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)|([\S\w]+\s*:(?! >)[ \t]*.*\n?)(?:---[\ \t]*\n)?', re.MULTILINE) |
|
423
|
|
|
_key_val_pat = re.compile("[\S\w]+\s*:(?! >)[ \t]*.*\n?", re.MULTILINE) |
|
424
|
|
|
# this allows key: > |
|
425
|
|
|
# value |
|
426
|
|
|
# conutiues over multiple lines |
|
427
|
|
|
_key_val_block_pat = re.compile( |
|
428
|
|
|
"(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)", re.MULTILINE) |
|
429
|
|
|
_meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE) |
|
430
|
|
|
_meta_data_newline = re.compile("^\n", re.MULTILINE) |
|
431
|
|
|
|
|
432
|
|
|
def _extract_metadata(self, text): |
|
433
|
|
|
if text.startswith("---"): |
|
434
|
|
|
fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2) |
|
435
|
|
|
metadata_content = fence_splits[1] |
|
436
|
|
|
match = re.findall(self._meta_data_pattern, metadata_content) |
|
437
|
|
|
if not match: |
|
438
|
|
|
return text |
|
439
|
|
|
tail = fence_splits[2] |
|
440
|
|
|
else: |
|
441
|
|
|
metadata_split = re.split(self._meta_data_newline, text, maxsplit=1) |
|
442
|
|
|
metadata_content = metadata_split[0] |
|
443
|
|
|
match = re.findall(self._meta_data_pattern, metadata_content) |
|
444
|
|
|
if not match: |
|
445
|
|
|
return text |
|
446
|
|
|
tail = metadata_split[1] |
|
447
|
|
|
|
|
448
|
|
|
kv = re.findall(self._key_val_pat, metadata_content) |
|
449
|
|
|
kvm = re.findall(self._key_val_block_pat, metadata_content) |
|
450
|
|
|
kvm = [item.replace(": >\n", ":", 1) for item in kvm] |
|
451
|
|
|
|
|
452
|
|
|
for item in kv + kvm: |
|
453
|
|
|
k, v = item.split(":", 1) |
|
454
|
|
|
self.metadata[k.strip()] = v.strip() |
|
455
|
|
|
|
|
456
|
|
|
return tail |
|
457
|
|
|
|
|
458
|
|
|
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) |
|
459
|
|
|
# This regular expression is intended to match blocks like this: |
|
460
|
|
|
# PREFIX Local Variables: SUFFIX |
|
461
|
|
|
# PREFIX mode: Tcl SUFFIX |
|
462
|
|
|
# PREFIX End: SUFFIX |
|
463
|
|
|
# Some notes: |
|
464
|
|
|
# - "[ \t]" is used instead of "\s" to specifically exclude newlines |
|
465
|
|
|
# - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does |
|
466
|
|
|
# not like anything other than Unix-style line terminators. |
|
467
|
|
|
_emacs_local_vars_pat = re.compile(r"""^ |
|
468
|
|
|
(?P<prefix>(?:[^\r\n|\n|\r])*?) |
|
469
|
|
|
[\ \t]*Local\ Variables:[\ \t]* |
|
470
|
|
|
(?P<suffix>.*?)(?:\r\n|\n|\r) |
|
471
|
|
|
(?P<content>.*?\1End:) |
|
472
|
|
|
""", re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) |
|
473
|
|
|
|
|
474
|
|
|
def _get_emacs_vars(self, text): |
|
475
|
|
|
"""Return a dictionary of emacs-style local variables. |
|
476
|
|
|
|
|
477
|
|
|
Parsing is done loosely according to this spec (and according to |
|
478
|
|
|
some in-practice deviations from this): |
|
479
|
|
|
http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables |
|
480
|
|
|
""" |
|
481
|
|
|
emacs_vars = {} |
|
482
|
|
|
SIZE = pow(2, 13) # 8kB |
|
483
|
|
|
|
|
484
|
|
|
# Search near the start for a '-*-'-style one-liner of variables. |
|
485
|
|
|
head = text[:SIZE] |
|
486
|
|
|
if "-*-" in head: |
|
487
|
|
|
match = self._emacs_oneliner_vars_pat.search(head) |
|
488
|
|
|
if match: |
|
489
|
|
|
emacs_vars_str = match.group(1) |
|
490
|
|
|
assert '\n' not in emacs_vars_str |
|
491
|
|
|
emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') |
|
492
|
|
|
if s.strip()] |
|
493
|
|
|
if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: |
|
494
|
|
|
# While not in the spec, this form is allowed by emacs: |
|
495
|
|
|
# -*- Tcl -*- |
|
496
|
|
|
# where the implied "variable" is "mode". This form |
|
497
|
|
|
# is only allowed if there are no other variables. |
|
498
|
|
|
emacs_vars["mode"] = emacs_var_strs[0].strip() |
|
499
|
|
|
else: |
|
500
|
|
|
for emacs_var_str in emacs_var_strs: |
|
501
|
|
|
try: |
|
502
|
|
|
variable, value = emacs_var_str.strip().split(':', 1) |
|
503
|
|
|
except ValueError: |
|
504
|
|
|
log.debug("emacs variables error: malformed -*- " |
|
505
|
|
|
"line: %r", emacs_var_str) |
|
506
|
|
|
continue |
|
507
|
|
|
# Lowercase the variable name because Emacs allows "Mode" |
|
508
|
|
|
# or "mode" or "MoDe", etc. |
|
509
|
|
|
emacs_vars[variable.lower()] = value.strip() |
|
510
|
|
|
|
|
511
|
|
|
tail = text[-SIZE:] |
|
512
|
|
|
if "Local Variables" in tail: |
|
513
|
|
|
match = self._emacs_local_vars_pat.search(tail) |
|
514
|
|
|
if match: |
|
515
|
|
|
prefix = match.group("prefix") |
|
516
|
|
|
suffix = match.group("suffix") |
|
517
|
|
|
lines = match.group("content").splitlines(0) |
|
518
|
|
|
# print "prefix=%r, suffix=%r, content=%r, lines: %s"\ |
|
519
|
|
|
# % (prefix, suffix, match.group("content"), lines) |
|
520
|
|
|
|
|
521
|
|
|
# Validate the Local Variables block: proper prefix and suffix |
|
522
|
|
|
# usage. |
|
523
|
|
|
for i, line in enumerate(lines): |
|
524
|
|
|
if not line.startswith(prefix): |
|
525
|
|
|
log.debug("emacs variables error: line '%s' " |
|
526
|
|
|
"does not use proper prefix '%s'" |
|
527
|
|
|
% (line, prefix)) |
|
528
|
|
|
return {} |
|
529
|
|
|
# Don't validate suffix on last line. Emacs doesn't care, |
|
530
|
|
|
# neither should we. |
|
531
|
|
|
if i != len(lines)-1 and not line.endswith(suffix): |
|
532
|
|
|
log.debug("emacs variables error: line '%s' " |
|
533
|
|
|
"does not use proper suffix '%s'" |
|
534
|
|
|
% (line, suffix)) |
|
535
|
|
|
return {} |
|
536
|
|
|
|
|
537
|
|
|
# Parse out one emacs var per line. |
|
538
|
|
|
continued_for = None |
|
539
|
|
|
for line in lines[:-1]: # no var on the last line ("PREFIX End:") |
|
540
|
|
|
if prefix: line = line[len(prefix):] # strip prefix |
|
541
|
|
|
if suffix: line = line[:-len(suffix)] # strip suffix |
|
542
|
|
|
line = line.strip() |
|
543
|
|
|
if continued_for: |
|
544
|
|
|
variable = continued_for |
|
545
|
|
|
if line.endswith('\\'): |
|
546
|
|
|
line = line[:-1].rstrip() |
|
547
|
|
|
else: |
|
548
|
|
|
continued_for = None |
|
549
|
|
|
emacs_vars[variable] += ' ' + line |
|
550
|
|
|
else: |
|
551
|
|
|
try: |
|
552
|
|
|
variable, value = line.split(':', 1) |
|
553
|
|
|
except ValueError: |
|
554
|
|
|
log.debug("local variables error: missing colon " |
|
555
|
|
|
"in local variables entry: '%s'" % line) |
|
556
|
|
|
continue |
|
557
|
|
|
# Do NOT lowercase the variable name, because Emacs only |
|
558
|
|
|
# allows "mode" (and not "Mode", "MoDe", etc.) in this block. |
|
559
|
|
|
value = value.strip() |
|
560
|
|
|
if value.endswith('\\'): |
|
561
|
|
|
value = value[:-1].rstrip() |
|
562
|
|
|
continued_for = variable |
|
563
|
|
|
else: |
|
564
|
|
|
continued_for = None |
|
565
|
|
|
emacs_vars[variable] = value |
|
566
|
|
|
|
|
567
|
|
|
# Unquote values. |
|
568
|
|
|
for var, val in list(emacs_vars.items()): |
|
569
|
|
|
if len(val) > 1 and (val.startswith('"') and val.endswith('"') |
|
570
|
|
|
or val.startswith('"') and val.endswith('"')): |
|
571
|
|
|
emacs_vars[var] = val[1:-1] |
|
572
|
|
|
|
|
573
|
|
|
return emacs_vars |
|
574
|
|
|
|
|
575
|
|
|
def _detab_line(self, line): |
|
576
|
|
|
r"""Recusively convert tabs to spaces in a single line. |
|
577
|
|
|
|
|
578
|
|
|
Called from _detab().""" |
|
579
|
|
|
if '\t' not in line: |
|
580
|
|
|
return line |
|
581
|
|
|
chunk1, chunk2 = line.split('\t', 1) |
|
582
|
|
|
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width)) |
|
583
|
|
|
output = chunk1 + chunk2 |
|
584
|
|
|
return self._detab_line(output) |
|
585
|
|
|
|
|
586
|
|
|
def _detab(self, text): |
|
587
|
|
|
r"""Iterate text line by line and convert tabs to spaces. |
|
588
|
|
|
|
|
589
|
|
|
>>> m = Markdown() |
|
590
|
|
|
>>> m._detab("\tfoo") |
|
591
|
|
|
' foo' |
|
592
|
|
|
>>> m._detab(" \tfoo") |
|
593
|
|
|
' foo' |
|
594
|
|
|
>>> m._detab("\t foo") |
|
595
|
|
|
' foo' |
|
596
|
|
|
>>> m._detab(" foo") |
|
597
|
|
|
' foo' |
|
598
|
|
|
>>> m._detab(" foo\n\tbar\tblam") |
|
599
|
|
|
' foo\n bar blam' |
|
600
|
|
|
""" |
|
601
|
|
|
if '\t' not in text: |
|
602
|
|
|
return text |
|
603
|
|
|
output = [] |
|
604
|
|
|
for line in text.splitlines(): |
|
605
|
|
|
output.append(self._detab_line(line)) |
|
606
|
|
|
return '\n'.join(output) |
|
607
|
|
|
|
|
608
|
|
|
# I broke out the html5 tags here and add them to _block_tags_a and |
|
609
|
|
|
# _block_tags_b. This way html5 tags are easy to keep track of. |
|
610
|
|
|
_html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption' |
|
611
|
|
|
|
|
612
|
|
|
_block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' |
|
613
|
|
|
_block_tags_a += _html5tags |
|
614
|
|
|
|
|
615
|
|
|
_strict_tag_block_re = re.compile(r""" |
|
616
|
|
|
( # save in \1 |
|
617
|
|
|
^ # start of line (with re.M) |
|
618
|
|
|
<(%s) # start tag = \2 |
|
619
|
|
|
\b # word break |
|
620
|
|
|
(.*\n)*? # any number of lines, minimally matching |
|
621
|
|
|
</\2> # the matching end tag |
|
622
|
|
|
[ \t]* # trailing spaces/tabs |
|
623
|
|
|
(?=\n+|\Z) # followed by a newline or end of document |
|
624
|
|
|
) |
|
625
|
|
|
""" % _block_tags_a, |
|
626
|
|
|
re.X | re.M) |
|
627
|
|
|
|
|
628
|
|
|
_block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' |
|
629
|
|
|
_block_tags_b += _html5tags |
|
630
|
|
|
|
|
631
|
|
|
_liberal_tag_block_re = re.compile(r""" |
|
632
|
|
|
( # save in \1 |
|
633
|
|
|
^ # start of line (with re.M) |
|
634
|
|
|
<(%s) # start tag = \2 |
|
635
|
|
|
\b # word break |
|
636
|
|
|
(.*\n)*? # any number of lines, minimally matching |
|
637
|
|
|
.*</\2> # the matching end tag |
|
638
|
|
|
[ \t]* # trailing spaces/tabs |
|
639
|
|
|
(?=\n+|\Z) # followed by a newline or end of document |
|
640
|
|
|
) |
|
641
|
|
|
""" % _block_tags_b, |
|
642
|
|
|
re.X | re.M) |
|
643
|
|
|
|
|
644
|
|
|
_html_markdown_attr_re = re.compile( |
|
645
|
|
|
r'''\s+markdown=("1"|'1')''') |
|
646
|
|
|
def _hash_html_block_sub(self, match, raw=False): |
|
647
|
|
|
html = match.group(1) |
|
648
|
|
|
if raw and self.safe_mode: |
|
649
|
|
|
html = self._sanitize_html(html) |
|
650
|
|
|
elif 'markdown-in-html' in self.extras and 'markdown=' in html: |
|
651
|
|
|
first_line = html.split('\n', 1)[0] |
|
652
|
|
|
m = self._html_markdown_attr_re.search(first_line) |
|
653
|
|
|
if m: |
|
654
|
|
|
lines = html.split('\n') |
|
655
|
|
|
middle = '\n'.join(lines[1:-1]) |
|
656
|
|
|
last_line = lines[-1] |
|
657
|
|
|
first_line = first_line[:m.start()] + first_line[m.end():] |
|
658
|
|
|
f_key = _hash_text(first_line) |
|
659
|
|
|
self.html_blocks[f_key] = first_line |
|
660
|
|
|
l_key = _hash_text(last_line) |
|
661
|
|
|
self.html_blocks[l_key] = last_line |
|
662
|
|
|
return ''.join(["\n\n", f_key, |
|
663
|
|
|
"\n\n", middle, "\n\n", |
|
664
|
|
|
l_key, "\n\n"]) |
|
665
|
|
|
key = _hash_text(html) |
|
666
|
|
|
self.html_blocks[key] = html |
|
667
|
|
|
return "\n\n" + key + "\n\n" |
|
668
|
|
|
|
|
669
|
|
|
def _hash_html_blocks(self, text, raw=False): |
|
670
|
|
|
"""Hashify HTML blocks |
|
671
|
|
|
|
|
672
|
|
|
We only want to do this for block-level HTML tags, such as headers, |
|
673
|
|
|
lists, and tables. That's because we still want to wrap <p>s around |
|
674
|
|
|
"paragraphs" that are wrapped in non-block-level tags, such as anchors, |
|
675
|
|
|
phrase emphasis, and spans. The list of tags we're looking for is |
|
676
|
|
|
hard-coded. |
|
677
|
|
|
|
|
678
|
|
|
@param raw {boolean} indicates if these are raw HTML blocks in |
|
679
|
|
|
the original source. It makes a difference in "safe" mode. |
|
680
|
|
|
""" |
|
681
|
|
|
if '<' not in text: |
|
682
|
|
|
return text |
|
683
|
|
|
|
|
684
|
|
|
# Pass `raw` value into our calls to self._hash_html_block_sub. |
|
685
|
|
|
hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) |
|
686
|
|
|
|
|
687
|
|
|
# First, look for nested blocks, e.g.: |
|
688
|
|
|
# <div> |
|
689
|
|
|
# <div> |
|
690
|
|
|
# tags for inner block must be indented. |
|
691
|
|
|
# </div> |
|
692
|
|
|
# </div> |
|
693
|
|
|
# |
|
694
|
|
|
# The outermost tags must start at the left margin for this to match, and |
|
695
|
|
|
# the inner nested divs must be indented. |
|
696
|
|
|
# We need to do this before the next, more liberal match, because the next |
|
697
|
|
|
# match will start at the first `<div>` and stop at the first `</div>`. |
|
698
|
|
|
text = self._strict_tag_block_re.sub(hash_html_block_sub, text) |
|
699
|
|
|
|
|
700
|
|
|
# Now match more liberally, simply from `\n<tag>` to `</tag>\n` |
|
701
|
|
|
text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) |
|
702
|
|
|
|
|
703
|
|
|
# Special case just for <hr />. It was easier to make a special |
|
704
|
|
|
# case than to make the other regex more complicated. |
|
705
|
|
|
if "<hr" in text: |
|
706
|
|
|
_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) |
|
707
|
|
|
text = _hr_tag_re.sub(hash_html_block_sub, text) |
|
708
|
|
|
|
|
709
|
|
|
# Special case for standalone HTML comments: |
|
710
|
|
|
if "<!--" in text: |
|
711
|
|
|
start = 0 |
|
712
|
|
|
while True: |
|
713
|
|
|
# Delimiters for next comment block. |
|
714
|
|
|
try: |
|
715
|
|
|
start_idx = text.index("<!--", start) |
|
716
|
|
|
except ValueError: |
|
717
|
|
|
break |
|
718
|
|
|
try: |
|
719
|
|
|
end_idx = text.index("-->", start_idx) + 3 |
|
720
|
|
|
except ValueError: |
|
721
|
|
|
break |
|
722
|
|
|
|
|
723
|
|
|
# Start position for next comment block search. |
|
724
|
|
|
start = end_idx |
|
725
|
|
|
|
|
726
|
|
|
# Validate whitespace before comment. |
|
727
|
|
|
if start_idx: |
|
728
|
|
|
# - Up to `tab_width - 1` spaces before start_idx. |
|
729
|
|
|
for i in range(self.tab_width - 1): |
|
730
|
|
|
if text[start_idx - 1] != ' ': |
|
731
|
|
|
break |
|
732
|
|
|
start_idx -= 1 |
|
733
|
|
|
if start_idx == 0: |
|
734
|
|
|
break |
|
735
|
|
|
# - Must be preceded by 2 newlines or hit the start of |
|
736
|
|
|
# the document. |
|
737
|
|
|
if start_idx == 0: |
|
738
|
|
|
pass |
|
739
|
|
|
elif start_idx == 1 and text[0] == '\n': |
|
740
|
|
|
start_idx = 0 # to match minute detail of Markdown.pl regex |
|
741
|
|
|
elif text[start_idx-2:start_idx] == '\n\n': |
|
742
|
|
|
pass |
|
743
|
|
|
else: |
|
744
|
|
|
break |
|
745
|
|
|
|
|
746
|
|
|
# Validate whitespace after comment. |
|
747
|
|
|
# - Any number of spaces and tabs. |
|
748
|
|
|
while end_idx < len(text): |
|
749
|
|
|
if text[end_idx] not in ' \t': |
|
750
|
|
|
break |
|
751
|
|
|
end_idx += 1 |
|
752
|
|
|
# - Must be following by 2 newlines or hit end of text. |
|
753
|
|
|
if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): |
|
754
|
|
|
continue |
|
755
|
|
|
|
|
756
|
|
|
# Escape and hash (must match `_hash_html_block_sub`). |
|
757
|
|
|
html = text[start_idx:end_idx] |
|
758
|
|
|
if raw and self.safe_mode: |
|
759
|
|
|
html = self._sanitize_html(html) |
|
760
|
|
|
key = _hash_text(html) |
|
761
|
|
|
self.html_blocks[key] = html |
|
762
|
|
|
text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] |
|
763
|
|
|
|
|
764
|
|
|
if "xml" in self.extras: |
|
765
|
|
|
# Treat XML processing instructions and namespaced one-liner |
|
766
|
|
|
# tags as if they were block HTML tags. E.g., if standalone |
|
767
|
|
|
# (i.e. are their own paragraph), the following do not get |
|
768
|
|
|
# wrapped in a <p> tag: |
|
769
|
|
|
# <?foo bar?> |
|
770
|
|
|
# |
|
771
|
|
|
# <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> |
|
772
|
|
|
_xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) |
|
773
|
|
|
text = _xml_oneliner_re.sub(hash_html_block_sub, text) |
|
774
|
|
|
|
|
775
|
|
|
return text |
|
776
|
|
|
|
|
777
|
|
|
def _strip_link_definitions(self, text): |
|
778
|
|
|
# Strips link definitions from text, stores the URLs and titles in |
|
779
|
|
|
# hash references. |
|
780
|
|
|
less_than_tab = self.tab_width - 1 |
|
781
|
|
|
|
|
782
|
|
|
# Link defs are in the form: |
|
783
|
|
|
# [id]: url "optional title" |
|
784
|
|
|
_link_def_re = re.compile(r""" |
|
785
|
|
|
^[ ]{0,%d}\[(.+)\]: # id = \1 |
|
786
|
|
|
[ \t]* |
|
787
|
|
|
\n? # maybe *one* newline |
|
788
|
|
|
[ \t]* |
|
789
|
|
|
<?(.+?)>? # url = \2 |
|
790
|
|
|
[ \t]* |
|
791
|
|
|
(?: |
|
792
|
|
|
\n? # maybe one newline |
|
793
|
|
|
[ \t]* |
|
794
|
|
|
(?<=\s) # lookbehind for whitespace |
|
795
|
|
|
['"(] |
|
796
|
|
|
([^\n]*) # title = \3 |
|
797
|
|
|
['")] |
|
798
|
|
|
[ \t]* |
|
799
|
|
|
)? # title is optional |
|
800
|
|
|
(?:\n+|\Z) |
|
801
|
|
|
""" % less_than_tab, re.X | re.M | re.U) |
|
802
|
|
|
return _link_def_re.sub(self._extract_link_def_sub, text) |
|
803
|
|
|
|
|
804
|
|
|
def _extract_link_def_sub(self, match): |
|
805
|
|
|
id, url, title = match.groups() |
|
806
|
|
|
key = id.lower() # Link IDs are case-insensitive |
|
807
|
|
|
self.urls[key] = self._encode_amps_and_angles(url) |
|
808
|
|
|
if title: |
|
809
|
|
|
self.titles[key] = title |
|
810
|
|
|
return "" |
|
811
|
|
|
|
|
812
|
|
|
def _do_numbering(self, text): |
|
813
|
|
|
''' We handle the special extension for generic numbering for |
|
814
|
|
|
tables, figures etc. |
|
815
|
|
|
''' |
|
816
|
|
|
# First pass to define all the references |
|
817
|
|
|
self.regex_defns = re.compile(r''' |
|
818
|
|
|
\[\#(\w+)\s* # the counter. Open square plus hash plus a word \1 |
|
819
|
|
|
([^@]*)\s* # Some optional characters, that aren't an @. \2 |
|
820
|
|
|
@(\w+) # the id. Should this be normed? \3 |
|
821
|
|
|
([^\]]*)\] # The rest of the text up to the terminating ] \4 |
|
822
|
|
|
''', re.VERBOSE) |
|
823
|
|
|
self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] |
|
824
|
|
|
counters = {} |
|
825
|
|
|
references = {} |
|
826
|
|
|
replacements = [] |
|
827
|
|
|
definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>' |
|
828
|
|
|
reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>' |
|
829
|
|
|
for match in self.regex_defns.finditer(text): |
|
830
|
|
|
# We must have four match groups otherwise this isn't a numbering reference |
|
831
|
|
|
if len(match.groups()) != 4: |
|
832
|
|
|
continue |
|
833
|
|
|
counter = match.group(1) |
|
834
|
|
|
text_before = match.group(2) |
|
835
|
|
|
ref_id = match.group(3) |
|
836
|
|
|
text_after = match.group(4) |
|
837
|
|
|
number = counters.get(counter, 1) |
|
838
|
|
|
references[ref_id] = (number, counter) |
|
839
|
|
|
replacements.append((match.start(0), |
|
840
|
|
|
definition_html.format(counter, |
|
841
|
|
|
ref_id, |
|
842
|
|
|
text_before, |
|
843
|
|
|
number, |
|
844
|
|
|
text_after), |
|
845
|
|
|
match.end(0))) |
|
846
|
|
|
counters[counter] = number + 1 |
|
847
|
|
|
for repl in reversed(replacements): |
|
848
|
|
|
text = text[:repl[0]] + repl[1] + text[repl[2]:] |
|
849
|
|
|
|
|
850
|
|
|
# Second pass to replace the references with the right |
|
851
|
|
|
# value of the counter |
|
852
|
|
|
# Fwiw, it's vaguely annoying to have to turn the iterator into |
|
853
|
|
|
# a list and then reverse it but I can't think of a better thing to do. |
|
854
|
|
|
for match in reversed(list(self.regex_subs.finditer(text))): |
|
855
|
|
|
number, counter = references.get(match.group(1), (None, None)) |
|
856
|
|
|
if number is not None: |
|
857
|
|
|
repl = reference_html.format(counter, |
|
858
|
|
|
match.group(1), |
|
859
|
|
|
number) |
|
860
|
|
|
else: |
|
861
|
|
|
repl = reference_html.format(match.group(1), |
|
862
|
|
|
'countererror', |
|
863
|
|
|
'?' + match.group(1) + '?') |
|
864
|
|
|
if "smarty-pants" in self.extras: |
|
865
|
|
|
repl = repl.replace('"', self._escape_table['"']) |
|
866
|
|
|
|
|
867
|
|
|
text = text[:match.start()] + repl + text[match.end():] |
|
868
|
|
|
return text |
|
869
|
|
|
|
|
870
|
|
|
def _extract_footnote_def_sub(self, match): |
|
871
|
|
|
id, text = match.groups() |
|
872
|
|
|
text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() |
|
873
|
|
|
normed_id = re.sub(r'\W', '-', id) |
|
874
|
|
|
# Ensure footnote text ends with a couple newlines (for some |
|
875
|
|
|
# block gamut matches). |
|
876
|
|
|
self.footnotes[normed_id] = text + "\n\n" |
|
877
|
|
|
return "" |
|
878
|
|
|
|
|
879
|
|
|
def _strip_footnote_definitions(self, text): |
|
880
|
|
|
"""A footnote definition looks like this: |
|
881
|
|
|
|
|
882
|
|
|
[^note-id]: Text of the note. |
|
883
|
|
|
|
|
884
|
|
|
May include one or more indented paragraphs. |
|
885
|
|
|
|
|
886
|
|
|
Where, |
|
887
|
|
|
- The 'note-id' can be pretty much anything, though typically it |
|
888
|
|
|
is the number of the footnote. |
|
889
|
|
|
- The first paragraph may start on the next line, like so: |
|
890
|
|
|
|
|
891
|
|
|
[^note-id]: |
|
892
|
|
|
Text of the note. |
|
893
|
|
|
""" |
|
894
|
|
|
less_than_tab = self.tab_width - 1 |
|
895
|
|
|
footnote_def_re = re.compile(r''' |
|
896
|
|
|
^[ ]{0,%d}\[\^(.+)\]: # id = \1 |
|
897
|
|
|
[ \t]* |
|
898
|
|
|
( # footnote text = \2 |
|
899
|
|
|
# First line need not start with the spaces. |
|
900
|
|
|
(?:\s*.*\n+) |
|
901
|
|
|
(?: |
|
902
|
|
|
(?:[ ]{%d} | \t) # Subsequent lines must be indented. |
|
903
|
|
|
.*\n+ |
|
904
|
|
|
)* |
|
905
|
|
|
) |
|
906
|
|
|
# Lookahead for non-space at line-start, or end of doc. |
|
907
|
|
|
(?:(?=^[ ]{0,%d}\S)|\Z) |
|
908
|
|
|
''' % (less_than_tab, self.tab_width, self.tab_width), |
|
909
|
|
|
re.X | re.M) |
|
910
|
|
|
return footnote_def_re.sub(self._extract_footnote_def_sub, text) |
|
911
|
|
|
|
|
912
|
|
|
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M) |
|
913
|
|
|
|
|
914
|
|
|
def _run_block_gamut(self, text): |
|
915
|
|
|
# These are all the transformations that form block-level |
|
916
|
|
|
# tags like paragraphs, headers, and list items. |
|
917
|
|
|
|
|
918
|
|
|
if "fenced-code-blocks" in self.extras: |
|
919
|
|
|
text = self._do_fenced_code_blocks(text) |
|
920
|
|
|
|
|
921
|
|
|
text = self._do_headers(text) |
|
922
|
|
|
|
|
923
|
|
|
# Do Horizontal Rules: |
|
924
|
|
|
# On the number of spaces in horizontal rules: The spec is fuzzy: "If |
|
925
|
|
|
# you wish, you may use spaces between the hyphens or asterisks." |
|
926
|
|
|
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the |
|
927
|
|
|
# hr chars to one or two. We'll reproduce that limit here. |
|
928
|
|
|
hr = "\n<hr"+self.empty_element_suffix+"\n" |
|
929
|
|
|
text = re.sub(self._hr_re, hr, text) |
|
930
|
|
|
|
|
931
|
|
|
text = self._do_lists(text) |
|
932
|
|
|
|
|
933
|
|
|
if "pyshell" in self.extras: |
|
934
|
|
|
text = self._prepare_pyshell_blocks(text) |
|
935
|
|
|
if "wiki-tables" in self.extras: |
|
936
|
|
|
text = self._do_wiki_tables(text) |
|
937
|
|
|
if "tables" in self.extras: |
|
938
|
|
|
text = self._do_tables(text) |
|
939
|
|
|
|
|
940
|
|
|
text = self._do_code_blocks(text) |
|
941
|
|
|
|
|
942
|
|
|
text = self._do_block_quotes(text) |
|
943
|
|
|
|
|
944
|
|
|
# We already ran _HashHTMLBlocks() before, in Markdown(), but that |
|
945
|
|
|
# was to escape raw HTML in the original Markdown source. This time, |
|
946
|
|
|
# we're escaping the markup we've just created, so that we don't wrap |
|
947
|
|
|
# <p> tags around block-level tags. |
|
948
|
|
|
text = self._hash_html_blocks(text) |
|
949
|
|
|
|
|
950
|
|
|
text = self._form_paragraphs(text) |
|
951
|
|
|
|
|
952
|
|
|
return text |
|
953
|
|
|
|
|
954
|
|
|
def _pyshell_block_sub(self, match): |
|
955
|
|
|
lines = match.group(0).splitlines(0) |
|
956
|
|
|
_dedentlines(lines) |
|
957
|
|
|
indent = ' ' * self.tab_width |
|
958
|
|
|
s = ('\n' # separate from possible cuddled paragraph |
|
959
|
|
|
+ indent + ('\n'+indent).join(lines) |
|
960
|
|
|
+ '\n\n') |
|
961
|
|
|
return s |
|
962
|
|
|
|
|
963
|
|
|
def _prepare_pyshell_blocks(self, text): |
|
964
|
|
|
"""Ensure that Python interactive shell sessions are put in |
|
965
|
|
|
code blocks -- even if not properly indented. |
|
966
|
|
|
""" |
|
967
|
|
|
if ">>>" not in text: |
|
968
|
|
|
return text |
|
969
|
|
|
|
|
970
|
|
|
less_than_tab = self.tab_width - 1 |
|
971
|
|
|
_pyshell_block_re = re.compile(r""" |
|
972
|
|
|
^([ ]{0,%d})>>>[ ].*\n # first line |
|
973
|
|
|
^(\1.*\S+.*\n)* # any number of subsequent lines |
|
974
|
|
|
^\n # ends with a blank line |
|
975
|
|
|
""" % less_than_tab, re.M | re.X) |
|
976
|
|
|
|
|
977
|
|
|
return _pyshell_block_re.sub(self._pyshell_block_sub, text) |
|
978
|
|
|
|
|
979
|
|
|
def _table_sub(self, match): |
|
980
|
|
|
trim_space_re = '^[ \t\n]+|[ \t\n]+$' |
|
981
|
|
|
trim_bar_re = '^\||\|$' |
|
982
|
|
|
split_bar_re = '^\||(?<!\\\\)\|' |
|
983
|
|
|
escape_bar_re = '\\\\\|' |
|
984
|
|
|
|
|
985
|
|
|
head, underline, body = match.groups() |
|
986
|
|
|
|
|
987
|
|
|
# Determine aligns for columns. |
|
988
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))] |
|
989
|
|
|
align_from_col_idx = {} |
|
990
|
|
|
for col_idx, col in enumerate(cols): |
|
991
|
|
|
if col[0] == ':' and col[-1] == ':': |
|
992
|
|
|
align_from_col_idx[col_idx] = ' align="center"' |
|
993
|
|
|
elif col[0] == ':': |
|
994
|
|
|
align_from_col_idx[col_idx] = ' align="left"' |
|
995
|
|
|
elif col[-1] == ':': |
|
996
|
|
|
align_from_col_idx[col_idx] = ' align="right"' |
|
997
|
|
|
|
|
998
|
|
|
# thead |
|
999
|
|
|
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>'] |
|
1000
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] |
|
1001
|
|
|
for col_idx, col in enumerate(cols): |
|
1002
|
|
|
hlines.append(' <th%s>%s</th>' % ( |
|
1003
|
|
|
align_from_col_idx.get(col_idx, ''), |
|
1004
|
|
|
self._run_span_gamut(col) |
|
1005
|
|
|
)) |
|
1006
|
|
|
hlines.append('</tr>') |
|
1007
|
|
|
hlines.append('</thead>') |
|
1008
|
|
|
|
|
1009
|
|
|
# tbody |
|
1010
|
|
|
hlines.append('<tbody>') |
|
1011
|
|
|
for line in body.strip('\n').split('\n'): |
|
1012
|
|
|
hlines.append('<tr>') |
|
1013
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] |
|
1014
|
|
|
for col_idx, col in enumerate(cols): |
|
1015
|
|
|
hlines.append(' <td%s>%s</td>' % ( |
|
1016
|
|
|
align_from_col_idx.get(col_idx, ''), |
|
1017
|
|
|
self._run_span_gamut(col) |
|
1018
|
|
|
)) |
|
1019
|
|
|
hlines.append('</tr>') |
|
1020
|
|
|
hlines.append('</tbody>') |
|
1021
|
|
|
hlines.append('</table>') |
|
1022
|
|
|
|
|
1023
|
|
|
return '\n'.join(hlines) + '\n' |
|
1024
|
|
|
|
|
1025
|
|
|
def _do_tables(self, text): |
|
1026
|
|
|
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from |
|
1027
|
|
|
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 |
|
1028
|
|
|
""" |
|
1029
|
|
|
less_than_tab = self.tab_width - 1 |
|
1030
|
|
|
table_re = re.compile(r''' |
|
1031
|
|
|
(?:(?<=\n\n)|\A\n?) # leading blank line |
|
1032
|
|
|
|
|
1033
|
|
|
^[ ]{0,%d} # allowed whitespace |
|
1034
|
|
|
(.*[|].*) \n # $1: header row (at least one pipe) |
|
1035
|
|
|
|
|
1036
|
|
|
^[ ]{0,%d} # allowed whitespace |
|
1037
|
|
|
( # $2: underline row |
|
1038
|
|
|
# underline row with leading bar |
|
1039
|
|
|
(?: \|\ *:?-+:?\ * )+ \|? \n |
|
1040
|
|
|
| |
|
1041
|
|
|
# or, underline row without leading bar |
|
1042
|
|
|
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n |
|
1043
|
|
|
) |
|
1044
|
|
|
|
|
1045
|
|
|
( # $3: data rows |
|
1046
|
|
|
(?: |
|
1047
|
|
|
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces |
|
1048
|
|
|
.*\|.* \n |
|
1049
|
|
|
)+ |
|
1050
|
|
|
) |
|
1051
|
|
|
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) |
|
1052
|
|
|
return table_re.sub(self._table_sub, text) |
|
1053
|
|
|
|
|
1054
|
|
|
def _wiki_table_sub(self, match): |
|
1055
|
|
|
ttext = match.group(0).strip() |
|
1056
|
|
|
# print 'wiki table: %r' % match.group(0) |
|
1057
|
|
|
rows = [] |
|
1058
|
|
|
for line in ttext.splitlines(0): |
|
1059
|
|
|
line = line.strip()[2:-2].strip() |
|
1060
|
|
|
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)] |
|
1061
|
|
|
rows.append(row) |
|
1062
|
|
|
# pprint(rows) |
|
1063
|
|
|
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>'] |
|
1064
|
|
|
for row in rows: |
|
1065
|
|
|
hrow = ['<tr>'] |
|
1066
|
|
|
for cell in row: |
|
1067
|
|
|
hrow.append('<td>') |
|
1068
|
|
|
hrow.append(self._run_span_gamut(cell)) |
|
1069
|
|
|
hrow.append('</td>') |
|
1070
|
|
|
hrow.append('</tr>') |
|
1071
|
|
|
hlines.append(''.join(hrow)) |
|
1072
|
|
|
hlines += ['</tbody>', '</table>'] |
|
1073
|
|
|
return '\n'.join(hlines) + '\n' |
|
1074
|
|
|
|
|
1075
|
|
|
def _do_wiki_tables(self, text): |
|
1076
|
|
|
# Optimization. |
|
1077
|
|
|
if "||" not in text: |
|
1078
|
|
|
return text |
|
1079
|
|
|
|
|
1080
|
|
|
less_than_tab = self.tab_width - 1 |
|
1081
|
|
|
wiki_table_re = re.compile(r''' |
|
1082
|
|
|
(?:(?<=\n\n)|\A\n?) # leading blank line |
|
1083
|
|
|
^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line |
|
1084
|
|
|
(^\1\|\|.+?\|\|\n)* # any number of subsequent lines |
|
1085
|
|
|
''' % less_than_tab, re.M | re.X) |
|
1086
|
|
|
return wiki_table_re.sub(self._wiki_table_sub, text) |
|
1087
|
|
|
|
|
1088
|
|
|
def _run_span_gamut(self, text): |
|
1089
|
|
|
# These are all the transformations that occur *within* block-level |
|
1090
|
|
|
# tags like paragraphs, headers, and list items. |
|
1091
|
|
|
|
|
1092
|
|
|
text = self._do_code_spans(text) |
|
1093
|
|
|
|
|
1094
|
|
|
text = self._escape_special_chars(text) |
|
1095
|
|
|
|
|
1096
|
|
|
# Process anchor and image tags. |
|
1097
|
|
|
text = self._do_links(text) |
|
1098
|
|
|
|
|
1099
|
|
|
# Make links out of things like `<http://example.com/>` |
|
1100
|
|
|
# Must come after _do_links(), because you can use < and > |
|
1101
|
|
|
# delimiters in inline links like [this](<url>). |
|
1102
|
|
|
text = self._do_auto_links(text) |
|
1103
|
|
|
|
|
1104
|
|
|
if "link-patterns" in self.extras: |
|
1105
|
|
|
text = self._do_link_patterns(text) |
|
1106
|
|
|
|
|
1107
|
|
|
text = self._encode_amps_and_angles(text) |
|
1108
|
|
|
|
|
1109
|
|
|
if "strike" in self.extras: |
|
1110
|
|
|
text = self._do_strike(text) |
|
1111
|
|
|
|
|
1112
|
|
|
text = self._do_italics_and_bold(text) |
|
1113
|
|
|
|
|
1114
|
|
|
if "smarty-pants" in self.extras: |
|
1115
|
|
|
text = self._do_smart_punctuation(text) |
|
1116
|
|
|
|
|
1117
|
|
|
# Do hard breaks: |
|
1118
|
|
|
if "break-on-newline" in self.extras: |
|
1119
|
|
|
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text) |
|
1120
|
|
|
else: |
|
1121
|
|
|
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) |
|
1122
|
|
|
|
|
1123
|
|
|
return text |
|
1124
|
|
|
|
|
1125
|
|
|
# "Sorta" because auto-links are identified as "tag" tokens. |
|
1126
|
|
|
_sorta_html_tokenize_re = re.compile(r""" |
|
1127
|
|
|
( |
|
1128
|
|
|
# tag |
|
1129
|
|
|
</? |
|
1130
|
|
|
(?:\w+) # tag name |
|
1131
|
|
|
(?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes |
|
1132
|
|
|
\s*/?> |
|
1133
|
|
|
| |
|
1134
|
|
|
# auto-link (e.g., <http://www.activestate.com/>) |
|
1135
|
|
|
<\w+[^>]*> |
|
1136
|
|
|
| |
|
1137
|
|
|
<!--.*?--> # comment |
|
1138
|
|
|
| |
|
1139
|
|
|
<\?.*?\?> # processing instruction |
|
1140
|
|
|
) |
|
1141
|
|
|
""", re.X) |
|
1142
|
|
|
|
|
1143
|
|
|
def _escape_special_chars(self, text): |
|
1144
|
|
|
# Python markdown note: the HTML tokenization here differs from |
|
1145
|
|
|
# that in Markdown.pl, hence the behaviour for subtle cases can |
|
1146
|
|
|
# differ (I believe the tokenizer here does a better job because |
|
1147
|
|
|
# it isn't susceptible to unmatched '<' and '>' in HTML tags). |
|
1148
|
|
|
# Note, however, that '>' is not allowed in an auto-link URL |
|
1149
|
|
|
# here. |
|
1150
|
|
|
escaped = [] |
|
1151
|
|
|
is_html_markup = False |
|
1152
|
|
|
for token in self._sorta_html_tokenize_re.split(text): |
|
1153
|
|
|
if is_html_markup: |
|
1154
|
|
|
# Within tags/HTML-comments/auto-links, encode * and _ |
|
1155
|
|
|
# so they don't conflict with their use in Markdown for |
|
1156
|
|
|
# italics and strong. We're replacing each such |
|
1157
|
|
|
# character with its corresponding MD5 checksum value; |
|
1158
|
|
|
# this is likely overkill, but it should prevent us from |
|
1159
|
|
|
# colliding with the escape values by accident. |
|
1160
|
|
|
escaped.append(token.replace('*', self._escape_table['*']) |
|
1161
|
|
|
.replace('_', self._escape_table['_'])) |
|
1162
|
|
|
else: |
|
1163
|
|
|
escaped.append(self._encode_backslash_escapes(token)) |
|
1164
|
|
|
is_html_markup = not is_html_markup |
|
1165
|
|
|
return ''.join(escaped) |
|
1166
|
|
|
|
|
1167
|
|
|
def _hash_html_spans(self, text): |
|
1168
|
|
|
# Used for safe_mode. |
|
1169
|
|
|
|
|
1170
|
|
|
def _is_auto_link(s): |
|
1171
|
|
|
if ':' in s and self._auto_link_re.match(s): |
|
1172
|
|
|
return True |
|
1173
|
|
|
elif '@' in s and self._auto_email_link_re.match(s): |
|
1174
|
|
|
return True |
|
1175
|
|
|
return False |
|
1176
|
|
|
|
|
1177
|
|
|
tokens = [] |
|
1178
|
|
|
is_html_markup = False |
|
1179
|
|
|
for token in self._sorta_html_tokenize_re.split(text): |
|
1180
|
|
|
if is_html_markup and not _is_auto_link(token): |
|
1181
|
|
|
sanitized = self._sanitize_html(token) |
|
1182
|
|
|
key = _hash_text(sanitized) |
|
1183
|
|
|
self.html_spans[key] = sanitized |
|
1184
|
|
|
tokens.append(key) |
|
1185
|
|
|
else: |
|
1186
|
|
|
tokens.append(token) |
|
1187
|
|
|
is_html_markup = not is_html_markup |
|
1188
|
|
|
return ''.join(tokens) |
|
1189
|
|
|
|
|
1190
|
|
|
def _unhash_html_spans(self, text): |
|
1191
|
|
|
for key, sanitized in list(self.html_spans.items()): |
|
1192
|
|
|
text = text.replace(key, sanitized) |
|
1193
|
|
|
return text |
|
1194
|
|
|
|
|
1195
|
|
|
def _sanitize_html(self, s): |
|
1196
|
|
|
if self.safe_mode == "replace": |
|
1197
|
|
|
return self.html_removed_text |
|
1198
|
|
|
elif self.safe_mode == "escape": |
|
1199
|
|
|
replacements = [ |
|
1200
|
|
|
('&', '&'), |
|
1201
|
|
|
('<', '<'), |
|
1202
|
|
|
('>', '>'), |
|
1203
|
|
|
] |
|
1204
|
|
|
for before, after in replacements: |
|
1205
|
|
|
s = s.replace(before, after) |
|
1206
|
|
|
return s |
|
1207
|
|
|
else: |
|
1208
|
|
|
raise MarkdownError("invalid value for 'safe_mode': %r (must be " |
|
1209
|
|
|
"'escape' or 'replace')" % self.safe_mode) |
|
1210
|
|
|
|
|
1211
|
|
|
_inline_link_title = re.compile(r''' |
|
1212
|
|
|
( # \1 |
|
1213
|
|
|
[ \t]+ |
|
1214
|
|
|
(['"]) # quote char = \2 |
|
1215
|
|
|
(?P<title>.*?) |
|
1216
|
|
|
\2 |
|
1217
|
|
|
)? # title is optional |
|
1218
|
|
|
\)$ |
|
1219
|
|
|
''', re.X | re.S) |
|
1220
|
|
|
_tail_of_reference_link_re = re.compile(r''' |
|
1221
|
|
|
# Match tail of: [text][id] |
|
1222
|
|
|
[ ]? # one optional space |
|
1223
|
|
|
(?:\n[ ]*)? # one optional newline followed by spaces |
|
1224
|
|
|
\[ |
|
1225
|
|
|
(?P<id>.*?) |
|
1226
|
|
|
\] |
|
1227
|
|
|
''', re.X | re.S) |
|
1228
|
|
|
|
|
1229
|
|
|
_whitespace = re.compile(r'\s*') |
|
1230
|
|
|
|
|
1231
|
|
|
_strip_anglebrackets = re.compile(r'<(.*)>.*') |
|
1232
|
|
|
|
|
1233
|
|
|
def _find_non_whitespace(self, text, start): |
|
1234
|
|
|
"""Returns the index of the first non-whitespace character in text |
|
1235
|
|
|
after (and including) start |
|
1236
|
|
|
""" |
|
1237
|
|
|
match = self._whitespace.match(text, start) |
|
1238
|
|
|
return match.end() |
|
1239
|
|
|
|
|
1240
|
|
|
def _find_balanced(self, text, start, open_c, close_c): |
|
1241
|
|
|
"""Returns the index where the open_c and close_c characters balance |
|
1242
|
|
|
out - the same number of open_c and close_c are encountered - or the |
|
1243
|
|
|
end of string if it's reached before the balance point is found. |
|
1244
|
|
|
""" |
|
1245
|
|
|
i = start |
|
1246
|
|
|
l = len(text) |
|
1247
|
|
|
count = 1 |
|
1248
|
|
|
while count > 0 and i < l: |
|
1249
|
|
|
if text[i] == open_c: |
|
1250
|
|
|
count += 1 |
|
1251
|
|
|
elif text[i] == close_c: |
|
1252
|
|
|
count -= 1 |
|
1253
|
|
|
i += 1 |
|
1254
|
|
|
return i |
|
1255
|
|
|
|
|
1256
|
|
|
def _extract_url_and_title(self, text, start): |
|
1257
|
|
|
"""Extracts the url and (optional) title from the tail of a link""" |
|
1258
|
|
|
# text[start] equals the opening parenthesis |
|
1259
|
|
|
idx = self._find_non_whitespace(text, start+1) |
|
1260
|
|
|
if idx == len(text): |
|
1261
|
|
|
return None, None, None |
|
1262
|
|
|
end_idx = idx |
|
1263
|
|
|
has_anglebrackets = text[idx] == "<" |
|
1264
|
|
|
if has_anglebrackets: |
|
1265
|
|
|
end_idx = self._find_balanced(text, end_idx+1, "<", ">") |
|
1266
|
|
|
end_idx = self._find_balanced(text, end_idx, "(", ")") |
|
1267
|
|
|
match = self._inline_link_title.search(text, idx, end_idx) |
|
1268
|
|
|
if not match: |
|
1269
|
|
|
return None, None, None |
|
1270
|
|
|
url, title = text[idx:match.start()], match.group("title") |
|
1271
|
|
|
if has_anglebrackets: |
|
1272
|
|
|
url = self._strip_anglebrackets.sub(r'\1', url) |
|
1273
|
|
|
return url, title, end_idx |
|
1274
|
|
|
|
|
1275
|
|
|
_safe_protocols = re.compile(r'(https?|ftp):', re.I) |
|
1276
|
|
|
def _do_links(self, text): |
|
1277
|
|
|
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags. |
|
1278
|
|
|
|
|
1279
|
|
|
This is a combination of Markdown.pl's _DoAnchors() and |
|
1280
|
|
|
_DoImages(). They are done together because that simplified the |
|
1281
|
|
|
approach. It was necessary to use a different approach than |
|
1282
|
|
|
Markdown.pl because of the lack of atomic matching support in |
|
1283
|
|
|
Python's regex engine used in $g_nested_brackets. |
|
1284
|
|
|
""" |
|
1285
|
|
|
MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 |
|
1286
|
|
|
|
|
1287
|
|
|
# `anchor_allowed_pos` is used to support img links inside |
|
1288
|
|
|
# anchors, but not anchors inside anchors. An anchor's start |
|
1289
|
|
|
# pos must be `>= anchor_allowed_pos`. |
|
1290
|
|
|
anchor_allowed_pos = 0 |
|
1291
|
|
|
|
|
1292
|
|
|
curr_pos = 0 |
|
1293
|
|
|
while True: # Handle the next link. |
|
1294
|
|
|
# The next '[' is the start of: |
|
1295
|
|
|
# - an inline anchor: [text](url "title") |
|
1296
|
|
|
# - a reference anchor: [text][id] |
|
1297
|
|
|
# - an inline img:  |
|
1298
|
|
|
# - a reference img: ![text][id] |
|
1299
|
|
|
# - a footnote ref: [^id] |
|
1300
|
|
|
# (Only if 'footnotes' extra enabled) |
|
1301
|
|
|
# - a footnote defn: [^id]: ... |
|
1302
|
|
|
# (Only if 'footnotes' extra enabled) These have already |
|
1303
|
|
|
# been stripped in _strip_footnote_definitions() so no |
|
1304
|
|
|
# need to watch for them. |
|
1305
|
|
|
# - a link definition: [id]: url "title" |
|
1306
|
|
|
# These have already been stripped in |
|
1307
|
|
|
# _strip_link_definitions() so no need to watch for them. |
|
1308
|
|
|
# - not markup: [...anything else... |
|
1309
|
|
|
try: |
|
1310
|
|
|
start_idx = text.index('[', curr_pos) |
|
1311
|
|
|
except ValueError: |
|
1312
|
|
|
break |
|
1313
|
|
|
text_length = len(text) |
|
1314
|
|
|
|
|
1315
|
|
|
# Find the matching closing ']'. |
|
1316
|
|
|
# Markdown.pl allows *matching* brackets in link text so we |
|
1317
|
|
|
# will here too. Markdown.pl *doesn't* currently allow |
|
1318
|
|
|
# matching brackets in img alt text -- we'll differ in that |
|
1319
|
|
|
# regard. |
|
1320
|
|
|
bracket_depth = 0 |
|
1321
|
|
|
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, |
|
1322
|
|
|
text_length)): |
|
1323
|
|
|
ch = text[p] |
|
1324
|
|
|
if ch == ']': |
|
1325
|
|
|
bracket_depth -= 1 |
|
1326
|
|
|
if bracket_depth < 0: |
|
1327
|
|
|
break |
|
1328
|
|
|
elif ch == '[': |
|
1329
|
|
|
bracket_depth += 1 |
|
1330
|
|
|
else: |
|
1331
|
|
|
# Closing bracket not found within sentinel length. |
|
1332
|
|
|
# This isn't markup. |
|
1333
|
|
|
curr_pos = start_idx + 1 |
|
1334
|
|
|
continue |
|
1335
|
|
|
link_text = text[start_idx+1:p] |
|
1336
|
|
|
|
|
1337
|
|
|
# Possibly a footnote ref? |
|
1338
|
|
|
if "footnotes" in self.extras and link_text.startswith("^"): |
|
1339
|
|
|
normed_id = re.sub(r'\W', '-', link_text[1:]) |
|
1340
|
|
|
if normed_id in self.footnotes: |
|
1341
|
|
|
self.footnote_ids.append(normed_id) |
|
1342
|
|
|
result = '<sup class="footnote-ref" id="fnref-%s">' \ |
|
1343
|
|
|
'<a href="#fn-%s">%s</a></sup>' \ |
|
1344
|
|
|
% (normed_id, normed_id, len(self.footnote_ids)) |
|
1345
|
|
|
text = text[:start_idx] + result + text[p+1:] |
|
1346
|
|
|
else: |
|
1347
|
|
|
# This id isn't defined, leave the markup alone. |
|
1348
|
|
|
curr_pos = p+1 |
|
1349
|
|
|
continue |
|
1350
|
|
|
|
|
1351
|
|
|
# Now determine what this is by the remainder. |
|
1352
|
|
|
p += 1 |
|
1353
|
|
|
if p == text_length: |
|
1354
|
|
|
return text |
|
1355
|
|
|
|
|
1356
|
|
|
# Inline anchor or img? |
|
1357
|
|
|
if text[p] == '(': # attempt at perf improvement |
|
1358
|
|
|
url, title, url_end_idx = self._extract_url_and_title(text, p) |
|
1359
|
|
|
if url is not None: |
|
1360
|
|
|
# Handle an inline anchor or img. |
|
1361
|
|
|
is_img = start_idx > 0 and text[start_idx-1] == "!" |
|
1362
|
|
|
if is_img: |
|
1363
|
|
|
start_idx -= 1 |
|
1364
|
|
|
|
|
1365
|
|
|
# We've got to encode these to avoid conflicting |
|
1366
|
|
|
# with italics/bold. |
|
1367
|
|
|
url = url.replace('*', self._escape_table['*']) \ |
|
1368
|
|
|
.replace('_', self._escape_table['_']) |
|
1369
|
|
|
if title: |
|
1370
|
|
|
title_str = ' title="%s"' % ( |
|
1371
|
|
|
_xml_escape_attr(title) |
|
1372
|
|
|
.replace('*', self._escape_table['*']) |
|
1373
|
|
|
.replace('_', self._escape_table['_'])) |
|
1374
|
|
|
else: |
|
1375
|
|
|
title_str = '' |
|
1376
|
|
View Code Duplication |
if is_img: |
|
|
|
|
|
|
1377
|
|
|
img_class_str = self._html_class_str_from_tag("img") |
|
1378
|
|
|
result = '<img src="%s" alt="%s"%s%s%s' \ |
|
1379
|
|
|
% (_html_escape_url(url, safe_mode=self.safe_mode), |
|
1380
|
|
|
_xml_escape_attr(link_text), |
|
1381
|
|
|
title_str, |
|
1382
|
|
|
img_class_str, |
|
1383
|
|
|
self.empty_element_suffix) |
|
1384
|
|
|
if "smarty-pants" in self.extras: |
|
1385
|
|
|
result = result.replace('"', self._escape_table['"']) |
|
1386
|
|
|
curr_pos = start_idx + len(result) |
|
1387
|
|
|
text = text[:start_idx] + result + text[url_end_idx:] |
|
1388
|
|
|
elif start_idx >= anchor_allowed_pos: |
|
1389
|
|
|
if self.safe_mode and not self._safe_protocols.match(url): |
|
1390
|
|
|
result_head = '<a href="#"%s>' % (title_str) |
|
1391
|
|
|
else: |
|
1392
|
|
|
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
|
1393
|
|
|
result = '%s%s</a>' % (result_head, _xml_escape_attr(link_text)) |
|
1394
|
|
|
if "smarty-pants" in self.extras: |
|
1395
|
|
|
result = result.replace('"', self._escape_table['"']) |
|
1396
|
|
|
# <img> allowed from curr_pos on, <a> from |
|
1397
|
|
|
# anchor_allowed_pos on. |
|
1398
|
|
|
curr_pos = start_idx + len(result_head) |
|
1399
|
|
|
anchor_allowed_pos = start_idx + len(result) |
|
1400
|
|
|
text = text[:start_idx] + result + text[url_end_idx:] |
|
1401
|
|
|
else: |
|
1402
|
|
|
# Anchor not allowed here. |
|
1403
|
|
|
curr_pos = start_idx + 1 |
|
1404
|
|
|
continue |
|
1405
|
|
|
|
|
1406
|
|
|
# Reference anchor or img? |
|
1407
|
|
|
else: |
|
1408
|
|
|
match = self._tail_of_reference_link_re.match(text, p) |
|
1409
|
|
|
if match: |
|
1410
|
|
|
# Handle a reference-style anchor or img. |
|
1411
|
|
|
is_img = start_idx > 0 and text[start_idx-1] == "!" |
|
1412
|
|
|
if is_img: |
|
1413
|
|
|
start_idx -= 1 |
|
1414
|
|
|
link_id = match.group("id").lower() |
|
1415
|
|
|
if not link_id: |
|
1416
|
|
|
link_id = link_text.lower() # for links like [this][] |
|
1417
|
|
|
if link_id in self.urls: |
|
1418
|
|
|
url = self.urls[link_id] |
|
1419
|
|
|
# We've got to encode these to avoid conflicting |
|
1420
|
|
|
# with italics/bold. |
|
1421
|
|
|
url = url.replace('*', self._escape_table['*']) \ |
|
1422
|
|
|
.replace('_', self._escape_table['_']) |
|
1423
|
|
|
title = self.titles.get(link_id) |
|
1424
|
|
|
if title: |
|
1425
|
|
|
title = _xml_escape_attr(title) \ |
|
1426
|
|
|
.replace('*', self._escape_table['*']) \ |
|
1427
|
|
|
.replace('_', self._escape_table['_']) |
|
1428
|
|
|
title_str = ' title="%s"' % title |
|
1429
|
|
|
else: |
|
1430
|
|
|
title_str = '' |
|
1431
|
|
View Code Duplication |
if is_img: |
|
|
|
|
|
|
1432
|
|
|
img_class_str = self._html_class_str_from_tag("img") |
|
1433
|
|
|
result = '<img src="%s" alt="%s"%s%s%s' \ |
|
1434
|
|
|
% (_html_escape_url(url, safe_mode=self.safe_mode), |
|
1435
|
|
|
_xml_escape_attr(link_text), |
|
1436
|
|
|
title_str, |
|
1437
|
|
|
img_class_str, |
|
1438
|
|
|
self.empty_element_suffix) |
|
1439
|
|
|
if "smarty-pants" in self.extras: |
|
1440
|
|
|
result = result.replace('"', self._escape_table['"']) |
|
1441
|
|
|
curr_pos = start_idx + len(result) |
|
1442
|
|
|
text = text[:start_idx] + result + text[match.end():] |
|
1443
|
|
|
elif start_idx >= anchor_allowed_pos: |
|
1444
|
|
|
if self.safe_mode and not self._safe_protocols.match(url): |
|
1445
|
|
|
result_head = '<a href="#"%s>' % (title_str) |
|
1446
|
|
|
else: |
|
1447
|
|
|
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
|
1448
|
|
|
result = '%s%s</a>' % (result_head, link_text) |
|
1449
|
|
|
if "smarty-pants" in self.extras: |
|
1450
|
|
|
result = result.replace('"', self._escape_table['"']) |
|
1451
|
|
|
# <img> allowed from curr_pos on, <a> from |
|
1452
|
|
|
# anchor_allowed_pos on. |
|
1453
|
|
|
curr_pos = start_idx + len(result_head) |
|
1454
|
|
|
anchor_allowed_pos = start_idx + len(result) |
|
1455
|
|
|
text = text[:start_idx] + result + text[match.end():] |
|
1456
|
|
|
else: |
|
1457
|
|
|
# Anchor not allowed here. |
|
1458
|
|
|
curr_pos = start_idx + 1 |
|
1459
|
|
|
else: |
|
1460
|
|
|
# This id isn't defined, leave the markup alone. |
|
1461
|
|
|
curr_pos = match.end() |
|
1462
|
|
|
continue |
|
1463
|
|
|
|
|
1464
|
|
|
# Otherwise, it isn't markup. |
|
1465
|
|
|
curr_pos = start_idx + 1 |
|
1466
|
|
|
|
|
1467
|
|
|
return text |
|
1468
|
|
|
|
|
1469
|
|
|
def header_id_from_text(self, text, prefix, n): |
|
1470
|
|
|
"""Generate a header id attribute value from the given header |
|
1471
|
|
|
HTML content. |
|
1472
|
|
|
|
|
1473
|
|
|
This is only called if the "header-ids" extra is enabled. |
|
1474
|
|
|
Subclasses may override this for different header ids. |
|
1475
|
|
|
|
|
1476
|
|
|
@param text {str} The text of the header tag |
|
1477
|
|
|
@param prefix {str} The requested prefix for header ids. This is the |
|
1478
|
|
|
value of the "header-ids" extra key, if any. Otherwise, None. |
|
1479
|
|
|
@param n {int} The <hN> tag number, i.e. `1` for an <h1> tag. |
|
1480
|
|
|
@returns {str} The value for the header tag's "id" attribute. Return |
|
1481
|
|
|
None to not have an id attribute and to exclude this header from |
|
1482
|
|
|
the TOC (if the "toc" extra is specified). |
|
1483
|
|
|
""" |
|
1484
|
|
|
header_id = _slugify(text) |
|
1485
|
|
|
if prefix and isinstance(prefix, base_string_type): |
|
1486
|
|
|
header_id = prefix + '-' + header_id |
|
1487
|
|
|
if header_id in self._count_from_header_id: |
|
1488
|
|
|
self._count_from_header_id[header_id] += 1 |
|
1489
|
|
|
header_id += '-%s' % self._count_from_header_id[header_id] |
|
1490
|
|
|
else: |
|
1491
|
|
|
self._count_from_header_id[header_id] = 1 |
|
1492
|
|
|
if 0 == len(header_id): |
|
1493
|
|
|
header_id += '-%s' % self._count_from_header_id[header_id] |
|
1494
|
|
|
|
|
1495
|
|
|
return header_id |
|
1496
|
|
|
|
|
1497
|
|
|
_toc = None |
|
1498
|
|
|
def _toc_add_entry(self, level, id, name): |
|
1499
|
|
|
if self._toc is None: |
|
1500
|
|
|
self._toc = [] |
|
1501
|
|
|
self._toc.append((level, id, self._unescape_special_chars(name))) |
|
1502
|
|
|
|
|
1503
|
|
|
_h_re_base = r''' |
|
1504
|
|
|
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) |
|
1505
|
|
|
| |
|
1506
|
|
|
(^(\#{1,6}) # \1 = string of #'s |
|
1507
|
|
|
[ \t]%s |
|
1508
|
|
|
(.+?) # \2 = Header text |
|
1509
|
|
|
[ \t]* |
|
1510
|
|
|
(?<!\\) # ensure not an escaped trailing '#' |
|
1511
|
|
|
\#* # optional closing #'s (not counted) |
|
1512
|
|
|
\n+ |
|
1513
|
|
|
) |
|
1514
|
|
|
''' |
|
1515
|
|
|
|
|
1516
|
|
|
_h_re = re.compile(_h_re_base % '*', re.X | re.M) |
|
1517
|
|
|
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) |
|
1518
|
|
|
|
|
1519
|
|
|
def _h_sub(self, match): |
|
1520
|
|
|
if match.group(1) is not None: |
|
1521
|
|
|
# Setext header |
|
1522
|
|
|
n = {"=": 1, "-": 2}[match.group(3)[0]] |
|
1523
|
|
|
header_group = match.group(2) |
|
1524
|
|
|
else: |
|
1525
|
|
|
# atx header |
|
1526
|
|
|
n = len(match.group(5)) |
|
1527
|
|
|
header_group = match.group(6) |
|
1528
|
|
|
|
|
1529
|
|
|
demote_headers = self.extras.get("demote-headers") |
|
1530
|
|
|
if demote_headers: |
|
1531
|
|
|
n = min(n + demote_headers, 6) |
|
1532
|
|
|
header_id_attr = "" |
|
1533
|
|
|
if "header-ids" in self.extras: |
|
1534
|
|
|
header_id = self.header_id_from_text(header_group, |
|
1535
|
|
|
self.extras["header-ids"], n) |
|
1536
|
|
|
if header_id: |
|
1537
|
|
|
header_id_attr = ' id="%s"' % header_id |
|
1538
|
|
|
html = self._run_span_gamut(header_group) |
|
1539
|
|
|
if "toc" in self.extras and header_id: |
|
1540
|
|
|
self._toc_add_entry(n, header_id, html) |
|
1541
|
|
|
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) |
|
1542
|
|
|
|
|
1543
|
|
|
def _do_headers(self, text): |
|
1544
|
|
|
# Setext-style headers: |
|
1545
|
|
|
# Header 1 |
|
1546
|
|
|
# ======== |
|
1547
|
|
|
# |
|
1548
|
|
|
# Header 2 |
|
1549
|
|
|
# -------- |
|
1550
|
|
|
|
|
1551
|
|
|
# atx-style headers: |
|
1552
|
|
|
# # Header 1 |
|
1553
|
|
|
# ## Header 2 |
|
1554
|
|
|
# ## Header 2 with closing hashes ## |
|
1555
|
|
|
# ... |
|
1556
|
|
|
# ###### Header 6 |
|
1557
|
|
|
|
|
1558
|
|
|
if 'tag-friendly' in self.extras: |
|
1559
|
|
|
return self._h_re_tag_friendly.sub(self._h_sub, text) |
|
1560
|
|
|
return self._h_re.sub(self._h_sub, text) |
|
1561
|
|
|
|
|
1562
|
|
|
_marker_ul_chars = '*+-' |
|
1563
|
|
|
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars |
|
1564
|
|
|
_marker_ul = '(?:[%s])' % _marker_ul_chars |
|
1565
|
|
|
_marker_ol = r'(?:\d+\.)' |
|
1566
|
|
|
|
|
1567
|
|
|
def _list_sub(self, match): |
|
1568
|
|
|
lst = match.group(1) |
|
1569
|
|
|
lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" |
|
1570
|
|
|
result = self._process_list_items(lst) |
|
1571
|
|
|
if self.list_level: |
|
1572
|
|
|
return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) |
|
1573
|
|
|
else: |
|
1574
|
|
|
return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) |
|
1575
|
|
|
|
|
1576
|
|
|
def _do_lists(self, text): |
|
1577
|
|
|
# Form HTML ordered (numbered) and unordered (bulleted) lists. |
|
1578
|
|
|
|
|
1579
|
|
|
# Iterate over each *non-overlapping* list match. |
|
1580
|
|
|
pos = 0 |
|
1581
|
|
|
while True: |
|
1582
|
|
|
# Find the *first* hit for either list style (ul or ol). We |
|
1583
|
|
|
# match ul and ol separately to avoid adjacent lists of different |
|
1584
|
|
|
# types running into each other (see issue #16). |
|
1585
|
|
|
hits = [] |
|
1586
|
|
|
for marker_pat in (self._marker_ul, self._marker_ol): |
|
1587
|
|
|
less_than_tab = self.tab_width - 1 |
|
1588
|
|
|
whole_list = r''' |
|
1589
|
|
|
( # \1 = whole list |
|
1590
|
|
|
( # \2 |
|
1591
|
|
|
[ ]{0,%d} |
|
1592
|
|
|
(%s) # \3 = first list item marker |
|
1593
|
|
|
[ \t]+ |
|
1594
|
|
|
(?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case. |
|
1595
|
|
|
) |
|
1596
|
|
|
(?:.+?) |
|
1597
|
|
|
( # \4 |
|
1598
|
|
|
\Z |
|
1599
|
|
|
| |
|
1600
|
|
|
\n{2,} |
|
1601
|
|
|
(?=\S) |
|
1602
|
|
|
(?! # Negative lookahead for another list item marker |
|
1603
|
|
|
[ \t]* |
|
1604
|
|
|
%s[ \t]+ |
|
1605
|
|
|
) |
|
1606
|
|
|
) |
|
1607
|
|
|
) |
|
1608
|
|
|
''' % (less_than_tab, marker_pat, marker_pat) |
|
1609
|
|
|
if self.list_level: # sub-list |
|
1610
|
|
|
list_re = re.compile("^"+whole_list, re.X | re.M | re.S) |
|
1611
|
|
|
else: |
|
1612
|
|
|
list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, |
|
1613
|
|
|
re.X | re.M | re.S) |
|
1614
|
|
|
match = list_re.search(text, pos) |
|
1615
|
|
|
if match: |
|
1616
|
|
|
hits.append((match.start(), match)) |
|
1617
|
|
|
if not hits: |
|
1618
|
|
|
break |
|
1619
|
|
|
hits.sort() |
|
1620
|
|
|
match = hits[0][1] |
|
1621
|
|
|
start, end = match.span() |
|
1622
|
|
|
middle = self._list_sub(match) |
|
1623
|
|
|
text = text[:start] + middle + text[end:] |
|
1624
|
|
|
pos = start + len(middle) # start pos for next attempted match |
|
1625
|
|
|
|
|
1626
|
|
|
return text |
|
1627
|
|
|
|
|
1628
|
|
|
_list_item_re = re.compile(r''' |
|
1629
|
|
|
(\n)? # leading line = \1 |
|
1630
|
|
|
(^[ \t]*) # leading whitespace = \2 |
|
1631
|
|
|
(?P<marker>%s) [ \t]+ # list marker = \3 |
|
1632
|
|
|
((?:.+?) # list item text = \4 |
|
1633
|
|
|
(\n{1,2})) # eols = \5 |
|
1634
|
|
|
(?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) |
|
1635
|
|
|
''' % (_marker_any, _marker_any), |
|
1636
|
|
|
re.M | re.X | re.S) |
|
1637
|
|
|
|
|
1638
|
|
|
_task_list_item_re = re.compile(r''' |
|
1639
|
|
|
(\[[\ x]\])[ \t]+ # tasklist marker = \1 |
|
1640
|
|
|
(.*) # list item text = \2 |
|
1641
|
|
|
''', re.M | re.X | re.S) |
|
1642
|
|
|
|
|
1643
|
|
|
_task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s' |
|
1644
|
|
|
|
|
1645
|
|
|
def _task_list_item_sub(self, match): |
|
1646
|
|
|
marker = match.group(1) |
|
1647
|
|
|
item_text = match.group(2) |
|
1648
|
|
|
if marker == '[x]': |
|
1649
|
|
|
return self._task_list_warpper_str % ('checked ', item_text) |
|
1650
|
|
|
elif marker == '[ ]': |
|
1651
|
|
|
return self._task_list_warpper_str % ('', item_text) |
|
1652
|
|
|
|
|
1653
|
|
|
_last_li_endswith_two_eols = False |
|
1654
|
|
|
def _list_item_sub(self, match): |
|
1655
|
|
|
item = match.group(4) |
|
1656
|
|
|
leading_line = match.group(1) |
|
1657
|
|
|
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: |
|
1658
|
|
|
item = self._run_block_gamut(self._outdent(item)) |
|
1659
|
|
|
else: |
|
1660
|
|
|
# Recursion for sub-lists: |
|
1661
|
|
|
item = self._do_lists(self._outdent(item)) |
|
1662
|
|
|
if item.endswith('\n'): |
|
1663
|
|
|
item = item[:-1] |
|
1664
|
|
|
item = self._run_span_gamut(item) |
|
1665
|
|
|
self._last_li_endswith_two_eols = (len(match.group(5)) == 2) |
|
1666
|
|
|
|
|
1667
|
|
|
if "task_list" in self.extras: |
|
1668
|
|
|
item = self._task_list_item_re.sub(self._task_list_item_sub, item) |
|
1669
|
|
|
|
|
1670
|
|
|
return "<li>%s</li>\n" % item |
|
1671
|
|
|
|
|
1672
|
|
|
def _process_list_items(self, list_str): |
|
1673
|
|
|
# Process the contents of a single ordered or unordered list, |
|
1674
|
|
|
# splitting it into individual list items. |
|
1675
|
|
|
|
|
1676
|
|
|
# The $g_list_level global keeps track of when we're inside a list. |
|
1677
|
|
|
# Each time we enter a list, we increment it; when we leave a list, |
|
1678
|
|
|
# we decrement. If it's zero, we're not in a list anymore. |
|
1679
|
|
|
# |
|
1680
|
|
|
# We do this because when we're not inside a list, we want to treat |
|
1681
|
|
|
# something like this: |
|
1682
|
|
|
# |
|
1683
|
|
|
# I recommend upgrading to version |
|
1684
|
|
|
# 8. Oops, now this line is treated |
|
1685
|
|
|
# as a sub-list. |
|
1686
|
|
|
# |
|
1687
|
|
|
# As a single paragraph, despite the fact that the second line starts |
|
1688
|
|
|
# with a digit-period-space sequence. |
|
1689
|
|
|
# |
|
1690
|
|
|
# Whereas when we're inside a list (or sub-list), that line will be |
|
1691
|
|
|
# treated as the start of a sub-list. What a kludge, huh? This is |
|
1692
|
|
|
# an aspect of Markdown's syntax that's hard to parse perfectly |
|
1693
|
|
|
# without resorting to mind-reading. Perhaps the solution is to |
|
1694
|
|
|
# change the syntax rules such that sub-lists must start with a |
|
1695
|
|
|
# starting cardinal number; e.g. "1." or "a.". |
|
1696
|
|
|
self.list_level += 1 |
|
1697
|
|
|
self._last_li_endswith_two_eols = False |
|
1698
|
|
|
list_str = list_str.rstrip('\n') + '\n' |
|
1699
|
|
|
list_str = self._list_item_re.sub(self._list_item_sub, list_str) |
|
1700
|
|
|
self.list_level -= 1 |
|
1701
|
|
|
return list_str |
|
1702
|
|
|
|
|
1703
|
|
|
def _get_pygments_lexer(self, lexer_name): |
|
1704
|
|
|
try: |
|
1705
|
|
|
from pygments import lexers, util |
|
1706
|
|
|
except ImportError: |
|
1707
|
|
|
return None |
|
1708
|
|
|
try: |
|
1709
|
|
|
return lexers.get_lexer_by_name(lexer_name) |
|
1710
|
|
|
except util.ClassNotFound: |
|
1711
|
|
|
return None |
|
1712
|
|
|
|
|
1713
|
|
|
def _color_with_pygments(self, codeblock, lexer, **formatter_opts): |
|
1714
|
|
|
import pygments |
|
1715
|
|
|
import pygments.formatters |
|
1716
|
|
|
|
|
1717
|
|
|
class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): |
|
1718
|
|
|
def _wrap_code(self, inner): |
|
1719
|
|
|
"""A function for use in a Pygments Formatter which |
|
1720
|
|
|
wraps in <code> tags. |
|
1721
|
|
|
""" |
|
1722
|
|
|
yield 0, "<code>" |
|
1723
|
|
|
for tup in inner: |
|
1724
|
|
|
yield tup |
|
1725
|
|
|
yield 0, "</code>" |
|
1726
|
|
|
|
|
1727
|
|
|
def wrap(self, source, outfile): |
|
1728
|
|
|
"""Return the source with a code, pre, and div.""" |
|
1729
|
|
|
return self._wrap_div(self._wrap_pre(self._wrap_code(source))) |
|
1730
|
|
|
|
|
1731
|
|
|
formatter_opts.setdefault("cssclass", "codehilite") |
|
1732
|
|
|
formatter = HtmlCodeFormatter(**formatter_opts) |
|
1733
|
|
|
return pygments.highlight(codeblock, lexer, formatter) |
|
1734
|
|
|
|
|
1735
|
|
|
def _code_block_sub(self, match, is_fenced_code_block=False): |
|
1736
|
|
|
lexer_name = None |
|
1737
|
|
|
if is_fenced_code_block: |
|
1738
|
|
|
lexer_name = match.group(1) |
|
1739
|
|
|
if lexer_name: |
|
1740
|
|
|
formatter_opts = self.extras['fenced-code-blocks'] or {} |
|
1741
|
|
|
codeblock = match.group(2) |
|
1742
|
|
|
codeblock = codeblock[:-1] # drop one trailing newline |
|
1743
|
|
|
else: |
|
1744
|
|
|
codeblock = match.group(1) |
|
1745
|
|
|
codeblock = self._outdent(codeblock) |
|
1746
|
|
|
codeblock = self._detab(codeblock) |
|
1747
|
|
|
codeblock = codeblock.lstrip('\n') # trim leading newlines |
|
1748
|
|
|
codeblock = codeblock.rstrip() # trim trailing whitespace |
|
1749
|
|
|
|
|
1750
|
|
|
# Note: "code-color" extra is DEPRECATED. |
|
1751
|
|
|
if "code-color" in self.extras and codeblock.startswith(":::"): |
|
1752
|
|
|
lexer_name, rest = codeblock.split('\n', 1) |
|
1753
|
|
|
lexer_name = lexer_name[3:].strip() |
|
1754
|
|
|
codeblock = rest.lstrip("\n") # Remove lexer declaration line. |
|
1755
|
|
|
formatter_opts = self.extras['code-color'] or {} |
|
1756
|
|
|
|
|
1757
|
|
|
# Use pygments only if not using the highlightjs-lang extra |
|
1758
|
|
|
if lexer_name and "highlightjs-lang" not in self.extras: |
|
1759
|
|
|
def unhash_code(codeblock): |
|
1760
|
|
|
for key, sanitized in list(self.html_spans.items()): |
|
1761
|
|
|
codeblock = codeblock.replace(key, sanitized) |
|
1762
|
|
|
replacements = [ |
|
1763
|
|
|
("&", "&"), |
|
1764
|
|
|
("<", "<"), |
|
1765
|
|
|
(">", ">") |
|
1766
|
|
|
] |
|
1767
|
|
|
for old, new in replacements: |
|
1768
|
|
|
codeblock = codeblock.replace(old, new) |
|
1769
|
|
|
return codeblock |
|
1770
|
|
|
lexer = self._get_pygments_lexer(lexer_name) |
|
1771
|
|
|
if lexer: |
|
1772
|
|
|
codeblock = unhash_code( codeblock ) |
|
1773
|
|
|
colored = self._color_with_pygments(codeblock, lexer, |
|
1774
|
|
|
**formatter_opts) |
|
1775
|
|
|
return "\n\n%s\n\n" % colored |
|
1776
|
|
|
|
|
1777
|
|
|
codeblock = self._encode_code(codeblock) |
|
1778
|
|
|
pre_class_str = self._html_class_str_from_tag("pre") |
|
1779
|
|
|
|
|
1780
|
|
|
if "highlightjs-lang" in self.extras and lexer_name: |
|
1781
|
|
|
code_class_str = ' class="%s"' % lexer_name |
|
1782
|
|
|
else: |
|
1783
|
|
|
code_class_str = self._html_class_str_from_tag("code") |
|
1784
|
|
|
|
|
1785
|
|
|
return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % ( |
|
1786
|
|
|
pre_class_str, code_class_str, codeblock) |
|
1787
|
|
|
|
|
1788
|
|
|
def _html_class_str_from_tag(self, tag): |
|
1789
|
|
|
"""Get the appropriate ' class="..."' string (note the leading |
|
1790
|
|
|
space), if any, for the given tag. |
|
1791
|
|
|
""" |
|
1792
|
|
|
if "html-classes" not in self.extras: |
|
1793
|
|
|
return "" |
|
1794
|
|
|
try: |
|
1795
|
|
|
html_classes_from_tag = self.extras["html-classes"] |
|
1796
|
|
|
except TypeError: |
|
1797
|
|
|
return "" |
|
1798
|
|
|
else: |
|
1799
|
|
|
if tag in html_classes_from_tag: |
|
1800
|
|
|
return ' class="%s"' % html_classes_from_tag[tag] |
|
1801
|
|
|
return "" |
|
1802
|
|
|
|
|
1803
|
|
|
def _do_code_blocks(self, text): |
|
1804
|
|
|
"""Process Markdown `<pre><code>` blocks.""" |
|
1805
|
|
|
code_block_re = re.compile(r''' |
|
1806
|
|
|
(?:\n\n|\A\n?) |
|
1807
|
|
|
( # $1 = the code block -- one or more lines, starting with a space/tab |
|
1808
|
|
|
(?: |
|
1809
|
|
|
(?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces |
|
1810
|
|
|
.*\n+ |
|
1811
|
|
|
)+ |
|
1812
|
|
|
) |
|
1813
|
|
|
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc |
|
1814
|
|
|
# Lookahead to make sure this block isn't already in a code block. |
|
1815
|
|
|
# Needed when syntax highlighting is being used. |
|
1816
|
|
|
(?![^<]*\</code\>) |
|
1817
|
|
|
''' % (self.tab_width, self.tab_width), |
|
1818
|
|
|
re.M | re.X) |
|
1819
|
|
|
return code_block_re.sub(self._code_block_sub, text) |
|
1820
|
|
|
|
|
1821
|
|
|
_fenced_code_block_re = re.compile(r''' |
|
1822
|
|
|
(?:\n+|\A\n?) |
|
1823
|
|
|
^```\s*?([\w+-]+)?\s*?\n # opening fence, $1 = optional lang |
|
1824
|
|
|
(.*?) # $2 = code block content |
|
1825
|
|
|
^```[ \t]*\n # closing fence |
|
1826
|
|
|
''', re.M | re.X | re.S) |
|
1827
|
|
|
|
|
1828
|
|
|
def _fenced_code_block_sub(self, match): |
|
1829
|
|
|
return self._code_block_sub(match, is_fenced_code_block=True) |
|
1830
|
|
|
|
|
1831
|
|
|
def _do_fenced_code_blocks(self, text): |
|
1832
|
|
|
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" |
|
1833
|
|
|
return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) |
|
1834
|
|
|
|
|
1835
|
|
|
# Rules for a code span: |
|
1836
|
|
|
# - backslash escapes are not interpreted in a code span |
|
1837
|
|
|
# - to include one or or a run of more backticks the delimiters must |
|
1838
|
|
|
# be a longer run of backticks |
|
1839
|
|
|
# - cannot start or end a code span with a backtick; pad with a |
|
1840
|
|
|
# space and that space will be removed in the emitted HTML |
|
1841
|
|
|
# See `test/tm-cases/escapes.text` for a number of edge-case |
|
1842
|
|
|
# examples. |
|
1843
|
|
|
_code_span_re = re.compile(r''' |
|
1844
|
|
|
(?<!\\) |
|
1845
|
|
|
(`+) # \1 = Opening run of ` |
|
1846
|
|
|
(?!`) # See Note A test/tm-cases/escapes.text |
|
1847
|
|
|
(.+?) # \2 = The code block |
|
1848
|
|
|
(?<!`) |
|
1849
|
|
|
\1 # Matching closer |
|
1850
|
|
|
(?!`) |
|
1851
|
|
|
''', re.X | re.S) |
|
1852
|
|
|
|
|
1853
|
|
|
def _code_span_sub(self, match): |
|
1854
|
|
|
c = match.group(2).strip(" \t") |
|
1855
|
|
|
c = self._encode_code(c) |
|
1856
|
|
|
return "<code>%s</code>" % c |
|
1857
|
|
|
|
|
1858
|
|
|
def _do_code_spans(self, text): |
|
1859
|
|
|
# * Backtick quotes are used for <code></code> spans. |
|
1860
|
|
|
# |
|
1861
|
|
|
# * You can use multiple backticks as the delimiters if you want to |
|
1862
|
|
|
# include literal backticks in the code span. So, this input: |
|
1863
|
|
|
# |
|
1864
|
|
|
# Just type ``foo `bar` baz`` at the prompt. |
|
1865
|
|
|
# |
|
1866
|
|
|
# Will translate to: |
|
1867
|
|
|
# |
|
1868
|
|
|
# <p>Just type <code>foo `bar` baz</code> at the prompt.</p> |
|
1869
|
|
|
# |
|
1870
|
|
|
# There's no arbitrary limit to the number of backticks you |
|
1871
|
|
|
# can use as delimters. If you need three consecutive backticks |
|
1872
|
|
|
# in your code, use four for delimiters, etc. |
|
1873
|
|
|
# |
|
1874
|
|
|
# * You can use spaces to get literal backticks at the edges: |
|
1875
|
|
|
# |
|
1876
|
|
|
# ... type `` `bar` `` ... |
|
1877
|
|
|
# |
|
1878
|
|
|
# Turns to: |
|
1879
|
|
|
# |
|
1880
|
|
|
# ... type <code>`bar`</code> ... |
|
1881
|
|
|
return self._code_span_re.sub(self._code_span_sub, text) |
|
1882
|
|
|
|
|
1883
|
|
|
def _encode_code(self, text): |
|
1884
|
|
|
"""Encode/escape certain characters inside Markdown code runs. |
|
1885
|
|
|
The point is that in code, these characters are literals, |
|
1886
|
|
|
and lose their special Markdown meanings. |
|
1887
|
|
|
""" |
|
1888
|
|
|
replacements = [ |
|
1889
|
|
|
# Encode all ampersands; HTML entities are not |
|
1890
|
|
|
# entities within a Markdown code span. |
|
1891
|
|
|
('&', '&'), |
|
1892
|
|
|
# Do the angle bracket song and dance: |
|
1893
|
|
|
('<', '<'), |
|
1894
|
|
|
('>', '>'), |
|
1895
|
|
|
] |
|
1896
|
|
|
for before, after in replacements: |
|
1897
|
|
|
text = text.replace(before, after) |
|
1898
|
|
|
hashed = _hash_text(text) |
|
1899
|
|
|
self._escape_table[text] = hashed |
|
1900
|
|
|
return hashed |
|
1901
|
|
|
|
|
1902
|
|
|
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) |
|
1903
|
|
|
def _do_strike(self, text): |
|
1904
|
|
|
text = self._strike_re.sub(r"<strike>\1</strike>", text) |
|
1905
|
|
|
return text |
|
1906
|
|
|
|
|
1907
|
|
|
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) |
|
1908
|
|
|
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) |
|
1909
|
|
|
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) |
|
1910
|
|
|
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) |
|
1911
|
|
|
def _do_italics_and_bold(self, text): |
|
1912
|
|
|
# <strong> must go first: |
|
1913
|
|
|
if "code-friendly" in self.extras: |
|
1914
|
|
|
text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) |
|
1915
|
|
|
text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) |
|
1916
|
|
|
else: |
|
1917
|
|
|
text = self._strong_re.sub(r"<strong>\2</strong>", text) |
|
1918
|
|
|
text = self._em_re.sub(r"<em>\2</em>", text) |
|
1919
|
|
|
return text |
|
1920
|
|
|
|
|
1921
|
|
|
# "smarty-pants" extra: Very liberal in interpreting a single prime as an |
|
1922
|
|
|
# apostrophe; e.g. ignores the fact that "round", "bout", "twer", and |
|
1923
|
|
|
# "twixt" can be written without an initial apostrophe. This is fine because |
|
1924
|
|
|
# using scare quotes (single quotation marks) is rare. |
|
1925
|
|
|
_apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))") |
|
1926
|
|
|
_contractions = ["tis", "twas", "twer", "neath", "o", "n", |
|
1927
|
|
|
"round", "bout", "twixt", "nuff", "fraid", "sup"] |
|
1928
|
|
|
def _do_smart_contractions(self, text): |
|
1929
|
|
|
text = self._apostrophe_year_re.sub(r"’\1", text) |
|
1930
|
|
|
for c in self._contractions: |
|
1931
|
|
|
text = text.replace("'%s" % c, "’%s" % c) |
|
1932
|
|
|
text = text.replace("'%s" % c.capitalize(), |
|
1933
|
|
|
"’%s" % c.capitalize()) |
|
1934
|
|
|
return text |
|
1935
|
|
|
|
|
1936
|
|
|
# Substitute double-quotes before single-quotes. |
|
1937
|
|
|
_opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)") |
|
1938
|
|
|
_opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)') |
|
1939
|
|
|
_closing_single_quote_re = re.compile(r"(?<=\S)'") |
|
1940
|
|
|
_closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))') |
|
1941
|
|
|
def _do_smart_punctuation(self, text): |
|
1942
|
|
|
"""Fancifies 'single quotes', "double quotes", and apostrophes. |
|
1943
|
|
|
Converts --, ---, and ... into en dashes, em dashes, and ellipses. |
|
1944
|
|
|
|
|
1945
|
|
|
Inspiration is: <http://daringfireball.net/projects/smartypants/> |
|
1946
|
|
|
See "test/tm-cases/smarty_pants.text" for a full discussion of the |
|
1947
|
|
|
support here and |
|
1948
|
|
|
<http://code.google.com/p/python-markdown2/issues/detail?id=42> for a |
|
1949
|
|
|
discussion of some diversion from the original SmartyPants. |
|
1950
|
|
|
""" |
|
1951
|
|
|
if "'" in text: # guard for perf |
|
1952
|
|
|
text = self._do_smart_contractions(text) |
|
1953
|
|
|
text = self._opening_single_quote_re.sub("‘", text) |
|
1954
|
|
|
text = self._closing_single_quote_re.sub("’", text) |
|
1955
|
|
|
|
|
1956
|
|
|
if '"' in text: # guard for perf |
|
1957
|
|
|
text = self._opening_double_quote_re.sub("“", text) |
|
1958
|
|
|
text = self._closing_double_quote_re.sub("”", text) |
|
1959
|
|
|
|
|
1960
|
|
|
text = text.replace("---", "—") |
|
1961
|
|
|
text = text.replace("--", "–") |
|
1962
|
|
|
text = text.replace("...", "…") |
|
1963
|
|
|
text = text.replace(" . . . ", "…") |
|
1964
|
|
|
text = text.replace(". . .", "…") |
|
1965
|
|
|
return text |
|
1966
|
|
|
|
|
1967
|
|
|
_block_quote_base = r''' |
|
1968
|
|
|
( # Wrap whole match in \1 |
|
1969
|
|
|
( |
|
1970
|
|
|
^[ \t]*>%s[ \t]? # '>' at the start of a line |
|
1971
|
|
|
.+\n # rest of the first line |
|
1972
|
|
|
(.+\n)* # subsequent consecutive lines |
|
1973
|
|
|
\n* # blanks |
|
1974
|
|
|
)+ |
|
1975
|
|
|
) |
|
1976
|
|
|
''' |
|
1977
|
|
|
_block_quote_re = re.compile(_block_quote_base % '', re.M | re.X) |
|
1978
|
|
|
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X) |
|
1979
|
|
|
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M) |
|
1980
|
|
|
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M) |
|
1981
|
|
|
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M) |
|
1982
|
|
|
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
|
1983
|
|
|
def _dedent_two_spaces_sub(self, match): |
|
1984
|
|
|
return re.sub(r'(?m)^ ', '', match.group(1)) |
|
1985
|
|
|
|
|
1986
|
|
|
def _block_quote_sub(self, match): |
|
1987
|
|
|
bq = match.group(1) |
|
1988
|
|
|
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq) |
|
1989
|
|
|
# trim one level of quoting |
|
1990
|
|
|
if is_spoiler: |
|
1991
|
|
|
bq = self._bq_one_level_re_spoiler.sub('', bq) |
|
1992
|
|
|
else: |
|
1993
|
|
|
bq = self._bq_one_level_re.sub('', bq) |
|
1994
|
|
|
# trim whitespace-only lines |
|
1995
|
|
|
bq = self._ws_only_line_re.sub('', bq) |
|
1996
|
|
|
bq = self._run_block_gamut(bq) # recurse |
|
1997
|
|
|
|
|
1998
|
|
|
bq = re.sub('(?m)^', ' ', bq) |
|
1999
|
|
|
# These leading spaces screw with <pre> content, so we need to fix that: |
|
2000
|
|
|
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) |
|
2001
|
|
|
|
|
2002
|
|
|
if is_spoiler: |
|
2003
|
|
|
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq |
|
2004
|
|
|
else: |
|
2005
|
|
|
return '<blockquote>\n%s\n</blockquote>\n\n' % bq |
|
2006
|
|
|
|
|
2007
|
|
|
def _do_block_quotes(self, text): |
|
2008
|
|
|
if '>' not in text: |
|
2009
|
|
|
return text |
|
2010
|
|
|
if 'spoiler' in self.extras: |
|
2011
|
|
|
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text) |
|
2012
|
|
|
else: |
|
2013
|
|
|
return self._block_quote_re.sub(self._block_quote_sub, text) |
|
2014
|
|
|
|
|
2015
|
|
|
def _form_paragraphs(self, text): |
|
2016
|
|
|
# Strip leading and trailing lines: |
|
2017
|
|
|
text = text.strip('\n') |
|
2018
|
|
|
|
|
2019
|
|
|
# Wrap <p> tags. |
|
2020
|
|
|
grafs = [] |
|
2021
|
|
|
for i, graf in enumerate(re.split(r"\n{2,}", text)): |
|
2022
|
|
|
if graf in self.html_blocks: |
|
2023
|
|
|
# Unhashify HTML blocks |
|
2024
|
|
|
grafs.append(self.html_blocks[graf]) |
|
2025
|
|
|
else: |
|
2026
|
|
|
cuddled_list = None |
|
2027
|
|
|
if "cuddled-lists" in self.extras: |
|
2028
|
|
|
# Need to put back trailing '\n' for `_list_item_re` |
|
2029
|
|
|
# match at the end of the paragraph. |
|
2030
|
|
|
li = self._list_item_re.search(graf + '\n') |
|
2031
|
|
|
# Two of the same list marker in this paragraph: a likely |
|
2032
|
|
|
# candidate for a list cuddled to preceding paragraph |
|
2033
|
|
|
# text (issue 33). Note the `[-1]` is a quick way to |
|
2034
|
|
|
# consider numeric bullets (e.g. "1." and "2.") to be |
|
2035
|
|
|
# equal. |
|
2036
|
|
|
if (li and len(li.group(2)) <= 3 and li.group("next_marker") |
|
2037
|
|
|
and li.group("marker")[-1] == li.group("next_marker")[-1]): |
|
2038
|
|
|
start = li.start() |
|
2039
|
|
|
cuddled_list = self._do_lists(graf[start:]).rstrip("\n") |
|
2040
|
|
|
assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>") |
|
2041
|
|
|
graf = graf[:start] |
|
2042
|
|
|
|
|
2043
|
|
|
# Wrap <p> tags. |
|
2044
|
|
|
graf = self._run_span_gamut(graf) |
|
2045
|
|
|
grafs.append("<p>" + graf.lstrip(" \t") + "</p>") |
|
2046
|
|
|
|
|
2047
|
|
|
if cuddled_list: |
|
2048
|
|
|
grafs.append(cuddled_list) |
|
2049
|
|
|
|
|
2050
|
|
|
return "\n\n".join(grafs) |
|
2051
|
|
|
|
|
2052
|
|
|
def _add_footnotes(self, text): |
|
2053
|
|
|
if self.footnotes: |
|
2054
|
|
|
footer = [ |
|
2055
|
|
|
'<div class="footnotes">', |
|
2056
|
|
|
'<hr' + self.empty_element_suffix, |
|
2057
|
|
|
'<ol>', |
|
2058
|
|
|
] |
|
2059
|
|
|
|
|
2060
|
|
|
if not self.footnote_title: |
|
2061
|
|
|
self.footnote_title = "Jump back to footnote %d in the text." |
|
2062
|
|
|
if not self.footnote_return_symbol: |
|
2063
|
|
|
self.footnote_return_symbol = "↩" |
|
2064
|
|
|
|
|
2065
|
|
|
for i, id in enumerate(self.footnote_ids): |
|
2066
|
|
|
if i != 0: |
|
2067
|
|
|
footer.append('') |
|
2068
|
|
|
footer.append('<li id="fn-%s">' % id) |
|
2069
|
|
|
footer.append(self._run_block_gamut(self.footnotes[id])) |
|
2070
|
|
|
try: |
|
2071
|
|
|
backlink = ('<a href="#fnref-%s" ' + |
|
2072
|
|
|
'class="footnoteBackLink" ' + |
|
2073
|
|
|
'title="' + self.footnote_title + '">' + |
|
2074
|
|
|
self.footnote_return_symbol + |
|
2075
|
|
|
'</a>') % (id, i+1) |
|
2076
|
|
|
except TypeError: |
|
2077
|
|
|
log.debug("Footnote error. `footnote_title` " |
|
2078
|
|
|
"must include parameter. Using defaults.") |
|
2079
|
|
|
backlink = ('<a href="#fnref-%s" ' |
|
2080
|
|
|
'class="footnoteBackLink" ' |
|
2081
|
|
|
'title="Jump back to footnote %d in the text.">' |
|
2082
|
|
|
'↩</a>' % (id, i+1)) |
|
2083
|
|
|
|
|
2084
|
|
|
if footer[-1].endswith("</p>"): |
|
2085
|
|
|
footer[-1] = footer[-1][:-len("</p>")] \ |
|
2086
|
|
|
+ ' ' + backlink + "</p>" |
|
2087
|
|
|
else: |
|
2088
|
|
|
footer.append("\n<p>%s</p>" % backlink) |
|
2089
|
|
|
footer.append('</li>') |
|
2090
|
|
|
footer.append('</ol>') |
|
2091
|
|
|
footer.append('</div>') |
|
2092
|
|
|
return text + '\n\n' + '\n'.join(footer) |
|
2093
|
|
|
else: |
|
2094
|
|
|
return text |
|
2095
|
|
|
|
|
2096
|
|
|
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: |
|
2097
|
|
|
# http://bumppo.net/projects/amputator/ |
|
2098
|
|
|
_ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') |
|
2099
|
|
|
_naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) |
|
2100
|
|
|
_naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I) |
|
2101
|
|
|
|
|
2102
|
|
|
def _encode_amps_and_angles(self, text): |
|
2103
|
|
|
# Smart processing for ampersands and angle brackets that need |
|
2104
|
|
|
# to be encoded. |
|
2105
|
|
|
text = self._ampersand_re.sub('&', text) |
|
2106
|
|
|
|
|
2107
|
|
|
# Encode naked <'s |
|
2108
|
|
|
text = self._naked_lt_re.sub('<', text) |
|
2109
|
|
|
|
|
2110
|
|
|
# Encode naked >'s |
|
2111
|
|
|
# Note: Other markdown implementations (e.g. Markdown.pl, PHP |
|
2112
|
|
|
# Markdown) don't do this. |
|
2113
|
|
|
text = self._naked_gt_re.sub('>', text) |
|
2114
|
|
|
return text |
|
2115
|
|
|
|
|
2116
|
|
|
def _encode_backslash_escapes(self, text): |
|
2117
|
|
|
for ch, escape in list(self._escape_table.items()): |
|
2118
|
|
|
text = text.replace("\\"+ch, escape) |
|
2119
|
|
|
return text |
|
2120
|
|
|
|
|
2121
|
|
|
_auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) |
|
2122
|
|
|
def _auto_link_sub(self, match): |
|
2123
|
|
|
g1 = match.group(1) |
|
2124
|
|
|
return '<a href="%s">%s</a>' % (g1, g1) |
|
2125
|
|
|
|
|
2126
|
|
|
_auto_email_link_re = re.compile(r""" |
|
2127
|
|
|
< |
|
2128
|
|
|
(?:mailto:)? |
|
2129
|
|
|
( |
|
2130
|
|
|
[-.\w]+ |
|
2131
|
|
|
\@ |
|
2132
|
|
|
[-\w]+(\.[-\w]+)*\.[a-z]+ |
|
2133
|
|
|
) |
|
2134
|
|
|
> |
|
2135
|
|
|
""", re.I | re.X | re.U) |
|
2136
|
|
|
def _auto_email_link_sub(self, match): |
|
2137
|
|
|
return self._encode_email_address( |
|
2138
|
|
|
self._unescape_special_chars(match.group(1))) |
|
2139
|
|
|
|
|
2140
|
|
|
def _do_auto_links(self, text): |
|
2141
|
|
|
text = self._auto_link_re.sub(self._auto_link_sub, text) |
|
2142
|
|
|
text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) |
|
2143
|
|
|
return text |
|
2144
|
|
|
|
|
2145
|
|
|
def _encode_email_address(self, addr): |
|
2146
|
|
|
# Input: an email address, e.g. "[email protected]" |
|
2147
|
|
|
# |
|
2148
|
|
|
# Output: the email address as a mailto link, with each character |
|
2149
|
|
|
# of the address encoded as either a decimal or hex entity, in |
|
2150
|
|
|
# the hopes of foiling most address harvesting spam bots. E.g.: |
|
2151
|
|
|
# |
|
2152
|
|
|
# <a href="mailto:foo@e |
|
2153
|
|
|
# xample.com">foo |
|
2154
|
|
|
# @example.com</a> |
|
2155
|
|
|
# |
|
2156
|
|
|
# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk |
|
2157
|
|
|
# mailing list: <http://tinyurl.com/yu7ue> |
|
2158
|
|
|
chars = [_xml_encode_email_char_at_random(ch) |
|
2159
|
|
|
for ch in "mailto:" + addr] |
|
2160
|
|
|
# Strip the mailto: from the visible part. |
|
2161
|
|
|
addr = '<a href="%s">%s</a>' \ |
|
2162
|
|
|
% (''.join(chars), ''.join(chars[7:])) |
|
2163
|
|
|
return addr |
|
2164
|
|
|
|
|
2165
|
|
|
def _do_link_patterns(self, text): |
|
2166
|
|
|
"""Caveat emptor: there isn't much guarding against link |
|
2167
|
|
|
patterns being formed inside other standard Markdown links, e.g. |
|
2168
|
|
|
inside a [link def][like this]. |
|
2169
|
|
|
|
|
2170
|
|
|
Dev Notes: *Could* consider prefixing regexes with a negative |
|
2171
|
|
|
lookbehind assertion to attempt to guard against this. |
|
2172
|
|
|
""" |
|
2173
|
|
|
link_from_hash = {} |
|
2174
|
|
|
for regex, repl in self.link_patterns: |
|
2175
|
|
|
replacements = [] |
|
2176
|
|
|
for match in regex.finditer(text): |
|
2177
|
|
|
if hasattr(repl, "__call__"): |
|
2178
|
|
|
href = repl(match) |
|
2179
|
|
|
else: |
|
2180
|
|
|
href = match.expand(repl) |
|
2181
|
|
|
replacements.append((match.span(), href)) |
|
2182
|
|
|
for (start, end), href in reversed(replacements): |
|
2183
|
|
|
escaped_href = ( |
|
2184
|
|
|
href.replace('"', '"') # b/c of attr quote |
|
2185
|
|
|
# To avoid markdown <em> and <strong>: |
|
2186
|
|
|
.replace('*', self._escape_table['*']) |
|
2187
|
|
|
.replace('_', self._escape_table['_'])) |
|
2188
|
|
|
link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) |
|
2189
|
|
|
hash = _hash_text(link) |
|
2190
|
|
|
link_from_hash[hash] = link |
|
2191
|
|
|
text = text[:start] + hash + text[end:] |
|
2192
|
|
|
for hash, link in list(link_from_hash.items()): |
|
2193
|
|
|
text = text.replace(hash, link) |
|
2194
|
|
|
return text |
|
2195
|
|
|
|
|
2196
|
|
|
def _unescape_special_chars(self, text): |
|
2197
|
|
|
# Swap back in all the special characters we've hidden. |
|
2198
|
|
|
for ch, hash in list(self._escape_table.items()): |
|
2199
|
|
|
text = text.replace(hash, ch) |
|
2200
|
|
|
return text |
|
2201
|
|
|
|
|
2202
|
|
|
def _outdent(self, text): |
|
2203
|
|
|
# Remove one level of line-leading tabs or spaces |
|
2204
|
|
|
return self._outdent_re.sub('', text) |
|
2205
|
|
|
|
|
2206
|
|
|
|
|
2207
|
|
|
class MarkdownWithExtras(Markdown): |
|
2208
|
|
|
"""A markdowner class that enables most extras: |
|
2209
|
|
|
|
|
2210
|
|
|
- footnotes |
|
2211
|
|
|
- code-color (only has effect if 'pygments' Python module on path) |
|
2212
|
|
|
|
|
2213
|
|
|
These are not included: |
|
2214
|
|
|
- pyshell (specific to Python-related documenting) |
|
2215
|
|
|
- code-friendly (because it *disables* part of the syntax) |
|
2216
|
|
|
- link-patterns (because you need to specify some actual |
|
2217
|
|
|
link-patterns anyway) |
|
2218
|
|
|
""" |
|
2219
|
|
|
extras = ["footnotes", "code-color"] |
|
2220
|
|
|
|
|
2221
|
|
|
|
|
2222
|
|
|
# ---- internal support functions |
|
2223
|
|
|
|
|
2224
|
|
|
class UnicodeWithAttrs(unicode): |
|
2225
|
|
|
"""A subclass of unicode used for the return value of conversion to |
|
2226
|
|
|
possibly attach some attributes. E.g. the "toc_html" attribute when |
|
2227
|
|
|
the "toc" extra is used. |
|
2228
|
|
|
""" |
|
2229
|
|
|
metadata = None |
|
2230
|
|
|
_toc = None |
|
2231
|
|
|
def toc_html(self): |
|
2232
|
|
|
"""Return the HTML for the current TOC. |
|
2233
|
|
|
|
|
2234
|
|
|
This expects the `_toc` attribute to have been set on this instance. |
|
2235
|
|
|
""" |
|
2236
|
|
|
if self._toc is None: |
|
2237
|
|
|
return None |
|
2238
|
|
|
|
|
2239
|
|
|
def indent(): |
|
2240
|
|
|
return ' ' * (len(h_stack) - 1) |
|
2241
|
|
|
lines = [] |
|
2242
|
|
|
h_stack = [0] # stack of header-level numbers |
|
2243
|
|
|
for level, id, name in self._toc: |
|
2244
|
|
|
if level > h_stack[-1]: |
|
2245
|
|
|
lines.append("%s<ul>" % indent()) |
|
2246
|
|
|
h_stack.append(level) |
|
2247
|
|
|
elif level == h_stack[-1]: |
|
2248
|
|
|
lines[-1] += "</li>" |
|
2249
|
|
|
else: |
|
2250
|
|
|
while level < h_stack[-1]: |
|
2251
|
|
|
h_stack.pop() |
|
2252
|
|
|
if not lines[-1].endswith("</li>"): |
|
2253
|
|
|
lines[-1] += "</li>" |
|
2254
|
|
|
lines.append("%s</ul></li>" % indent()) |
|
2255
|
|
|
lines.append('%s<li><a href="#%s">%s</a>' % ( |
|
2256
|
|
|
indent(), id, name)) |
|
2257
|
|
|
while len(h_stack) > 1: |
|
2258
|
|
|
h_stack.pop() |
|
2259
|
|
|
if not lines[-1].endswith("</li>"): |
|
2260
|
|
|
lines[-1] += "</li>" |
|
2261
|
|
|
lines.append("%s</ul>" % indent()) |
|
2262
|
|
|
return '\n'.join(lines) + '\n' |
|
2263
|
|
|
toc_html = property(toc_html) |
|
2264
|
|
|
|
|
2265
|
|
|
## {{{ http://code.activestate.com/recipes/577257/ (r1) |
|
2266
|
|
|
_slugify_strip_re = re.compile(r'[^\w\s-]') |
|
2267
|
|
|
_slugify_hyphenate_re = re.compile(r'[-\s]+') |
|
2268
|
|
|
def _slugify(value): |
|
2269
|
|
|
""" |
|
2270
|
|
|
Normalizes string, converts to lowercase, removes non-alpha characters, |
|
2271
|
|
|
and converts spaces to hyphens. |
|
2272
|
|
|
|
|
2273
|
|
|
From Django's "django/template/defaultfilters.py". |
|
2274
|
|
|
""" |
|
2275
|
|
|
import unicodedata |
|
2276
|
|
|
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() |
|
2277
|
|
|
value = _slugify_strip_re.sub('', value).strip().lower() |
|
2278
|
|
|
return _slugify_hyphenate_re.sub('-', value) |
|
2279
|
|
|
## end of http://code.activestate.com/recipes/577257/ }}} |
|
2280
|
|
|
|
|
2281
|
|
|
|
|
2282
|
|
|
# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 |
|
2283
|
|
|
def _curry(*args, **kwargs): |
|
2284
|
|
|
function, args = args[0], args[1:] |
|
2285
|
|
|
def result(*rest, **kwrest): |
|
2286
|
|
|
combined = kwargs.copy() |
|
2287
|
|
|
combined.update(kwrest) |
|
2288
|
|
|
return function(*args + rest, **combined) |
|
2289
|
|
|
return result |
|
2290
|
|
|
|
|
2291
|
|
|
|
|
2292
|
|
|
# Recipe: regex_from_encoded_pattern (1.0) |
|
2293
|
|
|
def _regex_from_encoded_pattern(s): |
|
2294
|
|
|
"""'foo' -> re.compile(re.escape('foo')) |
|
2295
|
|
|
'/foo/' -> re.compile('foo') |
|
2296
|
|
|
'/foo/i' -> re.compile('foo', re.I) |
|
2297
|
|
|
""" |
|
2298
|
|
|
if s.startswith('/') and s.rfind('/') != 0: |
|
2299
|
|
|
# Parse it: /PATTERN/FLAGS |
|
2300
|
|
|
idx = s.rfind('/') |
|
2301
|
|
|
pattern, flags_str = s[1:idx], s[idx+1:] |
|
2302
|
|
|
flag_from_char = { |
|
2303
|
|
|
"i": re.IGNORECASE, |
|
2304
|
|
|
"l": re.LOCALE, |
|
2305
|
|
|
"s": re.DOTALL, |
|
2306
|
|
|
"m": re.MULTILINE, |
|
2307
|
|
|
"u": re.UNICODE, |
|
2308
|
|
|
} |
|
2309
|
|
|
flags = 0 |
|
2310
|
|
|
for char in flags_str: |
|
2311
|
|
|
try: |
|
2312
|
|
|
flags |= flag_from_char[char] |
|
2313
|
|
|
except KeyError: |
|
2314
|
|
|
raise ValueError("unsupported regex flag: '%s' in '%s' " |
|
2315
|
|
|
"(must be one of '%s')" |
|
2316
|
|
|
% (char, s, ''.join(list(flag_from_char.keys())))) |
|
2317
|
|
|
return re.compile(s[1:idx], flags) |
|
2318
|
|
|
else: # not an encoded regex |
|
2319
|
|
|
return re.compile(re.escape(s)) |
|
2320
|
|
|
|
|
2321
|
|
|
|
|
2322
|
|
|
# Recipe: dedent (0.1.2) |
|
2323
|
|
|
def _dedentlines(lines, tabsize=8, skip_first_line=False): |
|
2324
|
|
|
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines |
|
2325
|
|
|
|
|
2326
|
|
|
"lines" is a list of lines to dedent. |
|
2327
|
|
|
"tabsize" is the tab width to use for indent width calculations. |
|
2328
|
|
|
"skip_first_line" is a boolean indicating if the first line should |
|
2329
|
|
|
be skipped for calculating the indent width and for dedenting. |
|
2330
|
|
|
This is sometimes useful for docstrings and similar. |
|
2331
|
|
|
|
|
2332
|
|
|
Same as dedent() except operates on a sequence of lines. Note: the |
|
2333
|
|
|
lines list is modified **in-place**. |
|
2334
|
|
|
""" |
|
2335
|
|
|
DEBUG = False |
|
2336
|
|
|
if DEBUG: |
|
2337
|
|
|
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ |
|
2338
|
|
|
% (tabsize, skip_first_line)) |
|
2339
|
|
|
margin = None |
|
2340
|
|
|
for i, line in enumerate(lines): |
|
2341
|
|
|
if i == 0 and skip_first_line: continue |
|
2342
|
|
|
indent = 0 |
|
2343
|
|
|
for ch in line: |
|
2344
|
|
|
if ch == ' ': |
|
2345
|
|
|
indent += 1 |
|
2346
|
|
|
elif ch == '\t': |
|
2347
|
|
|
indent += tabsize - (indent % tabsize) |
|
2348
|
|
|
elif ch in '\r\n': |
|
2349
|
|
|
continue # skip all-whitespace lines |
|
2350
|
|
|
else: |
|
2351
|
|
|
break |
|
2352
|
|
|
else: |
|
2353
|
|
|
continue # skip all-whitespace lines |
|
2354
|
|
|
if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) |
|
2355
|
|
|
if margin is None: |
|
2356
|
|
|
margin = indent |
|
2357
|
|
|
else: |
|
2358
|
|
|
margin = min(margin, indent) |
|
2359
|
|
|
if DEBUG: print("dedent: margin=%r" % margin) |
|
2360
|
|
|
|
|
2361
|
|
|
if margin is not None and margin > 0: |
|
2362
|
|
|
for i, line in enumerate(lines): |
|
2363
|
|
|
if i == 0 and skip_first_line: continue |
|
2364
|
|
|
removed = 0 |
|
2365
|
|
|
for j, ch in enumerate(line): |
|
2366
|
|
|
if ch == ' ': |
|
2367
|
|
|
removed += 1 |
|
2368
|
|
|
elif ch == '\t': |
|
2369
|
|
|
removed += tabsize - (removed % tabsize) |
|
2370
|
|
|
elif ch in '\r\n': |
|
2371
|
|
|
if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) |
|
2372
|
|
|
lines[i] = lines[i][j:] |
|
2373
|
|
|
break |
|
2374
|
|
|
else: |
|
2375
|
|
|
raise ValueError("unexpected non-whitespace char %r in " |
|
2376
|
|
|
"line %r while removing %d-space margin" |
|
2377
|
|
|
% (ch, line, margin)) |
|
2378
|
|
|
if DEBUG: |
|
2379
|
|
|
print("dedent: %r: %r -> removed %d/%d"\ |
|
2380
|
|
|
% (line, ch, removed, margin)) |
|
2381
|
|
|
if removed == margin: |
|
2382
|
|
|
lines[i] = lines[i][j+1:] |
|
2383
|
|
|
break |
|
2384
|
|
|
elif removed > margin: |
|
2385
|
|
|
lines[i] = ' '*(removed-margin) + lines[i][j+1:] |
|
2386
|
|
|
break |
|
2387
|
|
|
else: |
|
2388
|
|
|
if removed: |
|
2389
|
|
|
lines[i] = lines[i][removed:] |
|
2390
|
|
|
return lines |
|
2391
|
|
|
|
|
2392
|
|
|
|
|
2393
|
|
|
def _dedent(text, tabsize=8, skip_first_line=False): |
|
2394
|
|
|
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text |
|
2395
|
|
|
|
|
2396
|
|
|
"text" is the text to dedent. |
|
2397
|
|
|
"tabsize" is the tab width to use for indent width calculations. |
|
2398
|
|
|
"skip_first_line" is a boolean indicating if the first line should |
|
2399
|
|
|
be skipped for calculating the indent width and for dedenting. |
|
2400
|
|
|
This is sometimes useful for docstrings and similar. |
|
2401
|
|
|
|
|
2402
|
|
|
textwrap.dedent(s), but don't expand tabs to spaces |
|
2403
|
|
|
""" |
|
2404
|
|
|
lines = text.splitlines(1) |
|
2405
|
|
|
_dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) |
|
2406
|
|
|
return ''.join(lines) |
|
2407
|
|
|
|
|
2408
|
|
|
|
|
2409
|
|
|
class _memoized(object): |
|
2410
|
|
|
"""Decorator that caches a function's return value each time it is called. |
|
2411
|
|
|
If called later with the same arguments, the cached value is returned, and |
|
2412
|
|
|
not re-evaluated. |
|
2413
|
|
|
|
|
2414
|
|
|
http://wiki.python.org/moin/PythonDecoratorLibrary |
|
2415
|
|
|
""" |
|
2416
|
|
|
def __init__(self, func): |
|
2417
|
|
|
self.func = func |
|
2418
|
|
|
self.cache = {} |
|
2419
|
|
|
|
|
2420
|
|
|
def __call__(self, *args): |
|
2421
|
|
|
try: |
|
2422
|
|
|
return self.cache[args] |
|
2423
|
|
|
except KeyError: |
|
2424
|
|
|
self.cache[args] = value = self.func(*args) |
|
2425
|
|
|
return value |
|
2426
|
|
|
except TypeError: |
|
2427
|
|
|
# uncachable -- for instance, passing a list as an argument. |
|
2428
|
|
|
# Better to not cache than to blow up entirely. |
|
2429
|
|
|
return self.func(*args) |
|
2430
|
|
|
|
|
2431
|
|
|
def __repr__(self): |
|
2432
|
|
|
"""Return the function's docstring.""" |
|
2433
|
|
|
return self.func.__doc__ |
|
2434
|
|
|
|
|
2435
|
|
|
|
|
2436
|
|
|
def _xml_oneliner_re_from_tab_width(tab_width): |
|
2437
|
|
|
"""Standalone XML processing instruction regex.""" |
|
2438
|
|
|
return re.compile(r""" |
|
2439
|
|
|
(?: |
|
2440
|
|
|
(?<=\n\n) # Starting after a blank line |
|
2441
|
|
|
| # or |
|
2442
|
|
|
\A\n? # the beginning of the doc |
|
2443
|
|
|
) |
|
2444
|
|
|
( # save in $1 |
|
2445
|
|
|
[ ]{0,%d} |
|
2446
|
|
|
(?: |
|
2447
|
|
|
<\?\w+\b\s+.*?\?> # XML processing instruction |
|
2448
|
|
|
| |
|
2449
|
|
|
<\w+:\w+\b\s+.*?/> # namespaced single tag |
|
2450
|
|
|
) |
|
2451
|
|
|
[ \t]* |
|
2452
|
|
|
(?=\n{2,}|\Z) # followed by a blank line or end of document |
|
2453
|
|
|
) |
|
2454
|
|
|
""" % (tab_width - 1), re.X) |
|
2455
|
|
|
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) |
|
2456
|
|
|
|
|
2457
|
|
|
|
|
2458
|
|
|
def _hr_tag_re_from_tab_width(tab_width): |
|
2459
|
|
|
return re.compile(r""" |
|
2460
|
|
|
(?: |
|
2461
|
|
|
(?<=\n\n) # Starting after a blank line |
|
2462
|
|
|
| # or |
|
2463
|
|
|
\A\n? # the beginning of the doc |
|
2464
|
|
|
) |
|
2465
|
|
|
( # save in \1 |
|
2466
|
|
|
[ ]{0,%d} |
|
2467
|
|
|
<(hr) # start tag = \2 |
|
2468
|
|
|
\b # word break |
|
2469
|
|
|
([^<>])*? # |
|
2470
|
|
|
/?> # the matching end tag |
|
2471
|
|
|
[ \t]* |
|
2472
|
|
|
(?=\n{2,}|\Z) # followed by a blank line or end of document |
|
2473
|
|
|
) |
|
2474
|
|
|
""" % (tab_width - 1), re.X) |
|
2475
|
|
|
_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) |
|
2476
|
|
|
|
|
2477
|
|
|
|
|
2478
|
|
|
def _xml_escape_attr(attr, skip_single_quote=True): |
|
2479
|
|
|
"""Escape the given string for use in an HTML/XML tag attribute. |
|
2480
|
|
|
|
|
2481
|
|
|
By default this doesn't bother with escaping `'` to `'`, presuming that |
|
2482
|
|
|
the tag attribute is surrounded by double quotes. |
|
2483
|
|
|
""" |
|
2484
|
|
|
escaped = (attr |
|
2485
|
|
|
.replace('&', '&') |
|
2486
|
|
|
.replace('"', '"') |
|
2487
|
|
|
.replace('<', '<') |
|
2488
|
|
|
.replace('>', '>')) |
|
2489
|
|
|
if not skip_single_quote: |
|
2490
|
|
|
escaped = escaped.replace("'", "'") |
|
2491
|
|
|
return escaped |
|
2492
|
|
|
|
|
2493
|
|
|
|
|
2494
|
|
|
def _xml_encode_email_char_at_random(ch): |
|
2495
|
|
|
r = random() |
|
2496
|
|
|
# Roughly 10% raw, 45% hex, 45% dec. |
|
2497
|
|
|
# '@' *must* be encoded. I [John Gruber] insist. |
|
2498
|
|
|
# Issue 26: '_' must be encoded. |
|
2499
|
|
|
if r > 0.9 and ch not in "@_": |
|
2500
|
|
|
return ch |
|
2501
|
|
|
elif r < 0.45: |
|
2502
|
|
|
# The [1:] is to drop leading '0': 0x63 -> x63 |
|
2503
|
|
|
return '&#%s;' % hex(ord(ch))[1:] |
|
2504
|
|
|
else: |
|
2505
|
|
|
return '&#%s;' % ord(ch) |
|
2506
|
|
|
|
|
2507
|
|
|
|
|
2508
|
|
|
def _html_escape_url(attr, safe_mode=False): |
|
2509
|
|
|
"""Replace special characters that are potentially malicious in url string.""" |
|
2510
|
|
|
escaped = (attr |
|
2511
|
|
|
.replace('"', '"') |
|
2512
|
|
|
.replace('<', '<') |
|
2513
|
|
|
.replace('>', '>')) |
|
2514
|
|
|
if safe_mode: |
|
2515
|
|
|
escaped = escaped.replace('+', ' ') |
|
2516
|
|
|
escaped = escaped.replace("'", "'") |
|
2517
|
|
|
return escaped |
|
2518
|
|
|
|
|
2519
|
|
|
|
|
2520
|
|
|
# ---- mainline |
|
2521
|
|
|
|
|
2522
|
|
|
class _NoReflowFormatter(optparse.IndentedHelpFormatter): |
|
2523
|
|
|
"""An optparse formatter that does NOT reflow the description.""" |
|
2524
|
|
|
def format_description(self, description): |
|
2525
|
|
|
return description or "" |
|
2526
|
|
|
|
|
2527
|
|
|
|
|
2528
|
|
|
def _test(): |
|
2529
|
|
|
import doctest |
|
2530
|
|
|
doctest.testmod() |
|
2531
|
|
|
|
|
2532
|
|
|
|
|
2533
|
|
|
def main(argv=None): |
|
2534
|
|
|
if argv is None: |
|
2535
|
|
|
argv = sys.argv |
|
2536
|
|
|
if not logging.root.handlers: |
|
2537
|
|
|
logging.basicConfig() |
|
2538
|
|
|
|
|
2539
|
|
|
usage = "usage: %prog [PATHS...]" |
|
2540
|
|
|
version = "%prog "+__version__ |
|
2541
|
|
|
parser = optparse.OptionParser(prog="markdown2", usage=usage, |
|
2542
|
|
|
version=version, description=cmdln_desc, |
|
2543
|
|
|
formatter=_NoReflowFormatter()) |
|
2544
|
|
|
parser.add_option("-v", "--verbose", dest="log_level", |
|
2545
|
|
|
action="store_const", const=logging.DEBUG, |
|
2546
|
|
|
help="more verbose output") |
|
2547
|
|
|
parser.add_option("--encoding", |
|
2548
|
|
|
help="specify encoding of text content") |
|
2549
|
|
|
parser.add_option("--html4tags", action="store_true", default=False, |
|
2550
|
|
|
help="use HTML 4 style for empty element tags") |
|
2551
|
|
|
parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode", |
|
2552
|
|
|
help="sanitize literal HTML: 'escape' escapes " |
|
2553
|
|
|
"HTML meta chars, 'replace' replaces with an " |
|
2554
|
|
|
"[HTML_REMOVED] note") |
|
2555
|
|
|
parser.add_option("-x", "--extras", action="append", |
|
2556
|
|
|
help="Turn on specific extra features (not part of " |
|
2557
|
|
|
"the core Markdown spec). See above.") |
|
2558
|
|
|
parser.add_option("--use-file-vars", |
|
2559
|
|
|
help="Look for and use Emacs-style 'markdown-extras' " |
|
2560
|
|
|
"file var to turn on extras. See " |
|
2561
|
|
|
"<https://github.com/trentm/python-markdown2/wiki/Extras>") |
|
2562
|
|
|
parser.add_option("--link-patterns-file", |
|
2563
|
|
|
help="path to a link pattern file") |
|
2564
|
|
|
parser.add_option("--self-test", action="store_true", |
|
2565
|
|
|
help="run internal self-tests (some doctests)") |
|
2566
|
|
|
parser.add_option("--compare", action="store_true", |
|
2567
|
|
|
help="run against Markdown.pl as well (for testing)") |
|
2568
|
|
|
parser.set_defaults(log_level=logging.INFO, compare=False, |
|
2569
|
|
|
encoding="utf-8", safe_mode=None, use_file_vars=False) |
|
2570
|
|
|
opts, paths = parser.parse_args() |
|
2571
|
|
|
log.setLevel(opts.log_level) |
|
2572
|
|
|
|
|
2573
|
|
|
if opts.self_test: |
|
2574
|
|
|
return _test() |
|
2575
|
|
|
|
|
2576
|
|
View Code Duplication |
if opts.extras: |
|
|
|
|
|
|
2577
|
|
|
extras = {} |
|
2578
|
|
|
for s in opts.extras: |
|
2579
|
|
|
splitter = re.compile("[,;: ]+") |
|
2580
|
|
|
for e in splitter.split(s): |
|
2581
|
|
|
if '=' in e: |
|
2582
|
|
|
ename, earg = e.split('=', 1) |
|
2583
|
|
|
try: |
|
2584
|
|
|
earg = int(earg) |
|
2585
|
|
|
except ValueError: |
|
2586
|
|
|
pass |
|
2587
|
|
|
else: |
|
2588
|
|
|
ename, earg = e, None |
|
2589
|
|
|
extras[ename] = earg |
|
2590
|
|
|
else: |
|
2591
|
|
|
extras = None |
|
2592
|
|
|
|
|
2593
|
|
|
if opts.link_patterns_file: |
|
2594
|
|
|
link_patterns = [] |
|
2595
|
|
|
f = open(opts.link_patterns_file) |
|
2596
|
|
|
try: |
|
2597
|
|
|
for i, line in enumerate(f.readlines()): |
|
2598
|
|
|
if not line.strip(): continue |
|
2599
|
|
|
if line.lstrip().startswith("#"): continue |
|
2600
|
|
|
try: |
|
2601
|
|
|
pat, href = line.rstrip().rsplit(None, 1) |
|
2602
|
|
|
except ValueError: |
|
2603
|
|
|
raise MarkdownError("%s:%d: invalid link pattern line: %r" |
|
2604
|
|
|
% (opts.link_patterns_file, i+1, line)) |
|
2605
|
|
|
link_patterns.append( |
|
2606
|
|
|
(_regex_from_encoded_pattern(pat), href)) |
|
2607
|
|
|
finally: |
|
2608
|
|
|
f.close() |
|
2609
|
|
|
else: |
|
2610
|
|
|
link_patterns = None |
|
2611
|
|
|
|
|
2612
|
|
|
from os.path import join, dirname, abspath, exists |
|
2613
|
|
|
markdown_pl = join(dirname(dirname(abspath(__file__))), "test", |
|
2614
|
|
|
"Markdown.pl") |
|
2615
|
|
|
if not paths: |
|
2616
|
|
|
paths = ['-'] |
|
2617
|
|
|
for path in paths: |
|
2618
|
|
|
if path == '-': |
|
2619
|
|
|
text = sys.stdin.read() |
|
2620
|
|
|
else: |
|
2621
|
|
|
fp = codecs.open(path, 'r', opts.encoding) |
|
2622
|
|
|
text = fp.read() |
|
2623
|
|
|
fp.close() |
|
2624
|
|
|
if opts.compare: |
|
2625
|
|
|
from subprocess import Popen, PIPE |
|
2626
|
|
|
print("==== Markdown.pl ====") |
|
2627
|
|
|
p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True) |
|
2628
|
|
|
p.stdin.write(text.encode('utf-8')) |
|
2629
|
|
|
p.stdin.close() |
|
2630
|
|
|
perl_html = p.stdout.read().decode('utf-8') |
|
2631
|
|
|
if py3: |
|
2632
|
|
|
sys.stdout.write(perl_html) |
|
2633
|
|
|
else: |
|
2634
|
|
|
sys.stdout.write(perl_html.encode( |
|
2635
|
|
|
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) |
|
2636
|
|
|
print("==== markdown2.py ====") |
|
2637
|
|
|
html = markdown(text, |
|
2638
|
|
|
html4tags=opts.html4tags, |
|
2639
|
|
|
safe_mode=opts.safe_mode, |
|
2640
|
|
|
extras=extras, link_patterns=link_patterns, |
|
2641
|
|
|
use_file_vars=opts.use_file_vars) |
|
2642
|
|
|
if py3: |
|
2643
|
|
|
sys.stdout.write(html) |
|
2644
|
|
|
else: |
|
2645
|
|
|
sys.stdout.write(html.encode( |
|
2646
|
|
|
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) |
|
2647
|
|
|
if extras and "toc" in extras: |
|
2648
|
|
|
log.debug("toc_html: " + |
|
2649
|
|
|
str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))) |
|
2650
|
|
|
if opts.compare: |
|
2651
|
|
|
test_dir = join(dirname(dirname(abspath(__file__))), "test") |
|
2652
|
|
|
if exists(join(test_dir, "test_markdown2.py")): |
|
2653
|
|
|
sys.path.insert(0, test_dir) |
|
2654
|
|
|
from test_markdown2 import norm_html_from_html |
|
2655
|
|
|
norm_html = norm_html_from_html(html) |
|
2656
|
|
|
norm_perl_html = norm_html_from_html(perl_html) |
|
2657
|
|
|
else: |
|
2658
|
|
|
norm_html = html |
|
2659
|
|
|
norm_perl_html = perl_html |
|
2660
|
|
|
print("==== match? %r ====" % (norm_perl_html == norm_html)) |
|
2661
|
|
|
|
|
2662
|
|
|
|
|
2663
|
|
|
if __name__ == "__main__": |
|
2664
|
|
|
sys.exit(main(sys.argv)) |
|
2665
|
|
|
|