1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# Copyright (c) 2012 Trent Mick. |
3
|
|
|
# Copyright (c) 2007-2008 ActiveState Corp. |
4
|
|
|
# License: MIT (http://www.opensource.org/licenses/mit-license.php) |
5
|
|
|
|
6
|
|
|
from __future__ import generators |
7
|
|
|
|
8
|
|
|
r"""A fast and complete Python implementation of Markdown. |
9
|
|
|
|
10
|
|
|
[from http://daringfireball.net/projects/markdown/] |
11
|
|
|
> Markdown is a text-to-HTML filter; it translates an easy-to-read / |
12
|
|
|
> easy-to-write structured text format into HTML. Markdown's text |
13
|
|
|
> format is most similar to that of plain text email, and supports |
14
|
|
|
> features such as headers, *emphasis*, code blocks, blockquotes, and |
15
|
|
|
> links. |
16
|
|
|
> |
17
|
|
|
> Markdown's syntax is designed not as a generic markup language, but |
18
|
|
|
> specifically to serve as a front-end to (X)HTML. You can use span-level |
19
|
|
|
> HTML tags anywhere in a Markdown document, and you can use block level |
20
|
|
|
> HTML tags (like <div> and <table> as well). |
21
|
|
|
|
22
|
|
|
Module usage: |
23
|
|
|
|
24
|
|
|
>>> import markdown2 |
25
|
|
|
>>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)` |
26
|
|
|
u'<p><em>boo!</em></p>\n' |
27
|
|
|
|
28
|
|
|
>>> markdowner = Markdown() |
29
|
|
|
>>> markdowner.convert("*boo!*") |
30
|
|
|
u'<p><em>boo!</em></p>\n' |
31
|
|
|
>>> markdowner.convert("**boom!**") |
32
|
|
|
u'<p><strong>boom!</strong></p>\n' |
33
|
|
|
|
34
|
|
|
This implementation of Markdown implements the full "core" syntax plus a |
35
|
|
|
number of extras (e.g., code syntax coloring, footnotes) as described on |
36
|
|
|
<https://github.com/trentm/python-markdown2/wiki/Extras>. |
37
|
|
|
""" |
38
|
|
|
|
39
|
|
|
cmdln_desc = """A fast and complete Python implementation of Markdown, a |
40
|
|
|
text-to-HTML conversion tool for web writers. |
41
|
|
|
|
42
|
|
|
Supported extra syntax options (see -x|--extras option below and |
43
|
|
|
see <https://github.com/trentm/python-markdown2/wiki/Extras> for details): |
44
|
|
|
|
45
|
|
|
* code-friendly: Disable _ and __ for em and strong. |
46
|
|
|
* cuddled-lists: Allow lists to be cuddled to the preceding paragraph. |
47
|
|
|
* fenced-code-blocks: Allows a code block to not have to be indented |
48
|
|
|
by fencing it with '```' on a line before and after. Based on |
49
|
|
|
<http://github.github.com/github-flavored-markdown/> with support for |
50
|
|
|
syntax highlighting. |
51
|
|
|
* footnotes: Support footnotes as in use on daringfireball.net and |
52
|
|
|
implemented in other Markdown processors (tho not in Markdown.pl v1.0.1). |
53
|
|
|
* header-ids: Adds "id" attributes to headers. The id value is a slug of |
54
|
|
|
the header text. |
55
|
|
|
* highlightjs-lang: Allows specifying the language which used for syntax |
56
|
|
|
highlighting when using fenced-code-blocks and highlightjs. |
57
|
|
|
* html-classes: Takes a dict mapping html tag names (lowercase) to a |
58
|
|
|
string to use for a "class" tag attribute. Currently only supports "img", |
59
|
|
|
"table", "pre" and "code" tags. Add an issue if you require this for other |
60
|
|
|
tags. |
61
|
|
|
* link-patterns: Auto-link given regex patterns in text (e.g. bug number |
62
|
|
|
references, revision number references). |
63
|
|
|
* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to |
64
|
|
|
have markdown processing be done on its contents. Similar to |
65
|
|
|
<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with |
66
|
|
|
some limitations. |
67
|
|
|
* metadata: Extract metadata from a leading '---'-fenced block. |
68
|
|
|
See <https://github.com/trentm/python-markdown2/issues/77> for details. |
69
|
|
|
* nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See |
70
|
|
|
<http://en.wikipedia.org/wiki/Nofollow>. |
71
|
|
|
* numbering: Support of generic counters. Non standard extension to |
72
|
|
|
allow sequential numbering of figures, tables, equations, exhibits etc. |
73
|
|
|
* pyshell: Treats unindented Python interactive shell sessions as <code> |
74
|
|
|
blocks. |
75
|
|
|
* smarty-pants: Replaces ' and " with curly quotation marks or curly |
76
|
|
|
apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes, |
77
|
|
|
and ellipses. |
78
|
|
|
* spoiler: A special kind of blockquote commonly hidden behind a |
79
|
|
|
click on SO. Syntax per <http://meta.stackexchange.com/a/72878>. |
80
|
|
|
* tag-friendly: Requires atx style headers to have a space between the # and |
81
|
|
|
the header text. Useful for applications that require twitter style tags to |
82
|
|
|
pass through the parser. |
83
|
|
|
* tables: Tables using the same format as GFM |
84
|
|
|
<https://help.github.com/articles/github-flavored-markdown#tables> and |
85
|
|
|
PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>. |
86
|
|
|
* toc: The returned HTML string gets a new "toc_html" attribute which is |
87
|
|
|
a Table of Contents for the document. (experimental) |
88
|
|
|
* use-file-vars: Look for an Emacs-style markdown-extras file variable to turn |
89
|
|
|
on Extras. |
90
|
|
|
* wiki-tables: Google Code Wiki-style tables. See |
91
|
|
|
<http://code.google.com/p/support/wiki/WikiSyntax#Tables>. |
92
|
|
|
* xml: Passes one-liner processing instructions and namespaced XML tags. |
93
|
|
|
""" |
94
|
|
|
|
95
|
|
|
# Dev Notes: |
96
|
|
|
# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm |
97
|
|
|
# not yet sure if there implications with this. Compare 'pydoc sre' |
98
|
|
|
# and 'perldoc perlre'. |
99
|
|
|
|
100
|
|
|
__version_info__ = (2, 3, 5) |
101
|
|
|
__version__ = '.'.join(map(str, __version_info__)) |
102
|
|
|
__author__ = "Trent Mick" |
103
|
|
|
|
104
|
|
|
import sys |
105
|
|
|
import re |
106
|
|
|
import logging |
107
|
|
|
from hashlib import sha256 |
108
|
|
|
import optparse |
109
|
|
|
from random import random, randint |
110
|
|
|
import codecs |
111
|
|
|
try: |
112
|
|
|
from urllib import quote_plus |
113
|
|
|
except ImportError: |
114
|
|
|
from urllib.parse import quote_plus |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
# ---- Python version compat |
118
|
|
|
|
119
|
|
|
if sys.version_info[:2] < (2, 4): |
120
|
|
|
def reversed(sequence): |
121
|
|
|
for i in sequence[::-1]: |
122
|
|
|
yield i |
123
|
|
|
|
124
|
|
|
# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3). |
125
|
|
|
if sys.version_info[0] <= 2: |
126
|
|
|
py3 = False |
127
|
|
|
try: |
128
|
|
|
bytes |
129
|
|
|
except NameError: |
130
|
|
|
bytes = str |
131
|
|
|
base_string_type = basestring |
132
|
|
|
elif sys.version_info[0] >= 3: |
133
|
|
|
py3 = True |
134
|
|
|
unicode = str |
135
|
|
|
base_string_type = str |
136
|
|
|
|
137
|
|
|
# ---- globals |
138
|
|
|
|
139
|
|
|
DEBUG = False |
140
|
|
|
log = logging.getLogger("markdown") |
141
|
|
|
|
142
|
|
|
DEFAULT_TAB_WIDTH = 4 |
143
|
|
|
|
144
|
|
|
|
145
|
|
|
SECRET_SALT = bytes(randint(0, 1000000)) |
146
|
|
|
# MD5 function was previously used for this; the "md5" prefix was kept for |
147
|
|
|
# backwards compatibility. |
148
|
|
|
def _hash_text(s): |
149
|
|
|
return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:] |
150
|
|
|
|
151
|
|
|
# Table of hash values for escaped characters: |
152
|
|
|
g_escape_table = dict([(ch, _hash_text(ch)) |
153
|
|
|
for ch in '\\`*_{}[]()>#+-.!']) |
154
|
|
|
|
155
|
|
|
|
156
|
|
|
# ---- exceptions |
157
|
|
|
class MarkdownError(Exception): |
158
|
|
|
pass |
159
|
|
|
|
160
|
|
|
|
161
|
|
|
# ---- public api |
162
|
|
|
|
163
|
|
|
def markdown_path(path, encoding="utf-8", |
164
|
|
|
html4tags=False, tab_width=DEFAULT_TAB_WIDTH, |
165
|
|
|
safe_mode=None, extras=None, link_patterns=None, |
166
|
|
|
footnote_title=None, footnote_return_symbol=None, |
167
|
|
|
use_file_vars=False): |
168
|
|
|
fp = codecs.open(path, 'r', encoding) |
169
|
|
|
text = fp.read() |
170
|
|
|
fp.close() |
171
|
|
|
return Markdown(html4tags=html4tags, tab_width=tab_width, |
172
|
|
|
safe_mode=safe_mode, extras=extras, |
173
|
|
|
link_patterns=link_patterns, |
174
|
|
|
footnote_title=footnote_title, |
175
|
|
|
footnote_return_symbol=footnote_return_symbol, |
176
|
|
|
use_file_vars=use_file_vars).convert(text) |
177
|
|
|
|
178
|
|
|
|
179
|
|
|
def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, |
180
|
|
|
safe_mode=None, extras=None, link_patterns=None, |
181
|
|
|
footnote_title=None, footnote_return_symbol=None, |
182
|
|
|
use_file_vars=False): |
183
|
|
|
return Markdown(html4tags=html4tags, tab_width=tab_width, |
184
|
|
|
safe_mode=safe_mode, extras=extras, |
185
|
|
|
link_patterns=link_patterns, |
186
|
|
|
footnote_title=footnote_title, |
187
|
|
|
footnote_return_symbol=footnote_return_symbol, |
188
|
|
|
use_file_vars=use_file_vars).convert(text) |
189
|
|
|
|
190
|
|
|
|
191
|
|
|
class Markdown(object): |
192
|
|
|
# The dict of "extras" to enable in processing -- a mapping of |
193
|
|
|
# extra name to argument for the extra. Most extras do not have an |
194
|
|
|
# argument, in which case the value is None. |
195
|
|
|
# |
196
|
|
|
# This can be set via (a) subclassing and (b) the constructor |
197
|
|
|
# "extras" argument. |
198
|
|
|
extras = None |
199
|
|
|
|
200
|
|
|
urls = None |
201
|
|
|
titles = None |
202
|
|
|
html_blocks = None |
203
|
|
|
html_spans = None |
204
|
|
|
html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py |
205
|
|
|
|
206
|
|
|
# Used to track when we're inside an ordered or unordered list |
207
|
|
|
# (see _ProcessListItems() for details): |
208
|
|
|
list_level = 0 |
209
|
|
|
|
210
|
|
|
_ws_only_line_re = re.compile(r"^[ \t]+$", re.M) |
211
|
|
|
|
212
|
|
|
def __init__(self, html4tags=False, tab_width=4, safe_mode=None, |
213
|
|
|
extras=None, link_patterns=None, |
214
|
|
|
footnote_title=None, footnote_return_symbol=None, |
215
|
|
|
use_file_vars=False): |
216
|
|
|
if html4tags: |
217
|
|
|
self.empty_element_suffix = ">" |
218
|
|
|
else: |
219
|
|
|
self.empty_element_suffix = " />" |
220
|
|
|
self.tab_width = tab_width |
221
|
|
|
|
222
|
|
|
# For compatibility with earlier markdown2.py and with |
223
|
|
|
# markdown.py's safe_mode being a boolean, |
224
|
|
|
# safe_mode == True -> "replace" |
225
|
|
|
if safe_mode is True: |
226
|
|
|
self.safe_mode = "replace" |
227
|
|
|
else: |
228
|
|
|
self.safe_mode = safe_mode |
229
|
|
|
|
230
|
|
|
# Massaging and building the "extras" info. |
231
|
|
|
if self.extras is None: |
232
|
|
|
self.extras = {} |
233
|
|
|
elif not isinstance(self.extras, dict): |
234
|
|
|
self.extras = dict([(e, None) for e in self.extras]) |
235
|
|
|
if extras: |
236
|
|
|
if not isinstance(extras, dict): |
237
|
|
|
extras = dict([(e, None) for e in extras]) |
238
|
|
|
self.extras.update(extras) |
239
|
|
|
assert isinstance(self.extras, dict) |
240
|
|
|
if "toc" in self.extras and "header-ids" not in self.extras: |
241
|
|
|
self.extras["header-ids"] = None # "toc" implies "header-ids" |
242
|
|
|
self._instance_extras = self.extras.copy() |
243
|
|
|
|
244
|
|
|
self.link_patterns = link_patterns |
245
|
|
|
self.footnote_title = footnote_title |
246
|
|
|
self.footnote_return_symbol = footnote_return_symbol |
247
|
|
|
self.use_file_vars = use_file_vars |
248
|
|
|
self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) |
249
|
|
|
|
250
|
|
|
self._escape_table = g_escape_table.copy() |
251
|
|
|
if "smarty-pants" in self.extras: |
252
|
|
|
self._escape_table['"'] = _hash_text('"') |
253
|
|
|
self._escape_table["'"] = _hash_text("'") |
254
|
|
|
|
255
|
|
|
def reset(self): |
256
|
|
|
self.urls = {} |
257
|
|
|
self.titles = {} |
258
|
|
|
self.html_blocks = {} |
259
|
|
|
self.html_spans = {} |
260
|
|
|
self.list_level = 0 |
261
|
|
|
self.extras = self._instance_extras.copy() |
262
|
|
|
if "footnotes" in self.extras: |
263
|
|
|
self.footnotes = {} |
264
|
|
|
self.footnote_ids = [] |
265
|
|
|
if "header-ids" in self.extras: |
266
|
|
|
self._count_from_header_id = {} # no `defaultdict` in Python 2.4 |
267
|
|
|
if "metadata" in self.extras: |
268
|
|
|
self.metadata = {} |
269
|
|
|
|
270
|
|
|
# Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel" |
271
|
|
|
# should only be used in <a> tags with an "href" attribute. |
272
|
|
|
_a_nofollow = re.compile(r""" |
273
|
|
|
<(a) |
274
|
|
|
( |
275
|
|
|
[^>]* |
276
|
|
|
href= # href is required |
277
|
|
|
['"]? # HTML5 attribute values do not have to be quoted |
278
|
|
|
[^#'"] # We don't want to match href values that start with # (like footnotes) |
279
|
|
|
) |
280
|
|
|
""", |
281
|
|
|
re.IGNORECASE | re.VERBOSE |
282
|
|
|
) |
283
|
|
|
|
284
|
|
|
# Opens the linked document in a new window or tab |
285
|
|
|
# should only used in <a> tags with an "href" attribute. |
286
|
|
|
# same with _a_nofollow |
287
|
|
|
_a_blank = _a_nofollow |
288
|
|
|
|
289
|
|
|
def convert(self, text): |
290
|
|
|
"""Convert the given text.""" |
291
|
|
|
# Main function. The order in which other subs are called here is |
292
|
|
|
# essential. Link and image substitutions need to happen before |
293
|
|
|
# _EscapeSpecialChars(), so that any *'s or _'s in the <a> |
294
|
|
|
# and <img> tags get encoded. |
295
|
|
|
|
296
|
|
|
# Clear the global hashes. If we don't clear these, you get conflicts |
297
|
|
|
# from other articles when generating a page which contains more than |
298
|
|
|
# one article (e.g. an index page that shows the N most recent |
299
|
|
|
# articles): |
300
|
|
|
self.reset() |
301
|
|
|
|
302
|
|
|
if not isinstance(text, unicode): |
303
|
|
|
# TODO: perhaps shouldn't presume UTF-8 for string input? |
304
|
|
|
text = unicode(text, 'utf-8') |
305
|
|
|
|
306
|
|
View Code Duplication |
if self.use_file_vars: |
|
|
|
|
307
|
|
|
# Look for emacs-style file variable hints. |
308
|
|
|
emacs_vars = self._get_emacs_vars(text) |
309
|
|
|
if "markdown-extras" in emacs_vars: |
310
|
|
|
splitter = re.compile("[ ,]+") |
311
|
|
|
for e in splitter.split(emacs_vars["markdown-extras"]): |
312
|
|
|
if '=' in e: |
313
|
|
|
ename, earg = e.split('=', 1) |
314
|
|
|
try: |
315
|
|
|
earg = int(earg) |
316
|
|
|
except ValueError: |
317
|
|
|
pass |
318
|
|
|
else: |
319
|
|
|
ename, earg = e, None |
320
|
|
|
self.extras[ename] = earg |
321
|
|
|
|
322
|
|
|
# Standardize line endings: |
323
|
|
|
text = text.replace("\r\n", "\n") |
324
|
|
|
text = text.replace("\r", "\n") |
325
|
|
|
|
326
|
|
|
# Make sure $text ends with a couple of newlines: |
327
|
|
|
text += "\n\n" |
328
|
|
|
|
329
|
|
|
# Convert all tabs to spaces. |
330
|
|
|
text = self._detab(text) |
331
|
|
|
|
332
|
|
|
# Strip any lines consisting only of spaces and tabs. |
333
|
|
|
# This makes subsequent regexen easier to write, because we can |
334
|
|
|
# match consecutive blank lines with /\n+/ instead of something |
335
|
|
|
# contorted like /[ \t]*\n+/ . |
336
|
|
|
text = self._ws_only_line_re.sub("", text) |
337
|
|
|
|
338
|
|
|
# strip metadata from head and extract |
339
|
|
|
if "metadata" in self.extras: |
340
|
|
|
text = self._extract_metadata(text) |
341
|
|
|
|
342
|
|
|
text = self.preprocess(text) |
343
|
|
|
|
344
|
|
|
if "fenced-code-blocks" in self.extras and not self.safe_mode: |
345
|
|
|
text = self._do_fenced_code_blocks(text) |
346
|
|
|
|
347
|
|
|
if self.safe_mode: |
348
|
|
|
text = self._hash_html_spans(text) |
349
|
|
|
|
350
|
|
|
# Turn block-level HTML blocks into hash entries |
351
|
|
|
text = self._hash_html_blocks(text, raw=True) |
352
|
|
|
|
353
|
|
|
if "fenced-code-blocks" in self.extras and self.safe_mode: |
354
|
|
|
text = self._do_fenced_code_blocks(text) |
355
|
|
|
|
356
|
|
|
# Because numbering references aren't links (yet?) then we can do everything associated with counters |
357
|
|
|
# before we get started |
358
|
|
|
if "numbering" in self.extras: |
359
|
|
|
text = self._do_numbering(text) |
360
|
|
|
|
361
|
|
|
# Strip link definitions, store in hashes. |
362
|
|
|
if "footnotes" in self.extras: |
363
|
|
|
# Must do footnotes first because an unlucky footnote defn |
364
|
|
|
# looks like a link defn: |
365
|
|
|
# [^4]: this "looks like a link defn" |
366
|
|
|
text = self._strip_footnote_definitions(text) |
367
|
|
|
text = self._strip_link_definitions(text) |
368
|
|
|
|
369
|
|
|
text = self._run_block_gamut(text) |
370
|
|
|
|
371
|
|
|
if "footnotes" in self.extras: |
372
|
|
|
text = self._add_footnotes(text) |
373
|
|
|
|
374
|
|
|
text = self.postprocess(text) |
375
|
|
|
|
376
|
|
|
text = self._unescape_special_chars(text) |
377
|
|
|
|
378
|
|
|
if self.safe_mode: |
379
|
|
|
text = self._unhash_html_spans(text) |
380
|
|
|
|
381
|
|
|
if "nofollow" in self.extras: |
382
|
|
|
text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) |
383
|
|
|
|
384
|
|
|
if "target-blank-links" in self.extras: |
385
|
|
|
text = self._a_blank.sub(r'<\1 target="_blank"\2', text) |
386
|
|
|
|
387
|
|
|
text += "\n" |
388
|
|
|
|
389
|
|
|
rv = UnicodeWithAttrs(text) |
390
|
|
|
if "toc" in self.extras: |
391
|
|
|
rv._toc = self._toc |
392
|
|
|
if "metadata" in self.extras: |
393
|
|
|
rv.metadata = self.metadata |
394
|
|
|
return rv |
395
|
|
|
|
396
|
|
|
def postprocess(self, text): |
397
|
|
|
"""A hook for subclasses to do some postprocessing of the html, if |
398
|
|
|
desired. This is called before unescaping of special chars and |
399
|
|
|
unhashing of raw HTML spans. |
400
|
|
|
""" |
401
|
|
|
return text |
402
|
|
|
|
403
|
|
|
def preprocess(self, text): |
404
|
|
|
"""A hook for subclasses to do some preprocessing of the Markdown, if |
405
|
|
|
desired. This is called after basic formatting of the text, but prior |
406
|
|
|
to any extras, safe mode, etc. processing. |
407
|
|
|
""" |
408
|
|
|
return text |
409
|
|
|
|
410
|
|
|
# Is metadata if the content starts with optional '---'-fenced `key: value` |
411
|
|
|
# pairs. E.g. (indented for presentation): |
412
|
|
|
# --- |
413
|
|
|
# foo: bar |
414
|
|
|
# another-var: blah blah |
415
|
|
|
# --- |
416
|
|
|
# # header |
417
|
|
|
# or: |
418
|
|
|
# foo: bar |
419
|
|
|
# another-var: blah blah |
420
|
|
|
# |
421
|
|
|
# # header |
422
|
|
|
_meta_data_pattern = re.compile(r'^(?:---[\ \t]*\n)?(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)|([\S\w]+\s*:(?! >)[ \t]*.*\n?)(?:---[\ \t]*\n)?', re.MULTILINE) |
423
|
|
|
_key_val_pat = re.compile("[\S\w]+\s*:(?! >)[ \t]*.*\n?", re.MULTILINE) |
424
|
|
|
# this allows key: > |
425
|
|
|
# value |
426
|
|
|
# conutiues over multiple lines |
427
|
|
|
_key_val_block_pat = re.compile( |
428
|
|
|
"(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)", re.MULTILINE) |
429
|
|
|
_meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE) |
430
|
|
|
_meta_data_newline = re.compile("^\n", re.MULTILINE) |
431
|
|
|
|
432
|
|
|
def _extract_metadata(self, text): |
433
|
|
|
if text.startswith("---"): |
434
|
|
|
fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2) |
435
|
|
|
metadata_content = fence_splits[1] |
436
|
|
|
match = re.findall(self._meta_data_pattern, metadata_content) |
437
|
|
|
if not match: |
438
|
|
|
return text |
439
|
|
|
tail = fence_splits[2] |
440
|
|
|
else: |
441
|
|
|
metadata_split = re.split(self._meta_data_newline, text, maxsplit=1) |
442
|
|
|
metadata_content = metadata_split[0] |
443
|
|
|
match = re.findall(self._meta_data_pattern, metadata_content) |
444
|
|
|
if not match: |
445
|
|
|
return text |
446
|
|
|
tail = metadata_split[1] |
447
|
|
|
|
448
|
|
|
kv = re.findall(self._key_val_pat, metadata_content) |
449
|
|
|
kvm = re.findall(self._key_val_block_pat, metadata_content) |
450
|
|
|
kvm = [item.replace(": >\n", ":", 1) for item in kvm] |
451
|
|
|
|
452
|
|
|
for item in kv + kvm: |
453
|
|
|
k, v = item.split(":", 1) |
454
|
|
|
self.metadata[k.strip()] = v.strip() |
455
|
|
|
|
456
|
|
|
return tail |
457
|
|
|
|
458
|
|
|
_emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) |
459
|
|
|
# This regular expression is intended to match blocks like this: |
460
|
|
|
# PREFIX Local Variables: SUFFIX |
461
|
|
|
# PREFIX mode: Tcl SUFFIX |
462
|
|
|
# PREFIX End: SUFFIX |
463
|
|
|
# Some notes: |
464
|
|
|
# - "[ \t]" is used instead of "\s" to specifically exclude newlines |
465
|
|
|
# - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does |
466
|
|
|
# not like anything other than Unix-style line terminators. |
467
|
|
|
_emacs_local_vars_pat = re.compile(r"""^ |
468
|
|
|
(?P<prefix>(?:[^\r\n|\n|\r])*?) |
469
|
|
|
[\ \t]*Local\ Variables:[\ \t]* |
470
|
|
|
(?P<suffix>.*?)(?:\r\n|\n|\r) |
471
|
|
|
(?P<content>.*?\1End:) |
472
|
|
|
""", re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) |
473
|
|
|
|
474
|
|
|
def _get_emacs_vars(self, text): |
475
|
|
|
"""Return a dictionary of emacs-style local variables. |
476
|
|
|
|
477
|
|
|
Parsing is done loosely according to this spec (and according to |
478
|
|
|
some in-practice deviations from this): |
479
|
|
|
http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables |
480
|
|
|
""" |
481
|
|
|
emacs_vars = {} |
482
|
|
|
SIZE = pow(2, 13) # 8kB |
483
|
|
|
|
484
|
|
|
# Search near the start for a '-*-'-style one-liner of variables. |
485
|
|
|
head = text[:SIZE] |
486
|
|
|
if "-*-" in head: |
487
|
|
|
match = self._emacs_oneliner_vars_pat.search(head) |
488
|
|
|
if match: |
489
|
|
|
emacs_vars_str = match.group(1) |
490
|
|
|
assert '\n' not in emacs_vars_str |
491
|
|
|
emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') |
492
|
|
|
if s.strip()] |
493
|
|
|
if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: |
494
|
|
|
# While not in the spec, this form is allowed by emacs: |
495
|
|
|
# -*- Tcl -*- |
496
|
|
|
# where the implied "variable" is "mode". This form |
497
|
|
|
# is only allowed if there are no other variables. |
498
|
|
|
emacs_vars["mode"] = emacs_var_strs[0].strip() |
499
|
|
|
else: |
500
|
|
|
for emacs_var_str in emacs_var_strs: |
501
|
|
|
try: |
502
|
|
|
variable, value = emacs_var_str.strip().split(':', 1) |
503
|
|
|
except ValueError: |
504
|
|
|
log.debug("emacs variables error: malformed -*- " |
505
|
|
|
"line: %r", emacs_var_str) |
506
|
|
|
continue |
507
|
|
|
# Lowercase the variable name because Emacs allows "Mode" |
508
|
|
|
# or "mode" or "MoDe", etc. |
509
|
|
|
emacs_vars[variable.lower()] = value.strip() |
510
|
|
|
|
511
|
|
|
tail = text[-SIZE:] |
512
|
|
|
if "Local Variables" in tail: |
513
|
|
|
match = self._emacs_local_vars_pat.search(tail) |
514
|
|
|
if match: |
515
|
|
|
prefix = match.group("prefix") |
516
|
|
|
suffix = match.group("suffix") |
517
|
|
|
lines = match.group("content").splitlines(0) |
518
|
|
|
# print "prefix=%r, suffix=%r, content=%r, lines: %s"\ |
519
|
|
|
# % (prefix, suffix, match.group("content"), lines) |
520
|
|
|
|
521
|
|
|
# Validate the Local Variables block: proper prefix and suffix |
522
|
|
|
# usage. |
523
|
|
|
for i, line in enumerate(lines): |
524
|
|
|
if not line.startswith(prefix): |
525
|
|
|
log.debug("emacs variables error: line '%s' " |
526
|
|
|
"does not use proper prefix '%s'" |
527
|
|
|
% (line, prefix)) |
528
|
|
|
return {} |
529
|
|
|
# Don't validate suffix on last line. Emacs doesn't care, |
530
|
|
|
# neither should we. |
531
|
|
|
if i != len(lines)-1 and not line.endswith(suffix): |
532
|
|
|
log.debug("emacs variables error: line '%s' " |
533
|
|
|
"does not use proper suffix '%s'" |
534
|
|
|
% (line, suffix)) |
535
|
|
|
return {} |
536
|
|
|
|
537
|
|
|
# Parse out one emacs var per line. |
538
|
|
|
continued_for = None |
539
|
|
|
for line in lines[:-1]: # no var on the last line ("PREFIX End:") |
540
|
|
|
if prefix: line = line[len(prefix):] # strip prefix |
541
|
|
|
if suffix: line = line[:-len(suffix)] # strip suffix |
542
|
|
|
line = line.strip() |
543
|
|
|
if continued_for: |
544
|
|
|
variable = continued_for |
545
|
|
|
if line.endswith('\\'): |
546
|
|
|
line = line[:-1].rstrip() |
547
|
|
|
else: |
548
|
|
|
continued_for = None |
549
|
|
|
emacs_vars[variable] += ' ' + line |
550
|
|
|
else: |
551
|
|
|
try: |
552
|
|
|
variable, value = line.split(':', 1) |
553
|
|
|
except ValueError: |
554
|
|
|
log.debug("local variables error: missing colon " |
555
|
|
|
"in local variables entry: '%s'" % line) |
556
|
|
|
continue |
557
|
|
|
# Do NOT lowercase the variable name, because Emacs only |
558
|
|
|
# allows "mode" (and not "Mode", "MoDe", etc.) in this block. |
559
|
|
|
value = value.strip() |
560
|
|
|
if value.endswith('\\'): |
561
|
|
|
value = value[:-1].rstrip() |
562
|
|
|
continued_for = variable |
563
|
|
|
else: |
564
|
|
|
continued_for = None |
565
|
|
|
emacs_vars[variable] = value |
566
|
|
|
|
567
|
|
|
# Unquote values. |
568
|
|
|
for var, val in list(emacs_vars.items()): |
569
|
|
|
if len(val) > 1 and (val.startswith('"') and val.endswith('"') |
570
|
|
|
or val.startswith('"') and val.endswith('"')): |
571
|
|
|
emacs_vars[var] = val[1:-1] |
572
|
|
|
|
573
|
|
|
return emacs_vars |
574
|
|
|
|
575
|
|
|
def _detab_line(self, line): |
576
|
|
|
r"""Recusively convert tabs to spaces in a single line. |
577
|
|
|
|
578
|
|
|
Called from _detab().""" |
579
|
|
|
if '\t' not in line: |
580
|
|
|
return line |
581
|
|
|
chunk1, chunk2 = line.split('\t', 1) |
582
|
|
|
chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width)) |
583
|
|
|
output = chunk1 + chunk2 |
584
|
|
|
return self._detab_line(output) |
585
|
|
|
|
586
|
|
|
def _detab(self, text): |
587
|
|
|
r"""Iterate text line by line and convert tabs to spaces. |
588
|
|
|
|
589
|
|
|
>>> m = Markdown() |
590
|
|
|
>>> m._detab("\tfoo") |
591
|
|
|
' foo' |
592
|
|
|
>>> m._detab(" \tfoo") |
593
|
|
|
' foo' |
594
|
|
|
>>> m._detab("\t foo") |
595
|
|
|
' foo' |
596
|
|
|
>>> m._detab(" foo") |
597
|
|
|
' foo' |
598
|
|
|
>>> m._detab(" foo\n\tbar\tblam") |
599
|
|
|
' foo\n bar blam' |
600
|
|
|
""" |
601
|
|
|
if '\t' not in text: |
602
|
|
|
return text |
603
|
|
|
output = [] |
604
|
|
|
for line in text.splitlines(): |
605
|
|
|
output.append(self._detab_line(line)) |
606
|
|
|
return '\n'.join(output) |
607
|
|
|
|
608
|
|
|
# I broke out the html5 tags here and add them to _block_tags_a and |
609
|
|
|
# _block_tags_b. This way html5 tags are easy to keep track of. |
610
|
|
|
_html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption' |
611
|
|
|
|
612
|
|
|
_block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' |
613
|
|
|
_block_tags_a += _html5tags |
614
|
|
|
|
615
|
|
|
_strict_tag_block_re = re.compile(r""" |
616
|
|
|
( # save in \1 |
617
|
|
|
^ # start of line (with re.M) |
618
|
|
|
<(%s) # start tag = \2 |
619
|
|
|
\b # word break |
620
|
|
|
(.*\n)*? # any number of lines, minimally matching |
621
|
|
|
</\2> # the matching end tag |
622
|
|
|
[ \t]* # trailing spaces/tabs |
623
|
|
|
(?=\n+|\Z) # followed by a newline or end of document |
624
|
|
|
) |
625
|
|
|
""" % _block_tags_a, |
626
|
|
|
re.X | re.M) |
627
|
|
|
|
628
|
|
|
_block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' |
629
|
|
|
_block_tags_b += _html5tags |
630
|
|
|
|
631
|
|
|
_liberal_tag_block_re = re.compile(r""" |
632
|
|
|
( # save in \1 |
633
|
|
|
^ # start of line (with re.M) |
634
|
|
|
<(%s) # start tag = \2 |
635
|
|
|
\b # word break |
636
|
|
|
(.*\n)*? # any number of lines, minimally matching |
637
|
|
|
.*</\2> # the matching end tag |
638
|
|
|
[ \t]* # trailing spaces/tabs |
639
|
|
|
(?=\n+|\Z) # followed by a newline or end of document |
640
|
|
|
) |
641
|
|
|
""" % _block_tags_b, |
642
|
|
|
re.X | re.M) |
643
|
|
|
|
644
|
|
|
_html_markdown_attr_re = re.compile( |
645
|
|
|
r'''\s+markdown=("1"|'1')''') |
646
|
|
|
def _hash_html_block_sub(self, match, raw=False): |
647
|
|
|
html = match.group(1) |
648
|
|
|
if raw and self.safe_mode: |
649
|
|
|
html = self._sanitize_html(html) |
650
|
|
|
elif 'markdown-in-html' in self.extras and 'markdown=' in html: |
651
|
|
|
first_line = html.split('\n', 1)[0] |
652
|
|
|
m = self._html_markdown_attr_re.search(first_line) |
653
|
|
|
if m: |
654
|
|
|
lines = html.split('\n') |
655
|
|
|
middle = '\n'.join(lines[1:-1]) |
656
|
|
|
last_line = lines[-1] |
657
|
|
|
first_line = first_line[:m.start()] + first_line[m.end():] |
658
|
|
|
f_key = _hash_text(first_line) |
659
|
|
|
self.html_blocks[f_key] = first_line |
660
|
|
|
l_key = _hash_text(last_line) |
661
|
|
|
self.html_blocks[l_key] = last_line |
662
|
|
|
return ''.join(["\n\n", f_key, |
663
|
|
|
"\n\n", middle, "\n\n", |
664
|
|
|
l_key, "\n\n"]) |
665
|
|
|
key = _hash_text(html) |
666
|
|
|
self.html_blocks[key] = html |
667
|
|
|
return "\n\n" + key + "\n\n" |
668
|
|
|
|
669
|
|
|
def _hash_html_blocks(self, text, raw=False): |
670
|
|
|
"""Hashify HTML blocks |
671
|
|
|
|
672
|
|
|
We only want to do this for block-level HTML tags, such as headers, |
673
|
|
|
lists, and tables. That's because we still want to wrap <p>s around |
674
|
|
|
"paragraphs" that are wrapped in non-block-level tags, such as anchors, |
675
|
|
|
phrase emphasis, and spans. The list of tags we're looking for is |
676
|
|
|
hard-coded. |
677
|
|
|
|
678
|
|
|
@param raw {boolean} indicates if these are raw HTML blocks in |
679
|
|
|
the original source. It makes a difference in "safe" mode. |
680
|
|
|
""" |
681
|
|
|
if '<' not in text: |
682
|
|
|
return text |
683
|
|
|
|
684
|
|
|
# Pass `raw` value into our calls to self._hash_html_block_sub. |
685
|
|
|
hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) |
686
|
|
|
|
687
|
|
|
# First, look for nested blocks, e.g.: |
688
|
|
|
# <div> |
689
|
|
|
# <div> |
690
|
|
|
# tags for inner block must be indented. |
691
|
|
|
# </div> |
692
|
|
|
# </div> |
693
|
|
|
# |
694
|
|
|
# The outermost tags must start at the left margin for this to match, and |
695
|
|
|
# the inner nested divs must be indented. |
696
|
|
|
# We need to do this before the next, more liberal match, because the next |
697
|
|
|
# match will start at the first `<div>` and stop at the first `</div>`. |
698
|
|
|
text = self._strict_tag_block_re.sub(hash_html_block_sub, text) |
699
|
|
|
|
700
|
|
|
# Now match more liberally, simply from `\n<tag>` to `</tag>\n` |
701
|
|
|
text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) |
702
|
|
|
|
703
|
|
|
# Special case just for <hr />. It was easier to make a special |
704
|
|
|
# case than to make the other regex more complicated. |
705
|
|
|
if "<hr" in text: |
706
|
|
|
_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) |
707
|
|
|
text = _hr_tag_re.sub(hash_html_block_sub, text) |
708
|
|
|
|
709
|
|
|
# Special case for standalone HTML comments: |
710
|
|
|
if "<!--" in text: |
711
|
|
|
start = 0 |
712
|
|
|
while True: |
713
|
|
|
# Delimiters for next comment block. |
714
|
|
|
try: |
715
|
|
|
start_idx = text.index("<!--", start) |
716
|
|
|
except ValueError: |
717
|
|
|
break |
718
|
|
|
try: |
719
|
|
|
end_idx = text.index("-->", start_idx) + 3 |
720
|
|
|
except ValueError: |
721
|
|
|
break |
722
|
|
|
|
723
|
|
|
# Start position for next comment block search. |
724
|
|
|
start = end_idx |
725
|
|
|
|
726
|
|
|
# Validate whitespace before comment. |
727
|
|
|
if start_idx: |
728
|
|
|
# - Up to `tab_width - 1` spaces before start_idx. |
729
|
|
|
for i in range(self.tab_width - 1): |
730
|
|
|
if text[start_idx - 1] != ' ': |
731
|
|
|
break |
732
|
|
|
start_idx -= 1 |
733
|
|
|
if start_idx == 0: |
734
|
|
|
break |
735
|
|
|
# - Must be preceded by 2 newlines or hit the start of |
736
|
|
|
# the document. |
737
|
|
|
if start_idx == 0: |
738
|
|
|
pass |
739
|
|
|
elif start_idx == 1 and text[0] == '\n': |
740
|
|
|
start_idx = 0 # to match minute detail of Markdown.pl regex |
741
|
|
|
elif text[start_idx-2:start_idx] == '\n\n': |
742
|
|
|
pass |
743
|
|
|
else: |
744
|
|
|
break |
745
|
|
|
|
746
|
|
|
# Validate whitespace after comment. |
747
|
|
|
# - Any number of spaces and tabs. |
748
|
|
|
while end_idx < len(text): |
749
|
|
|
if text[end_idx] not in ' \t': |
750
|
|
|
break |
751
|
|
|
end_idx += 1 |
752
|
|
|
# - Must be following by 2 newlines or hit end of text. |
753
|
|
|
if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): |
754
|
|
|
continue |
755
|
|
|
|
756
|
|
|
# Escape and hash (must match `_hash_html_block_sub`). |
757
|
|
|
html = text[start_idx:end_idx] |
758
|
|
|
if raw and self.safe_mode: |
759
|
|
|
html = self._sanitize_html(html) |
760
|
|
|
key = _hash_text(html) |
761
|
|
|
self.html_blocks[key] = html |
762
|
|
|
text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] |
763
|
|
|
|
764
|
|
|
if "xml" in self.extras: |
765
|
|
|
# Treat XML processing instructions and namespaced one-liner |
766
|
|
|
# tags as if they were block HTML tags. E.g., if standalone |
767
|
|
|
# (i.e. are their own paragraph), the following do not get |
768
|
|
|
# wrapped in a <p> tag: |
769
|
|
|
# <?foo bar?> |
770
|
|
|
# |
771
|
|
|
# <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> |
772
|
|
|
_xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) |
773
|
|
|
text = _xml_oneliner_re.sub(hash_html_block_sub, text) |
774
|
|
|
|
775
|
|
|
return text |
776
|
|
|
|
777
|
|
|
def _strip_link_definitions(self, text): |
778
|
|
|
# Strips link definitions from text, stores the URLs and titles in |
779
|
|
|
# hash references. |
780
|
|
|
less_than_tab = self.tab_width - 1 |
781
|
|
|
|
782
|
|
|
# Link defs are in the form: |
783
|
|
|
# [id]: url "optional title" |
784
|
|
|
_link_def_re = re.compile(r""" |
785
|
|
|
^[ ]{0,%d}\[(.+)\]: # id = \1 |
786
|
|
|
[ \t]* |
787
|
|
|
\n? # maybe *one* newline |
788
|
|
|
[ \t]* |
789
|
|
|
<?(.+?)>? # url = \2 |
790
|
|
|
[ \t]* |
791
|
|
|
(?: |
792
|
|
|
\n? # maybe one newline |
793
|
|
|
[ \t]* |
794
|
|
|
(?<=\s) # lookbehind for whitespace |
795
|
|
|
['"(] |
796
|
|
|
([^\n]*) # title = \3 |
797
|
|
|
['")] |
798
|
|
|
[ \t]* |
799
|
|
|
)? # title is optional |
800
|
|
|
(?:\n+|\Z) |
801
|
|
|
""" % less_than_tab, re.X | re.M | re.U) |
802
|
|
|
return _link_def_re.sub(self._extract_link_def_sub, text) |
803
|
|
|
|
804
|
|
|
def _extract_link_def_sub(self, match): |
805
|
|
|
id, url, title = match.groups() |
806
|
|
|
key = id.lower() # Link IDs are case-insensitive |
807
|
|
|
self.urls[key] = self._encode_amps_and_angles(url) |
808
|
|
|
if title: |
809
|
|
|
self.titles[key] = title |
810
|
|
|
return "" |
811
|
|
|
|
812
|
|
|
def _do_numbering(self, text): |
813
|
|
|
''' We handle the special extension for generic numbering for |
814
|
|
|
tables, figures etc. |
815
|
|
|
''' |
816
|
|
|
# First pass to define all the references |
817
|
|
|
self.regex_defns = re.compile(r''' |
818
|
|
|
\[\#(\w+)\s* # the counter. Open square plus hash plus a word \1 |
819
|
|
|
([^@]*)\s* # Some optional characters, that aren't an @. \2 |
820
|
|
|
@(\w+) # the id. Should this be normed? \3 |
821
|
|
|
([^\]]*)\] # The rest of the text up to the terminating ] \4 |
822
|
|
|
''', re.VERBOSE) |
823
|
|
|
self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] |
824
|
|
|
counters = {} |
825
|
|
|
references = {} |
826
|
|
|
replacements = [] |
827
|
|
|
definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>' |
828
|
|
|
reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>' |
829
|
|
|
for match in self.regex_defns.finditer(text): |
830
|
|
|
# We must have four match groups otherwise this isn't a numbering reference |
831
|
|
|
if len(match.groups()) != 4: |
832
|
|
|
continue |
833
|
|
|
counter = match.group(1) |
834
|
|
|
text_before = match.group(2) |
835
|
|
|
ref_id = match.group(3) |
836
|
|
|
text_after = match.group(4) |
837
|
|
|
number = counters.get(counter, 1) |
838
|
|
|
references[ref_id] = (number, counter) |
839
|
|
|
replacements.append((match.start(0), |
840
|
|
|
definition_html.format(counter, |
841
|
|
|
ref_id, |
842
|
|
|
text_before, |
843
|
|
|
number, |
844
|
|
|
text_after), |
845
|
|
|
match.end(0))) |
846
|
|
|
counters[counter] = number + 1 |
847
|
|
|
for repl in reversed(replacements): |
848
|
|
|
text = text[:repl[0]] + repl[1] + text[repl[2]:] |
849
|
|
|
|
850
|
|
|
# Second pass to replace the references with the right |
851
|
|
|
# value of the counter |
852
|
|
|
# Fwiw, it's vaguely annoying to have to turn the iterator into |
853
|
|
|
# a list and then reverse it but I can't think of a better thing to do. |
854
|
|
|
for match in reversed(list(self.regex_subs.finditer(text))): |
855
|
|
|
number, counter = references.get(match.group(1), (None, None)) |
856
|
|
|
if number is not None: |
857
|
|
|
repl = reference_html.format(counter, |
858
|
|
|
match.group(1), |
859
|
|
|
number) |
860
|
|
|
else: |
861
|
|
|
repl = reference_html.format(match.group(1), |
862
|
|
|
'countererror', |
863
|
|
|
'?' + match.group(1) + '?') |
864
|
|
|
if "smarty-pants" in self.extras: |
865
|
|
|
repl = repl.replace('"', self._escape_table['"']) |
866
|
|
|
|
867
|
|
|
text = text[:match.start()] + repl + text[match.end():] |
868
|
|
|
return text |
869
|
|
|
|
870
|
|
|
def _extract_footnote_def_sub(self, match): |
871
|
|
|
id, text = match.groups() |
872
|
|
|
text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() |
873
|
|
|
normed_id = re.sub(r'\W', '-', id) |
874
|
|
|
# Ensure footnote text ends with a couple newlines (for some |
875
|
|
|
# block gamut matches). |
876
|
|
|
self.footnotes[normed_id] = text + "\n\n" |
877
|
|
|
return "" |
878
|
|
|
|
879
|
|
|
def _strip_footnote_definitions(self, text): |
880
|
|
|
"""A footnote definition looks like this: |
881
|
|
|
|
882
|
|
|
[^note-id]: Text of the note. |
883
|
|
|
|
884
|
|
|
May include one or more indented paragraphs. |
885
|
|
|
|
886
|
|
|
Where, |
887
|
|
|
- The 'note-id' can be pretty much anything, though typically it |
888
|
|
|
is the number of the footnote. |
889
|
|
|
- The first paragraph may start on the next line, like so: |
890
|
|
|
|
891
|
|
|
[^note-id]: |
892
|
|
|
Text of the note. |
893
|
|
|
""" |
894
|
|
|
less_than_tab = self.tab_width - 1 |
895
|
|
|
footnote_def_re = re.compile(r''' |
896
|
|
|
^[ ]{0,%d}\[\^(.+)\]: # id = \1 |
897
|
|
|
[ \t]* |
898
|
|
|
( # footnote text = \2 |
899
|
|
|
# First line need not start with the spaces. |
900
|
|
|
(?:\s*.*\n+) |
901
|
|
|
(?: |
902
|
|
|
(?:[ ]{%d} | \t) # Subsequent lines must be indented. |
903
|
|
|
.*\n+ |
904
|
|
|
)* |
905
|
|
|
) |
906
|
|
|
# Lookahead for non-space at line-start, or end of doc. |
907
|
|
|
(?:(?=^[ ]{0,%d}\S)|\Z) |
908
|
|
|
''' % (less_than_tab, self.tab_width, self.tab_width), |
909
|
|
|
re.X | re.M) |
910
|
|
|
return footnote_def_re.sub(self._extract_footnote_def_sub, text) |
911
|
|
|
|
912
|
|
|
_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M) |
913
|
|
|
|
914
|
|
|
def _run_block_gamut(self, text): |
915
|
|
|
# These are all the transformations that form block-level |
916
|
|
|
# tags like paragraphs, headers, and list items. |
917
|
|
|
|
918
|
|
|
if "fenced-code-blocks" in self.extras: |
919
|
|
|
text = self._do_fenced_code_blocks(text) |
920
|
|
|
|
921
|
|
|
text = self._do_headers(text) |
922
|
|
|
|
923
|
|
|
# Do Horizontal Rules: |
924
|
|
|
# On the number of spaces in horizontal rules: The spec is fuzzy: "If |
925
|
|
|
# you wish, you may use spaces between the hyphens or asterisks." |
926
|
|
|
# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the |
927
|
|
|
# hr chars to one or two. We'll reproduce that limit here. |
928
|
|
|
hr = "\n<hr"+self.empty_element_suffix+"\n" |
929
|
|
|
text = re.sub(self._hr_re, hr, text) |
930
|
|
|
|
931
|
|
|
text = self._do_lists(text) |
932
|
|
|
|
933
|
|
|
if "pyshell" in self.extras: |
934
|
|
|
text = self._prepare_pyshell_blocks(text) |
935
|
|
|
if "wiki-tables" in self.extras: |
936
|
|
|
text = self._do_wiki_tables(text) |
937
|
|
|
if "tables" in self.extras: |
938
|
|
|
text = self._do_tables(text) |
939
|
|
|
|
940
|
|
|
text = self._do_code_blocks(text) |
941
|
|
|
|
942
|
|
|
text = self._do_block_quotes(text) |
943
|
|
|
|
944
|
|
|
# We already ran _HashHTMLBlocks() before, in Markdown(), but that |
945
|
|
|
# was to escape raw HTML in the original Markdown source. This time, |
946
|
|
|
# we're escaping the markup we've just created, so that we don't wrap |
947
|
|
|
# <p> tags around block-level tags. |
948
|
|
|
text = self._hash_html_blocks(text) |
949
|
|
|
|
950
|
|
|
text = self._form_paragraphs(text) |
951
|
|
|
|
952
|
|
|
return text |
953
|
|
|
|
954
|
|
|
def _pyshell_block_sub(self, match): |
955
|
|
|
lines = match.group(0).splitlines(0) |
956
|
|
|
_dedentlines(lines) |
957
|
|
|
indent = ' ' * self.tab_width |
958
|
|
|
s = ('\n' # separate from possible cuddled paragraph |
959
|
|
|
+ indent + ('\n'+indent).join(lines) |
960
|
|
|
+ '\n\n') |
961
|
|
|
return s |
962
|
|
|
|
963
|
|
|
def _prepare_pyshell_blocks(self, text): |
964
|
|
|
"""Ensure that Python interactive shell sessions are put in |
965
|
|
|
code blocks -- even if not properly indented. |
966
|
|
|
""" |
967
|
|
|
if ">>>" not in text: |
968
|
|
|
return text |
969
|
|
|
|
970
|
|
|
less_than_tab = self.tab_width - 1 |
971
|
|
|
_pyshell_block_re = re.compile(r""" |
972
|
|
|
^([ ]{0,%d})>>>[ ].*\n # first line |
973
|
|
|
^(\1.*\S+.*\n)* # any number of subsequent lines |
974
|
|
|
^\n # ends with a blank line |
975
|
|
|
""" % less_than_tab, re.M | re.X) |
976
|
|
|
|
977
|
|
|
return _pyshell_block_re.sub(self._pyshell_block_sub, text) |
978
|
|
|
|
979
|
|
|
def _table_sub(self, match): |
980
|
|
|
trim_space_re = '^[ \t\n]+|[ \t\n]+$' |
981
|
|
|
trim_bar_re = '^\||\|$' |
982
|
|
|
split_bar_re = '^\||(?<!\\\\)\|' |
983
|
|
|
escape_bar_re = '\\\\\|' |
984
|
|
|
|
985
|
|
|
head, underline, body = match.groups() |
986
|
|
|
|
987
|
|
|
# Determine aligns for columns. |
988
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))] |
989
|
|
|
align_from_col_idx = {} |
990
|
|
|
for col_idx, col in enumerate(cols): |
991
|
|
|
if col[0] == ':' and col[-1] == ':': |
992
|
|
|
align_from_col_idx[col_idx] = ' align="center"' |
993
|
|
|
elif col[0] == ':': |
994
|
|
|
align_from_col_idx[col_idx] = ' align="left"' |
995
|
|
|
elif col[-1] == ':': |
996
|
|
|
align_from_col_idx[col_idx] = ' align="right"' |
997
|
|
|
|
998
|
|
|
# thead |
999
|
|
|
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>'] |
1000
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] |
1001
|
|
|
for col_idx, col in enumerate(cols): |
1002
|
|
|
hlines.append(' <th%s>%s</th>' % ( |
1003
|
|
|
align_from_col_idx.get(col_idx, ''), |
1004
|
|
|
self._run_span_gamut(col) |
1005
|
|
|
)) |
1006
|
|
|
hlines.append('</tr>') |
1007
|
|
|
hlines.append('</thead>') |
1008
|
|
|
|
1009
|
|
|
# tbody |
1010
|
|
|
hlines.append('<tbody>') |
1011
|
|
|
for line in body.strip('\n').split('\n'): |
1012
|
|
|
hlines.append('<tr>') |
1013
|
|
|
cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] |
1014
|
|
|
for col_idx, col in enumerate(cols): |
1015
|
|
|
hlines.append(' <td%s>%s</td>' % ( |
1016
|
|
|
align_from_col_idx.get(col_idx, ''), |
1017
|
|
|
self._run_span_gamut(col) |
1018
|
|
|
)) |
1019
|
|
|
hlines.append('</tr>') |
1020
|
|
|
hlines.append('</tbody>') |
1021
|
|
|
hlines.append('</table>') |
1022
|
|
|
|
1023
|
|
|
return '\n'.join(hlines) + '\n' |
1024
|
|
|
|
1025
|
|
|
def _do_tables(self, text): |
1026
|
|
|
"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from |
1027
|
|
|
https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 |
1028
|
|
|
""" |
1029
|
|
|
less_than_tab = self.tab_width - 1 |
1030
|
|
|
table_re = re.compile(r''' |
1031
|
|
|
(?:(?<=\n\n)|\A\n?) # leading blank line |
1032
|
|
|
|
1033
|
|
|
^[ ]{0,%d} # allowed whitespace |
1034
|
|
|
(.*[|].*) \n # $1: header row (at least one pipe) |
1035
|
|
|
|
1036
|
|
|
^[ ]{0,%d} # allowed whitespace |
1037
|
|
|
( # $2: underline row |
1038
|
|
|
# underline row with leading bar |
1039
|
|
|
(?: \|\ *:?-+:?\ * )+ \|? \n |
1040
|
|
|
| |
1041
|
|
|
# or, underline row without leading bar |
1042
|
|
|
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n |
1043
|
|
|
) |
1044
|
|
|
|
1045
|
|
|
( # $3: data rows |
1046
|
|
|
(?: |
1047
|
|
|
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces |
1048
|
|
|
.*\|.* \n |
1049
|
|
|
)+ |
1050
|
|
|
) |
1051
|
|
|
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) |
1052
|
|
|
return table_re.sub(self._table_sub, text) |
1053
|
|
|
|
1054
|
|
|
def _wiki_table_sub(self, match): |
1055
|
|
|
ttext = match.group(0).strip() |
1056
|
|
|
# print 'wiki table: %r' % match.group(0) |
1057
|
|
|
rows = [] |
1058
|
|
|
for line in ttext.splitlines(0): |
1059
|
|
|
line = line.strip()[2:-2].strip() |
1060
|
|
|
row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)] |
1061
|
|
|
rows.append(row) |
1062
|
|
|
# pprint(rows) |
1063
|
|
|
hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>'] |
1064
|
|
|
for row in rows: |
1065
|
|
|
hrow = ['<tr>'] |
1066
|
|
|
for cell in row: |
1067
|
|
|
hrow.append('<td>') |
1068
|
|
|
hrow.append(self._run_span_gamut(cell)) |
1069
|
|
|
hrow.append('</td>') |
1070
|
|
|
hrow.append('</tr>') |
1071
|
|
|
hlines.append(''.join(hrow)) |
1072
|
|
|
hlines += ['</tbody>', '</table>'] |
1073
|
|
|
return '\n'.join(hlines) + '\n' |
1074
|
|
|
|
1075
|
|
|
def _do_wiki_tables(self, text): |
1076
|
|
|
# Optimization. |
1077
|
|
|
if "||" not in text: |
1078
|
|
|
return text |
1079
|
|
|
|
1080
|
|
|
less_than_tab = self.tab_width - 1 |
1081
|
|
|
wiki_table_re = re.compile(r''' |
1082
|
|
|
(?:(?<=\n\n)|\A\n?) # leading blank line |
1083
|
|
|
^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line |
1084
|
|
|
(^\1\|\|.+?\|\|\n)* # any number of subsequent lines |
1085
|
|
|
''' % less_than_tab, re.M | re.X) |
1086
|
|
|
return wiki_table_re.sub(self._wiki_table_sub, text) |
1087
|
|
|
|
1088
|
|
|
def _run_span_gamut(self, text): |
1089
|
|
|
# These are all the transformations that occur *within* block-level |
1090
|
|
|
# tags like paragraphs, headers, and list items. |
1091
|
|
|
|
1092
|
|
|
text = self._do_code_spans(text) |
1093
|
|
|
|
1094
|
|
|
text = self._escape_special_chars(text) |
1095
|
|
|
|
1096
|
|
|
# Process anchor and image tags. |
1097
|
|
|
text = self._do_links(text) |
1098
|
|
|
|
1099
|
|
|
# Make links out of things like `<http://example.com/>` |
1100
|
|
|
# Must come after _do_links(), because you can use < and > |
1101
|
|
|
# delimiters in inline links like [this](<url>). |
1102
|
|
|
text = self._do_auto_links(text) |
1103
|
|
|
|
1104
|
|
|
if "link-patterns" in self.extras: |
1105
|
|
|
text = self._do_link_patterns(text) |
1106
|
|
|
|
1107
|
|
|
text = self._encode_amps_and_angles(text) |
1108
|
|
|
|
1109
|
|
|
if "strike" in self.extras: |
1110
|
|
|
text = self._do_strike(text) |
1111
|
|
|
|
1112
|
|
|
text = self._do_italics_and_bold(text) |
1113
|
|
|
|
1114
|
|
|
if "smarty-pants" in self.extras: |
1115
|
|
|
text = self._do_smart_punctuation(text) |
1116
|
|
|
|
1117
|
|
|
# Do hard breaks: |
1118
|
|
|
if "break-on-newline" in self.extras: |
1119
|
|
|
text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text) |
1120
|
|
|
else: |
1121
|
|
|
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) |
1122
|
|
|
|
1123
|
|
|
return text |
1124
|
|
|
|
1125
|
|
|
# "Sorta" because auto-links are identified as "tag" tokens. |
1126
|
|
|
_sorta_html_tokenize_re = re.compile(r""" |
1127
|
|
|
( |
1128
|
|
|
# tag |
1129
|
|
|
</? |
1130
|
|
|
(?:\w+) # tag name |
1131
|
|
|
(?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes |
1132
|
|
|
\s*/?> |
1133
|
|
|
| |
1134
|
|
|
# auto-link (e.g., <http://www.activestate.com/>) |
1135
|
|
|
<\w+[^>]*> |
1136
|
|
|
| |
1137
|
|
|
<!--.*?--> # comment |
1138
|
|
|
| |
1139
|
|
|
<\?.*?\?> # processing instruction |
1140
|
|
|
) |
1141
|
|
|
""", re.X) |
1142
|
|
|
|
1143
|
|
|
def _escape_special_chars(self, text): |
1144
|
|
|
# Python markdown note: the HTML tokenization here differs from |
1145
|
|
|
# that in Markdown.pl, hence the behaviour for subtle cases can |
1146
|
|
|
# differ (I believe the tokenizer here does a better job because |
1147
|
|
|
# it isn't susceptible to unmatched '<' and '>' in HTML tags). |
1148
|
|
|
# Note, however, that '>' is not allowed in an auto-link URL |
1149
|
|
|
# here. |
1150
|
|
|
escaped = [] |
1151
|
|
|
is_html_markup = False |
1152
|
|
|
for token in self._sorta_html_tokenize_re.split(text): |
1153
|
|
|
if is_html_markup: |
1154
|
|
|
# Within tags/HTML-comments/auto-links, encode * and _ |
1155
|
|
|
# so they don't conflict with their use in Markdown for |
1156
|
|
|
# italics and strong. We're replacing each such |
1157
|
|
|
# character with its corresponding MD5 checksum value; |
1158
|
|
|
# this is likely overkill, but it should prevent us from |
1159
|
|
|
# colliding with the escape values by accident. |
1160
|
|
|
escaped.append(token.replace('*', self._escape_table['*']) |
1161
|
|
|
.replace('_', self._escape_table['_'])) |
1162
|
|
|
else: |
1163
|
|
|
escaped.append(self._encode_backslash_escapes(token)) |
1164
|
|
|
is_html_markup = not is_html_markup |
1165
|
|
|
return ''.join(escaped) |
1166
|
|
|
|
1167
|
|
|
def _hash_html_spans(self, text): |
1168
|
|
|
# Used for safe_mode. |
1169
|
|
|
|
1170
|
|
|
def _is_auto_link(s): |
1171
|
|
|
if ':' in s and self._auto_link_re.match(s): |
1172
|
|
|
return True |
1173
|
|
|
elif '@' in s and self._auto_email_link_re.match(s): |
1174
|
|
|
return True |
1175
|
|
|
return False |
1176
|
|
|
|
1177
|
|
|
tokens = [] |
1178
|
|
|
is_html_markup = False |
1179
|
|
|
for token in self._sorta_html_tokenize_re.split(text): |
1180
|
|
|
if is_html_markup and not _is_auto_link(token): |
1181
|
|
|
sanitized = self._sanitize_html(token) |
1182
|
|
|
key = _hash_text(sanitized) |
1183
|
|
|
self.html_spans[key] = sanitized |
1184
|
|
|
tokens.append(key) |
1185
|
|
|
else: |
1186
|
|
|
tokens.append(token) |
1187
|
|
|
is_html_markup = not is_html_markup |
1188
|
|
|
return ''.join(tokens) |
1189
|
|
|
|
1190
|
|
|
def _unhash_html_spans(self, text): |
1191
|
|
|
for key, sanitized in list(self.html_spans.items()): |
1192
|
|
|
text = text.replace(key, sanitized) |
1193
|
|
|
return text |
1194
|
|
|
|
1195
|
|
|
def _sanitize_html(self, s): |
1196
|
|
|
if self.safe_mode == "replace": |
1197
|
|
|
return self.html_removed_text |
1198
|
|
|
elif self.safe_mode == "escape": |
1199
|
|
|
replacements = [ |
1200
|
|
|
('&', '&'), |
1201
|
|
|
('<', '<'), |
1202
|
|
|
('>', '>'), |
1203
|
|
|
] |
1204
|
|
|
for before, after in replacements: |
1205
|
|
|
s = s.replace(before, after) |
1206
|
|
|
return s |
1207
|
|
|
else: |
1208
|
|
|
raise MarkdownError("invalid value for 'safe_mode': %r (must be " |
1209
|
|
|
"'escape' or 'replace')" % self.safe_mode) |
1210
|
|
|
|
1211
|
|
|
_inline_link_title = re.compile(r''' |
1212
|
|
|
( # \1 |
1213
|
|
|
[ \t]+ |
1214
|
|
|
(['"]) # quote char = \2 |
1215
|
|
|
(?P<title>.*?) |
1216
|
|
|
\2 |
1217
|
|
|
)? # title is optional |
1218
|
|
|
\)$ |
1219
|
|
|
''', re.X | re.S) |
1220
|
|
|
_tail_of_reference_link_re = re.compile(r''' |
1221
|
|
|
# Match tail of: [text][id] |
1222
|
|
|
[ ]? # one optional space |
1223
|
|
|
(?:\n[ ]*)? # one optional newline followed by spaces |
1224
|
|
|
\[ |
1225
|
|
|
(?P<id>.*?) |
1226
|
|
|
\] |
1227
|
|
|
''', re.X | re.S) |
1228
|
|
|
|
1229
|
|
|
_whitespace = re.compile(r'\s*') |
1230
|
|
|
|
1231
|
|
|
_strip_anglebrackets = re.compile(r'<(.*)>.*') |
1232
|
|
|
|
1233
|
|
|
def _find_non_whitespace(self, text, start): |
1234
|
|
|
"""Returns the index of the first non-whitespace character in text |
1235
|
|
|
after (and including) start |
1236
|
|
|
""" |
1237
|
|
|
match = self._whitespace.match(text, start) |
1238
|
|
|
return match.end() |
1239
|
|
|
|
1240
|
|
|
def _find_balanced(self, text, start, open_c, close_c): |
1241
|
|
|
"""Returns the index where the open_c and close_c characters balance |
1242
|
|
|
out - the same number of open_c and close_c are encountered - or the |
1243
|
|
|
end of string if it's reached before the balance point is found. |
1244
|
|
|
""" |
1245
|
|
|
i = start |
1246
|
|
|
l = len(text) |
1247
|
|
|
count = 1 |
1248
|
|
|
while count > 0 and i < l: |
1249
|
|
|
if text[i] == open_c: |
1250
|
|
|
count += 1 |
1251
|
|
|
elif text[i] == close_c: |
1252
|
|
|
count -= 1 |
1253
|
|
|
i += 1 |
1254
|
|
|
return i |
1255
|
|
|
|
1256
|
|
|
def _extract_url_and_title(self, text, start): |
1257
|
|
|
"""Extracts the url and (optional) title from the tail of a link""" |
1258
|
|
|
# text[start] equals the opening parenthesis |
1259
|
|
|
idx = self._find_non_whitespace(text, start+1) |
1260
|
|
|
if idx == len(text): |
1261
|
|
|
return None, None, None |
1262
|
|
|
end_idx = idx |
1263
|
|
|
has_anglebrackets = text[idx] == "<" |
1264
|
|
|
if has_anglebrackets: |
1265
|
|
|
end_idx = self._find_balanced(text, end_idx+1, "<", ">") |
1266
|
|
|
end_idx = self._find_balanced(text, end_idx, "(", ")") |
1267
|
|
|
match = self._inline_link_title.search(text, idx, end_idx) |
1268
|
|
|
if not match: |
1269
|
|
|
return None, None, None |
1270
|
|
|
url, title = text[idx:match.start()], match.group("title") |
1271
|
|
|
if has_anglebrackets: |
1272
|
|
|
url = self._strip_anglebrackets.sub(r'\1', url) |
1273
|
|
|
return url, title, end_idx |
1274
|
|
|
|
1275
|
|
|
_safe_protocols = re.compile(r'(https?|ftp):', re.I) |
1276
|
|
|
def _do_links(self, text): |
1277
|
|
|
"""Turn Markdown link shortcuts into XHTML <a> and <img> tags. |
1278
|
|
|
|
1279
|
|
|
This is a combination of Markdown.pl's _DoAnchors() and |
1280
|
|
|
_DoImages(). They are done together because that simplified the |
1281
|
|
|
approach. It was necessary to use a different approach than |
1282
|
|
|
Markdown.pl because of the lack of atomic matching support in |
1283
|
|
|
Python's regex engine used in $g_nested_brackets. |
1284
|
|
|
""" |
1285
|
|
|
MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 |
1286
|
|
|
|
1287
|
|
|
# `anchor_allowed_pos` is used to support img links inside |
1288
|
|
|
# anchors, but not anchors inside anchors. An anchor's start |
1289
|
|
|
# pos must be `>= anchor_allowed_pos`. |
1290
|
|
|
anchor_allowed_pos = 0 |
1291
|
|
|
|
1292
|
|
|
curr_pos = 0 |
1293
|
|
|
while True: # Handle the next link. |
1294
|
|
|
# The next '[' is the start of: |
1295
|
|
|
# - an inline anchor: [text](url "title") |
1296
|
|
|
# - a reference anchor: [text][id] |
1297
|
|
|
# - an inline img:  |
1298
|
|
|
# - a reference img: ![text][id] |
1299
|
|
|
# - a footnote ref: [^id] |
1300
|
|
|
# (Only if 'footnotes' extra enabled) |
1301
|
|
|
# - a footnote defn: [^id]: ... |
1302
|
|
|
# (Only if 'footnotes' extra enabled) These have already |
1303
|
|
|
# been stripped in _strip_footnote_definitions() so no |
1304
|
|
|
# need to watch for them. |
1305
|
|
|
# - a link definition: [id]: url "title" |
1306
|
|
|
# These have already been stripped in |
1307
|
|
|
# _strip_link_definitions() so no need to watch for them. |
1308
|
|
|
# - not markup: [...anything else... |
1309
|
|
|
try: |
1310
|
|
|
start_idx = text.index('[', curr_pos) |
1311
|
|
|
except ValueError: |
1312
|
|
|
break |
1313
|
|
|
text_length = len(text) |
1314
|
|
|
|
1315
|
|
|
# Find the matching closing ']'. |
1316
|
|
|
# Markdown.pl allows *matching* brackets in link text so we |
1317
|
|
|
# will here too. Markdown.pl *doesn't* currently allow |
1318
|
|
|
# matching brackets in img alt text -- we'll differ in that |
1319
|
|
|
# regard. |
1320
|
|
|
bracket_depth = 0 |
1321
|
|
|
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, |
1322
|
|
|
text_length)): |
1323
|
|
|
ch = text[p] |
1324
|
|
|
if ch == ']': |
1325
|
|
|
bracket_depth -= 1 |
1326
|
|
|
if bracket_depth < 0: |
1327
|
|
|
break |
1328
|
|
|
elif ch == '[': |
1329
|
|
|
bracket_depth += 1 |
1330
|
|
|
else: |
1331
|
|
|
# Closing bracket not found within sentinel length. |
1332
|
|
|
# This isn't markup. |
1333
|
|
|
curr_pos = start_idx + 1 |
1334
|
|
|
continue |
1335
|
|
|
link_text = text[start_idx+1:p] |
1336
|
|
|
|
1337
|
|
|
# Possibly a footnote ref? |
1338
|
|
|
if "footnotes" in self.extras and link_text.startswith("^"): |
1339
|
|
|
normed_id = re.sub(r'\W', '-', link_text[1:]) |
1340
|
|
|
if normed_id in self.footnotes: |
1341
|
|
|
self.footnote_ids.append(normed_id) |
1342
|
|
|
result = '<sup class="footnote-ref" id="fnref-%s">' \ |
1343
|
|
|
'<a href="#fn-%s">%s</a></sup>' \ |
1344
|
|
|
% (normed_id, normed_id, len(self.footnote_ids)) |
1345
|
|
|
text = text[:start_idx] + result + text[p+1:] |
1346
|
|
|
else: |
1347
|
|
|
# This id isn't defined, leave the markup alone. |
1348
|
|
|
curr_pos = p+1 |
1349
|
|
|
continue |
1350
|
|
|
|
1351
|
|
|
# Now determine what this is by the remainder. |
1352
|
|
|
p += 1 |
1353
|
|
|
if p == text_length: |
1354
|
|
|
return text |
1355
|
|
|
|
1356
|
|
|
# Inline anchor or img? |
1357
|
|
|
if text[p] == '(': # attempt at perf improvement |
1358
|
|
|
url, title, url_end_idx = self._extract_url_and_title(text, p) |
1359
|
|
|
if url is not None: |
1360
|
|
|
# Handle an inline anchor or img. |
1361
|
|
|
is_img = start_idx > 0 and text[start_idx-1] == "!" |
1362
|
|
|
if is_img: |
1363
|
|
|
start_idx -= 1 |
1364
|
|
|
|
1365
|
|
|
# We've got to encode these to avoid conflicting |
1366
|
|
|
# with italics/bold. |
1367
|
|
|
url = url.replace('*', self._escape_table['*']) \ |
1368
|
|
|
.replace('_', self._escape_table['_']) |
1369
|
|
|
if title: |
1370
|
|
|
title_str = ' title="%s"' % ( |
1371
|
|
|
_xml_escape_attr(title) |
1372
|
|
|
.replace('*', self._escape_table['*']) |
1373
|
|
|
.replace('_', self._escape_table['_'])) |
1374
|
|
|
else: |
1375
|
|
|
title_str = '' |
1376
|
|
View Code Duplication |
if is_img: |
|
|
|
|
1377
|
|
|
img_class_str = self._html_class_str_from_tag("img") |
1378
|
|
|
result = '<img src="%s" alt="%s"%s%s%s' \ |
1379
|
|
|
% (_html_escape_url(url, safe_mode=self.safe_mode), |
1380
|
|
|
_xml_escape_attr(link_text), |
1381
|
|
|
title_str, |
1382
|
|
|
img_class_str, |
1383
|
|
|
self.empty_element_suffix) |
1384
|
|
|
if "smarty-pants" in self.extras: |
1385
|
|
|
result = result.replace('"', self._escape_table['"']) |
1386
|
|
|
curr_pos = start_idx + len(result) |
1387
|
|
|
text = text[:start_idx] + result + text[url_end_idx:] |
1388
|
|
|
elif start_idx >= anchor_allowed_pos: |
1389
|
|
|
if self.safe_mode and not self._safe_protocols.match(url): |
1390
|
|
|
result_head = '<a href="#"%s>' % (title_str) |
1391
|
|
|
else: |
1392
|
|
|
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
1393
|
|
|
result = '%s%s</a>' % (result_head, _xml_escape_attr(link_text)) |
1394
|
|
|
if "smarty-pants" in self.extras: |
1395
|
|
|
result = result.replace('"', self._escape_table['"']) |
1396
|
|
|
# <img> allowed from curr_pos on, <a> from |
1397
|
|
|
# anchor_allowed_pos on. |
1398
|
|
|
curr_pos = start_idx + len(result_head) |
1399
|
|
|
anchor_allowed_pos = start_idx + len(result) |
1400
|
|
|
text = text[:start_idx] + result + text[url_end_idx:] |
1401
|
|
|
else: |
1402
|
|
|
# Anchor not allowed here. |
1403
|
|
|
curr_pos = start_idx + 1 |
1404
|
|
|
continue |
1405
|
|
|
|
1406
|
|
|
# Reference anchor or img? |
1407
|
|
|
else: |
1408
|
|
|
match = self._tail_of_reference_link_re.match(text, p) |
1409
|
|
|
if match: |
1410
|
|
|
# Handle a reference-style anchor or img. |
1411
|
|
|
is_img = start_idx > 0 and text[start_idx-1] == "!" |
1412
|
|
|
if is_img: |
1413
|
|
|
start_idx -= 1 |
1414
|
|
|
link_id = match.group("id").lower() |
1415
|
|
|
if not link_id: |
1416
|
|
|
link_id = link_text.lower() # for links like [this][] |
1417
|
|
|
if link_id in self.urls: |
1418
|
|
|
url = self.urls[link_id] |
1419
|
|
|
# We've got to encode these to avoid conflicting |
1420
|
|
|
# with italics/bold. |
1421
|
|
|
url = url.replace('*', self._escape_table['*']) \ |
1422
|
|
|
.replace('_', self._escape_table['_']) |
1423
|
|
|
title = self.titles.get(link_id) |
1424
|
|
|
if title: |
1425
|
|
|
title = _xml_escape_attr(title) \ |
1426
|
|
|
.replace('*', self._escape_table['*']) \ |
1427
|
|
|
.replace('_', self._escape_table['_']) |
1428
|
|
|
title_str = ' title="%s"' % title |
1429
|
|
|
else: |
1430
|
|
|
title_str = '' |
1431
|
|
View Code Duplication |
if is_img: |
|
|
|
|
1432
|
|
|
img_class_str = self._html_class_str_from_tag("img") |
1433
|
|
|
result = '<img src="%s" alt="%s"%s%s%s' \ |
1434
|
|
|
% (_html_escape_url(url, safe_mode=self.safe_mode), |
1435
|
|
|
_xml_escape_attr(link_text), |
1436
|
|
|
title_str, |
1437
|
|
|
img_class_str, |
1438
|
|
|
self.empty_element_suffix) |
1439
|
|
|
if "smarty-pants" in self.extras: |
1440
|
|
|
result = result.replace('"', self._escape_table['"']) |
1441
|
|
|
curr_pos = start_idx + len(result) |
1442
|
|
|
text = text[:start_idx] + result + text[match.end():] |
1443
|
|
|
elif start_idx >= anchor_allowed_pos: |
1444
|
|
|
if self.safe_mode and not self._safe_protocols.match(url): |
1445
|
|
|
result_head = '<a href="#"%s>' % (title_str) |
1446
|
|
|
else: |
1447
|
|
|
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
1448
|
|
|
result = '%s%s</a>' % (result_head, link_text) |
1449
|
|
|
if "smarty-pants" in self.extras: |
1450
|
|
|
result = result.replace('"', self._escape_table['"']) |
1451
|
|
|
# <img> allowed from curr_pos on, <a> from |
1452
|
|
|
# anchor_allowed_pos on. |
1453
|
|
|
curr_pos = start_idx + len(result_head) |
1454
|
|
|
anchor_allowed_pos = start_idx + len(result) |
1455
|
|
|
text = text[:start_idx] + result + text[match.end():] |
1456
|
|
|
else: |
1457
|
|
|
# Anchor not allowed here. |
1458
|
|
|
curr_pos = start_idx + 1 |
1459
|
|
|
else: |
1460
|
|
|
# This id isn't defined, leave the markup alone. |
1461
|
|
|
curr_pos = match.end() |
1462
|
|
|
continue |
1463
|
|
|
|
1464
|
|
|
# Otherwise, it isn't markup. |
1465
|
|
|
curr_pos = start_idx + 1 |
1466
|
|
|
|
1467
|
|
|
return text |
1468
|
|
|
|
1469
|
|
|
def header_id_from_text(self, text, prefix, n): |
1470
|
|
|
"""Generate a header id attribute value from the given header |
1471
|
|
|
HTML content. |
1472
|
|
|
|
1473
|
|
|
This is only called if the "header-ids" extra is enabled. |
1474
|
|
|
Subclasses may override this for different header ids. |
1475
|
|
|
|
1476
|
|
|
@param text {str} The text of the header tag |
1477
|
|
|
@param prefix {str} The requested prefix for header ids. This is the |
1478
|
|
|
value of the "header-ids" extra key, if any. Otherwise, None. |
1479
|
|
|
@param n {int} The <hN> tag number, i.e. `1` for an <h1> tag. |
1480
|
|
|
@returns {str} The value for the header tag's "id" attribute. Return |
1481
|
|
|
None to not have an id attribute and to exclude this header from |
1482
|
|
|
the TOC (if the "toc" extra is specified). |
1483
|
|
|
""" |
1484
|
|
|
header_id = _slugify(text) |
1485
|
|
|
if prefix and isinstance(prefix, base_string_type): |
1486
|
|
|
header_id = prefix + '-' + header_id |
1487
|
|
|
if header_id in self._count_from_header_id: |
1488
|
|
|
self._count_from_header_id[header_id] += 1 |
1489
|
|
|
header_id += '-%s' % self._count_from_header_id[header_id] |
1490
|
|
|
else: |
1491
|
|
|
self._count_from_header_id[header_id] = 1 |
1492
|
|
|
if 0 == len(header_id): |
1493
|
|
|
header_id += '-%s' % self._count_from_header_id[header_id] |
1494
|
|
|
|
1495
|
|
|
return header_id |
1496
|
|
|
|
1497
|
|
|
_toc = None |
1498
|
|
|
def _toc_add_entry(self, level, id, name): |
1499
|
|
|
if self._toc is None: |
1500
|
|
|
self._toc = [] |
1501
|
|
|
self._toc.append((level, id, self._unescape_special_chars(name))) |
1502
|
|
|
|
1503
|
|
|
_h_re_base = r''' |
1504
|
|
|
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) |
1505
|
|
|
| |
1506
|
|
|
(^(\#{1,6}) # \1 = string of #'s |
1507
|
|
|
[ \t]%s |
1508
|
|
|
(.+?) # \2 = Header text |
1509
|
|
|
[ \t]* |
1510
|
|
|
(?<!\\) # ensure not an escaped trailing '#' |
1511
|
|
|
\#* # optional closing #'s (not counted) |
1512
|
|
|
\n+ |
1513
|
|
|
) |
1514
|
|
|
''' |
1515
|
|
|
|
1516
|
|
|
_h_re = re.compile(_h_re_base % '*', re.X | re.M) |
1517
|
|
|
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) |
1518
|
|
|
|
1519
|
|
|
def _h_sub(self, match): |
1520
|
|
|
if match.group(1) is not None: |
1521
|
|
|
# Setext header |
1522
|
|
|
n = {"=": 1, "-": 2}[match.group(3)[0]] |
1523
|
|
|
header_group = match.group(2) |
1524
|
|
|
else: |
1525
|
|
|
# atx header |
1526
|
|
|
n = len(match.group(5)) |
1527
|
|
|
header_group = match.group(6) |
1528
|
|
|
|
1529
|
|
|
demote_headers = self.extras.get("demote-headers") |
1530
|
|
|
if demote_headers: |
1531
|
|
|
n = min(n + demote_headers, 6) |
1532
|
|
|
header_id_attr = "" |
1533
|
|
|
if "header-ids" in self.extras: |
1534
|
|
|
header_id = self.header_id_from_text(header_group, |
1535
|
|
|
self.extras["header-ids"], n) |
1536
|
|
|
if header_id: |
1537
|
|
|
header_id_attr = ' id="%s"' % header_id |
1538
|
|
|
html = self._run_span_gamut(header_group) |
1539
|
|
|
if "toc" in self.extras and header_id: |
1540
|
|
|
self._toc_add_entry(n, header_id, html) |
1541
|
|
|
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) |
1542
|
|
|
|
1543
|
|
|
def _do_headers(self, text): |
1544
|
|
|
# Setext-style headers: |
1545
|
|
|
# Header 1 |
1546
|
|
|
# ======== |
1547
|
|
|
# |
1548
|
|
|
# Header 2 |
1549
|
|
|
# -------- |
1550
|
|
|
|
1551
|
|
|
# atx-style headers: |
1552
|
|
|
# # Header 1 |
1553
|
|
|
# ## Header 2 |
1554
|
|
|
# ## Header 2 with closing hashes ## |
1555
|
|
|
# ... |
1556
|
|
|
# ###### Header 6 |
1557
|
|
|
|
1558
|
|
|
if 'tag-friendly' in self.extras: |
1559
|
|
|
return self._h_re_tag_friendly.sub(self._h_sub, text) |
1560
|
|
|
return self._h_re.sub(self._h_sub, text) |
1561
|
|
|
|
1562
|
|
|
_marker_ul_chars = '*+-' |
1563
|
|
|
_marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars |
1564
|
|
|
_marker_ul = '(?:[%s])' % _marker_ul_chars |
1565
|
|
|
_marker_ol = r'(?:\d+\.)' |
1566
|
|
|
|
1567
|
|
|
def _list_sub(self, match): |
1568
|
|
|
lst = match.group(1) |
1569
|
|
|
lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" |
1570
|
|
|
result = self._process_list_items(lst) |
1571
|
|
|
if self.list_level: |
1572
|
|
|
return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) |
1573
|
|
|
else: |
1574
|
|
|
return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) |
1575
|
|
|
|
1576
|
|
|
def _do_lists(self, text): |
1577
|
|
|
# Form HTML ordered (numbered) and unordered (bulleted) lists. |
1578
|
|
|
|
1579
|
|
|
# Iterate over each *non-overlapping* list match. |
1580
|
|
|
pos = 0 |
1581
|
|
|
while True: |
1582
|
|
|
# Find the *first* hit for either list style (ul or ol). We |
1583
|
|
|
# match ul and ol separately to avoid adjacent lists of different |
1584
|
|
|
# types running into each other (see issue #16). |
1585
|
|
|
hits = [] |
1586
|
|
|
for marker_pat in (self._marker_ul, self._marker_ol): |
1587
|
|
|
less_than_tab = self.tab_width - 1 |
1588
|
|
|
whole_list = r''' |
1589
|
|
|
( # \1 = whole list |
1590
|
|
|
( # \2 |
1591
|
|
|
[ ]{0,%d} |
1592
|
|
|
(%s) # \3 = first list item marker |
1593
|
|
|
[ \t]+ |
1594
|
|
|
(?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case. |
1595
|
|
|
) |
1596
|
|
|
(?:.+?) |
1597
|
|
|
( # \4 |
1598
|
|
|
\Z |
1599
|
|
|
| |
1600
|
|
|
\n{2,} |
1601
|
|
|
(?=\S) |
1602
|
|
|
(?! # Negative lookahead for another list item marker |
1603
|
|
|
[ \t]* |
1604
|
|
|
%s[ \t]+ |
1605
|
|
|
) |
1606
|
|
|
) |
1607
|
|
|
) |
1608
|
|
|
''' % (less_than_tab, marker_pat, marker_pat) |
1609
|
|
|
if self.list_level: # sub-list |
1610
|
|
|
list_re = re.compile("^"+whole_list, re.X | re.M | re.S) |
1611
|
|
|
else: |
1612
|
|
|
list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, |
1613
|
|
|
re.X | re.M | re.S) |
1614
|
|
|
match = list_re.search(text, pos) |
1615
|
|
|
if match: |
1616
|
|
|
hits.append((match.start(), match)) |
1617
|
|
|
if not hits: |
1618
|
|
|
break |
1619
|
|
|
hits.sort() |
1620
|
|
|
match = hits[0][1] |
1621
|
|
|
start, end = match.span() |
1622
|
|
|
middle = self._list_sub(match) |
1623
|
|
|
text = text[:start] + middle + text[end:] |
1624
|
|
|
pos = start + len(middle) # start pos for next attempted match |
1625
|
|
|
|
1626
|
|
|
return text |
1627
|
|
|
|
1628
|
|
|
_list_item_re = re.compile(r''' |
1629
|
|
|
(\n)? # leading line = \1 |
1630
|
|
|
(^[ \t]*) # leading whitespace = \2 |
1631
|
|
|
(?P<marker>%s) [ \t]+ # list marker = \3 |
1632
|
|
|
((?:.+?) # list item text = \4 |
1633
|
|
|
(\n{1,2})) # eols = \5 |
1634
|
|
|
(?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) |
1635
|
|
|
''' % (_marker_any, _marker_any), |
1636
|
|
|
re.M | re.X | re.S) |
1637
|
|
|
|
1638
|
|
|
_task_list_item_re = re.compile(r''' |
1639
|
|
|
(\[[\ x]\])[ \t]+ # tasklist marker = \1 |
1640
|
|
|
(.*) # list item text = \2 |
1641
|
|
|
''', re.M | re.X | re.S) |
1642
|
|
|
|
1643
|
|
|
_task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s' |
1644
|
|
|
|
1645
|
|
|
def _task_list_item_sub(self, match): |
1646
|
|
|
marker = match.group(1) |
1647
|
|
|
item_text = match.group(2) |
1648
|
|
|
if marker == '[x]': |
1649
|
|
|
return self._task_list_warpper_str % ('checked ', item_text) |
1650
|
|
|
elif marker == '[ ]': |
1651
|
|
|
return self._task_list_warpper_str % ('', item_text) |
1652
|
|
|
|
1653
|
|
|
_last_li_endswith_two_eols = False |
1654
|
|
|
def _list_item_sub(self, match): |
1655
|
|
|
item = match.group(4) |
1656
|
|
|
leading_line = match.group(1) |
1657
|
|
|
if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: |
1658
|
|
|
item = self._run_block_gamut(self._outdent(item)) |
1659
|
|
|
else: |
1660
|
|
|
# Recursion for sub-lists: |
1661
|
|
|
item = self._do_lists(self._outdent(item)) |
1662
|
|
|
if item.endswith('\n'): |
1663
|
|
|
item = item[:-1] |
1664
|
|
|
item = self._run_span_gamut(item) |
1665
|
|
|
self._last_li_endswith_two_eols = (len(match.group(5)) == 2) |
1666
|
|
|
|
1667
|
|
|
if "task_list" in self.extras: |
1668
|
|
|
item = self._task_list_item_re.sub(self._task_list_item_sub, item) |
1669
|
|
|
|
1670
|
|
|
return "<li>%s</li>\n" % item |
1671
|
|
|
|
1672
|
|
|
def _process_list_items(self, list_str): |
1673
|
|
|
# Process the contents of a single ordered or unordered list, |
1674
|
|
|
# splitting it into individual list items. |
1675
|
|
|
|
1676
|
|
|
# The $g_list_level global keeps track of when we're inside a list. |
1677
|
|
|
# Each time we enter a list, we increment it; when we leave a list, |
1678
|
|
|
# we decrement. If it's zero, we're not in a list anymore. |
1679
|
|
|
# |
1680
|
|
|
# We do this because when we're not inside a list, we want to treat |
1681
|
|
|
# something like this: |
1682
|
|
|
# |
1683
|
|
|
# I recommend upgrading to version |
1684
|
|
|
# 8. Oops, now this line is treated |
1685
|
|
|
# as a sub-list. |
1686
|
|
|
# |
1687
|
|
|
# As a single paragraph, despite the fact that the second line starts |
1688
|
|
|
# with a digit-period-space sequence. |
1689
|
|
|
# |
1690
|
|
|
# Whereas when we're inside a list (or sub-list), that line will be |
1691
|
|
|
# treated as the start of a sub-list. What a kludge, huh? This is |
1692
|
|
|
# an aspect of Markdown's syntax that's hard to parse perfectly |
1693
|
|
|
# without resorting to mind-reading. Perhaps the solution is to |
1694
|
|
|
# change the syntax rules such that sub-lists must start with a |
1695
|
|
|
# starting cardinal number; e.g. "1." or "a.". |
1696
|
|
|
self.list_level += 1 |
1697
|
|
|
self._last_li_endswith_two_eols = False |
1698
|
|
|
list_str = list_str.rstrip('\n') + '\n' |
1699
|
|
|
list_str = self._list_item_re.sub(self._list_item_sub, list_str) |
1700
|
|
|
self.list_level -= 1 |
1701
|
|
|
return list_str |
1702
|
|
|
|
1703
|
|
|
def _get_pygments_lexer(self, lexer_name): |
1704
|
|
|
try: |
1705
|
|
|
from pygments import lexers, util |
1706
|
|
|
except ImportError: |
1707
|
|
|
return None |
1708
|
|
|
try: |
1709
|
|
|
return lexers.get_lexer_by_name(lexer_name) |
1710
|
|
|
except util.ClassNotFound: |
1711
|
|
|
return None |
1712
|
|
|
|
1713
|
|
|
def _color_with_pygments(self, codeblock, lexer, **formatter_opts): |
1714
|
|
|
import pygments |
1715
|
|
|
import pygments.formatters |
1716
|
|
|
|
1717
|
|
|
class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): |
1718
|
|
|
def _wrap_code(self, inner): |
1719
|
|
|
"""A function for use in a Pygments Formatter which |
1720
|
|
|
wraps in <code> tags. |
1721
|
|
|
""" |
1722
|
|
|
yield 0, "<code>" |
1723
|
|
|
for tup in inner: |
1724
|
|
|
yield tup |
1725
|
|
|
yield 0, "</code>" |
1726
|
|
|
|
1727
|
|
|
def wrap(self, source, outfile): |
1728
|
|
|
"""Return the source with a code, pre, and div.""" |
1729
|
|
|
return self._wrap_div(self._wrap_pre(self._wrap_code(source))) |
1730
|
|
|
|
1731
|
|
|
formatter_opts.setdefault("cssclass", "codehilite") |
1732
|
|
|
formatter = HtmlCodeFormatter(**formatter_opts) |
1733
|
|
|
return pygments.highlight(codeblock, lexer, formatter) |
1734
|
|
|
|
1735
|
|
|
def _code_block_sub(self, match, is_fenced_code_block=False): |
1736
|
|
|
lexer_name = None |
1737
|
|
|
if is_fenced_code_block: |
1738
|
|
|
lexer_name = match.group(1) |
1739
|
|
|
if lexer_name: |
1740
|
|
|
formatter_opts = self.extras['fenced-code-blocks'] or {} |
1741
|
|
|
codeblock = match.group(2) |
1742
|
|
|
codeblock = codeblock[:-1] # drop one trailing newline |
1743
|
|
|
else: |
1744
|
|
|
codeblock = match.group(1) |
1745
|
|
|
codeblock = self._outdent(codeblock) |
1746
|
|
|
codeblock = self._detab(codeblock) |
1747
|
|
|
codeblock = codeblock.lstrip('\n') # trim leading newlines |
1748
|
|
|
codeblock = codeblock.rstrip() # trim trailing whitespace |
1749
|
|
|
|
1750
|
|
|
# Note: "code-color" extra is DEPRECATED. |
1751
|
|
|
if "code-color" in self.extras and codeblock.startswith(":::"): |
1752
|
|
|
lexer_name, rest = codeblock.split('\n', 1) |
1753
|
|
|
lexer_name = lexer_name[3:].strip() |
1754
|
|
|
codeblock = rest.lstrip("\n") # Remove lexer declaration line. |
1755
|
|
|
formatter_opts = self.extras['code-color'] or {} |
1756
|
|
|
|
1757
|
|
|
# Use pygments only if not using the highlightjs-lang extra |
1758
|
|
|
if lexer_name and "highlightjs-lang" not in self.extras: |
1759
|
|
|
def unhash_code(codeblock): |
1760
|
|
|
for key, sanitized in list(self.html_spans.items()): |
1761
|
|
|
codeblock = codeblock.replace(key, sanitized) |
1762
|
|
|
replacements = [ |
1763
|
|
|
("&", "&"), |
1764
|
|
|
("<", "<"), |
1765
|
|
|
(">", ">") |
1766
|
|
|
] |
1767
|
|
|
for old, new in replacements: |
1768
|
|
|
codeblock = codeblock.replace(old, new) |
1769
|
|
|
return codeblock |
1770
|
|
|
lexer = self._get_pygments_lexer(lexer_name) |
1771
|
|
|
if lexer: |
1772
|
|
|
codeblock = unhash_code( codeblock ) |
1773
|
|
|
colored = self._color_with_pygments(codeblock, lexer, |
1774
|
|
|
**formatter_opts) |
1775
|
|
|
return "\n\n%s\n\n" % colored |
1776
|
|
|
|
1777
|
|
|
codeblock = self._encode_code(codeblock) |
1778
|
|
|
pre_class_str = self._html_class_str_from_tag("pre") |
1779
|
|
|
|
1780
|
|
|
if "highlightjs-lang" in self.extras and lexer_name: |
1781
|
|
|
code_class_str = ' class="%s"' % lexer_name |
1782
|
|
|
else: |
1783
|
|
|
code_class_str = self._html_class_str_from_tag("code") |
1784
|
|
|
|
1785
|
|
|
return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % ( |
1786
|
|
|
pre_class_str, code_class_str, codeblock) |
1787
|
|
|
|
1788
|
|
|
def _html_class_str_from_tag(self, tag): |
1789
|
|
|
"""Get the appropriate ' class="..."' string (note the leading |
1790
|
|
|
space), if any, for the given tag. |
1791
|
|
|
""" |
1792
|
|
|
if "html-classes" not in self.extras: |
1793
|
|
|
return "" |
1794
|
|
|
try: |
1795
|
|
|
html_classes_from_tag = self.extras["html-classes"] |
1796
|
|
|
except TypeError: |
1797
|
|
|
return "" |
1798
|
|
|
else: |
1799
|
|
|
if tag in html_classes_from_tag: |
1800
|
|
|
return ' class="%s"' % html_classes_from_tag[tag] |
1801
|
|
|
return "" |
1802
|
|
|
|
1803
|
|
|
def _do_code_blocks(self, text): |
1804
|
|
|
"""Process Markdown `<pre><code>` blocks.""" |
1805
|
|
|
code_block_re = re.compile(r''' |
1806
|
|
|
(?:\n\n|\A\n?) |
1807
|
|
|
( # $1 = the code block -- one or more lines, starting with a space/tab |
1808
|
|
|
(?: |
1809
|
|
|
(?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces |
1810
|
|
|
.*\n+ |
1811
|
|
|
)+ |
1812
|
|
|
) |
1813
|
|
|
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc |
1814
|
|
|
# Lookahead to make sure this block isn't already in a code block. |
1815
|
|
|
# Needed when syntax highlighting is being used. |
1816
|
|
|
(?![^<]*\</code\>) |
1817
|
|
|
''' % (self.tab_width, self.tab_width), |
1818
|
|
|
re.M | re.X) |
1819
|
|
|
return code_block_re.sub(self._code_block_sub, text) |
1820
|
|
|
|
1821
|
|
|
_fenced_code_block_re = re.compile(r''' |
1822
|
|
|
(?:\n+|\A\n?) |
1823
|
|
|
^```\s*?([\w+-]+)?\s*?\n # opening fence, $1 = optional lang |
1824
|
|
|
(.*?) # $2 = code block content |
1825
|
|
|
^```[ \t]*\n # closing fence |
1826
|
|
|
''', re.M | re.X | re.S) |
1827
|
|
|
|
1828
|
|
|
def _fenced_code_block_sub(self, match): |
1829
|
|
|
return self._code_block_sub(match, is_fenced_code_block=True) |
1830
|
|
|
|
1831
|
|
|
def _do_fenced_code_blocks(self, text): |
1832
|
|
|
"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" |
1833
|
|
|
return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) |
1834
|
|
|
|
1835
|
|
|
# Rules for a code span: |
1836
|
|
|
# - backslash escapes are not interpreted in a code span |
1837
|
|
|
# - to include one or or a run of more backticks the delimiters must |
1838
|
|
|
# be a longer run of backticks |
1839
|
|
|
# - cannot start or end a code span with a backtick; pad with a |
1840
|
|
|
# space and that space will be removed in the emitted HTML |
1841
|
|
|
# See `test/tm-cases/escapes.text` for a number of edge-case |
1842
|
|
|
# examples. |
1843
|
|
|
_code_span_re = re.compile(r''' |
1844
|
|
|
(?<!\\) |
1845
|
|
|
(`+) # \1 = Opening run of ` |
1846
|
|
|
(?!`) # See Note A test/tm-cases/escapes.text |
1847
|
|
|
(.+?) # \2 = The code block |
1848
|
|
|
(?<!`) |
1849
|
|
|
\1 # Matching closer |
1850
|
|
|
(?!`) |
1851
|
|
|
''', re.X | re.S) |
1852
|
|
|
|
1853
|
|
|
def _code_span_sub(self, match): |
1854
|
|
|
c = match.group(2).strip(" \t") |
1855
|
|
|
c = self._encode_code(c) |
1856
|
|
|
return "<code>%s</code>" % c |
1857
|
|
|
|
1858
|
|
|
def _do_code_spans(self, text): |
1859
|
|
|
# * Backtick quotes are used for <code></code> spans. |
1860
|
|
|
# |
1861
|
|
|
# * You can use multiple backticks as the delimiters if you want to |
1862
|
|
|
# include literal backticks in the code span. So, this input: |
1863
|
|
|
# |
1864
|
|
|
# Just type ``foo `bar` baz`` at the prompt. |
1865
|
|
|
# |
1866
|
|
|
# Will translate to: |
1867
|
|
|
# |
1868
|
|
|
# <p>Just type <code>foo `bar` baz</code> at the prompt.</p> |
1869
|
|
|
# |
1870
|
|
|
# There's no arbitrary limit to the number of backticks you |
1871
|
|
|
# can use as delimters. If you need three consecutive backticks |
1872
|
|
|
# in your code, use four for delimiters, etc. |
1873
|
|
|
# |
1874
|
|
|
# * You can use spaces to get literal backticks at the edges: |
1875
|
|
|
# |
1876
|
|
|
# ... type `` `bar` `` ... |
1877
|
|
|
# |
1878
|
|
|
# Turns to: |
1879
|
|
|
# |
1880
|
|
|
# ... type <code>`bar`</code> ... |
1881
|
|
|
return self._code_span_re.sub(self._code_span_sub, text) |
1882
|
|
|
|
1883
|
|
|
def _encode_code(self, text): |
1884
|
|
|
"""Encode/escape certain characters inside Markdown code runs. |
1885
|
|
|
The point is that in code, these characters are literals, |
1886
|
|
|
and lose their special Markdown meanings. |
1887
|
|
|
""" |
1888
|
|
|
replacements = [ |
1889
|
|
|
# Encode all ampersands; HTML entities are not |
1890
|
|
|
# entities within a Markdown code span. |
1891
|
|
|
('&', '&'), |
1892
|
|
|
# Do the angle bracket song and dance: |
1893
|
|
|
('<', '<'), |
1894
|
|
|
('>', '>'), |
1895
|
|
|
] |
1896
|
|
|
for before, after in replacements: |
1897
|
|
|
text = text.replace(before, after) |
1898
|
|
|
hashed = _hash_text(text) |
1899
|
|
|
self._escape_table[text] = hashed |
1900
|
|
|
return hashed |
1901
|
|
|
|
1902
|
|
|
_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) |
1903
|
|
|
def _do_strike(self, text): |
1904
|
|
|
text = self._strike_re.sub(r"<strike>\1</strike>", text) |
1905
|
|
|
return text |
1906
|
|
|
|
1907
|
|
|
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) |
1908
|
|
|
_em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) |
1909
|
|
|
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) |
1910
|
|
|
_code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) |
1911
|
|
|
def _do_italics_and_bold(self, text): |
1912
|
|
|
# <strong> must go first: |
1913
|
|
|
if "code-friendly" in self.extras: |
1914
|
|
|
text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) |
1915
|
|
|
text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) |
1916
|
|
|
else: |
1917
|
|
|
text = self._strong_re.sub(r"<strong>\2</strong>", text) |
1918
|
|
|
text = self._em_re.sub(r"<em>\2</em>", text) |
1919
|
|
|
return text |
1920
|
|
|
|
1921
|
|
|
# "smarty-pants" extra: Very liberal in interpreting a single prime as an |
1922
|
|
|
# apostrophe; e.g. ignores the fact that "round", "bout", "twer", and |
1923
|
|
|
# "twixt" can be written without an initial apostrophe. This is fine because |
1924
|
|
|
# using scare quotes (single quotation marks) is rare. |
1925
|
|
|
_apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))") |
1926
|
|
|
_contractions = ["tis", "twas", "twer", "neath", "o", "n", |
1927
|
|
|
"round", "bout", "twixt", "nuff", "fraid", "sup"] |
1928
|
|
|
def _do_smart_contractions(self, text): |
1929
|
|
|
text = self._apostrophe_year_re.sub(r"’\1", text) |
1930
|
|
|
for c in self._contractions: |
1931
|
|
|
text = text.replace("'%s" % c, "’%s" % c) |
1932
|
|
|
text = text.replace("'%s" % c.capitalize(), |
1933
|
|
|
"’%s" % c.capitalize()) |
1934
|
|
|
return text |
1935
|
|
|
|
1936
|
|
|
# Substitute double-quotes before single-quotes. |
1937
|
|
|
_opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)") |
1938
|
|
|
_opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)') |
1939
|
|
|
_closing_single_quote_re = re.compile(r"(?<=\S)'") |
1940
|
|
|
_closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))') |
1941
|
|
|
def _do_smart_punctuation(self, text): |
1942
|
|
|
"""Fancifies 'single quotes', "double quotes", and apostrophes. |
1943
|
|
|
Converts --, ---, and ... into en dashes, em dashes, and ellipses. |
1944
|
|
|
|
1945
|
|
|
Inspiration is: <http://daringfireball.net/projects/smartypants/> |
1946
|
|
|
See "test/tm-cases/smarty_pants.text" for a full discussion of the |
1947
|
|
|
support here and |
1948
|
|
|
<http://code.google.com/p/python-markdown2/issues/detail?id=42> for a |
1949
|
|
|
discussion of some diversion from the original SmartyPants. |
1950
|
|
|
""" |
1951
|
|
|
if "'" in text: # guard for perf |
1952
|
|
|
text = self._do_smart_contractions(text) |
1953
|
|
|
text = self._opening_single_quote_re.sub("‘", text) |
1954
|
|
|
text = self._closing_single_quote_re.sub("’", text) |
1955
|
|
|
|
1956
|
|
|
if '"' in text: # guard for perf |
1957
|
|
|
text = self._opening_double_quote_re.sub("“", text) |
1958
|
|
|
text = self._closing_double_quote_re.sub("”", text) |
1959
|
|
|
|
1960
|
|
|
text = text.replace("---", "—") |
1961
|
|
|
text = text.replace("--", "–") |
1962
|
|
|
text = text.replace("...", "…") |
1963
|
|
|
text = text.replace(" . . . ", "…") |
1964
|
|
|
text = text.replace(". . .", "…") |
1965
|
|
|
return text |
1966
|
|
|
|
1967
|
|
|
_block_quote_base = r''' |
1968
|
|
|
( # Wrap whole match in \1 |
1969
|
|
|
( |
1970
|
|
|
^[ \t]*>%s[ \t]? # '>' at the start of a line |
1971
|
|
|
.+\n # rest of the first line |
1972
|
|
|
(.+\n)* # subsequent consecutive lines |
1973
|
|
|
\n* # blanks |
1974
|
|
|
)+ |
1975
|
|
|
) |
1976
|
|
|
''' |
1977
|
|
|
_block_quote_re = re.compile(_block_quote_base % '', re.M | re.X) |
1978
|
|
|
_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X) |
1979
|
|
|
_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M) |
1980
|
|
|
_bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M) |
1981
|
|
|
_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M) |
1982
|
|
|
_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
1983
|
|
|
def _dedent_two_spaces_sub(self, match): |
1984
|
|
|
return re.sub(r'(?m)^ ', '', match.group(1)) |
1985
|
|
|
|
1986
|
|
|
def _block_quote_sub(self, match): |
1987
|
|
|
bq = match.group(1) |
1988
|
|
|
is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq) |
1989
|
|
|
# trim one level of quoting |
1990
|
|
|
if is_spoiler: |
1991
|
|
|
bq = self._bq_one_level_re_spoiler.sub('', bq) |
1992
|
|
|
else: |
1993
|
|
|
bq = self._bq_one_level_re.sub('', bq) |
1994
|
|
|
# trim whitespace-only lines |
1995
|
|
|
bq = self._ws_only_line_re.sub('', bq) |
1996
|
|
|
bq = self._run_block_gamut(bq) # recurse |
1997
|
|
|
|
1998
|
|
|
bq = re.sub('(?m)^', ' ', bq) |
1999
|
|
|
# These leading spaces screw with <pre> content, so we need to fix that: |
2000
|
|
|
bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) |
2001
|
|
|
|
2002
|
|
|
if is_spoiler: |
2003
|
|
|
return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq |
2004
|
|
|
else: |
2005
|
|
|
return '<blockquote>\n%s\n</blockquote>\n\n' % bq |
2006
|
|
|
|
2007
|
|
|
def _do_block_quotes(self, text): |
2008
|
|
|
if '>' not in text: |
2009
|
|
|
return text |
2010
|
|
|
if 'spoiler' in self.extras: |
2011
|
|
|
return self._block_quote_re_spoiler.sub(self._block_quote_sub, text) |
2012
|
|
|
else: |
2013
|
|
|
return self._block_quote_re.sub(self._block_quote_sub, text) |
2014
|
|
|
|
2015
|
|
|
def _form_paragraphs(self, text): |
2016
|
|
|
# Strip leading and trailing lines: |
2017
|
|
|
text = text.strip('\n') |
2018
|
|
|
|
2019
|
|
|
# Wrap <p> tags. |
2020
|
|
|
grafs = [] |
2021
|
|
|
for i, graf in enumerate(re.split(r"\n{2,}", text)): |
2022
|
|
|
if graf in self.html_blocks: |
2023
|
|
|
# Unhashify HTML blocks |
2024
|
|
|
grafs.append(self.html_blocks[graf]) |
2025
|
|
|
else: |
2026
|
|
|
cuddled_list = None |
2027
|
|
|
if "cuddled-lists" in self.extras: |
2028
|
|
|
# Need to put back trailing '\n' for `_list_item_re` |
2029
|
|
|
# match at the end of the paragraph. |
2030
|
|
|
li = self._list_item_re.search(graf + '\n') |
2031
|
|
|
# Two of the same list marker in this paragraph: a likely |
2032
|
|
|
# candidate for a list cuddled to preceding paragraph |
2033
|
|
|
# text (issue 33). Note the `[-1]` is a quick way to |
2034
|
|
|
# consider numeric bullets (e.g. "1." and "2.") to be |
2035
|
|
|
# equal. |
2036
|
|
|
if (li and len(li.group(2)) <= 3 and li.group("next_marker") |
2037
|
|
|
and li.group("marker")[-1] == li.group("next_marker")[-1]): |
2038
|
|
|
start = li.start() |
2039
|
|
|
cuddled_list = self._do_lists(graf[start:]).rstrip("\n") |
2040
|
|
|
assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>") |
2041
|
|
|
graf = graf[:start] |
2042
|
|
|
|
2043
|
|
|
# Wrap <p> tags. |
2044
|
|
|
graf = self._run_span_gamut(graf) |
2045
|
|
|
grafs.append("<p>" + graf.lstrip(" \t") + "</p>") |
2046
|
|
|
|
2047
|
|
|
if cuddled_list: |
2048
|
|
|
grafs.append(cuddled_list) |
2049
|
|
|
|
2050
|
|
|
return "\n\n".join(grafs) |
2051
|
|
|
|
2052
|
|
|
def _add_footnotes(self, text): |
2053
|
|
|
if self.footnotes: |
2054
|
|
|
footer = [ |
2055
|
|
|
'<div class="footnotes">', |
2056
|
|
|
'<hr' + self.empty_element_suffix, |
2057
|
|
|
'<ol>', |
2058
|
|
|
] |
2059
|
|
|
|
2060
|
|
|
if not self.footnote_title: |
2061
|
|
|
self.footnote_title = "Jump back to footnote %d in the text." |
2062
|
|
|
if not self.footnote_return_symbol: |
2063
|
|
|
self.footnote_return_symbol = "↩" |
2064
|
|
|
|
2065
|
|
|
for i, id in enumerate(self.footnote_ids): |
2066
|
|
|
if i != 0: |
2067
|
|
|
footer.append('') |
2068
|
|
|
footer.append('<li id="fn-%s">' % id) |
2069
|
|
|
footer.append(self._run_block_gamut(self.footnotes[id])) |
2070
|
|
|
try: |
2071
|
|
|
backlink = ('<a href="#fnref-%s" ' + |
2072
|
|
|
'class="footnoteBackLink" ' + |
2073
|
|
|
'title="' + self.footnote_title + '">' + |
2074
|
|
|
self.footnote_return_symbol + |
2075
|
|
|
'</a>') % (id, i+1) |
2076
|
|
|
except TypeError: |
2077
|
|
|
log.debug("Footnote error. `footnote_title` " |
2078
|
|
|
"must include parameter. Using defaults.") |
2079
|
|
|
backlink = ('<a href="#fnref-%s" ' |
2080
|
|
|
'class="footnoteBackLink" ' |
2081
|
|
|
'title="Jump back to footnote %d in the text.">' |
2082
|
|
|
'↩</a>' % (id, i+1)) |
2083
|
|
|
|
2084
|
|
|
if footer[-1].endswith("</p>"): |
2085
|
|
|
footer[-1] = footer[-1][:-len("</p>")] \ |
2086
|
|
|
+ ' ' + backlink + "</p>" |
2087
|
|
|
else: |
2088
|
|
|
footer.append("\n<p>%s</p>" % backlink) |
2089
|
|
|
footer.append('</li>') |
2090
|
|
|
footer.append('</ol>') |
2091
|
|
|
footer.append('</div>') |
2092
|
|
|
return text + '\n\n' + '\n'.join(footer) |
2093
|
|
|
else: |
2094
|
|
|
return text |
2095
|
|
|
|
2096
|
|
|
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: |
2097
|
|
|
# http://bumppo.net/projects/amputator/ |
2098
|
|
|
_ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') |
2099
|
|
|
_naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) |
2100
|
|
|
_naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I) |
2101
|
|
|
|
2102
|
|
|
def _encode_amps_and_angles(self, text): |
2103
|
|
|
# Smart processing for ampersands and angle brackets that need |
2104
|
|
|
# to be encoded. |
2105
|
|
|
text = self._ampersand_re.sub('&', text) |
2106
|
|
|
|
2107
|
|
|
# Encode naked <'s |
2108
|
|
|
text = self._naked_lt_re.sub('<', text) |
2109
|
|
|
|
2110
|
|
|
# Encode naked >'s |
2111
|
|
|
# Note: Other markdown implementations (e.g. Markdown.pl, PHP |
2112
|
|
|
# Markdown) don't do this. |
2113
|
|
|
text = self._naked_gt_re.sub('>', text) |
2114
|
|
|
return text |
2115
|
|
|
|
2116
|
|
|
def _encode_backslash_escapes(self, text): |
2117
|
|
|
for ch, escape in list(self._escape_table.items()): |
2118
|
|
|
text = text.replace("\\"+ch, escape) |
2119
|
|
|
return text |
2120
|
|
|
|
2121
|
|
|
_auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) |
2122
|
|
|
def _auto_link_sub(self, match): |
2123
|
|
|
g1 = match.group(1) |
2124
|
|
|
return '<a href="%s">%s</a>' % (g1, g1) |
2125
|
|
|
|
2126
|
|
|
_auto_email_link_re = re.compile(r""" |
2127
|
|
|
< |
2128
|
|
|
(?:mailto:)? |
2129
|
|
|
( |
2130
|
|
|
[-.\w]+ |
2131
|
|
|
\@ |
2132
|
|
|
[-\w]+(\.[-\w]+)*\.[a-z]+ |
2133
|
|
|
) |
2134
|
|
|
> |
2135
|
|
|
""", re.I | re.X | re.U) |
2136
|
|
|
def _auto_email_link_sub(self, match): |
2137
|
|
|
return self._encode_email_address( |
2138
|
|
|
self._unescape_special_chars(match.group(1))) |
2139
|
|
|
|
2140
|
|
|
def _do_auto_links(self, text): |
2141
|
|
|
text = self._auto_link_re.sub(self._auto_link_sub, text) |
2142
|
|
|
text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) |
2143
|
|
|
return text |
2144
|
|
|
|
2145
|
|
|
def _encode_email_address(self, addr): |
2146
|
|
|
# Input: an email address, e.g. "[email protected]" |
2147
|
|
|
# |
2148
|
|
|
# Output: the email address as a mailto link, with each character |
2149
|
|
|
# of the address encoded as either a decimal or hex entity, in |
2150
|
|
|
# the hopes of foiling most address harvesting spam bots. E.g.: |
2151
|
|
|
# |
2152
|
|
|
# <a href="mailto:foo@e |
2153
|
|
|
# xample.com">foo |
2154
|
|
|
# @example.com</a> |
2155
|
|
|
# |
2156
|
|
|
# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk |
2157
|
|
|
# mailing list: <http://tinyurl.com/yu7ue> |
2158
|
|
|
chars = [_xml_encode_email_char_at_random(ch) |
2159
|
|
|
for ch in "mailto:" + addr] |
2160
|
|
|
# Strip the mailto: from the visible part. |
2161
|
|
|
addr = '<a href="%s">%s</a>' \ |
2162
|
|
|
% (''.join(chars), ''.join(chars[7:])) |
2163
|
|
|
return addr |
2164
|
|
|
|
2165
|
|
|
def _do_link_patterns(self, text): |
2166
|
|
|
"""Caveat emptor: there isn't much guarding against link |
2167
|
|
|
patterns being formed inside other standard Markdown links, e.g. |
2168
|
|
|
inside a [link def][like this]. |
2169
|
|
|
|
2170
|
|
|
Dev Notes: *Could* consider prefixing regexes with a negative |
2171
|
|
|
lookbehind assertion to attempt to guard against this. |
2172
|
|
|
""" |
2173
|
|
|
link_from_hash = {} |
2174
|
|
|
for regex, repl in self.link_patterns: |
2175
|
|
|
replacements = [] |
2176
|
|
|
for match in regex.finditer(text): |
2177
|
|
|
if hasattr(repl, "__call__"): |
2178
|
|
|
href = repl(match) |
2179
|
|
|
else: |
2180
|
|
|
href = match.expand(repl) |
2181
|
|
|
replacements.append((match.span(), href)) |
2182
|
|
|
for (start, end), href in reversed(replacements): |
2183
|
|
|
escaped_href = ( |
2184
|
|
|
href.replace('"', '"') # b/c of attr quote |
2185
|
|
|
# To avoid markdown <em> and <strong>: |
2186
|
|
|
.replace('*', self._escape_table['*']) |
2187
|
|
|
.replace('_', self._escape_table['_'])) |
2188
|
|
|
link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) |
2189
|
|
|
hash = _hash_text(link) |
2190
|
|
|
link_from_hash[hash] = link |
2191
|
|
|
text = text[:start] + hash + text[end:] |
2192
|
|
|
for hash, link in list(link_from_hash.items()): |
2193
|
|
|
text = text.replace(hash, link) |
2194
|
|
|
return text |
2195
|
|
|
|
2196
|
|
|
def _unescape_special_chars(self, text): |
2197
|
|
|
# Swap back in all the special characters we've hidden. |
2198
|
|
|
for ch, hash in list(self._escape_table.items()): |
2199
|
|
|
text = text.replace(hash, ch) |
2200
|
|
|
return text |
2201
|
|
|
|
2202
|
|
|
def _outdent(self, text): |
2203
|
|
|
# Remove one level of line-leading tabs or spaces |
2204
|
|
|
return self._outdent_re.sub('', text) |
2205
|
|
|
|
2206
|
|
|
|
2207
|
|
|
class MarkdownWithExtras(Markdown): |
2208
|
|
|
"""A markdowner class that enables most extras: |
2209
|
|
|
|
2210
|
|
|
- footnotes |
2211
|
|
|
- code-color (only has effect if 'pygments' Python module on path) |
2212
|
|
|
|
2213
|
|
|
These are not included: |
2214
|
|
|
- pyshell (specific to Python-related documenting) |
2215
|
|
|
- code-friendly (because it *disables* part of the syntax) |
2216
|
|
|
- link-patterns (because you need to specify some actual |
2217
|
|
|
link-patterns anyway) |
2218
|
|
|
""" |
2219
|
|
|
extras = ["footnotes", "code-color"] |
2220
|
|
|
|
2221
|
|
|
|
2222
|
|
|
# ---- internal support functions |
2223
|
|
|
|
2224
|
|
|
class UnicodeWithAttrs(unicode): |
2225
|
|
|
"""A subclass of unicode used for the return value of conversion to |
2226
|
|
|
possibly attach some attributes. E.g. the "toc_html" attribute when |
2227
|
|
|
the "toc" extra is used. |
2228
|
|
|
""" |
2229
|
|
|
metadata = None |
2230
|
|
|
_toc = None |
2231
|
|
|
def toc_html(self): |
2232
|
|
|
"""Return the HTML for the current TOC. |
2233
|
|
|
|
2234
|
|
|
This expects the `_toc` attribute to have been set on this instance. |
2235
|
|
|
""" |
2236
|
|
|
if self._toc is None: |
2237
|
|
|
return None |
2238
|
|
|
|
2239
|
|
|
def indent(): |
2240
|
|
|
return ' ' * (len(h_stack) - 1) |
2241
|
|
|
lines = [] |
2242
|
|
|
h_stack = [0] # stack of header-level numbers |
2243
|
|
|
for level, id, name in self._toc: |
2244
|
|
|
if level > h_stack[-1]: |
2245
|
|
|
lines.append("%s<ul>" % indent()) |
2246
|
|
|
h_stack.append(level) |
2247
|
|
|
elif level == h_stack[-1]: |
2248
|
|
|
lines[-1] += "</li>" |
2249
|
|
|
else: |
2250
|
|
|
while level < h_stack[-1]: |
2251
|
|
|
h_stack.pop() |
2252
|
|
|
if not lines[-1].endswith("</li>"): |
2253
|
|
|
lines[-1] += "</li>" |
2254
|
|
|
lines.append("%s</ul></li>" % indent()) |
2255
|
|
|
lines.append('%s<li><a href="#%s">%s</a>' % ( |
2256
|
|
|
indent(), id, name)) |
2257
|
|
|
while len(h_stack) > 1: |
2258
|
|
|
h_stack.pop() |
2259
|
|
|
if not lines[-1].endswith("</li>"): |
2260
|
|
|
lines[-1] += "</li>" |
2261
|
|
|
lines.append("%s</ul>" % indent()) |
2262
|
|
|
return '\n'.join(lines) + '\n' |
2263
|
|
|
toc_html = property(toc_html) |
2264
|
|
|
|
2265
|
|
|
## {{{ http://code.activestate.com/recipes/577257/ (r1) |
2266
|
|
|
_slugify_strip_re = re.compile(r'[^\w\s-]') |
2267
|
|
|
_slugify_hyphenate_re = re.compile(r'[-\s]+') |
2268
|
|
|
def _slugify(value): |
2269
|
|
|
""" |
2270
|
|
|
Normalizes string, converts to lowercase, removes non-alpha characters, |
2271
|
|
|
and converts spaces to hyphens. |
2272
|
|
|
|
2273
|
|
|
From Django's "django/template/defaultfilters.py". |
2274
|
|
|
""" |
2275
|
|
|
import unicodedata |
2276
|
|
|
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() |
2277
|
|
|
value = _slugify_strip_re.sub('', value).strip().lower() |
2278
|
|
|
return _slugify_hyphenate_re.sub('-', value) |
2279
|
|
|
## end of http://code.activestate.com/recipes/577257/ }}} |
2280
|
|
|
|
2281
|
|
|
|
2282
|
|
|
# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 |
2283
|
|
|
def _curry(*args, **kwargs): |
2284
|
|
|
function, args = args[0], args[1:] |
2285
|
|
|
def result(*rest, **kwrest): |
2286
|
|
|
combined = kwargs.copy() |
2287
|
|
|
combined.update(kwrest) |
2288
|
|
|
return function(*args + rest, **combined) |
2289
|
|
|
return result |
2290
|
|
|
|
2291
|
|
|
|
2292
|
|
|
# Recipe: regex_from_encoded_pattern (1.0) |
2293
|
|
|
def _regex_from_encoded_pattern(s): |
2294
|
|
|
"""'foo' -> re.compile(re.escape('foo')) |
2295
|
|
|
'/foo/' -> re.compile('foo') |
2296
|
|
|
'/foo/i' -> re.compile('foo', re.I) |
2297
|
|
|
""" |
2298
|
|
|
if s.startswith('/') and s.rfind('/') != 0: |
2299
|
|
|
# Parse it: /PATTERN/FLAGS |
2300
|
|
|
idx = s.rfind('/') |
2301
|
|
|
pattern, flags_str = s[1:idx], s[idx+1:] |
2302
|
|
|
flag_from_char = { |
2303
|
|
|
"i": re.IGNORECASE, |
2304
|
|
|
"l": re.LOCALE, |
2305
|
|
|
"s": re.DOTALL, |
2306
|
|
|
"m": re.MULTILINE, |
2307
|
|
|
"u": re.UNICODE, |
2308
|
|
|
} |
2309
|
|
|
flags = 0 |
2310
|
|
|
for char in flags_str: |
2311
|
|
|
try: |
2312
|
|
|
flags |= flag_from_char[char] |
2313
|
|
|
except KeyError: |
2314
|
|
|
raise ValueError("unsupported regex flag: '%s' in '%s' " |
2315
|
|
|
"(must be one of '%s')" |
2316
|
|
|
% (char, s, ''.join(list(flag_from_char.keys())))) |
2317
|
|
|
return re.compile(s[1:idx], flags) |
2318
|
|
|
else: # not an encoded regex |
2319
|
|
|
return re.compile(re.escape(s)) |
2320
|
|
|
|
2321
|
|
|
|
2322
|
|
|
# Recipe: dedent (0.1.2) |
2323
|
|
|
def _dedentlines(lines, tabsize=8, skip_first_line=False): |
2324
|
|
|
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines |
2325
|
|
|
|
2326
|
|
|
"lines" is a list of lines to dedent. |
2327
|
|
|
"tabsize" is the tab width to use for indent width calculations. |
2328
|
|
|
"skip_first_line" is a boolean indicating if the first line should |
2329
|
|
|
be skipped for calculating the indent width and for dedenting. |
2330
|
|
|
This is sometimes useful for docstrings and similar. |
2331
|
|
|
|
2332
|
|
|
Same as dedent() except operates on a sequence of lines. Note: the |
2333
|
|
|
lines list is modified **in-place**. |
2334
|
|
|
""" |
2335
|
|
|
DEBUG = False |
2336
|
|
|
if DEBUG: |
2337
|
|
|
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ |
2338
|
|
|
% (tabsize, skip_first_line)) |
2339
|
|
|
margin = None |
2340
|
|
|
for i, line in enumerate(lines): |
2341
|
|
|
if i == 0 and skip_first_line: continue |
2342
|
|
|
indent = 0 |
2343
|
|
|
for ch in line: |
2344
|
|
|
if ch == ' ': |
2345
|
|
|
indent += 1 |
2346
|
|
|
elif ch == '\t': |
2347
|
|
|
indent += tabsize - (indent % tabsize) |
2348
|
|
|
elif ch in '\r\n': |
2349
|
|
|
continue # skip all-whitespace lines |
2350
|
|
|
else: |
2351
|
|
|
break |
2352
|
|
|
else: |
2353
|
|
|
continue # skip all-whitespace lines |
2354
|
|
|
if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) |
2355
|
|
|
if margin is None: |
2356
|
|
|
margin = indent |
2357
|
|
|
else: |
2358
|
|
|
margin = min(margin, indent) |
2359
|
|
|
if DEBUG: print("dedent: margin=%r" % margin) |
2360
|
|
|
|
2361
|
|
|
if margin is not None and margin > 0: |
2362
|
|
|
for i, line in enumerate(lines): |
2363
|
|
|
if i == 0 and skip_first_line: continue |
2364
|
|
|
removed = 0 |
2365
|
|
|
for j, ch in enumerate(line): |
2366
|
|
|
if ch == ' ': |
2367
|
|
|
removed += 1 |
2368
|
|
|
elif ch == '\t': |
2369
|
|
|
removed += tabsize - (removed % tabsize) |
2370
|
|
|
elif ch in '\r\n': |
2371
|
|
|
if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) |
2372
|
|
|
lines[i] = lines[i][j:] |
2373
|
|
|
break |
2374
|
|
|
else: |
2375
|
|
|
raise ValueError("unexpected non-whitespace char %r in " |
2376
|
|
|
"line %r while removing %d-space margin" |
2377
|
|
|
% (ch, line, margin)) |
2378
|
|
|
if DEBUG: |
2379
|
|
|
print("dedent: %r: %r -> removed %d/%d"\ |
2380
|
|
|
% (line, ch, removed, margin)) |
2381
|
|
|
if removed == margin: |
2382
|
|
|
lines[i] = lines[i][j+1:] |
2383
|
|
|
break |
2384
|
|
|
elif removed > margin: |
2385
|
|
|
lines[i] = ' '*(removed-margin) + lines[i][j+1:] |
2386
|
|
|
break |
2387
|
|
|
else: |
2388
|
|
|
if removed: |
2389
|
|
|
lines[i] = lines[i][removed:] |
2390
|
|
|
return lines |
2391
|
|
|
|
2392
|
|
|
|
2393
|
|
|
def _dedent(text, tabsize=8, skip_first_line=False): |
2394
|
|
|
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text |
2395
|
|
|
|
2396
|
|
|
"text" is the text to dedent. |
2397
|
|
|
"tabsize" is the tab width to use for indent width calculations. |
2398
|
|
|
"skip_first_line" is a boolean indicating if the first line should |
2399
|
|
|
be skipped for calculating the indent width and for dedenting. |
2400
|
|
|
This is sometimes useful for docstrings and similar. |
2401
|
|
|
|
2402
|
|
|
textwrap.dedent(s), but don't expand tabs to spaces |
2403
|
|
|
""" |
2404
|
|
|
lines = text.splitlines(1) |
2405
|
|
|
_dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) |
2406
|
|
|
return ''.join(lines) |
2407
|
|
|
|
2408
|
|
|
|
2409
|
|
|
class _memoized(object): |
2410
|
|
|
"""Decorator that caches a function's return value each time it is called. |
2411
|
|
|
If called later with the same arguments, the cached value is returned, and |
2412
|
|
|
not re-evaluated. |
2413
|
|
|
|
2414
|
|
|
http://wiki.python.org/moin/PythonDecoratorLibrary |
2415
|
|
|
""" |
2416
|
|
|
def __init__(self, func): |
2417
|
|
|
self.func = func |
2418
|
|
|
self.cache = {} |
2419
|
|
|
|
2420
|
|
|
def __call__(self, *args): |
2421
|
|
|
try: |
2422
|
|
|
return self.cache[args] |
2423
|
|
|
except KeyError: |
2424
|
|
|
self.cache[args] = value = self.func(*args) |
2425
|
|
|
return value |
2426
|
|
|
except TypeError: |
2427
|
|
|
# uncachable -- for instance, passing a list as an argument. |
2428
|
|
|
# Better to not cache than to blow up entirely. |
2429
|
|
|
return self.func(*args) |
2430
|
|
|
|
2431
|
|
|
def __repr__(self): |
2432
|
|
|
"""Return the function's docstring.""" |
2433
|
|
|
return self.func.__doc__ |
2434
|
|
|
|
2435
|
|
|
|
2436
|
|
|
def _xml_oneliner_re_from_tab_width(tab_width): |
2437
|
|
|
"""Standalone XML processing instruction regex.""" |
2438
|
|
|
return re.compile(r""" |
2439
|
|
|
(?: |
2440
|
|
|
(?<=\n\n) # Starting after a blank line |
2441
|
|
|
| # or |
2442
|
|
|
\A\n? # the beginning of the doc |
2443
|
|
|
) |
2444
|
|
|
( # save in $1 |
2445
|
|
|
[ ]{0,%d} |
2446
|
|
|
(?: |
2447
|
|
|
<\?\w+\b\s+.*?\?> # XML processing instruction |
2448
|
|
|
| |
2449
|
|
|
<\w+:\w+\b\s+.*?/> # namespaced single tag |
2450
|
|
|
) |
2451
|
|
|
[ \t]* |
2452
|
|
|
(?=\n{2,}|\Z) # followed by a blank line or end of document |
2453
|
|
|
) |
2454
|
|
|
""" % (tab_width - 1), re.X) |
2455
|
|
|
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) |
2456
|
|
|
|
2457
|
|
|
|
2458
|
|
|
def _hr_tag_re_from_tab_width(tab_width): |
2459
|
|
|
return re.compile(r""" |
2460
|
|
|
(?: |
2461
|
|
|
(?<=\n\n) # Starting after a blank line |
2462
|
|
|
| # or |
2463
|
|
|
\A\n? # the beginning of the doc |
2464
|
|
|
) |
2465
|
|
|
( # save in \1 |
2466
|
|
|
[ ]{0,%d} |
2467
|
|
|
<(hr) # start tag = \2 |
2468
|
|
|
\b # word break |
2469
|
|
|
([^<>])*? # |
2470
|
|
|
/?> # the matching end tag |
2471
|
|
|
[ \t]* |
2472
|
|
|
(?=\n{2,}|\Z) # followed by a blank line or end of document |
2473
|
|
|
) |
2474
|
|
|
""" % (tab_width - 1), re.X) |
2475
|
|
|
_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) |
2476
|
|
|
|
2477
|
|
|
|
2478
|
|
|
def _xml_escape_attr(attr, skip_single_quote=True): |
2479
|
|
|
"""Escape the given string for use in an HTML/XML tag attribute. |
2480
|
|
|
|
2481
|
|
|
By default this doesn't bother with escaping `'` to `'`, presuming that |
2482
|
|
|
the tag attribute is surrounded by double quotes. |
2483
|
|
|
""" |
2484
|
|
|
escaped = (attr |
2485
|
|
|
.replace('&', '&') |
2486
|
|
|
.replace('"', '"') |
2487
|
|
|
.replace('<', '<') |
2488
|
|
|
.replace('>', '>')) |
2489
|
|
|
if not skip_single_quote: |
2490
|
|
|
escaped = escaped.replace("'", "'") |
2491
|
|
|
return escaped |
2492
|
|
|
|
2493
|
|
|
|
2494
|
|
|
def _xml_encode_email_char_at_random(ch): |
2495
|
|
|
r = random() |
2496
|
|
|
# Roughly 10% raw, 45% hex, 45% dec. |
2497
|
|
|
# '@' *must* be encoded. I [John Gruber] insist. |
2498
|
|
|
# Issue 26: '_' must be encoded. |
2499
|
|
|
if r > 0.9 and ch not in "@_": |
2500
|
|
|
return ch |
2501
|
|
|
elif r < 0.45: |
2502
|
|
|
# The [1:] is to drop leading '0': 0x63 -> x63 |
2503
|
|
|
return '&#%s;' % hex(ord(ch))[1:] |
2504
|
|
|
else: |
2505
|
|
|
return '&#%s;' % ord(ch) |
2506
|
|
|
|
2507
|
|
|
|
2508
|
|
|
def _html_escape_url(attr, safe_mode=False): |
2509
|
|
|
"""Replace special characters that are potentially malicious in url string.""" |
2510
|
|
|
escaped = (attr |
2511
|
|
|
.replace('"', '"') |
2512
|
|
|
.replace('<', '<') |
2513
|
|
|
.replace('>', '>')) |
2514
|
|
|
if safe_mode: |
2515
|
|
|
escaped = escaped.replace('+', ' ') |
2516
|
|
|
escaped = escaped.replace("'", "'") |
2517
|
|
|
return escaped |
2518
|
|
|
|
2519
|
|
|
|
2520
|
|
|
# ---- mainline |
2521
|
|
|
|
2522
|
|
|
class _NoReflowFormatter(optparse.IndentedHelpFormatter): |
2523
|
|
|
"""An optparse formatter that does NOT reflow the description.""" |
2524
|
|
|
def format_description(self, description): |
2525
|
|
|
return description or "" |
2526
|
|
|
|
2527
|
|
|
|
2528
|
|
|
def _test(): |
2529
|
|
|
import doctest |
2530
|
|
|
doctest.testmod() |
2531
|
|
|
|
2532
|
|
|
|
2533
|
|
|
def main(argv=None): |
2534
|
|
|
if argv is None: |
2535
|
|
|
argv = sys.argv |
2536
|
|
|
if not logging.root.handlers: |
2537
|
|
|
logging.basicConfig() |
2538
|
|
|
|
2539
|
|
|
usage = "usage: %prog [PATHS...]" |
2540
|
|
|
version = "%prog "+__version__ |
2541
|
|
|
parser = optparse.OptionParser(prog="markdown2", usage=usage, |
2542
|
|
|
version=version, description=cmdln_desc, |
2543
|
|
|
formatter=_NoReflowFormatter()) |
2544
|
|
|
parser.add_option("-v", "--verbose", dest="log_level", |
2545
|
|
|
action="store_const", const=logging.DEBUG, |
2546
|
|
|
help="more verbose output") |
2547
|
|
|
parser.add_option("--encoding", |
2548
|
|
|
help="specify encoding of text content") |
2549
|
|
|
parser.add_option("--html4tags", action="store_true", default=False, |
2550
|
|
|
help="use HTML 4 style for empty element tags") |
2551
|
|
|
parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode", |
2552
|
|
|
help="sanitize literal HTML: 'escape' escapes " |
2553
|
|
|
"HTML meta chars, 'replace' replaces with an " |
2554
|
|
|
"[HTML_REMOVED] note") |
2555
|
|
|
parser.add_option("-x", "--extras", action="append", |
2556
|
|
|
help="Turn on specific extra features (not part of " |
2557
|
|
|
"the core Markdown spec). See above.") |
2558
|
|
|
parser.add_option("--use-file-vars", |
2559
|
|
|
help="Look for and use Emacs-style 'markdown-extras' " |
2560
|
|
|
"file var to turn on extras. See " |
2561
|
|
|
"<https://github.com/trentm/python-markdown2/wiki/Extras>") |
2562
|
|
|
parser.add_option("--link-patterns-file", |
2563
|
|
|
help="path to a link pattern file") |
2564
|
|
|
parser.add_option("--self-test", action="store_true", |
2565
|
|
|
help="run internal self-tests (some doctests)") |
2566
|
|
|
parser.add_option("--compare", action="store_true", |
2567
|
|
|
help="run against Markdown.pl as well (for testing)") |
2568
|
|
|
parser.set_defaults(log_level=logging.INFO, compare=False, |
2569
|
|
|
encoding="utf-8", safe_mode=None, use_file_vars=False) |
2570
|
|
|
opts, paths = parser.parse_args() |
2571
|
|
|
log.setLevel(opts.log_level) |
2572
|
|
|
|
2573
|
|
|
if opts.self_test: |
2574
|
|
|
return _test() |
2575
|
|
|
|
2576
|
|
View Code Duplication |
if opts.extras: |
|
|
|
|
2577
|
|
|
extras = {} |
2578
|
|
|
for s in opts.extras: |
2579
|
|
|
splitter = re.compile("[,;: ]+") |
2580
|
|
|
for e in splitter.split(s): |
2581
|
|
|
if '=' in e: |
2582
|
|
|
ename, earg = e.split('=', 1) |
2583
|
|
|
try: |
2584
|
|
|
earg = int(earg) |
2585
|
|
|
except ValueError: |
2586
|
|
|
pass |
2587
|
|
|
else: |
2588
|
|
|
ename, earg = e, None |
2589
|
|
|
extras[ename] = earg |
2590
|
|
|
else: |
2591
|
|
|
extras = None |
2592
|
|
|
|
2593
|
|
|
if opts.link_patterns_file: |
2594
|
|
|
link_patterns = [] |
2595
|
|
|
f = open(opts.link_patterns_file) |
2596
|
|
|
try: |
2597
|
|
|
for i, line in enumerate(f.readlines()): |
2598
|
|
|
if not line.strip(): continue |
2599
|
|
|
if line.lstrip().startswith("#"): continue |
2600
|
|
|
try: |
2601
|
|
|
pat, href = line.rstrip().rsplit(None, 1) |
2602
|
|
|
except ValueError: |
2603
|
|
|
raise MarkdownError("%s:%d: invalid link pattern line: %r" |
2604
|
|
|
% (opts.link_patterns_file, i+1, line)) |
2605
|
|
|
link_patterns.append( |
2606
|
|
|
(_regex_from_encoded_pattern(pat), href)) |
2607
|
|
|
finally: |
2608
|
|
|
f.close() |
2609
|
|
|
else: |
2610
|
|
|
link_patterns = None |
2611
|
|
|
|
2612
|
|
|
from os.path import join, dirname, abspath, exists |
2613
|
|
|
markdown_pl = join(dirname(dirname(abspath(__file__))), "test", |
2614
|
|
|
"Markdown.pl") |
2615
|
|
|
if not paths: |
2616
|
|
|
paths = ['-'] |
2617
|
|
|
for path in paths: |
2618
|
|
|
if path == '-': |
2619
|
|
|
text = sys.stdin.read() |
2620
|
|
|
else: |
2621
|
|
|
fp = codecs.open(path, 'r', opts.encoding) |
2622
|
|
|
text = fp.read() |
2623
|
|
|
fp.close() |
2624
|
|
|
if opts.compare: |
2625
|
|
|
from subprocess import Popen, PIPE |
2626
|
|
|
print("==== Markdown.pl ====") |
2627
|
|
|
p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True) |
2628
|
|
|
p.stdin.write(text.encode('utf-8')) |
2629
|
|
|
p.stdin.close() |
2630
|
|
|
perl_html = p.stdout.read().decode('utf-8') |
2631
|
|
|
if py3: |
2632
|
|
|
sys.stdout.write(perl_html) |
2633
|
|
|
else: |
2634
|
|
|
sys.stdout.write(perl_html.encode( |
2635
|
|
|
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) |
2636
|
|
|
print("==== markdown2.py ====") |
2637
|
|
|
html = markdown(text, |
2638
|
|
|
html4tags=opts.html4tags, |
2639
|
|
|
safe_mode=opts.safe_mode, |
2640
|
|
|
extras=extras, link_patterns=link_patterns, |
2641
|
|
|
use_file_vars=opts.use_file_vars) |
2642
|
|
|
if py3: |
2643
|
|
|
sys.stdout.write(html) |
2644
|
|
|
else: |
2645
|
|
|
sys.stdout.write(html.encode( |
2646
|
|
|
sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) |
2647
|
|
|
if extras and "toc" in extras: |
2648
|
|
|
log.debug("toc_html: " + |
2649
|
|
|
str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))) |
2650
|
|
|
if opts.compare: |
2651
|
|
|
test_dir = join(dirname(dirname(abspath(__file__))), "test") |
2652
|
|
|
if exists(join(test_dir, "test_markdown2.py")): |
2653
|
|
|
sys.path.insert(0, test_dir) |
2654
|
|
|
from test_markdown2 import norm_html_from_html |
2655
|
|
|
norm_html = norm_html_from_html(html) |
2656
|
|
|
norm_perl_html = norm_html_from_html(perl_html) |
2657
|
|
|
else: |
2658
|
|
|
norm_html = html |
2659
|
|
|
norm_perl_html = perl_html |
2660
|
|
|
print("==== match? %r ====" % (norm_perl_html == norm_html)) |
2661
|
|
|
|
2662
|
|
|
|
2663
|
|
|
if __name__ == "__main__": |
2664
|
|
|
sys.exit(main(sys.argv)) |
2665
|
|
|
|