|
1
|
|
|
from __future__ import absolute_import, unicode_literals |
|
2
|
|
|
|
|
3
|
|
|
import re |
|
4
|
|
|
import sys |
|
5
|
|
|
from CommonMark import common |
|
6
|
|
|
from CommonMark.common import normalize_uri, unescape_string |
|
7
|
|
|
from CommonMark.node import Node |
|
8
|
|
|
|
|
9
|
|
|
if sys.version_info >= (3, 0): |
|
10
|
|
|
if sys.version_info >= (3, 4): |
|
11
|
|
|
import html.parser |
|
12
|
|
|
HTMLunescape = html.parser.HTMLParser().unescape |
|
13
|
|
|
else: |
|
14
|
|
|
from .entitytrans import _unescape |
|
15
|
|
|
HTMLunescape = _unescape |
|
16
|
|
|
else: |
|
17
|
|
|
from CommonMark import entitytrans |
|
18
|
|
|
HTMLunescape = entitytrans._unescape |
|
19
|
|
|
|
|
20
|
|
|
# Some regexps used in inline parser: |
|
21
|
|
|
|
|
22
|
|
|
ESCAPED_CHAR = '\\\\' + common.ESCAPABLE |
|
23
|
|
|
|
|
24
|
|
|
rePunctuation = re.compile( |
|
25
|
|
|
r'^[\u2000-\u206F\u2E00-\u2E7F\\' + "'" + '!"#\$%&\(\)' |
|
26
|
|
|
r'\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]') |
|
27
|
|
|
|
|
28
|
|
|
reLinkTitle = re.compile( |
|
29
|
|
|
'^(?:"(' + ESCAPED_CHAR + '|[^"\\x00])*"' + |
|
30
|
|
|
'|' + |
|
31
|
|
|
'\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' + |
|
32
|
|
|
'|' + |
|
33
|
|
|
'\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))') |
|
34
|
|
|
reLinkDestinationBraces = re.compile( |
|
35
|
|
|
'^(?:[<](?:[^ <>\\t\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + |
|
36
|
|
|
'\\\\)*[>])') |
|
37
|
|
|
|
|
38
|
|
|
reEscapable = re.compile('^' + common.ESCAPABLE) |
|
39
|
|
|
reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE) |
|
40
|
|
|
reTicks = re.compile(r'`+') |
|
41
|
|
|
reTicksHere = re.compile(r'^`+') |
|
42
|
|
|
reEllipses = re.compile(r'\.\.\.') |
|
43
|
|
|
reDash = re.compile(r'--+') |
|
44
|
|
|
reEmailAutolink = re.compile( |
|
45
|
|
|
r"^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]" |
|
46
|
|
|
r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" |
|
47
|
|
|
r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>") |
|
48
|
|
|
reAutolink = re.compile( |
|
49
|
|
|
r'^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', |
|
50
|
|
|
re.IGNORECASE) |
|
51
|
|
|
reSpnl = re.compile(r'^ *(?:\n *)?') |
|
52
|
|
|
reWhitespaceChar = re.compile(r'^^[ \t\n\x0b\x0c\x0d]') |
|
53
|
|
|
reWhitespace = re.compile(r'[ \t\n\x0b\x0c\x0d]+') |
|
54
|
|
|
reUnicodeWhitespaceChar = re.compile(r'^\s') |
|
55
|
|
|
reFinalSpace = re.compile(r' *$') |
|
56
|
|
|
reInitialSpace = re.compile(r'^ *') |
|
57
|
|
|
reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)') |
|
58
|
|
|
reLinkLabel = re.compile('^\\[(?:[^\\\\\\[\\]]|' + ESCAPED_CHAR + |
|
59
|
|
|
'|\\\\){0,1000}\\]') |
|
60
|
|
|
# Matches a string of non-special characters. |
|
61
|
|
|
reMain = re.compile(r'^[^\n`\[\]\\!<&*_\'"]+', re.MULTILINE) |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def normalizeReference(s): |
|
65
|
|
|
"""Normalize reference label. |
|
66
|
|
|
|
|
67
|
|
|
Collapse internal whitespace to single space, remove |
|
68
|
|
|
leading/trailing whitespace, case fold. |
|
69
|
|
|
""" |
|
70
|
|
|
return re.sub(r'\s+', ' ', s.strip()).upper() |
|
71
|
|
|
|
|
72
|
|
|
|
|
73
|
|
|
def text(s): |
|
74
|
|
|
node = Node('text', None) |
|
75
|
|
|
node.literal = s |
|
76
|
|
|
return node |
|
77
|
|
|
|
|
78
|
|
|
|
|
79
|
|
|
def smart_dashes(chars): |
|
80
|
|
|
en_count = 0 |
|
81
|
|
|
em_count = 0 |
|
82
|
|
|
if len(chars) % 3 == 0: |
|
83
|
|
|
# If divisible by 3, use all em dashes |
|
84
|
|
|
em_count = len(chars) / 3 |
|
85
|
|
|
elif len(chars) % 2 == 0: |
|
86
|
|
|
# If divisble by 2, use all en dashes |
|
87
|
|
|
en_count = len(chars) / 2 |
|
88
|
|
|
elif len(chars) % 3 == 2: |
|
89
|
|
|
# if 2 extra dashes, use en dashfor last 2; |
|
90
|
|
|
# em dashes for rest |
|
91
|
|
|
en_count = 1 |
|
92
|
|
|
em_count = (len(chars) - 2) / 3 |
|
93
|
|
|
else: |
|
94
|
|
|
# Use en dashes for last 4 hyphens; em dashes for rest |
|
95
|
|
|
en_count = 2 |
|
96
|
|
|
em_count = (len(chars) - 4) / 3 |
|
97
|
|
|
return ('\u2014' * em_count) + ('\u2013' * en_count) |
|
98
|
|
|
|
|
99
|
|
|
|
|
100
|
|
|
class InlineParser(object): |
|
101
|
|
|
"""INLINE PARSER |
|
102
|
|
|
|
|
103
|
|
|
These are methods of an InlineParser class, defined below. |
|
104
|
|
|
An InlineParser keeps track of a subject (a string to be |
|
105
|
|
|
parsed) and a position in that subject. |
|
106
|
|
|
""" |
|
107
|
|
|
|
|
108
|
|
|
def __init__(self, options={}): |
|
109
|
|
|
self.subject = '' |
|
110
|
|
|
self.brackets = None |
|
111
|
|
|
self.pos = 0 |
|
112
|
|
|
self.refmap = {} |
|
113
|
|
|
self.options = options |
|
114
|
|
|
|
|
115
|
|
|
def match(self, regexString): |
|
116
|
|
|
""" |
|
117
|
|
|
If regexString matches at current position in the subject, advance |
|
118
|
|
|
position in subject and return the match; otherwise return None. |
|
119
|
|
|
""" |
|
120
|
|
|
match = re.search(regexString, self.subject[self.pos:]) |
|
121
|
|
|
if match is None: |
|
122
|
|
|
return None |
|
123
|
|
|
else: |
|
124
|
|
|
self.pos += match.end() |
|
125
|
|
|
return match.group() |
|
126
|
|
|
|
|
127
|
|
|
def peek(self): |
|
128
|
|
|
""" Returns the character at the current subject position, or None if |
|
129
|
|
|
there are no more characters.""" |
|
130
|
|
|
if self.pos < len(self.subject): |
|
131
|
|
|
return self.subject[self.pos] |
|
132
|
|
|
else: |
|
133
|
|
|
return None |
|
134
|
|
|
|
|
135
|
|
|
def spnl(self): |
|
136
|
|
|
""" Parse zero or more space characters, including at |
|
137
|
|
|
most one newline.""" |
|
138
|
|
|
self.match(reSpnl) |
|
139
|
|
|
return True |
|
140
|
|
|
|
|
141
|
|
|
# All of the parsers below try to match something at the current position |
|
142
|
|
|
# in the subject. If they succeed in matching anything, they |
|
143
|
|
|
# push an inline matched, advancing the subject. |
|
144
|
|
|
|
|
145
|
|
|
def parseBackticks(self, block): |
|
146
|
|
|
""" Attempt to parse backticks, adding either a backtick code span or a |
|
147
|
|
|
literal sequence of backticks to the 'inlines' list.""" |
|
148
|
|
|
ticks = self.match(reTicksHere) |
|
149
|
|
|
if ticks is None: |
|
150
|
|
|
return False |
|
151
|
|
|
after_open_ticks = self.pos |
|
152
|
|
|
matched = self.match(reTicks) |
|
153
|
|
|
while matched is not None: |
|
154
|
|
|
if (matched == ticks): |
|
155
|
|
|
node = Node('code', None) |
|
156
|
|
|
c = self.subject[after_open_ticks:self.pos - len(ticks)] |
|
157
|
|
|
c = c.strip() |
|
158
|
|
|
c = re.sub(reWhitespace, ' ', c) |
|
159
|
|
|
node.literal = c |
|
160
|
|
|
block.append_child(node) |
|
161
|
|
|
return True |
|
162
|
|
|
matched = self.match(reTicks) |
|
163
|
|
|
# If we got here, we didn't match a closing backtick sequence. |
|
164
|
|
|
self.pos = after_open_ticks |
|
165
|
|
|
block.append_child(text(ticks)) |
|
166
|
|
|
return True |
|
167
|
|
|
|
|
168
|
|
|
def parseBackslash(self, block): |
|
169
|
|
|
""" |
|
170
|
|
|
Parse a backslash-escaped special character, adding either the |
|
171
|
|
|
escaped character, a hard line break (if the backslash is followed |
|
172
|
|
|
by a newline), or a literal backslash to the block's children. |
|
173
|
|
|
Assumes current character is a backslash. |
|
174
|
|
|
""" |
|
175
|
|
|
subj = self.subject |
|
176
|
|
|
self.pos += 1 |
|
177
|
|
|
|
|
178
|
|
|
try: |
|
179
|
|
|
subjchar = subj[self.pos] |
|
180
|
|
|
except IndexError: |
|
181
|
|
|
subjchar = None |
|
182
|
|
|
|
|
183
|
|
|
if self.peek() == '\n': |
|
184
|
|
|
self.pos += 1 |
|
185
|
|
|
node = Node('linebreak', None) |
|
186
|
|
|
block.append_child(node) |
|
187
|
|
|
elif subjchar and re.match(reEscapable, subjchar): |
|
188
|
|
|
block.append_child(text(subjchar)) |
|
189
|
|
|
self.pos += 1 |
|
190
|
|
|
else: |
|
191
|
|
|
block.append_child(text('\\')) |
|
192
|
|
|
|
|
193
|
|
|
return True |
|
194
|
|
|
|
|
195
|
|
|
def parseAutolink(self, block): |
|
196
|
|
|
"""Attempt to parse an autolink (URL or email in pointy brackets).""" |
|
197
|
|
|
m = self.match(reEmailAutolink) |
|
198
|
|
|
|
|
199
|
|
|
if m: |
|
200
|
|
|
# email |
|
201
|
|
|
dest = m[1:-1] |
|
202
|
|
|
node = Node('link', None) |
|
203
|
|
|
node.destination = normalize_uri('mailto:' + dest) |
|
204
|
|
|
node.title = '' |
|
205
|
|
|
node.append_child(text(dest)) |
|
206
|
|
|
block.append_child(node) |
|
207
|
|
|
return True |
|
208
|
|
|
else: |
|
209
|
|
|
m = self.match(reAutolink) |
|
210
|
|
|
if m: |
|
211
|
|
|
# link |
|
212
|
|
|
dest = m[1:-1] |
|
213
|
|
|
node = Node('link', None) |
|
214
|
|
|
node.destination = normalize_uri(dest) |
|
215
|
|
|
node.title = '' |
|
216
|
|
|
node.append_child(text(dest)) |
|
217
|
|
|
block.append_child(node) |
|
218
|
|
|
return True |
|
219
|
|
|
|
|
220
|
|
|
return False |
|
221
|
|
|
|
|
222
|
|
|
def parseHtmlTag(self, block): |
|
223
|
|
|
"""Attempt to parse a raw HTML tag.""" |
|
224
|
|
|
m = self.match(common.reHtmlTag) |
|
225
|
|
|
if m is None: |
|
226
|
|
|
return False |
|
227
|
|
|
else: |
|
228
|
|
|
node = Node('html_inline', None) |
|
229
|
|
|
node.literal = m |
|
230
|
|
|
block.append_child(node) |
|
231
|
|
|
return True |
|
232
|
|
|
|
|
233
|
|
|
def scanDelims(self, c): |
|
234
|
|
|
""" |
|
235
|
|
|
Scan a sequence of characters == c, and return information about |
|
236
|
|
|
the number of delimiters and whether they are positioned such that |
|
237
|
|
|
they can open and/or close emphasis or strong emphasis. A utility |
|
238
|
|
|
function for strong/emph parsing. |
|
239
|
|
|
""" |
|
240
|
|
|
numdelims = 0 |
|
241
|
|
|
startpos = self.pos |
|
242
|
|
|
|
|
243
|
|
|
if c == "'" or c == '"': |
|
244
|
|
|
numdelims += 1 |
|
245
|
|
|
self.pos += 1 |
|
246
|
|
|
else: |
|
247
|
|
|
while (self.peek() == c): |
|
248
|
|
|
numdelims += 1 |
|
249
|
|
|
self.pos += 1 |
|
250
|
|
|
|
|
251
|
|
|
if numdelims == 0: |
|
252
|
|
|
return None |
|
253
|
|
|
|
|
254
|
|
|
c_before = '\n' if startpos == 0 else self.subject[startpos - 1] |
|
255
|
|
|
|
|
256
|
|
|
c_after = self.peek() |
|
257
|
|
|
if c_after is None: |
|
258
|
|
|
c_after = '\n' |
|
259
|
|
|
|
|
260
|
|
|
# Python 2 doesn't recognize '\xa0' as whitespace |
|
261
|
|
|
after_is_whitespace = re.match(reUnicodeWhitespaceChar, c_after) or \ |
|
262
|
|
|
c_after == '\xa0' |
|
263
|
|
|
after_is_punctuation = re.match(rePunctuation, c_after) |
|
264
|
|
|
before_is_whitespace = re.match(reUnicodeWhitespaceChar, c_before) or \ |
|
265
|
|
|
c_before == '\xa0' |
|
266
|
|
|
before_is_punctuation = re.match(rePunctuation, c_before) |
|
267
|
|
|
|
|
268
|
|
|
left_flanking = not after_is_whitespace and \ |
|
269
|
|
|
not (after_is_punctuation and |
|
270
|
|
|
not before_is_whitespace and |
|
271
|
|
|
not before_is_punctuation) |
|
272
|
|
|
right_flanking = not before_is_whitespace and \ |
|
273
|
|
|
not (before_is_punctuation and |
|
274
|
|
|
not after_is_whitespace and |
|
275
|
|
|
not after_is_punctuation) |
|
276
|
|
|
if c == '_': |
|
277
|
|
|
can_open = left_flanking and \ |
|
278
|
|
|
(not right_flanking or before_is_punctuation) |
|
279
|
|
|
can_close = right_flanking and \ |
|
280
|
|
|
(not left_flanking or after_is_punctuation) |
|
281
|
|
|
elif c == "'" or c == '"': |
|
282
|
|
|
can_open = left_flanking and not right_flanking |
|
283
|
|
|
can_close = right_flanking |
|
284
|
|
|
else: |
|
285
|
|
|
can_open = left_flanking |
|
286
|
|
|
can_close = right_flanking |
|
287
|
|
|
|
|
288
|
|
|
self.pos = startpos |
|
289
|
|
|
return { |
|
290
|
|
|
'numdelims': numdelims, |
|
291
|
|
|
'can_open': can_open, |
|
292
|
|
|
'can_close': can_close, |
|
293
|
|
|
} |
|
294
|
|
|
|
|
295
|
|
|
def handleDelim(self, cc, block): |
|
296
|
|
|
"""Handle a delimiter marker for emphasis or a quote.""" |
|
297
|
|
|
res = self.scanDelims(cc) |
|
298
|
|
|
if not res: |
|
299
|
|
|
return False |
|
300
|
|
|
numdelims = res.get('numdelims') |
|
301
|
|
|
startpos = self.pos |
|
302
|
|
|
|
|
303
|
|
|
self.pos += numdelims |
|
304
|
|
|
if cc == "'": |
|
305
|
|
|
contents = '\u2019' |
|
306
|
|
|
elif cc == '"': |
|
307
|
|
|
contents = '\u201C' |
|
308
|
|
|
else: |
|
309
|
|
|
contents = self.subject[startpos:self.pos] |
|
310
|
|
|
node = text(contents) |
|
311
|
|
|
block.append_child(node) |
|
312
|
|
|
|
|
313
|
|
|
# Add entry to stack for this opener |
|
314
|
|
|
self.delimiters = { |
|
315
|
|
|
'cc': cc, |
|
316
|
|
|
'numdelims': numdelims, |
|
317
|
|
|
'origdelims': numdelims, |
|
318
|
|
|
'node': node, |
|
319
|
|
|
'previous': self.delimiters, |
|
320
|
|
|
'next': None, |
|
321
|
|
|
'can_open': res.get('can_open'), |
|
322
|
|
|
'can_close': res.get('can_close'), |
|
323
|
|
|
} |
|
324
|
|
|
if self.delimiters['previous'] is not None: |
|
325
|
|
|
self.delimiters['previous']['next'] = self.delimiters |
|
326
|
|
|
return True |
|
327
|
|
|
|
|
328
|
|
|
def removeDelimiter(self, delim): |
|
329
|
|
|
if delim.get('previous') is not None: |
|
330
|
|
|
delim['previous']['next'] = delim.get('next') |
|
331
|
|
|
if delim.get('next') is None: |
|
332
|
|
|
# Top of stack |
|
333
|
|
|
self.delimiters = delim.get('previous') |
|
334
|
|
|
else: |
|
335
|
|
|
delim['next']['previous'] = delim.get('previous') |
|
336
|
|
|
|
|
337
|
|
|
@staticmethod |
|
338
|
|
|
def removeDelimitersBetween(bottom, top): |
|
339
|
|
|
if bottom.get('next') != top: |
|
340
|
|
|
bottom['next'] = top |
|
341
|
|
|
top['previous'] = bottom |
|
342
|
|
|
|
|
343
|
|
|
def processEmphasis(self, stack_bottom): |
|
344
|
|
|
openers_bottom = { |
|
345
|
|
|
'_': stack_bottom, |
|
346
|
|
|
'*': stack_bottom, |
|
347
|
|
|
"'": stack_bottom, |
|
348
|
|
|
'"': stack_bottom, |
|
349
|
|
|
} |
|
350
|
|
|
odd_match = False |
|
351
|
|
|
use_delims = 0 |
|
352
|
|
|
|
|
353
|
|
|
# Find first closer above stack_bottom |
|
354
|
|
|
closer = self.delimiters |
|
355
|
|
|
while closer is not None and closer.get('previous') != stack_bottom: |
|
356
|
|
|
closer = closer.get('previous') |
|
357
|
|
|
|
|
358
|
|
|
# Move forward, looking for closers, and handling each |
|
359
|
|
|
while closer is not None: |
|
360
|
|
|
if not closer.get('can_close'): |
|
361
|
|
|
closer = closer.get('next') |
|
362
|
|
|
else: |
|
363
|
|
|
# found emphasis closer. now look back for first |
|
364
|
|
|
# matching opener: |
|
365
|
|
|
opener = closer.get('previous') |
|
366
|
|
|
opener_found = False |
|
367
|
|
|
closercc = closer.get('cc') |
|
368
|
|
|
while (opener is not None and opener != stack_bottom and |
|
369
|
|
|
opener != openers_bottom[closercc]): |
|
370
|
|
|
odd_match = (closer.get('can_open') or |
|
371
|
|
|
opener.get('can_close')) and \ |
|
372
|
|
|
(opener.get('origdelims') + |
|
373
|
|
|
closer.get('origdelims')) % 3 == 0 |
|
374
|
|
|
if opener.get('cc') == closercc and \ |
|
375
|
|
|
opener.get('can_open') and \ |
|
376
|
|
|
not odd_match: |
|
377
|
|
|
opener_found = True |
|
378
|
|
|
break |
|
379
|
|
|
opener = opener.get('previous') |
|
380
|
|
|
old_closer = closer |
|
381
|
|
|
|
|
382
|
|
|
if closercc == '*' or closercc == '_': |
|
383
|
|
|
if not opener_found: |
|
384
|
|
|
closer = closer.get('next') |
|
385
|
|
|
else: |
|
386
|
|
|
# Calculate actual number of delimiters used from |
|
387
|
|
|
# closer |
|
388
|
|
|
if closer['numdelims'] < 3 or opener['numdelims'] < 3: |
|
389
|
|
|
if closer['numdelims'] <= opener['numdelims']: |
|
390
|
|
|
use_delims = closer['numdelims'] |
|
391
|
|
|
else: |
|
392
|
|
|
use_delims = opener['numdelims'] |
|
393
|
|
|
else: |
|
394
|
|
|
if closer['numdelims'] % 2 == 0: |
|
395
|
|
|
use_delims = 2 |
|
396
|
|
|
else: |
|
397
|
|
|
use_delims = 1 |
|
398
|
|
|
|
|
399
|
|
|
opener_inl = opener.get('node') |
|
400
|
|
|
closer_inl = closer.get('node') |
|
401
|
|
|
|
|
402
|
|
|
# Remove used delimiters from stack elts and inlines |
|
403
|
|
|
opener['numdelims'] -= use_delims |
|
404
|
|
|
closer['numdelims'] -= use_delims |
|
405
|
|
|
opener_inl.literal = opener_inl.literal[ |
|
406
|
|
|
:len(opener_inl.literal) - use_delims] |
|
407
|
|
|
closer_inl.literal = closer_inl.literal[ |
|
408
|
|
|
:len(closer_inl.literal) - use_delims] |
|
409
|
|
|
|
|
410
|
|
|
# Build contents for new Emph element |
|
411
|
|
|
if use_delims == 1: |
|
412
|
|
|
emph = Node('emph', None) |
|
413
|
|
|
else: |
|
414
|
|
|
emph = Node('strong', None) |
|
415
|
|
|
|
|
416
|
|
|
tmp = opener_inl.nxt |
|
417
|
|
|
while tmp and tmp != closer_inl: |
|
418
|
|
|
nxt = tmp.nxt |
|
419
|
|
|
tmp.unlink() |
|
420
|
|
|
emph.append_child(tmp) |
|
421
|
|
|
tmp = nxt |
|
422
|
|
|
|
|
423
|
|
|
opener_inl.insert_after(emph) |
|
424
|
|
|
|
|
425
|
|
|
# Remove elts between opener and closer in delimiters |
|
426
|
|
|
# stack |
|
427
|
|
|
self.removeDelimitersBetween(opener, closer) |
|
428
|
|
|
|
|
429
|
|
|
# If opener has 0 delims, remove it and the inline |
|
430
|
|
|
if opener['numdelims'] == 0: |
|
431
|
|
|
opener_inl.unlink() |
|
432
|
|
|
self.removeDelimiter(opener) |
|
433
|
|
|
|
|
434
|
|
|
if closer['numdelims'] == 0: |
|
435
|
|
|
closer_inl.unlink() |
|
436
|
|
|
tempstack = closer['next'] |
|
437
|
|
|
self.removeDelimiter(closer) |
|
438
|
|
|
closer = tempstack |
|
439
|
|
|
|
|
440
|
|
|
elif closercc == "'": |
|
441
|
|
|
closer['node'].literal = '\u2019' |
|
442
|
|
|
if opener_found: |
|
443
|
|
|
opener['node'].literal = '\u2018' |
|
444
|
|
|
closer = closer['next'] |
|
445
|
|
|
|
|
446
|
|
|
elif closercc == '"': |
|
447
|
|
|
closer['node'].literal = '\u201D' |
|
448
|
|
|
if opener_found: |
|
449
|
|
|
opener['node'].literal = '\u201C' |
|
450
|
|
|
closer = closer['next'] |
|
451
|
|
|
|
|
452
|
|
|
if not opener_found and not odd_match: |
|
453
|
|
|
# Set lower bound for future searches for openers: |
|
454
|
|
|
# We don't do this with odd_match because a ** |
|
455
|
|
|
# that doesn't match an earlier * might turn into |
|
456
|
|
|
# an opener, and the * might be matched by something |
|
457
|
|
|
# else. |
|
458
|
|
|
openers_bottom[closercc] = old_closer['previous'] |
|
459
|
|
|
if not old_closer['can_open']: |
|
460
|
|
|
# We can remove a closer that can't be an opener, |
|
461
|
|
|
# once we've seen there's no matching opener: |
|
462
|
|
|
self.removeDelimiter(old_closer) |
|
463
|
|
|
|
|
464
|
|
|
# Remove all delimiters |
|
465
|
|
|
while self.delimiters is not None and self.delimiters != stack_bottom: |
|
466
|
|
|
self.removeDelimiter(self.delimiters) |
|
467
|
|
|
|
|
468
|
|
|
def parseLinkTitle(self): |
|
469
|
|
|
""" |
|
470
|
|
|
Attempt to parse link title (sans quotes), returning the string |
|
471
|
|
|
or None if no match. |
|
472
|
|
|
""" |
|
473
|
|
|
title = self.match(reLinkTitle) |
|
474
|
|
|
if title is None: |
|
475
|
|
|
return None |
|
476
|
|
|
else: |
|
477
|
|
|
# chop off quotes from title and unescape: |
|
478
|
|
|
return unescape_string(title[1:-1]) |
|
479
|
|
|
|
|
480
|
|
|
def parseLinkDestination(self): |
|
481
|
|
|
""" |
|
482
|
|
|
Attempt to parse link destination, returning the string or |
|
483
|
|
|
None if no match. |
|
484
|
|
|
""" |
|
485
|
|
|
res = self.match(reLinkDestinationBraces) |
|
486
|
|
|
if res is None: |
|
487
|
|
|
# TODO handrolled parser; res should be None or the string |
|
488
|
|
|
savepos = self.pos |
|
489
|
|
|
openparens = 0 |
|
490
|
|
|
c = self.peek() |
|
491
|
|
|
while c is not None: |
|
492
|
|
|
if c == '\\': |
|
493
|
|
|
self.pos += 1 |
|
494
|
|
|
if self.peek() is not None: |
|
495
|
|
|
self.pos += 1 |
|
496
|
|
|
elif c == '(': |
|
497
|
|
|
self.pos += 1 |
|
498
|
|
|
openparens += 1 |
|
499
|
|
|
elif c == ')': |
|
500
|
|
|
if openparens < 1: |
|
501
|
|
|
break |
|
502
|
|
|
else: |
|
503
|
|
|
self.pos += 1 |
|
504
|
|
|
openparens -= 1 |
|
505
|
|
|
elif re.match(reWhitespaceChar, c): |
|
506
|
|
|
break |
|
507
|
|
|
else: |
|
508
|
|
|
self.pos += 1 |
|
509
|
|
|
c = self.peek() |
|
510
|
|
|
res = self.subject[savepos:self.pos] |
|
511
|
|
|
return normalize_uri(unescape_string(res)) |
|
512
|
|
|
else: |
|
513
|
|
|
# chop off surrounding <..>: |
|
514
|
|
|
return normalize_uri(unescape_string(res[1:-1])) |
|
515
|
|
|
|
|
516
|
|
|
def parseLinkLabel(self): |
|
517
|
|
|
""" |
|
518
|
|
|
Attempt to parse a link label, returning number of |
|
519
|
|
|
characters parsed. |
|
520
|
|
|
""" |
|
521
|
|
|
m = self.match(reLinkLabel) |
|
522
|
|
|
if m is None or len(m) > 1001 or re.match(r'\[\s+\]', m): |
|
523
|
|
|
return 0 |
|
524
|
|
|
else: |
|
525
|
|
|
return len(m) |
|
526
|
|
|
|
|
527
|
|
|
def parseOpenBracket(self, block): |
|
528
|
|
|
""" |
|
529
|
|
|
Add open bracket to delimiter stack and add a text node to |
|
530
|
|
|
block's children. |
|
531
|
|
|
""" |
|
532
|
|
|
startpos = self.pos |
|
533
|
|
|
self.pos += 1 |
|
534
|
|
|
|
|
535
|
|
|
node = text('[') |
|
536
|
|
|
block.append_child(node) |
|
537
|
|
|
|
|
538
|
|
|
# Add entry to stack for this opener |
|
539
|
|
|
self.addBracket(node, startpos, False) |
|
540
|
|
|
return True |
|
541
|
|
|
|
|
542
|
|
|
def parseBang(self, block): |
|
543
|
|
|
""" |
|
544
|
|
|
If next character is [, and ! delimiter to delimiter stack and |
|
545
|
|
|
add a text node to block's children. Otherwise just add a text |
|
546
|
|
|
node. |
|
547
|
|
|
""" |
|
548
|
|
|
startpos = self.pos |
|
549
|
|
|
self.pos += 1 |
|
550
|
|
|
if self.peek() == '[': |
|
551
|
|
|
self.pos += 1 |
|
552
|
|
|
|
|
553
|
|
|
node = text('![') |
|
554
|
|
|
block.append_child(node) |
|
555
|
|
|
|
|
556
|
|
|
# Add entry to stack for this openeer |
|
557
|
|
|
self.addBracket(node, startpos + 1, True) |
|
558
|
|
|
else: |
|
559
|
|
|
block.append_child(text('!')) |
|
560
|
|
|
|
|
561
|
|
|
return True |
|
562
|
|
|
|
|
563
|
|
|
def parseCloseBracket(self, block): |
|
564
|
|
|
""" |
|
565
|
|
|
Try to match close bracket against an opening in the delimiter |
|
566
|
|
|
stack. Add either a link or image, or a plain [ character, |
|
567
|
|
|
to block's children. If there is a matching delimiter, |
|
568
|
|
|
remove it from the delimiter stack. |
|
569
|
|
|
""" |
|
570
|
|
|
title = None |
|
571
|
|
|
matched = False |
|
572
|
|
|
self.pos += 1 |
|
573
|
|
|
startpos = self.pos |
|
574
|
|
|
|
|
575
|
|
|
# get last [ or ![ |
|
576
|
|
|
opener = self.brackets |
|
577
|
|
|
|
|
578
|
|
|
if opener is None: |
|
579
|
|
|
# no matched opener, just return a literal |
|
580
|
|
|
block.append_child(text(']')) |
|
581
|
|
|
return True |
|
582
|
|
|
|
|
583
|
|
|
if not opener.get('active'): |
|
584
|
|
|
# no matched opener, just return a literal |
|
585
|
|
|
block.append_child(text(']')) |
|
586
|
|
|
# take opener off brackets stack |
|
587
|
|
|
self.removeBracket() |
|
588
|
|
|
return True |
|
589
|
|
|
|
|
590
|
|
|
# If we got here, opener is a potential opener |
|
591
|
|
|
is_image = opener.get('image') |
|
592
|
|
|
|
|
593
|
|
|
# Check to see if we have a link/image |
|
594
|
|
|
|
|
595
|
|
|
savepos = self.pos |
|
596
|
|
|
|
|
597
|
|
|
# Inline link? |
|
598
|
|
|
if self.peek() == '(': |
|
599
|
|
|
self.pos += 1 |
|
600
|
|
|
self.spnl() |
|
601
|
|
|
dest = self.parseLinkDestination() |
|
602
|
|
|
if dest is not None and self.spnl(): |
|
603
|
|
|
# make sure there's a space before the title |
|
604
|
|
|
if re.match(reWhitespaceChar, self.subject[self.pos-1]): |
|
605
|
|
|
title = self.parseLinkTitle() |
|
606
|
|
|
if self.spnl() and self.peek() == ')': |
|
607
|
|
|
self.pos += 1 |
|
608
|
|
|
matched = True |
|
609
|
|
|
else: |
|
610
|
|
|
self.pos = savepos |
|
611
|
|
|
|
|
612
|
|
|
if not matched: |
|
613
|
|
|
# Next, see if there's a link label |
|
614
|
|
|
beforelabel = self.pos |
|
615
|
|
|
n = self.parseLinkLabel() |
|
616
|
|
|
if n > 2: |
|
617
|
|
|
reflabel = self.subject[beforelabel:beforelabel + n] |
|
618
|
|
|
elif not opener.get('bracket_after'): |
|
619
|
|
|
# Empty or missing second label means to use the first |
|
620
|
|
|
# label as the reference. The reference must not |
|
621
|
|
|
# contain a bracket. If we know there's a bracket, we |
|
622
|
|
|
# don't even bother checking it. |
|
623
|
|
|
reflabel = self.subject[opener.get('index'):startpos] |
|
624
|
|
|
if n == 0: |
|
625
|
|
|
# If shortcut reference link, rewind before spaces we skipped. |
|
626
|
|
|
self.pos = savepos |
|
627
|
|
|
|
|
628
|
|
|
if reflabel: |
|
629
|
|
|
# lookup rawlabel in refmap |
|
630
|
|
|
link = self.refmap.get(normalizeReference(reflabel)) |
|
631
|
|
|
if link: |
|
632
|
|
|
dest = link['destination'] |
|
633
|
|
|
title = link['title'] |
|
634
|
|
|
matched = True |
|
635
|
|
|
|
|
636
|
|
|
if matched: |
|
637
|
|
|
node = Node('image' if is_image else 'link', None) |
|
638
|
|
|
|
|
639
|
|
|
node.destination = dest |
|
640
|
|
|
node.title = title or '' |
|
641
|
|
|
tmp = opener.get('node').nxt |
|
642
|
|
|
while tmp: |
|
643
|
|
|
nxt = tmp.nxt |
|
644
|
|
|
tmp.unlink() |
|
645
|
|
|
node.append_child(tmp) |
|
646
|
|
|
tmp = nxt |
|
647
|
|
|
block.append_child(node) |
|
648
|
|
|
self.processEmphasis(opener.get('previousDelimiter')) |
|
649
|
|
|
self.removeBracket() |
|
650
|
|
|
opener.get('node').unlink() |
|
651
|
|
|
|
|
652
|
|
|
# We remove this bracket and processEmphasis will remove |
|
653
|
|
|
# later delimiters. |
|
654
|
|
|
# Now, for a link, we also deactivate earlier link openers. |
|
655
|
|
|
# (no links in links) |
|
656
|
|
|
if not is_image: |
|
657
|
|
|
opener = self.brackets |
|
658
|
|
|
while opener is not None: |
|
659
|
|
|
if not opener.get('image'): |
|
660
|
|
|
# deactivate this opener |
|
661
|
|
|
opener['active'] = False |
|
662
|
|
|
opener = opener.get('previous') |
|
663
|
|
|
|
|
664
|
|
|
return True |
|
665
|
|
|
else: |
|
666
|
|
|
# no match |
|
667
|
|
|
# remove this opener from stack |
|
668
|
|
|
self.removeBracket() |
|
669
|
|
|
self.pos = startpos |
|
670
|
|
|
block.append_child(text(']')) |
|
671
|
|
|
return True |
|
672
|
|
|
|
|
673
|
|
|
def addBracket(self, node, index, image): |
|
674
|
|
|
if self.brackets is not None: |
|
675
|
|
|
self.brackets['bracketAfter'] = True |
|
676
|
|
|
|
|
677
|
|
|
self.brackets = { |
|
678
|
|
|
'node': node, |
|
679
|
|
|
'previous': self.brackets, |
|
680
|
|
|
'previousDelimiter': self.delimiters, |
|
681
|
|
|
'index': index, |
|
682
|
|
|
'image': image, |
|
683
|
|
|
'active': True, |
|
684
|
|
|
} |
|
685
|
|
|
|
|
686
|
|
|
def removeBracket(self): |
|
687
|
|
|
self.brackets = self.brackets.get('previous') |
|
688
|
|
|
|
|
689
|
|
|
def parseEntity(self, block): |
|
690
|
|
|
"""Attempt to parse an entity.""" |
|
691
|
|
|
m = self.match(reEntityHere) |
|
692
|
|
|
if m: |
|
693
|
|
|
block.append_child(text(HTMLunescape(m))) |
|
694
|
|
|
return True |
|
695
|
|
|
else: |
|
696
|
|
|
return False |
|
697
|
|
|
|
|
698
|
|
|
def parseString(self, block): |
|
699
|
|
|
""" |
|
700
|
|
|
Parse a run of ordinary characters, or a single character with |
|
701
|
|
|
a special meaning in markdown, as a plain string. |
|
702
|
|
|
""" |
|
703
|
|
|
m = self.match(reMain) |
|
704
|
|
|
if m: |
|
705
|
|
|
if self.options.get('smart'): |
|
706
|
|
|
s = re.sub(reEllipses, '\u2026', m) |
|
707
|
|
|
s = re.sub(reDash, lambda x: smart_dashes(x.group()), s) |
|
708
|
|
|
block.append_child(text(s)) |
|
709
|
|
|
else: |
|
710
|
|
|
block.append_child(text(m)) |
|
711
|
|
|
return True |
|
712
|
|
|
else: |
|
713
|
|
|
return False |
|
714
|
|
|
|
|
715
|
|
|
def parseNewline(self, block): |
|
716
|
|
|
""" |
|
717
|
|
|
Parse a newline. If it was preceded by two spaces, return a hard |
|
718
|
|
|
line break; otherwise a soft line break. |
|
719
|
|
|
""" |
|
720
|
|
|
# assume we're at a \n |
|
721
|
|
|
self.pos += 1 |
|
722
|
|
|
lastc = block.last_child |
|
723
|
|
|
if lastc and lastc.t == 'text' and lastc.literal[-1] == ' ': |
|
724
|
|
|
linebreak = len(lastc.literal) >= 2 and lastc.literal[-2] == ' ' |
|
725
|
|
|
lastc.literal = re.sub(reFinalSpace, '', lastc.literal) |
|
726
|
|
|
if linebreak: |
|
727
|
|
|
node = Node('linebreak', None) |
|
728
|
|
|
else: |
|
729
|
|
|
node = Node('softbreak', None) |
|
730
|
|
|
block.append_child(node) |
|
731
|
|
|
else: |
|
732
|
|
|
block.append_child(Node('softbreak', None)) |
|
733
|
|
|
|
|
734
|
|
|
# gobble leading spaces in next line |
|
735
|
|
|
self.match(reInitialSpace) |
|
736
|
|
|
return True |
|
737
|
|
|
|
|
738
|
|
|
def parseReference(self, s, refmap): |
|
739
|
|
|
"""Attempt to parse a link reference, modifying refmap.""" |
|
740
|
|
|
self.subject = s |
|
741
|
|
|
self.pos = 0 |
|
742
|
|
|
startpos = self.pos |
|
743
|
|
|
|
|
744
|
|
|
# label: |
|
745
|
|
|
match_chars = self.parseLinkLabel() |
|
746
|
|
|
if match_chars == 0 or match_chars == 2: |
|
747
|
|
|
return 0 |
|
748
|
|
|
else: |
|
749
|
|
|
rawlabel = self.subject[:match_chars] |
|
750
|
|
|
|
|
751
|
|
|
# colon: |
|
752
|
|
|
if (self.peek() == ':'): |
|
753
|
|
|
self.pos += 1 |
|
754
|
|
|
else: |
|
755
|
|
|
self.pos = startpos |
|
756
|
|
|
return 0 |
|
757
|
|
|
|
|
758
|
|
|
# link url |
|
759
|
|
|
self.spnl() |
|
760
|
|
|
|
|
761
|
|
|
dest = self.parseLinkDestination() |
|
762
|
|
|
if (dest is None or len(dest) == 0): |
|
763
|
|
|
self.pos = startpos |
|
764
|
|
|
return 0 |
|
765
|
|
|
|
|
766
|
|
|
beforetitle = self.pos |
|
767
|
|
|
self.spnl() |
|
768
|
|
|
title = self.parseLinkTitle() |
|
769
|
|
|
if title is None: |
|
770
|
|
|
title = '' |
|
771
|
|
|
# rewind before spaces |
|
772
|
|
|
self.pos = beforetitle |
|
773
|
|
|
|
|
774
|
|
|
# make sure we're at line end: |
|
775
|
|
|
at_line_end = True |
|
776
|
|
|
if self.match(reSpaceAtEndOfLine) is None: |
|
777
|
|
|
if title == '': |
|
778
|
|
|
at_line_end = False |
|
779
|
|
|
else: |
|
780
|
|
|
# the potential title we found is not at the line end, |
|
781
|
|
|
# but it could still be a legal link reference if we |
|
782
|
|
|
# discard the title |
|
783
|
|
|
title == '' |
|
784
|
|
|
# rewind before spaces |
|
785
|
|
|
self.pos = beforetitle |
|
786
|
|
|
# and instead check if the link URL is at the line end |
|
787
|
|
|
at_line_end = self.match(reSpaceAtEndOfLine) is not None |
|
788
|
|
|
|
|
789
|
|
|
if not at_line_end: |
|
790
|
|
|
self.pos = startpos |
|
791
|
|
|
return 0 |
|
792
|
|
|
|
|
793
|
|
|
normlabel = normalizeReference(rawlabel) |
|
794
|
|
|
if refmap.get(normlabel) == '': |
|
795
|
|
|
# label must contain non-whitespace characters |
|
796
|
|
|
self.pos = startpos |
|
797
|
|
|
return 0 |
|
798
|
|
|
|
|
799
|
|
|
if refmap.get(normlabel) is None: |
|
800
|
|
|
refmap[normlabel] = { |
|
801
|
|
|
'destination': dest, |
|
802
|
|
|
'title': title |
|
803
|
|
|
} |
|
804
|
|
|
return (self.pos - startpos) |
|
805
|
|
|
|
|
806
|
|
|
def parseInline(self, block): |
|
807
|
|
|
""" |
|
808
|
|
|
Parse the next inline element in subject, advancing subject |
|
809
|
|
|
position. |
|
810
|
|
|
|
|
811
|
|
|
On success, add the result to block's children and return True. |
|
812
|
|
|
On failure, return False. |
|
813
|
|
|
""" |
|
814
|
|
|
res = False |
|
815
|
|
|
c = self.peek() |
|
816
|
|
|
if c is None: |
|
817
|
|
|
return False |
|
818
|
|
|
if c == '\n': |
|
819
|
|
|
res = self.parseNewline(block) |
|
820
|
|
|
elif c == '\\': |
|
821
|
|
|
res = self.parseBackslash(block) |
|
822
|
|
|
elif c == '`': |
|
823
|
|
|
res = self.parseBackticks(block) |
|
824
|
|
|
elif c == '*' or c == '_': |
|
825
|
|
|
res = self.handleDelim(c, block) |
|
826
|
|
|
elif c == "'" or c == '"': |
|
827
|
|
|
res = self.options.get('smart') and self.handleDelim(c, block) |
|
828
|
|
|
elif c == '[': |
|
829
|
|
|
res = self.parseOpenBracket(block) |
|
830
|
|
|
elif c == '!': |
|
831
|
|
|
res = self.parseBang(block) |
|
832
|
|
|
elif c == ']': |
|
833
|
|
|
res = self.parseCloseBracket(block) |
|
834
|
|
|
elif c == '<': |
|
835
|
|
|
res = self.parseAutolink(block) or self.parseHtmlTag(block) |
|
836
|
|
|
elif c == '&': |
|
837
|
|
|
res = self.parseEntity(block) |
|
838
|
|
|
else: |
|
839
|
|
|
res = self.parseString(block) |
|
840
|
|
|
|
|
841
|
|
|
if not res: |
|
842
|
|
|
self.pos += 1 |
|
843
|
|
|
block.append_child(text(c)) |
|
844
|
|
|
|
|
845
|
|
|
return True |
|
846
|
|
|
|
|
847
|
|
|
def parseInlines(self, block): |
|
848
|
|
|
""" |
|
849
|
|
|
Parse string content in block into inline children, |
|
850
|
|
|
using refmap to resolve references. |
|
851
|
|
|
""" |
|
852
|
|
|
self.subject = block.string_content.strip() |
|
853
|
|
|
self.pos = 0 |
|
854
|
|
|
self.delimiters = None |
|
855
|
|
|
self.brackets = None |
|
856
|
|
|
while (self.parseInline(block)): |
|
857
|
|
|
pass |
|
858
|
|
|
# allow raw string to be garbage collected |
|
859
|
|
|
block.string_content = None |
|
860
|
|
|
self.processEmphasis(None) |
|
861
|
|
|
|
|
862
|
|
|
parse = parseInlines |
|
863
|
|
|
|