1
|
|
|
from __future__ import absolute_import, unicode_literals |
2
|
|
|
|
3
|
|
|
import re |
4
|
|
|
import sys |
5
|
|
|
from CommonMark import common |
6
|
|
|
from CommonMark.common import normalize_uri, unescape_string |
7
|
|
|
from CommonMark.node import Node |
8
|
|
|
|
9
|
|
|
if sys.version_info >= (3, 0): |
10
|
|
|
if sys.version_info >= (3, 4): |
11
|
|
|
import html.parser |
12
|
|
|
HTMLunescape = html.parser.HTMLParser().unescape |
13
|
|
|
else: |
14
|
|
|
from .entitytrans import _unescape |
15
|
|
|
HTMLunescape = _unescape |
16
|
|
|
else: |
17
|
|
|
from CommonMark import entitytrans |
18
|
|
|
HTMLunescape = entitytrans._unescape |
19
|
|
|
|
20
|
|
|
# Some regexps used in inline parser: |
21
|
|
|
|
22
|
|
|
ESCAPED_CHAR = '\\\\' + common.ESCAPABLE |
23
|
|
|
|
24
|
|
|
rePunctuation = re.compile( |
25
|
|
|
r'^[\u2000-\u206F\u2E00-\u2E7F\\' + "'" + '!"#\$%&\(\)' |
26
|
|
|
r'\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]') |
27
|
|
|
|
28
|
|
|
reLinkTitle = re.compile( |
29
|
|
|
'^(?:"(' + ESCAPED_CHAR + '|[^"\\x00])*"' + |
30
|
|
|
'|' + |
31
|
|
|
'\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' + |
32
|
|
|
'|' + |
33
|
|
|
'\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))') |
34
|
|
|
reLinkDestinationBraces = re.compile( |
35
|
|
|
'^(?:[<](?:[^ <>\\t\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + |
36
|
|
|
'\\\\)*[>])') |
37
|
|
|
|
38
|
|
|
reEscapable = re.compile('^' + common.ESCAPABLE) |
39
|
|
|
reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE) |
40
|
|
|
reTicks = re.compile(r'`+') |
41
|
|
|
reTicksHere = re.compile(r'^`+') |
42
|
|
|
reEllipses = re.compile(r'\.\.\.') |
43
|
|
|
reDash = re.compile(r'--+') |
44
|
|
|
reEmailAutolink = re.compile( |
45
|
|
|
r"^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]" |
46
|
|
|
r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" |
47
|
|
|
r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>") |
48
|
|
|
reAutolink = re.compile( |
49
|
|
|
r'^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', |
50
|
|
|
re.IGNORECASE) |
51
|
|
|
reSpnl = re.compile(r'^ *(?:\n *)?') |
52
|
|
|
reWhitespaceChar = re.compile(r'^^[ \t\n\x0b\x0c\x0d]') |
53
|
|
|
reWhitespace = re.compile(r'[ \t\n\x0b\x0c\x0d]+') |
54
|
|
|
reUnicodeWhitespaceChar = re.compile(r'^\s') |
55
|
|
|
reFinalSpace = re.compile(r' *$') |
56
|
|
|
reInitialSpace = re.compile(r'^ *') |
57
|
|
|
reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)') |
58
|
|
|
reLinkLabel = re.compile('^\\[(?:[^\\\\\\[\\]]|' + ESCAPED_CHAR + |
59
|
|
|
'|\\\\){0,1000}\\]') |
60
|
|
|
# Matches a string of non-special characters. |
61
|
|
|
reMain = re.compile(r'^[^\n`\[\]\\!<&*_\'"]+', re.MULTILINE) |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
def normalizeReference(s): |
65
|
|
|
"""Normalize reference label. |
66
|
|
|
|
67
|
|
|
Collapse internal whitespace to single space, remove |
68
|
|
|
leading/trailing whitespace, case fold. |
69
|
|
|
""" |
70
|
|
|
return re.sub(r'\s+', ' ', s.strip()).upper() |
71
|
|
|
|
72
|
|
|
|
73
|
|
|
def text(s): |
74
|
|
|
node = Node('text', None) |
75
|
|
|
node.literal = s |
76
|
|
|
return node |
77
|
|
|
|
78
|
|
|
|
79
|
|
|
def smart_dashes(chars): |
80
|
|
|
en_count = 0 |
81
|
|
|
em_count = 0 |
82
|
|
|
if len(chars) % 3 == 0: |
83
|
|
|
# If divisible by 3, use all em dashes |
84
|
|
|
em_count = len(chars) / 3 |
85
|
|
|
elif len(chars) % 2 == 0: |
86
|
|
|
# If divisble by 2, use all en dashes |
87
|
|
|
en_count = len(chars) / 2 |
88
|
|
|
elif len(chars) % 3 == 2: |
89
|
|
|
# if 2 extra dashes, use en dashfor last 2; |
90
|
|
|
# em dashes for rest |
91
|
|
|
en_count = 1 |
92
|
|
|
em_count = (len(chars) - 2) / 3 |
93
|
|
|
else: |
94
|
|
|
# Use en dashes for last 4 hyphens; em dashes for rest |
95
|
|
|
en_count = 2 |
96
|
|
|
em_count = (len(chars) - 4) / 3 |
97
|
|
|
return ('\u2014' * em_count) + ('\u2013' * en_count) |
98
|
|
|
|
99
|
|
|
|
100
|
|
|
class InlineParser(object): |
101
|
|
|
"""INLINE PARSER |
102
|
|
|
|
103
|
|
|
These are methods of an InlineParser class, defined below. |
104
|
|
|
An InlineParser keeps track of a subject (a string to be |
105
|
|
|
parsed) and a position in that subject. |
106
|
|
|
""" |
107
|
|
|
|
108
|
|
|
def __init__(self, options={}): |
109
|
|
|
self.subject = '' |
110
|
|
|
self.brackets = None |
111
|
|
|
self.pos = 0 |
112
|
|
|
self.refmap = {} |
113
|
|
|
self.options = options |
114
|
|
|
|
115
|
|
|
def match(self, regexString): |
116
|
|
|
""" |
117
|
|
|
If regexString matches at current position in the subject, advance |
118
|
|
|
position in subject and return the match; otherwise return None. |
119
|
|
|
""" |
120
|
|
|
match = re.search(regexString, self.subject[self.pos:]) |
121
|
|
|
if match is None: |
122
|
|
|
return None |
123
|
|
|
else: |
124
|
|
|
self.pos += match.end() |
125
|
|
|
return match.group() |
126
|
|
|
|
127
|
|
|
def peek(self): |
128
|
|
|
""" Returns the character at the current subject position, or None if |
129
|
|
|
there are no more characters.""" |
130
|
|
|
if self.pos < len(self.subject): |
131
|
|
|
return self.subject[self.pos] |
132
|
|
|
else: |
133
|
|
|
return None |
134
|
|
|
|
135
|
|
|
def spnl(self): |
136
|
|
|
""" Parse zero or more space characters, including at |
137
|
|
|
most one newline.""" |
138
|
|
|
self.match(reSpnl) |
139
|
|
|
return True |
140
|
|
|
|
141
|
|
|
# All of the parsers below try to match something at the current position |
142
|
|
|
# in the subject. If they succeed in matching anything, they |
143
|
|
|
# push an inline matched, advancing the subject. |
144
|
|
|
|
145
|
|
|
def parseBackticks(self, block): |
146
|
|
|
""" Attempt to parse backticks, adding either a backtick code span or a |
147
|
|
|
literal sequence of backticks to the 'inlines' list.""" |
148
|
|
|
ticks = self.match(reTicksHere) |
149
|
|
|
if ticks is None: |
150
|
|
|
return False |
151
|
|
|
after_open_ticks = self.pos |
152
|
|
|
matched = self.match(reTicks) |
153
|
|
|
while matched is not None: |
154
|
|
|
if (matched == ticks): |
155
|
|
|
node = Node('code', None) |
156
|
|
|
c = self.subject[after_open_ticks:self.pos - len(ticks)] |
157
|
|
|
c = c.strip() |
158
|
|
|
c = re.sub(reWhitespace, ' ', c) |
159
|
|
|
node.literal = c |
160
|
|
|
block.append_child(node) |
161
|
|
|
return True |
162
|
|
|
matched = self.match(reTicks) |
163
|
|
|
# If we got here, we didn't match a closing backtick sequence. |
164
|
|
|
self.pos = after_open_ticks |
165
|
|
|
block.append_child(text(ticks)) |
166
|
|
|
return True |
167
|
|
|
|
168
|
|
|
def parseBackslash(self, block): |
169
|
|
|
""" |
170
|
|
|
Parse a backslash-escaped special character, adding either the |
171
|
|
|
escaped character, a hard line break (if the backslash is followed |
172
|
|
|
by a newline), or a literal backslash to the block's children. |
173
|
|
|
Assumes current character is a backslash. |
174
|
|
|
""" |
175
|
|
|
subj = self.subject |
176
|
|
|
self.pos += 1 |
177
|
|
|
|
178
|
|
|
try: |
179
|
|
|
subjchar = subj[self.pos] |
180
|
|
|
except IndexError: |
181
|
|
|
subjchar = None |
182
|
|
|
|
183
|
|
|
if self.peek() == '\n': |
184
|
|
|
self.pos += 1 |
185
|
|
|
node = Node('linebreak', None) |
186
|
|
|
block.append_child(node) |
187
|
|
|
elif subjchar and re.match(reEscapable, subjchar): |
188
|
|
|
block.append_child(text(subjchar)) |
189
|
|
|
self.pos += 1 |
190
|
|
|
else: |
191
|
|
|
block.append_child(text('\\')) |
192
|
|
|
|
193
|
|
|
return True |
194
|
|
|
|
195
|
|
|
def parseAutolink(self, block): |
196
|
|
|
"""Attempt to parse an autolink (URL or email in pointy brackets).""" |
197
|
|
|
m = self.match(reEmailAutolink) |
198
|
|
|
|
199
|
|
|
if m: |
200
|
|
|
# email |
201
|
|
|
dest = m[1:-1] |
202
|
|
|
node = Node('link', None) |
203
|
|
|
node.destination = normalize_uri('mailto:' + dest) |
204
|
|
|
node.title = '' |
205
|
|
|
node.append_child(text(dest)) |
206
|
|
|
block.append_child(node) |
207
|
|
|
return True |
208
|
|
|
else: |
209
|
|
|
m = self.match(reAutolink) |
210
|
|
|
if m: |
211
|
|
|
# link |
212
|
|
|
dest = m[1:-1] |
213
|
|
|
node = Node('link', None) |
214
|
|
|
node.destination = normalize_uri(dest) |
215
|
|
|
node.title = '' |
216
|
|
|
node.append_child(text(dest)) |
217
|
|
|
block.append_child(node) |
218
|
|
|
return True |
219
|
|
|
|
220
|
|
|
return False |
221
|
|
|
|
222
|
|
|
def parseHtmlTag(self, block): |
223
|
|
|
"""Attempt to parse a raw HTML tag.""" |
224
|
|
|
m = self.match(common.reHtmlTag) |
225
|
|
|
if m is None: |
226
|
|
|
return False |
227
|
|
|
else: |
228
|
|
|
node = Node('html_inline', None) |
229
|
|
|
node.literal = m |
230
|
|
|
block.append_child(node) |
231
|
|
|
return True |
232
|
|
|
|
233
|
|
|
def scanDelims(self, c): |
234
|
|
|
""" |
235
|
|
|
Scan a sequence of characters == c, and return information about |
236
|
|
|
the number of delimiters and whether they are positioned such that |
237
|
|
|
they can open and/or close emphasis or strong emphasis. A utility |
238
|
|
|
function for strong/emph parsing. |
239
|
|
|
""" |
240
|
|
|
numdelims = 0 |
241
|
|
|
startpos = self.pos |
242
|
|
|
|
243
|
|
|
if c == "'" or c == '"': |
244
|
|
|
numdelims += 1 |
245
|
|
|
self.pos += 1 |
246
|
|
|
else: |
247
|
|
|
while (self.peek() == c): |
248
|
|
|
numdelims += 1 |
249
|
|
|
self.pos += 1 |
250
|
|
|
|
251
|
|
|
if numdelims == 0: |
252
|
|
|
return None |
253
|
|
|
|
254
|
|
|
c_before = '\n' if startpos == 0 else self.subject[startpos - 1] |
255
|
|
|
|
256
|
|
|
c_after = self.peek() |
257
|
|
|
if c_after is None: |
258
|
|
|
c_after = '\n' |
259
|
|
|
|
260
|
|
|
# Python 2 doesn't recognize '\xa0' as whitespace |
261
|
|
|
after_is_whitespace = re.match(reUnicodeWhitespaceChar, c_after) or \ |
262
|
|
|
c_after == '\xa0' |
263
|
|
|
after_is_punctuation = re.match(rePunctuation, c_after) |
264
|
|
|
before_is_whitespace = re.match(reUnicodeWhitespaceChar, c_before) or \ |
265
|
|
|
c_before == '\xa0' |
266
|
|
|
before_is_punctuation = re.match(rePunctuation, c_before) |
267
|
|
|
|
268
|
|
|
left_flanking = not after_is_whitespace and \ |
269
|
|
|
not (after_is_punctuation and |
270
|
|
|
not before_is_whitespace and |
271
|
|
|
not before_is_punctuation) |
272
|
|
|
right_flanking = not before_is_whitespace and \ |
273
|
|
|
not (before_is_punctuation and |
274
|
|
|
not after_is_whitespace and |
275
|
|
|
not after_is_punctuation) |
276
|
|
|
if c == '_': |
277
|
|
|
can_open = left_flanking and \ |
278
|
|
|
(not right_flanking or before_is_punctuation) |
279
|
|
|
can_close = right_flanking and \ |
280
|
|
|
(not left_flanking or after_is_punctuation) |
281
|
|
|
elif c == "'" or c == '"': |
282
|
|
|
can_open = left_flanking and not right_flanking |
283
|
|
|
can_close = right_flanking |
284
|
|
|
else: |
285
|
|
|
can_open = left_flanking |
286
|
|
|
can_close = right_flanking |
287
|
|
|
|
288
|
|
|
self.pos = startpos |
289
|
|
|
return { |
290
|
|
|
'numdelims': numdelims, |
291
|
|
|
'can_open': can_open, |
292
|
|
|
'can_close': can_close, |
293
|
|
|
} |
294
|
|
|
|
295
|
|
|
def handleDelim(self, cc, block): |
296
|
|
|
"""Handle a delimiter marker for emphasis or a quote.""" |
297
|
|
|
res = self.scanDelims(cc) |
298
|
|
|
if not res: |
299
|
|
|
return False |
300
|
|
|
numdelims = res.get('numdelims') |
301
|
|
|
startpos = self.pos |
302
|
|
|
|
303
|
|
|
self.pos += numdelims |
304
|
|
|
if cc == "'": |
305
|
|
|
contents = '\u2019' |
306
|
|
|
elif cc == '"': |
307
|
|
|
contents = '\u201C' |
308
|
|
|
else: |
309
|
|
|
contents = self.subject[startpos:self.pos] |
310
|
|
|
node = text(contents) |
311
|
|
|
block.append_child(node) |
312
|
|
|
|
313
|
|
|
# Add entry to stack for this opener |
314
|
|
|
self.delimiters = { |
315
|
|
|
'cc': cc, |
316
|
|
|
'numdelims': numdelims, |
317
|
|
|
'origdelims': numdelims, |
318
|
|
|
'node': node, |
319
|
|
|
'previous': self.delimiters, |
320
|
|
|
'next': None, |
321
|
|
|
'can_open': res.get('can_open'), |
322
|
|
|
'can_close': res.get('can_close'), |
323
|
|
|
} |
324
|
|
|
if self.delimiters['previous'] is not None: |
325
|
|
|
self.delimiters['previous']['next'] = self.delimiters |
326
|
|
|
return True |
327
|
|
|
|
328
|
|
|
def removeDelimiter(self, delim): |
329
|
|
|
if delim.get('previous') is not None: |
330
|
|
|
delim['previous']['next'] = delim.get('next') |
331
|
|
|
if delim.get('next') is None: |
332
|
|
|
# Top of stack |
333
|
|
|
self.delimiters = delim.get('previous') |
334
|
|
|
else: |
335
|
|
|
delim['next']['previous'] = delim.get('previous') |
336
|
|
|
|
337
|
|
|
@staticmethod |
338
|
|
|
def removeDelimitersBetween(bottom, top): |
339
|
|
|
if bottom.get('next') != top: |
340
|
|
|
bottom['next'] = top |
341
|
|
|
top['previous'] = bottom |
342
|
|
|
|
343
|
|
|
def processEmphasis(self, stack_bottom): |
344
|
|
|
openers_bottom = { |
345
|
|
|
'_': stack_bottom, |
346
|
|
|
'*': stack_bottom, |
347
|
|
|
"'": stack_bottom, |
348
|
|
|
'"': stack_bottom, |
349
|
|
|
} |
350
|
|
|
odd_match = False |
351
|
|
|
use_delims = 0 |
352
|
|
|
|
353
|
|
|
# Find first closer above stack_bottom |
354
|
|
|
closer = self.delimiters |
355
|
|
|
while closer is not None and closer.get('previous') != stack_bottom: |
356
|
|
|
closer = closer.get('previous') |
357
|
|
|
|
358
|
|
|
# Move forward, looking for closers, and handling each |
359
|
|
|
while closer is not None: |
360
|
|
|
if not closer.get('can_close'): |
361
|
|
|
closer = closer.get('next') |
362
|
|
|
else: |
363
|
|
|
# found emphasis closer. now look back for first |
364
|
|
|
# matching opener: |
365
|
|
|
opener = closer.get('previous') |
366
|
|
|
opener_found = False |
367
|
|
|
closercc = closer.get('cc') |
368
|
|
|
while (opener is not None and opener != stack_bottom and |
369
|
|
|
opener != openers_bottom[closercc]): |
370
|
|
|
odd_match = (closer.get('can_open') or |
371
|
|
|
opener.get('can_close')) and \ |
372
|
|
|
(opener.get('origdelims') + |
373
|
|
|
closer.get('origdelims')) % 3 == 0 |
374
|
|
|
if opener.get('cc') == closercc and \ |
375
|
|
|
opener.get('can_open') and \ |
376
|
|
|
not odd_match: |
377
|
|
|
opener_found = True |
378
|
|
|
break |
379
|
|
|
opener = opener.get('previous') |
380
|
|
|
old_closer = closer |
381
|
|
|
|
382
|
|
|
if closercc == '*' or closercc == '_': |
383
|
|
|
if not opener_found: |
384
|
|
|
closer = closer.get('next') |
385
|
|
|
else: |
386
|
|
|
# Calculate actual number of delimiters used from |
387
|
|
|
# closer |
388
|
|
|
if closer['numdelims'] < 3 or opener['numdelims'] < 3: |
389
|
|
|
if closer['numdelims'] <= opener['numdelims']: |
390
|
|
|
use_delims = closer['numdelims'] |
391
|
|
|
else: |
392
|
|
|
use_delims = opener['numdelims'] |
393
|
|
|
else: |
394
|
|
|
if closer['numdelims'] % 2 == 0: |
395
|
|
|
use_delims = 2 |
396
|
|
|
else: |
397
|
|
|
use_delims = 1 |
398
|
|
|
|
399
|
|
|
opener_inl = opener.get('node') |
400
|
|
|
closer_inl = closer.get('node') |
401
|
|
|
|
402
|
|
|
# Remove used delimiters from stack elts and inlines |
403
|
|
|
opener['numdelims'] -= use_delims |
404
|
|
|
closer['numdelims'] -= use_delims |
405
|
|
|
opener_inl.literal = opener_inl.literal[ |
406
|
|
|
:len(opener_inl.literal) - use_delims] |
407
|
|
|
closer_inl.literal = closer_inl.literal[ |
408
|
|
|
:len(closer_inl.literal) - use_delims] |
409
|
|
|
|
410
|
|
|
# Build contents for new Emph element |
411
|
|
|
if use_delims == 1: |
412
|
|
|
emph = Node('emph', None) |
413
|
|
|
else: |
414
|
|
|
emph = Node('strong', None) |
415
|
|
|
|
416
|
|
|
tmp = opener_inl.nxt |
417
|
|
|
while tmp and tmp != closer_inl: |
418
|
|
|
nxt = tmp.nxt |
419
|
|
|
tmp.unlink() |
420
|
|
|
emph.append_child(tmp) |
421
|
|
|
tmp = nxt |
422
|
|
|
|
423
|
|
|
opener_inl.insert_after(emph) |
424
|
|
|
|
425
|
|
|
# Remove elts between opener and closer in delimiters |
426
|
|
|
# stack |
427
|
|
|
self.removeDelimitersBetween(opener, closer) |
428
|
|
|
|
429
|
|
|
# If opener has 0 delims, remove it and the inline |
430
|
|
|
if opener['numdelims'] == 0: |
431
|
|
|
opener_inl.unlink() |
432
|
|
|
self.removeDelimiter(opener) |
433
|
|
|
|
434
|
|
|
if closer['numdelims'] == 0: |
435
|
|
|
closer_inl.unlink() |
436
|
|
|
tempstack = closer['next'] |
437
|
|
|
self.removeDelimiter(closer) |
438
|
|
|
closer = tempstack |
439
|
|
|
|
440
|
|
|
elif closercc == "'": |
441
|
|
|
closer['node'].literal = '\u2019' |
442
|
|
|
if opener_found: |
443
|
|
|
opener['node'].literal = '\u2018' |
444
|
|
|
closer = closer['next'] |
445
|
|
|
|
446
|
|
|
elif closercc == '"': |
447
|
|
|
closer['node'].literal = '\u201D' |
448
|
|
|
if opener_found: |
449
|
|
|
opener['node'].literal = '\u201C' |
450
|
|
|
closer = closer['next'] |
451
|
|
|
|
452
|
|
|
if not opener_found and not odd_match: |
453
|
|
|
# Set lower bound for future searches for openers: |
454
|
|
|
# We don't do this with odd_match because a ** |
455
|
|
|
# that doesn't match an earlier * might turn into |
456
|
|
|
# an opener, and the * might be matched by something |
457
|
|
|
# else. |
458
|
|
|
openers_bottom[closercc] = old_closer['previous'] |
459
|
|
|
if not old_closer['can_open']: |
460
|
|
|
# We can remove a closer that can't be an opener, |
461
|
|
|
# once we've seen there's no matching opener: |
462
|
|
|
self.removeDelimiter(old_closer) |
463
|
|
|
|
464
|
|
|
# Remove all delimiters |
465
|
|
|
while self.delimiters is not None and self.delimiters != stack_bottom: |
466
|
|
|
self.removeDelimiter(self.delimiters) |
467
|
|
|
|
468
|
|
|
def parseLinkTitle(self): |
469
|
|
|
""" |
470
|
|
|
Attempt to parse link title (sans quotes), returning the string |
471
|
|
|
or None if no match. |
472
|
|
|
""" |
473
|
|
|
title = self.match(reLinkTitle) |
474
|
|
|
if title is None: |
475
|
|
|
return None |
476
|
|
|
else: |
477
|
|
|
# chop off quotes from title and unescape: |
478
|
|
|
return unescape_string(title[1:-1]) |
479
|
|
|
|
480
|
|
|
def parseLinkDestination(self): |
481
|
|
|
""" |
482
|
|
|
Attempt to parse link destination, returning the string or |
483
|
|
|
None if no match. |
484
|
|
|
""" |
485
|
|
|
res = self.match(reLinkDestinationBraces) |
486
|
|
|
if res is None: |
487
|
|
|
# TODO handrolled parser; res should be None or the string |
488
|
|
|
savepos = self.pos |
489
|
|
|
openparens = 0 |
490
|
|
|
c = self.peek() |
491
|
|
|
while c is not None: |
492
|
|
|
if c == '\\': |
493
|
|
|
self.pos += 1 |
494
|
|
|
if self.peek() is not None: |
495
|
|
|
self.pos += 1 |
496
|
|
|
elif c == '(': |
497
|
|
|
self.pos += 1 |
498
|
|
|
openparens += 1 |
499
|
|
|
elif c == ')': |
500
|
|
|
if openparens < 1: |
501
|
|
|
break |
502
|
|
|
else: |
503
|
|
|
self.pos += 1 |
504
|
|
|
openparens -= 1 |
505
|
|
|
elif re.match(reWhitespaceChar, c): |
506
|
|
|
break |
507
|
|
|
else: |
508
|
|
|
self.pos += 1 |
509
|
|
|
c = self.peek() |
510
|
|
|
res = self.subject[savepos:self.pos] |
511
|
|
|
return normalize_uri(unescape_string(res)) |
512
|
|
|
else: |
513
|
|
|
# chop off surrounding <..>: |
514
|
|
|
return normalize_uri(unescape_string(res[1:-1])) |
515
|
|
|
|
516
|
|
|
def parseLinkLabel(self): |
517
|
|
|
""" |
518
|
|
|
Attempt to parse a link label, returning number of |
519
|
|
|
characters parsed. |
520
|
|
|
""" |
521
|
|
|
m = self.match(reLinkLabel) |
522
|
|
|
if m is None or len(m) > 1001 or re.match(r'\[\s+\]', m): |
523
|
|
|
return 0 |
524
|
|
|
else: |
525
|
|
|
return len(m) |
526
|
|
|
|
527
|
|
|
def parseOpenBracket(self, block): |
528
|
|
|
""" |
529
|
|
|
Add open bracket to delimiter stack and add a text node to |
530
|
|
|
block's children. |
531
|
|
|
""" |
532
|
|
|
startpos = self.pos |
533
|
|
|
self.pos += 1 |
534
|
|
|
|
535
|
|
|
node = text('[') |
536
|
|
|
block.append_child(node) |
537
|
|
|
|
538
|
|
|
# Add entry to stack for this opener |
539
|
|
|
self.addBracket(node, startpos, False) |
540
|
|
|
return True |
541
|
|
|
|
542
|
|
|
def parseBang(self, block): |
543
|
|
|
""" |
544
|
|
|
If next character is [, and ! delimiter to delimiter stack and |
545
|
|
|
add a text node to block's children. Otherwise just add a text |
546
|
|
|
node. |
547
|
|
|
""" |
548
|
|
|
startpos = self.pos |
549
|
|
|
self.pos += 1 |
550
|
|
|
if self.peek() == '[': |
551
|
|
|
self.pos += 1 |
552
|
|
|
|
553
|
|
|
node = text('![') |
554
|
|
|
block.append_child(node) |
555
|
|
|
|
556
|
|
|
# Add entry to stack for this openeer |
557
|
|
|
self.addBracket(node, startpos + 1, True) |
558
|
|
|
else: |
559
|
|
|
block.append_child(text('!')) |
560
|
|
|
|
561
|
|
|
return True |
562
|
|
|
|
563
|
|
|
def parseCloseBracket(self, block): |
564
|
|
|
""" |
565
|
|
|
Try to match close bracket against an opening in the delimiter |
566
|
|
|
stack. Add either a link or image, or a plain [ character, |
567
|
|
|
to block's children. If there is a matching delimiter, |
568
|
|
|
remove it from the delimiter stack. |
569
|
|
|
""" |
570
|
|
|
title = None |
571
|
|
|
matched = False |
572
|
|
|
self.pos += 1 |
573
|
|
|
startpos = self.pos |
574
|
|
|
|
575
|
|
|
# get last [ or ![ |
576
|
|
|
opener = self.brackets |
577
|
|
|
|
578
|
|
|
if opener is None: |
579
|
|
|
# no matched opener, just return a literal |
580
|
|
|
block.append_child(text(']')) |
581
|
|
|
return True |
582
|
|
|
|
583
|
|
|
if not opener.get('active'): |
584
|
|
|
# no matched opener, just return a literal |
585
|
|
|
block.append_child(text(']')) |
586
|
|
|
# take opener off brackets stack |
587
|
|
|
self.removeBracket() |
588
|
|
|
return True |
589
|
|
|
|
590
|
|
|
# If we got here, opener is a potential opener |
591
|
|
|
is_image = opener.get('image') |
592
|
|
|
|
593
|
|
|
# Check to see if we have a link/image |
594
|
|
|
|
595
|
|
|
savepos = self.pos |
596
|
|
|
|
597
|
|
|
# Inline link? |
598
|
|
|
if self.peek() == '(': |
599
|
|
|
self.pos += 1 |
600
|
|
|
self.spnl() |
601
|
|
|
dest = self.parseLinkDestination() |
602
|
|
|
if dest is not None and self.spnl(): |
603
|
|
|
# make sure there's a space before the title |
604
|
|
|
if re.match(reWhitespaceChar, self.subject[self.pos-1]): |
605
|
|
|
title = self.parseLinkTitle() |
606
|
|
|
if self.spnl() and self.peek() == ')': |
607
|
|
|
self.pos += 1 |
608
|
|
|
matched = True |
609
|
|
|
else: |
610
|
|
|
self.pos = savepos |
611
|
|
|
|
612
|
|
|
if not matched: |
613
|
|
|
# Next, see if there's a link label |
614
|
|
|
beforelabel = self.pos |
615
|
|
|
n = self.parseLinkLabel() |
616
|
|
|
if n > 2: |
617
|
|
|
reflabel = self.subject[beforelabel:beforelabel + n] |
618
|
|
|
elif not opener.get('bracket_after'): |
619
|
|
|
# Empty or missing second label means to use the first |
620
|
|
|
# label as the reference. The reference must not |
621
|
|
|
# contain a bracket. If we know there's a bracket, we |
622
|
|
|
# don't even bother checking it. |
623
|
|
|
reflabel = self.subject[opener.get('index'):startpos] |
624
|
|
|
if n == 0: |
625
|
|
|
# If shortcut reference link, rewind before spaces we skipped. |
626
|
|
|
self.pos = savepos |
627
|
|
|
|
628
|
|
|
if reflabel: |
629
|
|
|
# lookup rawlabel in refmap |
630
|
|
|
link = self.refmap.get(normalizeReference(reflabel)) |
631
|
|
|
if link: |
632
|
|
|
dest = link['destination'] |
633
|
|
|
title = link['title'] |
634
|
|
|
matched = True |
635
|
|
|
|
636
|
|
|
if matched: |
637
|
|
|
node = Node('image' if is_image else 'link', None) |
638
|
|
|
|
639
|
|
|
node.destination = dest |
640
|
|
|
node.title = title or '' |
641
|
|
|
tmp = opener.get('node').nxt |
642
|
|
|
while tmp: |
643
|
|
|
nxt = tmp.nxt |
644
|
|
|
tmp.unlink() |
645
|
|
|
node.append_child(tmp) |
646
|
|
|
tmp = nxt |
647
|
|
|
block.append_child(node) |
648
|
|
|
self.processEmphasis(opener.get('previousDelimiter')) |
649
|
|
|
self.removeBracket() |
650
|
|
|
opener.get('node').unlink() |
651
|
|
|
|
652
|
|
|
# We remove this bracket and processEmphasis will remove |
653
|
|
|
# later delimiters. |
654
|
|
|
# Now, for a link, we also deactivate earlier link openers. |
655
|
|
|
# (no links in links) |
656
|
|
|
if not is_image: |
657
|
|
|
opener = self.brackets |
658
|
|
|
while opener is not None: |
659
|
|
|
if not opener.get('image'): |
660
|
|
|
# deactivate this opener |
661
|
|
|
opener['active'] = False |
662
|
|
|
opener = opener.get('previous') |
663
|
|
|
|
664
|
|
|
return True |
665
|
|
|
else: |
666
|
|
|
# no match |
667
|
|
|
# remove this opener from stack |
668
|
|
|
self.removeBracket() |
669
|
|
|
self.pos = startpos |
670
|
|
|
block.append_child(text(']')) |
671
|
|
|
return True |
672
|
|
|
|
673
|
|
|
def addBracket(self, node, index, image): |
674
|
|
|
if self.brackets is not None: |
675
|
|
|
self.brackets['bracketAfter'] = True |
676
|
|
|
|
677
|
|
|
self.brackets = { |
678
|
|
|
'node': node, |
679
|
|
|
'previous': self.brackets, |
680
|
|
|
'previousDelimiter': self.delimiters, |
681
|
|
|
'index': index, |
682
|
|
|
'image': image, |
683
|
|
|
'active': True, |
684
|
|
|
} |
685
|
|
|
|
686
|
|
|
def removeBracket(self): |
687
|
|
|
self.brackets = self.brackets.get('previous') |
688
|
|
|
|
689
|
|
|
def parseEntity(self, block): |
690
|
|
|
"""Attempt to parse an entity.""" |
691
|
|
|
m = self.match(reEntityHere) |
692
|
|
|
if m: |
693
|
|
|
block.append_child(text(HTMLunescape(m))) |
694
|
|
|
return True |
695
|
|
|
else: |
696
|
|
|
return False |
697
|
|
|
|
698
|
|
|
def parseString(self, block): |
699
|
|
|
""" |
700
|
|
|
Parse a run of ordinary characters, or a single character with |
701
|
|
|
a special meaning in markdown, as a plain string. |
702
|
|
|
""" |
703
|
|
|
m = self.match(reMain) |
704
|
|
|
if m: |
705
|
|
|
if self.options.get('smart'): |
706
|
|
|
s = re.sub(reEllipses, '\u2026', m) |
707
|
|
|
s = re.sub(reDash, lambda x: smart_dashes(x.group()), s) |
708
|
|
|
block.append_child(text(s)) |
709
|
|
|
else: |
710
|
|
|
block.append_child(text(m)) |
711
|
|
|
return True |
712
|
|
|
else: |
713
|
|
|
return False |
714
|
|
|
|
715
|
|
|
def parseNewline(self, block): |
716
|
|
|
""" |
717
|
|
|
Parse a newline. If it was preceded by two spaces, return a hard |
718
|
|
|
line break; otherwise a soft line break. |
719
|
|
|
""" |
720
|
|
|
# assume we're at a \n |
721
|
|
|
self.pos += 1 |
722
|
|
|
lastc = block.last_child |
723
|
|
|
if lastc and lastc.t == 'text' and lastc.literal[-1] == ' ': |
724
|
|
|
linebreak = len(lastc.literal) >= 2 and lastc.literal[-2] == ' ' |
725
|
|
|
lastc.literal = re.sub(reFinalSpace, '', lastc.literal) |
726
|
|
|
if linebreak: |
727
|
|
|
node = Node('linebreak', None) |
728
|
|
|
else: |
729
|
|
|
node = Node('softbreak', None) |
730
|
|
|
block.append_child(node) |
731
|
|
|
else: |
732
|
|
|
block.append_child(Node('softbreak', None)) |
733
|
|
|
|
734
|
|
|
# gobble leading spaces in next line |
735
|
|
|
self.match(reInitialSpace) |
736
|
|
|
return True |
737
|
|
|
|
738
|
|
|
def parseReference(self, s, refmap): |
739
|
|
|
"""Attempt to parse a link reference, modifying refmap.""" |
740
|
|
|
self.subject = s |
741
|
|
|
self.pos = 0 |
742
|
|
|
startpos = self.pos |
743
|
|
|
|
744
|
|
|
# label: |
745
|
|
|
match_chars = self.parseLinkLabel() |
746
|
|
|
if match_chars == 0 or match_chars == 2: |
747
|
|
|
return 0 |
748
|
|
|
else: |
749
|
|
|
rawlabel = self.subject[:match_chars] |
750
|
|
|
|
751
|
|
|
# colon: |
752
|
|
|
if (self.peek() == ':'): |
753
|
|
|
self.pos += 1 |
754
|
|
|
else: |
755
|
|
|
self.pos = startpos |
756
|
|
|
return 0 |
757
|
|
|
|
758
|
|
|
# link url |
759
|
|
|
self.spnl() |
760
|
|
|
|
761
|
|
|
dest = self.parseLinkDestination() |
762
|
|
|
if (dest is None or len(dest) == 0): |
763
|
|
|
self.pos = startpos |
764
|
|
|
return 0 |
765
|
|
|
|
766
|
|
|
beforetitle = self.pos |
767
|
|
|
self.spnl() |
768
|
|
|
title = self.parseLinkTitle() |
769
|
|
|
if title is None: |
770
|
|
|
title = '' |
771
|
|
|
# rewind before spaces |
772
|
|
|
self.pos = beforetitle |
773
|
|
|
|
774
|
|
|
# make sure we're at line end: |
775
|
|
|
at_line_end = True |
776
|
|
|
if self.match(reSpaceAtEndOfLine) is None: |
777
|
|
|
if title == '': |
778
|
|
|
at_line_end = False |
779
|
|
|
else: |
780
|
|
|
# the potential title we found is not at the line end, |
781
|
|
|
# but it could still be a legal link reference if we |
782
|
|
|
# discard the title |
783
|
|
|
title == '' |
784
|
|
|
# rewind before spaces |
785
|
|
|
self.pos = beforetitle |
786
|
|
|
# and instead check if the link URL is at the line end |
787
|
|
|
at_line_end = self.match(reSpaceAtEndOfLine) is not None |
788
|
|
|
|
789
|
|
|
if not at_line_end: |
790
|
|
|
self.pos = startpos |
791
|
|
|
return 0 |
792
|
|
|
|
793
|
|
|
normlabel = normalizeReference(rawlabel) |
794
|
|
|
if refmap.get(normlabel) == '': |
795
|
|
|
# label must contain non-whitespace characters |
796
|
|
|
self.pos = startpos |
797
|
|
|
return 0 |
798
|
|
|
|
799
|
|
|
if refmap.get(normlabel) is None: |
800
|
|
|
refmap[normlabel] = { |
801
|
|
|
'destination': dest, |
802
|
|
|
'title': title |
803
|
|
|
} |
804
|
|
|
return (self.pos - startpos) |
805
|
|
|
|
806
|
|
|
def parseInline(self, block): |
807
|
|
|
""" |
808
|
|
|
Parse the next inline element in subject, advancing subject |
809
|
|
|
position. |
810
|
|
|
|
811
|
|
|
On success, add the result to block's children and return True. |
812
|
|
|
On failure, return False. |
813
|
|
|
""" |
814
|
|
|
res = False |
815
|
|
|
c = self.peek() |
816
|
|
|
if c is None: |
817
|
|
|
return False |
818
|
|
|
if c == '\n': |
819
|
|
|
res = self.parseNewline(block) |
820
|
|
|
elif c == '\\': |
821
|
|
|
res = self.parseBackslash(block) |
822
|
|
|
elif c == '`': |
823
|
|
|
res = self.parseBackticks(block) |
824
|
|
|
elif c == '*' or c == '_': |
825
|
|
|
res = self.handleDelim(c, block) |
826
|
|
|
elif c == "'" or c == '"': |
827
|
|
|
res = self.options.get('smart') and self.handleDelim(c, block) |
828
|
|
|
elif c == '[': |
829
|
|
|
res = self.parseOpenBracket(block) |
830
|
|
|
elif c == '!': |
831
|
|
|
res = self.parseBang(block) |
832
|
|
|
elif c == ']': |
833
|
|
|
res = self.parseCloseBracket(block) |
834
|
|
|
elif c == '<': |
835
|
|
|
res = self.parseAutolink(block) or self.parseHtmlTag(block) |
836
|
|
|
elif c == '&': |
837
|
|
|
res = self.parseEntity(block) |
838
|
|
|
else: |
839
|
|
|
res = self.parseString(block) |
840
|
|
|
|
841
|
|
|
if not res: |
842
|
|
|
self.pos += 1 |
843
|
|
|
block.append_child(text(c)) |
844
|
|
|
|
845
|
|
|
return True |
846
|
|
|
|
847
|
|
|
def parseInlines(self, block): |
848
|
|
|
""" |
849
|
|
|
Parse string content in block into inline children, |
850
|
|
|
using refmap to resolve references. |
851
|
|
|
""" |
852
|
|
|
self.subject = block.string_content.strip() |
853
|
|
|
self.pos = 0 |
854
|
|
|
self.delimiters = None |
855
|
|
|
self.brackets = None |
856
|
|
|
while (self.parseInline(block)): |
857
|
|
|
pass |
858
|
|
|
# allow raw string to be garbage collected |
859
|
|
|
block.string_content = None |
860
|
|
|
self.processEmphasis(None) |
861
|
|
|
|
862
|
|
|
parse = parseInlines |
863
|
|
|
|