1
|
|
|
""" |
2
|
|
|
PRE-PROCESSORS |
3
|
|
|
============================================================================= |
4
|
|
|
|
5
|
|
|
Preprocessors work on source text before we start doing anything too |
6
|
|
|
complicated. |
7
|
|
|
""" |
8
|
|
|
|
9
|
|
|
from __future__ import absolute_import |
10
|
|
|
from __future__ import unicode_literals |
11
|
|
|
from . import util |
12
|
|
|
from . import odict |
13
|
|
|
import re |
14
|
|
|
|
15
|
|
|
|
16
|
|
|
def build_preprocessors(md_instance, **kwargs): |
17
|
|
|
""" Build the default set of preprocessors used by Markdown. """ |
18
|
|
|
preprocessors = odict.OrderedDict() |
19
|
|
|
preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
20
|
|
|
if md_instance.safeMode != 'escape': |
21
|
|
|
preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
22
|
|
|
preprocessors["reference"] = ReferencePreprocessor(md_instance) |
23
|
|
|
return preprocessors |
24
|
|
|
|
25
|
|
|
|
26
|
|
|
class Preprocessor(util.Processor): |
27
|
|
|
""" |
28
|
|
|
Preprocessors are run after the text is broken into lines. |
29
|
|
|
|
30
|
|
|
Each preprocessor implements a "run" method that takes a pointer to a |
31
|
|
|
list of lines of the document, modifies it as necessary and returns |
32
|
|
|
either the same pointer or a pointer to a new list. |
33
|
|
|
|
34
|
|
|
Preprocessors must extend markdown.Preprocessor. |
35
|
|
|
|
36
|
|
|
""" |
37
|
|
|
def run(self, lines): |
38
|
|
|
""" |
39
|
|
|
Each subclass of Preprocessor should override the `run` method, which |
40
|
|
|
takes the document as a list of strings split by newlines and returns |
41
|
|
|
the (possibly modified) list of lines. |
42
|
|
|
|
43
|
|
|
""" |
44
|
|
|
pass # pragma: no cover |
45
|
|
|
|
46
|
|
|
|
47
|
|
|
class NormalizeWhitespace(Preprocessor): |
48
|
|
|
""" Normalize whitespace for consistant parsing. """ |
49
|
|
|
|
50
|
|
|
def run(self, lines): |
51
|
|
|
source = '\n'.join(lines) |
52
|
|
|
source = source.replace(util.STX, "").replace(util.ETX, "") |
53
|
|
|
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
54
|
|
|
source = source.expandtabs(self.markdown.tab_length) |
55
|
|
|
source = re.sub(r'(?<=\n) +\n', '\n', source) |
56
|
|
|
return source.split('\n') |
57
|
|
|
|
58
|
|
|
|
59
|
|
|
class HtmlBlockPreprocessor(Preprocessor): |
60
|
|
|
"""Remove html blocks from the text and store them for later retrieval.""" |
61
|
|
|
|
62
|
|
|
right_tag_patterns = ["</%s>", "%s>"] |
63
|
|
|
attrs_pattern = r""" |
64
|
|
|
\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
65
|
|
|
| # OR |
66
|
|
|
\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
67
|
|
|
| # OR |
68
|
|
|
\s+(?P<attr2>[^>"'/= ]+) # attr |
69
|
|
|
""" |
70
|
|
|
left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
71
|
|
|
attrs_pattern |
72
|
|
|
attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
73
|
|
|
left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
74
|
|
|
markdown_in_raw = False |
75
|
|
|
|
76
|
|
|
def _get_left_tag(self, block): |
77
|
|
|
m = self.left_tag_re.match(block) |
78
|
|
|
if m: |
79
|
|
|
tag = m.group('tag') |
80
|
|
|
raw_attrs = m.group('attrs') |
81
|
|
|
attrs = {} |
82
|
|
|
if raw_attrs: |
83
|
|
|
for ma in self.attrs_re.finditer(raw_attrs): |
84
|
|
|
if ma.group('attr'): |
85
|
|
|
if ma.group('value'): |
86
|
|
|
attrs[ma.group('attr').strip()] = ma.group('value') |
87
|
|
|
else: |
88
|
|
|
attrs[ma.group('attr').strip()] = "" |
89
|
|
|
elif ma.group('attr1'): |
90
|
|
|
if ma.group('value1'): |
91
|
|
|
attrs[ma.group('attr1').strip()] = ma.group( |
92
|
|
|
'value1' |
93
|
|
|
) |
94
|
|
|
else: |
95
|
|
|
attrs[ma.group('attr1').strip()] = "" |
96
|
|
|
elif ma.group('attr2'): |
97
|
|
|
attrs[ma.group('attr2').strip()] = "" |
98
|
|
|
return tag, len(m.group(0)), attrs |
99
|
|
|
else: |
100
|
|
|
tag = block[1:].split(">", 1)[0].lower() |
101
|
|
|
return tag, len(tag)+2, {} |
102
|
|
|
|
103
|
|
|
def _recursive_tagfind(self, ltag, rtag, start_index, block): |
104
|
|
|
while 1: |
105
|
|
|
i = block.find(rtag, start_index) |
106
|
|
|
if i == -1: |
107
|
|
|
return -1 |
108
|
|
|
j = block.find(ltag, start_index) |
109
|
|
|
# if no ltag, or rtag found before another ltag, return index |
110
|
|
|
if (j > i or j == -1): |
111
|
|
|
return i + len(rtag) |
112
|
|
|
# another ltag found before rtag, use end of ltag as starting |
113
|
|
|
# point and search again |
114
|
|
|
j = block.find('>', j) |
115
|
|
|
start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
116
|
|
|
if start_index == -1: |
117
|
|
|
# HTML potentially malformed- ltag has no corresponding |
118
|
|
|
# rtag |
119
|
|
|
return -1 |
120
|
|
|
|
121
|
|
|
def _get_right_tag(self, left_tag, left_index, block): |
122
|
|
|
for p in self.right_tag_patterns: |
123
|
|
|
tag = p % left_tag |
124
|
|
|
i = self._recursive_tagfind( |
125
|
|
|
"<%s" % left_tag, tag, left_index, block |
126
|
|
|
) |
127
|
|
|
if i > 2: |
128
|
|
|
return tag.lstrip("<").rstrip(">"), i |
129
|
|
|
return block.rstrip()[-left_index:-1].lower(), len(block) |
130
|
|
|
|
131
|
|
|
def _equal_tags(self, left_tag, right_tag): |
132
|
|
|
if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
133
|
|
|
return True |
134
|
|
|
if ("/" + left_tag) == right_tag: |
135
|
|
|
return True |
136
|
|
|
if (right_tag == "--" and left_tag == "--"): |
137
|
|
|
return True |
138
|
|
|
elif left_tag == right_tag[1:] and right_tag[0] == "/": |
139
|
|
|
return True |
140
|
|
|
else: |
141
|
|
|
return False |
142
|
|
|
|
143
|
|
|
def _is_oneliner(self, tag): |
144
|
|
|
return (tag in ['hr', 'hr/']) |
145
|
|
|
|
146
|
|
|
def _stringindex_to_listindex(self, stringindex, items): |
147
|
|
|
""" |
148
|
|
|
Same effect as concatenating the strings in items, |
149
|
|
|
finding the character to which stringindex refers in that string, |
150
|
|
|
and returning the index of the item in which that character resides. |
151
|
|
|
""" |
152
|
|
|
items.append('dummy') |
153
|
|
|
i, count = 0, 0 |
154
|
|
|
while count <= stringindex: |
155
|
|
|
count += len(items[i]) |
156
|
|
|
i += 1 |
157
|
|
|
return i - 1 |
158
|
|
|
|
159
|
|
|
def _nested_markdown_in_html(self, items): |
160
|
|
|
"""Find and process html child elements of the given element block.""" |
161
|
|
|
for i, item in enumerate(items): |
162
|
|
|
if self.left_tag_re.match(item): |
163
|
|
|
left_tag, left_index, attrs = \ |
164
|
|
|
self._get_left_tag(''.join(items[i:])) |
165
|
|
|
right_tag, data_index = self._get_right_tag( |
166
|
|
|
left_tag, left_index, ''.join(items[i:])) |
167
|
|
|
right_listindex = \ |
168
|
|
|
self._stringindex_to_listindex(data_index, items[i:]) + i |
169
|
|
|
if 'markdown' in attrs.keys(): |
170
|
|
|
items[i] = items[i][left_index:] # remove opening tag |
171
|
|
|
placeholder = self.markdown.htmlStash.store_tag( |
172
|
|
|
left_tag, attrs, i + 1, right_listindex + 1) |
173
|
|
|
items.insert(i, placeholder) |
174
|
|
|
if len(items) - right_listindex <= 1: # last nest, no tail |
175
|
|
|
right_listindex -= 1 |
176
|
|
|
items[right_listindex] = items[right_listindex][ |
177
|
|
|
:-len(right_tag) - 2] # remove closing tag |
178
|
|
|
else: # raw html |
179
|
|
|
if len(items) - right_listindex <= 1: # last element |
180
|
|
|
right_listindex -= 1 |
181
|
|
|
if right_listindex <= i: |
182
|
|
|
right_listindex = i + 1 |
183
|
|
|
placeholder = self.markdown.htmlStash.store('\n\n'.join( |
184
|
|
|
items[i:right_listindex])) |
185
|
|
|
del items[i:right_listindex] |
186
|
|
|
items.insert(i, placeholder) |
187
|
|
|
return items |
188
|
|
|
|
189
|
|
|
def run(self, lines): |
190
|
|
|
text = "\n".join(lines) |
191
|
|
|
new_blocks = [] |
192
|
|
|
text = text.rsplit("\n\n") |
193
|
|
|
items = [] |
194
|
|
|
left_tag = '' |
195
|
|
|
right_tag = '' |
196
|
|
|
in_tag = False # flag |
197
|
|
|
|
198
|
|
|
while text: |
199
|
|
|
block = text[0] |
200
|
|
|
if block.startswith("\n"): |
201
|
|
|
block = block[1:] |
202
|
|
|
text = text[1:] |
203
|
|
|
|
204
|
|
|
if block.startswith("\n"): |
205
|
|
|
block = block[1:] |
206
|
|
|
|
207
|
|
|
if not in_tag: |
208
|
|
|
if block.startswith("<") and len(block.strip()) > 1: |
209
|
|
|
|
210
|
|
|
if block[1:4] == "!--": |
211
|
|
|
# is a comment block |
212
|
|
|
left_tag, left_index, attrs = "--", 2, {} |
213
|
|
|
else: |
214
|
|
|
left_tag, left_index, attrs = self._get_left_tag(block) |
215
|
|
|
right_tag, data_index = self._get_right_tag(left_tag, |
216
|
|
|
left_index, |
217
|
|
|
block) |
218
|
|
|
# keep checking conditions below and maybe just append |
219
|
|
|
|
220
|
|
|
if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
221
|
|
|
text.insert(0, block[data_index:]) |
222
|
|
|
block = block[:data_index] |
223
|
|
|
|
224
|
|
|
if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
225
|
|
|
new_blocks.append(block) |
226
|
|
|
continue |
227
|
|
|
|
228
|
|
|
if self._is_oneliner(left_tag): |
229
|
|
|
new_blocks.append(block.strip()) |
230
|
|
|
continue |
231
|
|
|
|
232
|
|
|
if block.rstrip().endswith(">") \ |
233
|
|
|
and self._equal_tags(left_tag, right_tag): |
234
|
|
|
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
235
|
|
|
block = block[left_index:-len(right_tag) - 2] |
236
|
|
|
new_blocks.append(self.markdown.htmlStash. |
237
|
|
|
store_tag(left_tag, attrs, 0, 2)) |
238
|
|
|
new_blocks.extend([block]) |
239
|
|
|
else: |
240
|
|
|
new_blocks.append( |
241
|
|
|
self.markdown.htmlStash.store(block.strip())) |
242
|
|
|
continue |
243
|
|
|
else: |
244
|
|
|
# if is block level tag and is not complete |
245
|
|
|
if (not self._equal_tags(left_tag, right_tag)) and \ |
246
|
|
|
(util.isBlockLevel(left_tag) or left_tag == "--"): |
247
|
|
|
items.append(block.strip()) |
248
|
|
|
in_tag = True |
249
|
|
|
else: |
250
|
|
|
new_blocks.append( |
251
|
|
|
self.markdown.htmlStash.store(block.strip()) |
252
|
|
|
) |
253
|
|
|
continue |
254
|
|
|
|
255
|
|
|
else: |
256
|
|
|
new_blocks.append(block) |
257
|
|
|
|
258
|
|
|
else: |
259
|
|
|
items.append(block) |
260
|
|
|
|
261
|
|
|
# Need to evaluate all items so we can calculate relative to the left index. |
262
|
|
|
right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) |
263
|
|
|
# Adjust data_index: relative to items -> relative to last block |
264
|
|
|
prev_block_length = 0 |
265
|
|
|
for item in items[:-1]: |
266
|
|
|
prev_block_length += len(item) |
267
|
|
|
data_index -= prev_block_length |
268
|
|
|
|
269
|
|
|
if self._equal_tags(left_tag, right_tag): |
270
|
|
|
# if find closing tag |
271
|
|
|
|
272
|
|
|
if data_index < len(block): |
273
|
|
|
# we have more text after right_tag |
274
|
|
|
items[-1] = block[:data_index] |
275
|
|
|
text.insert(0, block[data_index:]) |
276
|
|
|
|
277
|
|
|
in_tag = False |
278
|
|
View Code Duplication |
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
|
|
|
279
|
|
|
items[0] = items[0][left_index:] |
280
|
|
|
items[-1] = items[-1][:-len(right_tag) - 2] |
281
|
|
|
if items[len(items) - 1]: # not a newline/empty string |
282
|
|
|
right_index = len(items) + 3 |
283
|
|
|
else: |
284
|
|
|
right_index = len(items) + 2 |
285
|
|
|
new_blocks.append(self.markdown.htmlStash.store_tag( |
286
|
|
|
left_tag, attrs, 0, right_index)) |
287
|
|
|
placeholderslen = len(self.markdown.htmlStash.tag_data) |
288
|
|
|
new_blocks.extend( |
289
|
|
|
self._nested_markdown_in_html(items)) |
290
|
|
|
nests = len(self.markdown.htmlStash.tag_data) - \ |
291
|
|
|
placeholderslen |
292
|
|
|
self.markdown.htmlStash.tag_data[-1 - nests][ |
293
|
|
|
'right_index'] += nests - 2 |
294
|
|
|
else: |
295
|
|
|
new_blocks.append( |
296
|
|
|
self.markdown.htmlStash.store('\n\n'.join(items))) |
297
|
|
|
items = [] |
298
|
|
|
|
299
|
|
View Code Duplication |
if items: |
|
|
|
|
300
|
|
|
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
301
|
|
|
items[0] = items[0][left_index:] |
302
|
|
|
items[-1] = items[-1][:-len(right_tag) - 2] |
303
|
|
|
if items[len(items) - 1]: # not a newline/empty string |
304
|
|
|
right_index = len(items) + 3 |
305
|
|
|
else: |
306
|
|
|
right_index = len(items) + 2 |
307
|
|
|
new_blocks.append( |
308
|
|
|
self.markdown.htmlStash.store_tag( |
309
|
|
|
left_tag, attrs, 0, right_index)) |
310
|
|
|
placeholderslen = len(self.markdown.htmlStash.tag_data) |
311
|
|
|
new_blocks.extend(self._nested_markdown_in_html(items)) |
312
|
|
|
nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
313
|
|
|
self.markdown.htmlStash.tag_data[-1 - nests][ |
314
|
|
|
'right_index'] += nests - 2 |
315
|
|
|
else: |
316
|
|
|
new_blocks.append( |
317
|
|
|
self.markdown.htmlStash.store('\n\n'.join(items))) |
318
|
|
|
new_blocks.append('\n') |
319
|
|
|
|
320
|
|
|
new_text = "\n\n".join(new_blocks) |
321
|
|
|
return new_text.split("\n") |
322
|
|
|
|
323
|
|
|
|
324
|
|
|
class ReferencePreprocessor(Preprocessor): |
325
|
|
|
""" Remove reference definitions from text and store for later use. """ |
326
|
|
|
|
327
|
|
|
TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
328
|
|
|
RE = re.compile( |
329
|
|
|
r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL |
330
|
|
|
) |
331
|
|
|
TITLE_RE = re.compile(r'^%s$' % TITLE) |
332
|
|
|
|
333
|
|
|
def run(self, lines): |
334
|
|
|
new_text = [] |
335
|
|
|
while lines: |
336
|
|
|
line = lines.pop(0) |
337
|
|
|
m = self.RE.match(line) |
338
|
|
|
if m: |
339
|
|
|
id = m.group(1).strip().lower() |
340
|
|
|
link = m.group(2).lstrip('<').rstrip('>') |
341
|
|
|
t = m.group(5) or m.group(6) or m.group(7) |
342
|
|
|
if not t: |
343
|
|
|
# Check next line for title |
344
|
|
|
tm = self.TITLE_RE.match(lines[0]) |
345
|
|
|
if tm: |
346
|
|
|
lines.pop(0) |
347
|
|
|
t = tm.group(2) or tm.group(3) or tm.group(4) |
348
|
|
|
self.markdown.references[id] = (link, t) |
349
|
|
|
else: |
350
|
|
|
new_text.append(line) |
351
|
|
|
|
352
|
|
|
return new_text # + "\n" |
353
|
|
|
|