|
1
|
|
|
""" |
|
2
|
|
|
PRE-PROCESSORS |
|
3
|
|
|
============================================================================= |
|
4
|
|
|
|
|
5
|
|
|
Preprocessors work on source text before we start doing anything too |
|
6
|
|
|
complicated. |
|
7
|
|
|
""" |
|
8
|
|
|
|
|
9
|
|
|
from __future__ import absolute_import |
|
10
|
|
|
from __future__ import unicode_literals |
|
11
|
|
|
from . import util |
|
12
|
|
|
from . import odict |
|
13
|
|
|
import re |
|
14
|
|
|
|
|
15
|
|
|
|
|
16
|
|
|
def build_preprocessors(md_instance, **kwargs): |
|
17
|
|
|
""" Build the default set of preprocessors used by Markdown. """ |
|
18
|
|
|
preprocessors = odict.OrderedDict() |
|
19
|
|
|
preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
|
20
|
|
|
if md_instance.safeMode != 'escape': |
|
21
|
|
|
preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
|
22
|
|
|
preprocessors["reference"] = ReferencePreprocessor(md_instance) |
|
23
|
|
|
return preprocessors |
|
24
|
|
|
|
|
25
|
|
|
|
|
26
|
|
|
class Preprocessor(util.Processor): |
|
27
|
|
|
""" |
|
28
|
|
|
Preprocessors are run after the text is broken into lines. |
|
29
|
|
|
|
|
30
|
|
|
Each preprocessor implements a "run" method that takes a pointer to a |
|
31
|
|
|
list of lines of the document, modifies it as necessary and returns |
|
32
|
|
|
either the same pointer or a pointer to a new list. |
|
33
|
|
|
|
|
34
|
|
|
Preprocessors must extend markdown.Preprocessor. |
|
35
|
|
|
|
|
36
|
|
|
""" |
|
37
|
|
|
def run(self, lines): |
|
38
|
|
|
""" |
|
39
|
|
|
Each subclass of Preprocessor should override the `run` method, which |
|
40
|
|
|
takes the document as a list of strings split by newlines and returns |
|
41
|
|
|
the (possibly modified) list of lines. |
|
42
|
|
|
|
|
43
|
|
|
""" |
|
44
|
|
|
pass # pragma: no cover |
|
45
|
|
|
|
|
46
|
|
|
|
|
47
|
|
|
class NormalizeWhitespace(Preprocessor): |
|
48
|
|
|
""" Normalize whitespace for consistant parsing. """ |
|
49
|
|
|
|
|
50
|
|
|
def run(self, lines): |
|
51
|
|
|
source = '\n'.join(lines) |
|
52
|
|
|
source = source.replace(util.STX, "").replace(util.ETX, "") |
|
53
|
|
|
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
|
54
|
|
|
source = source.expandtabs(self.markdown.tab_length) |
|
55
|
|
|
source = re.sub(r'(?<=\n) +\n', '\n', source) |
|
56
|
|
|
return source.split('\n') |
|
57
|
|
|
|
|
58
|
|
|
|
|
59
|
|
|
class HtmlBlockPreprocessor(Preprocessor): |
|
60
|
|
|
"""Remove html blocks from the text and store them for later retrieval.""" |
|
61
|
|
|
|
|
62
|
|
|
right_tag_patterns = ["</%s>", "%s>"] |
|
63
|
|
|
attrs_pattern = r""" |
|
64
|
|
|
\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
|
65
|
|
|
| # OR |
|
66
|
|
|
\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
|
67
|
|
|
| # OR |
|
68
|
|
|
\s+(?P<attr2>[^>"'/= ]+) # attr |
|
69
|
|
|
""" |
|
70
|
|
|
left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
|
71
|
|
|
attrs_pattern |
|
72
|
|
|
attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
|
73
|
|
|
left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
|
74
|
|
|
markdown_in_raw = False |
|
75
|
|
|
|
|
76
|
|
|
def _get_left_tag(self, block): |
|
77
|
|
|
m = self.left_tag_re.match(block) |
|
78
|
|
|
if m: |
|
79
|
|
|
tag = m.group('tag') |
|
80
|
|
|
raw_attrs = m.group('attrs') |
|
81
|
|
|
attrs = {} |
|
82
|
|
|
if raw_attrs: |
|
83
|
|
|
for ma in self.attrs_re.finditer(raw_attrs): |
|
84
|
|
|
if ma.group('attr'): |
|
85
|
|
|
if ma.group('value'): |
|
86
|
|
|
attrs[ma.group('attr').strip()] = ma.group('value') |
|
87
|
|
|
else: |
|
88
|
|
|
attrs[ma.group('attr').strip()] = "" |
|
89
|
|
|
elif ma.group('attr1'): |
|
90
|
|
|
if ma.group('value1'): |
|
91
|
|
|
attrs[ma.group('attr1').strip()] = ma.group( |
|
92
|
|
|
'value1' |
|
93
|
|
|
) |
|
94
|
|
|
else: |
|
95
|
|
|
attrs[ma.group('attr1').strip()] = "" |
|
96
|
|
|
elif ma.group('attr2'): |
|
97
|
|
|
attrs[ma.group('attr2').strip()] = "" |
|
98
|
|
|
return tag, len(m.group(0)), attrs |
|
99
|
|
|
else: |
|
100
|
|
|
tag = block[1:].split(">", 1)[0].lower() |
|
101
|
|
|
return tag, len(tag)+2, {} |
|
102
|
|
|
|
|
103
|
|
|
def _recursive_tagfind(self, ltag, rtag, start_index, block): |
|
104
|
|
|
while 1: |
|
105
|
|
|
i = block.find(rtag, start_index) |
|
106
|
|
|
if i == -1: |
|
107
|
|
|
return -1 |
|
108
|
|
|
j = block.find(ltag, start_index) |
|
109
|
|
|
# if no ltag, or rtag found before another ltag, return index |
|
110
|
|
|
if (j > i or j == -1): |
|
111
|
|
|
return i + len(rtag) |
|
112
|
|
|
# another ltag found before rtag, use end of ltag as starting |
|
113
|
|
|
# point and search again |
|
114
|
|
|
j = block.find('>', j) |
|
115
|
|
|
start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
|
116
|
|
|
if start_index == -1: |
|
117
|
|
|
# HTML potentially malformed- ltag has no corresponding |
|
118
|
|
|
# rtag |
|
119
|
|
|
return -1 |
|
120
|
|
|
|
|
121
|
|
|
def _get_right_tag(self, left_tag, left_index, block): |
|
122
|
|
|
for p in self.right_tag_patterns: |
|
123
|
|
|
tag = p % left_tag |
|
124
|
|
|
i = self._recursive_tagfind( |
|
125
|
|
|
"<%s" % left_tag, tag, left_index, block |
|
126
|
|
|
) |
|
127
|
|
|
if i > 2: |
|
128
|
|
|
return tag.lstrip("<").rstrip(">"), i |
|
129
|
|
|
return block.rstrip()[-left_index:-1].lower(), len(block) |
|
130
|
|
|
|
|
131
|
|
|
def _equal_tags(self, left_tag, right_tag): |
|
132
|
|
|
if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
|
133
|
|
|
return True |
|
134
|
|
|
if ("/" + left_tag) == right_tag: |
|
135
|
|
|
return True |
|
136
|
|
|
if (right_tag == "--" and left_tag == "--"): |
|
137
|
|
|
return True |
|
138
|
|
|
elif left_tag == right_tag[1:] and right_tag[0] == "/": |
|
139
|
|
|
return True |
|
140
|
|
|
else: |
|
141
|
|
|
return False |
|
142
|
|
|
|
|
143
|
|
|
def _is_oneliner(self, tag): |
|
144
|
|
|
return (tag in ['hr', 'hr/']) |
|
145
|
|
|
|
|
146
|
|
|
def _stringindex_to_listindex(self, stringindex, items): |
|
147
|
|
|
""" |
|
148
|
|
|
Same effect as concatenating the strings in items, |
|
149
|
|
|
finding the character to which stringindex refers in that string, |
|
150
|
|
|
and returning the index of the item in which that character resides. |
|
151
|
|
|
""" |
|
152
|
|
|
items.append('dummy') |
|
153
|
|
|
i, count = 0, 0 |
|
154
|
|
|
while count <= stringindex: |
|
155
|
|
|
count += len(items[i]) |
|
156
|
|
|
i += 1 |
|
157
|
|
|
return i - 1 |
|
158
|
|
|
|
|
159
|
|
|
def _nested_markdown_in_html(self, items): |
|
160
|
|
|
"""Find and process html child elements of the given element block.""" |
|
161
|
|
|
for i, item in enumerate(items): |
|
162
|
|
|
if self.left_tag_re.match(item): |
|
163
|
|
|
left_tag, left_index, attrs = \ |
|
164
|
|
|
self._get_left_tag(''.join(items[i:])) |
|
165
|
|
|
right_tag, data_index = self._get_right_tag( |
|
166
|
|
|
left_tag, left_index, ''.join(items[i:])) |
|
167
|
|
|
right_listindex = \ |
|
168
|
|
|
self._stringindex_to_listindex(data_index, items[i:]) + i |
|
169
|
|
|
if 'markdown' in attrs.keys(): |
|
170
|
|
|
items[i] = items[i][left_index:] # remove opening tag |
|
171
|
|
|
placeholder = self.markdown.htmlStash.store_tag( |
|
172
|
|
|
left_tag, attrs, i + 1, right_listindex + 1) |
|
173
|
|
|
items.insert(i, placeholder) |
|
174
|
|
|
if len(items) - right_listindex <= 1: # last nest, no tail |
|
175
|
|
|
right_listindex -= 1 |
|
176
|
|
|
items[right_listindex] = items[right_listindex][ |
|
177
|
|
|
:-len(right_tag) - 2] # remove closing tag |
|
178
|
|
|
else: # raw html |
|
179
|
|
|
if len(items) - right_listindex <= 1: # last element |
|
180
|
|
|
right_listindex -= 1 |
|
181
|
|
|
if right_listindex <= i: |
|
182
|
|
|
right_listindex = i + 1 |
|
183
|
|
|
placeholder = self.markdown.htmlStash.store('\n\n'.join( |
|
184
|
|
|
items[i:right_listindex])) |
|
185
|
|
|
del items[i:right_listindex] |
|
186
|
|
|
items.insert(i, placeholder) |
|
187
|
|
|
return items |
|
188
|
|
|
|
|
189
|
|
|
def run(self, lines): |
|
190
|
|
|
text = "\n".join(lines) |
|
191
|
|
|
new_blocks = [] |
|
192
|
|
|
text = text.rsplit("\n\n") |
|
193
|
|
|
items = [] |
|
194
|
|
|
left_tag = '' |
|
195
|
|
|
right_tag = '' |
|
196
|
|
|
in_tag = False # flag |
|
197
|
|
|
|
|
198
|
|
|
while text: |
|
199
|
|
|
block = text[0] |
|
200
|
|
|
if block.startswith("\n"): |
|
201
|
|
|
block = block[1:] |
|
202
|
|
|
text = text[1:] |
|
203
|
|
|
|
|
204
|
|
|
if block.startswith("\n"): |
|
205
|
|
|
block = block[1:] |
|
206
|
|
|
|
|
207
|
|
|
if not in_tag: |
|
208
|
|
|
if block.startswith("<") and len(block.strip()) > 1: |
|
209
|
|
|
|
|
210
|
|
|
if block[1:4] == "!--": |
|
211
|
|
|
# is a comment block |
|
212
|
|
|
left_tag, left_index, attrs = "--", 2, {} |
|
213
|
|
|
else: |
|
214
|
|
|
left_tag, left_index, attrs = self._get_left_tag(block) |
|
215
|
|
|
right_tag, data_index = self._get_right_tag(left_tag, |
|
216
|
|
|
left_index, |
|
217
|
|
|
block) |
|
218
|
|
|
# keep checking conditions below and maybe just append |
|
219
|
|
|
|
|
220
|
|
|
if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
|
221
|
|
|
text.insert(0, block[data_index:]) |
|
222
|
|
|
block = block[:data_index] |
|
223
|
|
|
|
|
224
|
|
|
if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
|
225
|
|
|
new_blocks.append(block) |
|
226
|
|
|
continue |
|
227
|
|
|
|
|
228
|
|
|
if self._is_oneliner(left_tag): |
|
229
|
|
|
new_blocks.append(block.strip()) |
|
230
|
|
|
continue |
|
231
|
|
|
|
|
232
|
|
|
if block.rstrip().endswith(">") \ |
|
233
|
|
|
and self._equal_tags(left_tag, right_tag): |
|
234
|
|
|
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
235
|
|
|
block = block[left_index:-len(right_tag) - 2] |
|
236
|
|
|
new_blocks.append(self.markdown.htmlStash. |
|
237
|
|
|
store_tag(left_tag, attrs, 0, 2)) |
|
238
|
|
|
new_blocks.extend([block]) |
|
239
|
|
|
else: |
|
240
|
|
|
new_blocks.append( |
|
241
|
|
|
self.markdown.htmlStash.store(block.strip())) |
|
242
|
|
|
continue |
|
243
|
|
|
else: |
|
244
|
|
|
# if is block level tag and is not complete |
|
245
|
|
|
if (not self._equal_tags(left_tag, right_tag)) and \ |
|
246
|
|
|
(util.isBlockLevel(left_tag) or left_tag == "--"): |
|
247
|
|
|
items.append(block.strip()) |
|
248
|
|
|
in_tag = True |
|
249
|
|
|
else: |
|
250
|
|
|
new_blocks.append( |
|
251
|
|
|
self.markdown.htmlStash.store(block.strip()) |
|
252
|
|
|
) |
|
253
|
|
|
continue |
|
254
|
|
|
|
|
255
|
|
|
else: |
|
256
|
|
|
new_blocks.append(block) |
|
257
|
|
|
|
|
258
|
|
|
else: |
|
259
|
|
|
items.append(block) |
|
260
|
|
|
|
|
261
|
|
|
# Need to evaluate all items so we can calculate relative to the left index. |
|
262
|
|
|
right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) |
|
263
|
|
|
# Adjust data_index: relative to items -> relative to last block |
|
264
|
|
|
prev_block_length = 0 |
|
265
|
|
|
for item in items[:-1]: |
|
266
|
|
|
prev_block_length += len(item) |
|
267
|
|
|
data_index -= prev_block_length |
|
268
|
|
|
|
|
269
|
|
|
if self._equal_tags(left_tag, right_tag): |
|
270
|
|
|
# if find closing tag |
|
271
|
|
|
|
|
272
|
|
|
if data_index < len(block): |
|
273
|
|
|
# we have more text after right_tag |
|
274
|
|
|
items[-1] = block[:data_index] |
|
275
|
|
|
text.insert(0, block[data_index:]) |
|
276
|
|
|
|
|
277
|
|
|
in_tag = False |
|
278
|
|
View Code Duplication |
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
|
|
|
|
|
279
|
|
|
items[0] = items[0][left_index:] |
|
280
|
|
|
items[-1] = items[-1][:-len(right_tag) - 2] |
|
281
|
|
|
if items[len(items) - 1]: # not a newline/empty string |
|
282
|
|
|
right_index = len(items) + 3 |
|
283
|
|
|
else: |
|
284
|
|
|
right_index = len(items) + 2 |
|
285
|
|
|
new_blocks.append(self.markdown.htmlStash.store_tag( |
|
286
|
|
|
left_tag, attrs, 0, right_index)) |
|
287
|
|
|
placeholderslen = len(self.markdown.htmlStash.tag_data) |
|
288
|
|
|
new_blocks.extend( |
|
289
|
|
|
self._nested_markdown_in_html(items)) |
|
290
|
|
|
nests = len(self.markdown.htmlStash.tag_data) - \ |
|
291
|
|
|
placeholderslen |
|
292
|
|
|
self.markdown.htmlStash.tag_data[-1 - nests][ |
|
293
|
|
|
'right_index'] += nests - 2 |
|
294
|
|
|
else: |
|
295
|
|
|
new_blocks.append( |
|
296
|
|
|
self.markdown.htmlStash.store('\n\n'.join(items))) |
|
297
|
|
|
items = [] |
|
298
|
|
|
|
|
299
|
|
View Code Duplication |
if items: |
|
|
|
|
|
|
300
|
|
|
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
|
301
|
|
|
items[0] = items[0][left_index:] |
|
302
|
|
|
items[-1] = items[-1][:-len(right_tag) - 2] |
|
303
|
|
|
if items[len(items) - 1]: # not a newline/empty string |
|
304
|
|
|
right_index = len(items) + 3 |
|
305
|
|
|
else: |
|
306
|
|
|
right_index = len(items) + 2 |
|
307
|
|
|
new_blocks.append( |
|
308
|
|
|
self.markdown.htmlStash.store_tag( |
|
309
|
|
|
left_tag, attrs, 0, right_index)) |
|
310
|
|
|
placeholderslen = len(self.markdown.htmlStash.tag_data) |
|
311
|
|
|
new_blocks.extend(self._nested_markdown_in_html(items)) |
|
312
|
|
|
nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
|
313
|
|
|
self.markdown.htmlStash.tag_data[-1 - nests][ |
|
314
|
|
|
'right_index'] += nests - 2 |
|
315
|
|
|
else: |
|
316
|
|
|
new_blocks.append( |
|
317
|
|
|
self.markdown.htmlStash.store('\n\n'.join(items))) |
|
318
|
|
|
new_blocks.append('\n') |
|
319
|
|
|
|
|
320
|
|
|
new_text = "\n\n".join(new_blocks) |
|
321
|
|
|
return new_text.split("\n") |
|
322
|
|
|
|
|
323
|
|
|
|
|
324
|
|
|
class ReferencePreprocessor(Preprocessor): |
|
325
|
|
|
""" Remove reference definitions from text and store for later use. """ |
|
326
|
|
|
|
|
327
|
|
|
TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
|
328
|
|
|
RE = re.compile( |
|
329
|
|
|
r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL |
|
330
|
|
|
) |
|
331
|
|
|
TITLE_RE = re.compile(r'^%s$' % TITLE) |
|
332
|
|
|
|
|
333
|
|
|
def run(self, lines): |
|
334
|
|
|
new_text = [] |
|
335
|
|
|
while lines: |
|
336
|
|
|
line = lines.pop(0) |
|
337
|
|
|
m = self.RE.match(line) |
|
338
|
|
|
if m: |
|
339
|
|
|
id = m.group(1).strip().lower() |
|
340
|
|
|
link = m.group(2).lstrip('<').rstrip('>') |
|
341
|
|
|
t = m.group(5) or m.group(6) or m.group(7) |
|
342
|
|
|
if not t: |
|
343
|
|
|
# Check next line for title |
|
344
|
|
|
tm = self.TITLE_RE.match(lines[0]) |
|
345
|
|
|
if tm: |
|
346
|
|
|
lines.pop(0) |
|
347
|
|
|
t = tm.group(2) or tm.group(3) or tm.group(4) |
|
348
|
|
|
self.markdown.references[id] = (link, t) |
|
349
|
|
|
else: |
|
350
|
|
|
new_text.append(line) |
|
351
|
|
|
|
|
352
|
|
|
return new_text # + "\n" |
|
353
|
|
|
|