Total Complexity | 327 |
Total Lines | 2014 |
Duplicated Lines | 3.53 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | #!/usr/bin/env python |
||
191 | class Markdown(object): |
||
192 | # The dict of "extras" to enable in processing -- a mapping of |
||
193 | # extra name to argument for the extra. Most extras do not have an |
||
194 | # argument, in which case the value is None. |
||
195 | # |
||
196 | # This can be set via (a) subclassing and (b) the constructor |
||
197 | # "extras" argument. |
||
198 | extras = None |
||
199 | |||
200 | urls = None |
||
201 | titles = None |
||
202 | html_blocks = None |
||
203 | html_spans = None |
||
204 | html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py |
||
205 | |||
206 | # Used to track when we're inside an ordered or unordered list |
||
207 | # (see _ProcessListItems() for details): |
||
208 | list_level = 0 |
||
209 | |||
210 | _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) |
||
211 | |||
212 | def __init__(self, html4tags=False, tab_width=4, safe_mode=None, |
||
213 | extras=None, link_patterns=None, |
||
214 | footnote_title=None, footnote_return_symbol=None, |
||
215 | use_file_vars=False): |
||
216 | if html4tags: |
||
217 | self.empty_element_suffix = ">" |
||
218 | else: |
||
219 | self.empty_element_suffix = " />" |
||
220 | self.tab_width = tab_width |
||
221 | |||
222 | # For compatibility with earlier markdown2.py and with |
||
223 | # markdown.py's safe_mode being a boolean, |
||
224 | # safe_mode == True -> "replace" |
||
225 | if safe_mode is True: |
||
226 | self.safe_mode = "replace" |
||
227 | else: |
||
228 | self.safe_mode = safe_mode |
||
229 | |||
230 | # Massaging and building the "extras" info. |
||
231 | if self.extras is None: |
||
232 | self.extras = {} |
||
233 | elif not isinstance(self.extras, dict): |
||
234 | self.extras = dict([(e, None) for e in self.extras]) |
||
235 | if extras: |
||
236 | if not isinstance(extras, dict): |
||
237 | extras = dict([(e, None) for e in extras]) |
||
238 | self.extras.update(extras) |
||
239 | assert isinstance(self.extras, dict) |
||
240 | if "toc" in self.extras and "header-ids" not in self.extras: |
||
241 | self.extras["header-ids"] = None # "toc" implies "header-ids" |
||
242 | self._instance_extras = self.extras.copy() |
||
243 | |||
244 | self.link_patterns = link_patterns |
||
245 | self.footnote_title = footnote_title |
||
246 | self.footnote_return_symbol = footnote_return_symbol |
||
247 | self.use_file_vars = use_file_vars |
||
248 | self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) |
||
249 | |||
250 | self._escape_table = g_escape_table.copy() |
||
251 | if "smarty-pants" in self.extras: |
||
252 | self._escape_table['"'] = _hash_text('"') |
||
253 | self._escape_table["'"] = _hash_text("'") |
||
254 | |||
255 | def reset(self): |
||
256 | self.urls = {} |
||
257 | self.titles = {} |
||
258 | self.html_blocks = {} |
||
259 | self.html_spans = {} |
||
260 | self.list_level = 0 |
||
261 | self.extras = self._instance_extras.copy() |
||
262 | if "footnotes" in self.extras: |
||
263 | self.footnotes = {} |
||
264 | self.footnote_ids = [] |
||
265 | if "header-ids" in self.extras: |
||
266 | self._count_from_header_id = {} # no `defaultdict` in Python 2.4 |
||
267 | if "metadata" in self.extras: |
||
268 | self.metadata = {} |
||
269 | |||
270 | # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel" |
||
271 | # should only be used in <a> tags with an "href" attribute. |
||
272 | _a_nofollow = re.compile(r""" |
||
273 | <(a) |
||
274 | ( |
||
275 | [^>]* |
||
276 | href= # href is required |
||
277 | ['"]? # HTML5 attribute values do not have to be quoted |
||
278 | [^#'"] # We don't want to match href values that start with # (like footnotes) |
||
279 | ) |
||
280 | """, |
||
281 | re.IGNORECASE | re.VERBOSE |
||
282 | ) |
||
283 | |||
284 | # Opens the linked document in a new window or tab |
||
285 | # should only used in <a> tags with an "href" attribute. |
||
286 | # same with _a_nofollow |
||
287 | _a_blank = _a_nofollow |
||
288 | |||
289 | def convert(self, text): |
||
290 | """Convert the given text.""" |
||
291 | # Main function. The order in which other subs are called here is |
||
292 | # essential. Link and image substitutions need to happen before |
||
293 | # _EscapeSpecialChars(), so that any *'s or _'s in the <a> |
||
294 | # and <img> tags get encoded. |
||
295 | |||
296 | # Clear the global hashes. If we don't clear these, you get conflicts |
||
297 | # from other articles when generating a page which contains more than |
||
298 | # one article (e.g. an index page that shows the N most recent |
||
299 | # articles): |
||
300 | self.reset() |
||
301 | |||
302 | if not isinstance(text, unicode): |
||
303 | # TODO: perhaps shouldn't presume UTF-8 for string input? |
||
304 | text = unicode(text, 'utf-8') |
||
305 | |||
306 | View Code Duplication | if self.use_file_vars: |
|
|
|||
307 | # Look for emacs-style file variable hints. |
||
308 | emacs_vars = self._get_emacs_vars(text) |
||
309 | if "markdown-extras" in emacs_vars: |
||
310 | splitter = re.compile("[ ,]+") |
||
311 | for e in splitter.split(emacs_vars["markdown-extras"]): |
||
312 | if '=' in e: |
||
313 | ename, earg = e.split('=', 1) |
||
314 | try: |
||
315 | earg = int(earg) |
||
316 | except ValueError: |
||
317 | pass |
||
318 | else: |
||
319 | ename, earg = e, None |
||
320 | self.extras[ename] = earg |
||
321 | |||
322 | # Standardize line endings: |
||
323 | text = text.replace("\r\n", "\n") |
||
324 | text = text.replace("\r", "\n") |
||
325 | |||
326 | # Make sure $text ends with a couple of newlines: |
||
327 | text += "\n\n" |
||
328 | |||
329 | # Convert all tabs to spaces. |
||
330 | text = self._detab(text) |
||
331 | |||
332 | # Strip any lines consisting only of spaces and tabs. |
||
333 | # This makes subsequent regexen easier to write, because we can |
||
334 | # match consecutive blank lines with /\n+/ instead of something |
||
335 | # contorted like /[ \t]*\n+/ . |
||
336 | text = self._ws_only_line_re.sub("", text) |
||
337 | |||
338 | # strip metadata from head and extract |
||
339 | if "metadata" in self.extras: |
||
340 | text = self._extract_metadata(text) |
||
341 | |||
342 | text = self.preprocess(text) |
||
343 | |||
344 | if "fenced-code-blocks" in self.extras and not self.safe_mode: |
||
345 | text = self._do_fenced_code_blocks(text) |
||
346 | |||
347 | if self.safe_mode: |
||
348 | text = self._hash_html_spans(text) |
||
349 | |||
350 | # Turn block-level HTML blocks into hash entries |
||
351 | text = self._hash_html_blocks(text, raw=True) |
||
352 | |||
353 | if "fenced-code-blocks" in self.extras and self.safe_mode: |
||
354 | text = self._do_fenced_code_blocks(text) |
||
355 | |||
356 | # Because numbering references aren't links (yet?) then we can do everything associated with counters |
||
357 | # before we get started |
||
358 | if "numbering" in self.extras: |
||
359 | text = self._do_numbering(text) |
||
360 | |||
361 | # Strip link definitions, store in hashes. |
||
362 | if "footnotes" in self.extras: |
||
363 | # Must do footnotes first because an unlucky footnote defn |
||
364 | # looks like a link defn: |
||
365 | # [^4]: this "looks like a link defn" |
||
366 | text = self._strip_footnote_definitions(text) |
||
367 | text = self._strip_link_definitions(text) |
||
368 | |||
369 | text = self._run_block_gamut(text) |
||
370 | |||
371 | if "footnotes" in self.extras: |
||
372 | text = self._add_footnotes(text) |
||
373 | |||
374 | text = self.postprocess(text) |
||
375 | |||
376 | text = self._unescape_special_chars(text) |
||
377 | |||
378 | if self.safe_mode: |
||
379 | text = self._unhash_html_spans(text) |
||
380 | |||
381 | if "nofollow" in self.extras: |
||
382 | text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) |
||
383 | |||
384 | if "target-blank-links" in self.extras: |
||
385 | text = self._a_blank.sub(r'<\1 target="_blank"\2', text) |
||
386 | |||
387 | text += "\n" |
||
388 | |||
389 | rv = UnicodeWithAttrs(text) |
||
390 | if "toc" in self.extras: |
||
391 | rv._toc = self._toc |
||
392 | if "metadata" in self.extras: |
||
393 | rv.metadata = self.metadata |
||
394 | return rv |
||
395 | |||
396 | def postprocess(self, text): |
||
397 | """A hook for subclasses to do some postprocessing of the html, if |
||
398 | desired. This is called before unescaping of special chars and |
||
399 | unhashing of raw HTML spans. |
||
400 | """ |
||
401 | return text |
||
402 | |||
403 | def preprocess(self, text): |
||
404 | """A hook for subclasses to do some preprocessing of the Markdown, if |
||
405 | desired. This is called after basic formatting of the text, but prior |
||
406 | to any extras, safe mode, etc. processing. |
||
407 | """ |
||
408 | return text |
||
409 | |||
410 | # Is metadata if the content starts with optional '---'-fenced `key: value` |
||
411 | # pairs. E.g. (indented for presentation): |
||
412 | # --- |
||
413 | # foo: bar |
||
414 | # another-var: blah blah |
||
415 | # --- |
||
416 | # # header |
||
417 | # or: |
||
418 | # foo: bar |
||
419 | # another-var: blah blah |
||
420 | # |
||
421 | # # header |
||
422 | _meta_data_pattern = re.compile(r'^(?:---[\ \t]*\n)?(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)|([\S\w]+\s*:(?! >)[ \t]*.*\n?)(?:---[\ \t]*\n)?', re.MULTILINE) |
||
423 | _key_val_pat = re.compile("[\S\w]+\s*:(?! >)[ \t]*.*\n?", re.MULTILINE) |
||
424 | # this allows key: > |
||
425 | # value |
||
426 | # conutiues over multiple lines |
||
427 | _key_val_block_pat = re.compile( |
||
428 | "(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)", re.MULTILINE) |
||
429 | _meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE) |
||
430 | _meta_data_newline = re.compile("^\n", re.MULTILINE) |
||
431 | |||
432 | def _extract_metadata(self, text): |
||
433 | if text.startswith("---"): |
||
434 | fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2) |
||
435 | metadata_content = fence_splits[1] |
||
436 | match = re.findall(self._meta_data_pattern, metadata_content) |
||
437 | if not match: |
||
438 | return text |
||
439 | tail = fence_splits[2] |
||
440 | else: |
||
441 | metadata_split = re.split(self._meta_data_newline, text, maxsplit=1) |
||
442 | metadata_content = metadata_split[0] |
||
443 | match = re.findall(self._meta_data_pattern, metadata_content) |
||
444 | if not match: |
||
445 | return text |
||
446 | tail = metadata_split[1] |
||
447 | |||
448 | kv = re.findall(self._key_val_pat, metadata_content) |
||
449 | kvm = re.findall(self._key_val_block_pat, metadata_content) |
||
450 | kvm = [item.replace(": >\n", ":", 1) for item in kvm] |
||
451 | |||
452 | for item in kv + kvm: |
||
453 | k, v = item.split(":", 1) |
||
454 | self.metadata[k.strip()] = v.strip() |
||
455 | |||
456 | return tail |
||
457 | |||
458 | _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) |
||
459 | # This regular expression is intended to match blocks like this: |
||
460 | # PREFIX Local Variables: SUFFIX |
||
461 | # PREFIX mode: Tcl SUFFIX |
||
462 | # PREFIX End: SUFFIX |
||
463 | # Some notes: |
||
464 | # - "[ \t]" is used instead of "\s" to specifically exclude newlines |
||
465 | # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does |
||
466 | # not like anything other than Unix-style line terminators. |
||
467 | _emacs_local_vars_pat = re.compile(r"""^ |
||
468 | (?P<prefix>(?:[^\r\n|\n|\r])*?) |
||
469 | [\ \t]*Local\ Variables:[\ \t]* |
||
470 | (?P<suffix>.*?)(?:\r\n|\n|\r) |
||
471 | (?P<content>.*?\1End:) |
||
472 | """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) |
||
473 | |||
474 | def _get_emacs_vars(self, text): |
||
475 | """Return a dictionary of emacs-style local variables. |
||
476 | |||
477 | Parsing is done loosely according to this spec (and according to |
||
478 | some in-practice deviations from this): |
||
479 | http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables |
||
480 | """ |
||
481 | emacs_vars = {} |
||
482 | SIZE = pow(2, 13) # 8kB |
||
483 | |||
484 | # Search near the start for a '-*-'-style one-liner of variables. |
||
485 | head = text[:SIZE] |
||
486 | if "-*-" in head: |
||
487 | match = self._emacs_oneliner_vars_pat.search(head) |
||
488 | if match: |
||
489 | emacs_vars_str = match.group(1) |
||
490 | assert '\n' not in emacs_vars_str |
||
491 | emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') |
||
492 | if s.strip()] |
||
493 | if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: |
||
494 | # While not in the spec, this form is allowed by emacs: |
||
495 | # -*- Tcl -*- |
||
496 | # where the implied "variable" is "mode". This form |
||
497 | # is only allowed if there are no other variables. |
||
498 | emacs_vars["mode"] = emacs_var_strs[0].strip() |
||
499 | else: |
||
500 | for emacs_var_str in emacs_var_strs: |
||
501 | try: |
||
502 | variable, value = emacs_var_str.strip().split(':', 1) |
||
503 | except ValueError: |
||
504 | log.debug("emacs variables error: malformed -*- " |
||
505 | "line: %r", emacs_var_str) |
||
506 | continue |
||
507 | # Lowercase the variable name because Emacs allows "Mode" |
||
508 | # or "mode" or "MoDe", etc. |
||
509 | emacs_vars[variable.lower()] = value.strip() |
||
510 | |||
511 | tail = text[-SIZE:] |
||
512 | if "Local Variables" in tail: |
||
513 | match = self._emacs_local_vars_pat.search(tail) |
||
514 | if match: |
||
515 | prefix = match.group("prefix") |
||
516 | suffix = match.group("suffix") |
||
517 | lines = match.group("content").splitlines(0) |
||
518 | # print "prefix=%r, suffix=%r, content=%r, lines: %s"\ |
||
519 | # % (prefix, suffix, match.group("content"), lines) |
||
520 | |||
521 | # Validate the Local Variables block: proper prefix and suffix |
||
522 | # usage. |
||
523 | for i, line in enumerate(lines): |
||
524 | if not line.startswith(prefix): |
||
525 | log.debug("emacs variables error: line '%s' " |
||
526 | "does not use proper prefix '%s'" |
||
527 | % (line, prefix)) |
||
528 | return {} |
||
529 | # Don't validate suffix on last line. Emacs doesn't care, |
||
530 | # neither should we. |
||
531 | if i != len(lines)-1 and not line.endswith(suffix): |
||
532 | log.debug("emacs variables error: line '%s' " |
||
533 | "does not use proper suffix '%s'" |
||
534 | % (line, suffix)) |
||
535 | return {} |
||
536 | |||
537 | # Parse out one emacs var per line. |
||
538 | continued_for = None |
||
539 | for line in lines[:-1]: # no var on the last line ("PREFIX End:") |
||
540 | if prefix: line = line[len(prefix):] # strip prefix |
||
541 | if suffix: line = line[:-len(suffix)] # strip suffix |
||
542 | line = line.strip() |
||
543 | if continued_for: |
||
544 | variable = continued_for |
||
545 | if line.endswith('\\'): |
||
546 | line = line[:-1].rstrip() |
||
547 | else: |
||
548 | continued_for = None |
||
549 | emacs_vars[variable] += ' ' + line |
||
550 | else: |
||
551 | try: |
||
552 | variable, value = line.split(':', 1) |
||
553 | except ValueError: |
||
554 | log.debug("local variables error: missing colon " |
||
555 | "in local variables entry: '%s'" % line) |
||
556 | continue |
||
557 | # Do NOT lowercase the variable name, because Emacs only |
||
558 | # allows "mode" (and not "Mode", "MoDe", etc.) in this block. |
||
559 | value = value.strip() |
||
560 | if value.endswith('\\'): |
||
561 | value = value[:-1].rstrip() |
||
562 | continued_for = variable |
||
563 | else: |
||
564 | continued_for = None |
||
565 | emacs_vars[variable] = value |
||
566 | |||
567 | # Unquote values. |
||
568 | for var, val in list(emacs_vars.items()): |
||
569 | if len(val) > 1 and (val.startswith('"') and val.endswith('"') |
||
570 | or val.startswith('"') and val.endswith('"')): |
||
571 | emacs_vars[var] = val[1:-1] |
||
572 | |||
573 | return emacs_vars |
||
574 | |||
575 | def _detab_line(self, line): |
||
576 | r"""Recusively convert tabs to spaces in a single line. |
||
577 | |||
578 | Called from _detab().""" |
||
579 | if '\t' not in line: |
||
580 | return line |
||
581 | chunk1, chunk2 = line.split('\t', 1) |
||
582 | chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width)) |
||
583 | output = chunk1 + chunk2 |
||
584 | return self._detab_line(output) |
||
585 | |||
586 | def _detab(self, text): |
||
587 | r"""Iterate text line by line and convert tabs to spaces. |
||
588 | |||
589 | >>> m = Markdown() |
||
590 | >>> m._detab("\tfoo") |
||
591 | ' foo' |
||
592 | >>> m._detab(" \tfoo") |
||
593 | ' foo' |
||
594 | >>> m._detab("\t foo") |
||
595 | ' foo' |
||
596 | >>> m._detab(" foo") |
||
597 | ' foo' |
||
598 | >>> m._detab(" foo\n\tbar\tblam") |
||
599 | ' foo\n bar blam' |
||
600 | """ |
||
601 | if '\t' not in text: |
||
602 | return text |
||
603 | output = [] |
||
604 | for line in text.splitlines(): |
||
605 | output.append(self._detab_line(line)) |
||
606 | return '\n'.join(output) |
||
607 | |||
608 | # I broke out the html5 tags here and add them to _block_tags_a and |
||
609 | # _block_tags_b. This way html5 tags are easy to keep track of. |
||
610 | _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption' |
||
611 | |||
612 | _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' |
||
613 | _block_tags_a += _html5tags |
||
614 | |||
615 | _strict_tag_block_re = re.compile(r""" |
||
616 | ( # save in \1 |
||
617 | ^ # start of line (with re.M) |
||
618 | <(%s) # start tag = \2 |
||
619 | \b # word break |
||
620 | (.*\n)*? # any number of lines, minimally matching |
||
621 | </\2> # the matching end tag |
||
622 | [ \t]* # trailing spaces/tabs |
||
623 | (?=\n+|\Z) # followed by a newline or end of document |
||
624 | ) |
||
625 | """ % _block_tags_a, |
||
626 | re.X | re.M) |
||
627 | |||
628 | _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' |
||
629 | _block_tags_b += _html5tags |
||
630 | |||
631 | _liberal_tag_block_re = re.compile(r""" |
||
632 | ( # save in \1 |
||
633 | ^ # start of line (with re.M) |
||
634 | <(%s) # start tag = \2 |
||
635 | \b # word break |
||
636 | (.*\n)*? # any number of lines, minimally matching |
||
637 | .*</\2> # the matching end tag |
||
638 | [ \t]* # trailing spaces/tabs |
||
639 | (?=\n+|\Z) # followed by a newline or end of document |
||
640 | ) |
||
641 | """ % _block_tags_b, |
||
642 | re.X | re.M) |
||
643 | |||
644 | _html_markdown_attr_re = re.compile( |
||
645 | r'''\s+markdown=("1"|'1')''') |
||
646 | def _hash_html_block_sub(self, match, raw=False): |
||
647 | html = match.group(1) |
||
648 | if raw and self.safe_mode: |
||
649 | html = self._sanitize_html(html) |
||
650 | elif 'markdown-in-html' in self.extras and 'markdown=' in html: |
||
651 | first_line = html.split('\n', 1)[0] |
||
652 | m = self._html_markdown_attr_re.search(first_line) |
||
653 | if m: |
||
654 | lines = html.split('\n') |
||
655 | middle = '\n'.join(lines[1:-1]) |
||
656 | last_line = lines[-1] |
||
657 | first_line = first_line[:m.start()] + first_line[m.end():] |
||
658 | f_key = _hash_text(first_line) |
||
659 | self.html_blocks[f_key] = first_line |
||
660 | l_key = _hash_text(last_line) |
||
661 | self.html_blocks[l_key] = last_line |
||
662 | return ''.join(["\n\n", f_key, |
||
663 | "\n\n", middle, "\n\n", |
||
664 | l_key, "\n\n"]) |
||
665 | key = _hash_text(html) |
||
666 | self.html_blocks[key] = html |
||
667 | return "\n\n" + key + "\n\n" |
||
668 | |||
669 | def _hash_html_blocks(self, text, raw=False): |
||
670 | """Hashify HTML blocks |
||
671 | |||
672 | We only want to do this for block-level HTML tags, such as headers, |
||
673 | lists, and tables. That's because we still want to wrap <p>s around |
||
674 | "paragraphs" that are wrapped in non-block-level tags, such as anchors, |
||
675 | phrase emphasis, and spans. The list of tags we're looking for is |
||
676 | hard-coded. |
||
677 | |||
678 | @param raw {boolean} indicates if these are raw HTML blocks in |
||
679 | the original source. It makes a difference in "safe" mode. |
||
680 | """ |
||
681 | if '<' not in text: |
||
682 | return text |
||
683 | |||
684 | # Pass `raw` value into our calls to self._hash_html_block_sub. |
||
685 | hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) |
||
686 | |||
687 | # First, look for nested blocks, e.g.: |
||
688 | # <div> |
||
689 | # <div> |
||
690 | # tags for inner block must be indented. |
||
691 | # </div> |
||
692 | # </div> |
||
693 | # |
||
694 | # The outermost tags must start at the left margin for this to match, and |
||
695 | # the inner nested divs must be indented. |
||
696 | # We need to do this before the next, more liberal match, because the next |
||
697 | # match will start at the first `<div>` and stop at the first `</div>`. |
||
698 | text = self._strict_tag_block_re.sub(hash_html_block_sub, text) |
||
699 | |||
700 | # Now match more liberally, simply from `\n<tag>` to `</tag>\n` |
||
701 | text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) |
||
702 | |||
703 | # Special case just for <hr />. It was easier to make a special |
||
704 | # case than to make the other regex more complicated. |
||
705 | if "<hr" in text: |
||
706 | _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) |
||
707 | text = _hr_tag_re.sub(hash_html_block_sub, text) |
||
708 | |||
709 | # Special case for standalone HTML comments: |
||
710 | if "<!--" in text: |
||
711 | start = 0 |
||
712 | while True: |
||
713 | # Delimiters for next comment block. |
||
714 | try: |
||
715 | start_idx = text.index("<!--", start) |
||
716 | except ValueError: |
||
717 | break |
||
718 | try: |
||
719 | end_idx = text.index("-->", start_idx) + 3 |
||
720 | except ValueError: |
||
721 | break |
||
722 | |||
723 | # Start position for next comment block search. |
||
724 | start = end_idx |
||
725 | |||
726 | # Validate whitespace before comment. |
||
727 | if start_idx: |
||
728 | # - Up to `tab_width - 1` spaces before start_idx. |
||
729 | for i in range(self.tab_width - 1): |
||
730 | if text[start_idx - 1] != ' ': |
||
731 | break |
||
732 | start_idx -= 1 |
||
733 | if start_idx == 0: |
||
734 | break |
||
735 | # - Must be preceded by 2 newlines or hit the start of |
||
736 | # the document. |
||
737 | if start_idx == 0: |
||
738 | pass |
||
739 | elif start_idx == 1 and text[0] == '\n': |
||
740 | start_idx = 0 # to match minute detail of Markdown.pl regex |
||
741 | elif text[start_idx-2:start_idx] == '\n\n': |
||
742 | pass |
||
743 | else: |
||
744 | break |
||
745 | |||
746 | # Validate whitespace after comment. |
||
747 | # - Any number of spaces and tabs. |
||
748 | while end_idx < len(text): |
||
749 | if text[end_idx] not in ' \t': |
||
750 | break |
||
751 | end_idx += 1 |
||
752 | # - Must be following by 2 newlines or hit end of text. |
||
753 | if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): |
||
754 | continue |
||
755 | |||
756 | # Escape and hash (must match `_hash_html_block_sub`). |
||
757 | html = text[start_idx:end_idx] |
||
758 | if raw and self.safe_mode: |
||
759 | html = self._sanitize_html(html) |
||
760 | key = _hash_text(html) |
||
761 | self.html_blocks[key] = html |
||
762 | text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] |
||
763 | |||
764 | if "xml" in self.extras: |
||
765 | # Treat XML processing instructions and namespaced one-liner |
||
766 | # tags as if they were block HTML tags. E.g., if standalone |
||
767 | # (i.e. are their own paragraph), the following do not get |
||
768 | # wrapped in a <p> tag: |
||
769 | # <?foo bar?> |
||
770 | # |
||
771 | # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> |
||
772 | _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) |
||
773 | text = _xml_oneliner_re.sub(hash_html_block_sub, text) |
||
774 | |||
775 | return text |
||
776 | |||
777 | def _strip_link_definitions(self, text): |
||
778 | # Strips link definitions from text, stores the URLs and titles in |
||
779 | # hash references. |
||
780 | less_than_tab = self.tab_width - 1 |
||
781 | |||
782 | # Link defs are in the form: |
||
783 | # [id]: url "optional title" |
||
784 | _link_def_re = re.compile(r""" |
||
785 | ^[ ]{0,%d}\[(.+)\]: # id = \1 |
||
786 | [ \t]* |
||
787 | \n? # maybe *one* newline |
||
788 | [ \t]* |
||
789 | <?(.+?)>? # url = \2 |
||
790 | [ \t]* |
||
791 | (?: |
||
792 | \n? # maybe one newline |
||
793 | [ \t]* |
||
794 | (?<=\s) # lookbehind for whitespace |
||
795 | ['"(] |
||
796 | ([^\n]*) # title = \3 |
||
797 | ['")] |
||
798 | [ \t]* |
||
799 | )? # title is optional |
||
800 | (?:\n+|\Z) |
||
801 | """ % less_than_tab, re.X | re.M | re.U) |
||
802 | return _link_def_re.sub(self._extract_link_def_sub, text) |
||
803 | |||
804 | def _extract_link_def_sub(self, match): |
||
805 | id, url, title = match.groups() |
||
806 | key = id.lower() # Link IDs are case-insensitive |
||
807 | self.urls[key] = self._encode_amps_and_angles(url) |
||
808 | if title: |
||
809 | self.titles[key] = title |
||
810 | return "" |
||
811 | |||
812 | def _do_numbering(self, text): |
||
813 | ''' We handle the special extension for generic numbering for |
||
814 | tables, figures etc. |
||
815 | ''' |
||
816 | # First pass to define all the references |
||
817 | self.regex_defns = re.compile(r''' |
||
818 | \[\#(\w+)\s* # the counter. Open square plus hash plus a word \1 |
||
819 | ([^@]*)\s* # Some optional characters, that aren't an @. \2 |
||
820 | @(\w+) # the id. Should this be normed? \3 |
||
821 | ([^\]]*)\] # The rest of the text up to the terminating ] \4 |
||
822 | ''', re.VERBOSE) |
||
823 | self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] |
||
824 | counters = {} |
||
825 | references = {} |
||
826 | replacements = [] |
||
827 | definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>' |
||
828 | reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>' |
||
829 | for match in self.regex_defns.finditer(text): |
||
830 | # We must have four match groups otherwise this isn't a numbering reference |
||
831 | if len(match.groups()) != 4: |
||
832 | continue |
||
833 | counter = match.group(1) |
||
834 | text_before = match.group(2) |
||
835 | ref_id = match.group(3) |
||
836 | text_after = match.group(4) |
||
837 | number = counters.get(counter, 1) |
||
838 | references[ref_id] = (number, counter) |
||
839 | replacements.append((match.start(0), |
||
840 | definition_html.format(counter, |
||
841 | ref_id, |
||
842 | text_before, |
||
843 | number, |
||
844 | text_after), |
||
845 | match.end(0))) |
||
846 | counters[counter] = number + 1 |
||
847 | for repl in reversed(replacements): |
||
848 | text = text[:repl[0]] + repl[1] + text[repl[2]:] |
||
849 | |||
850 | # Second pass to replace the references with the right |
||
851 | # value of the counter |
||
852 | # Fwiw, it's vaguely annoying to have to turn the iterator into |
||
853 | # a list and then reverse it but I can't think of a better thing to do. |
||
854 | for match in reversed(list(self.regex_subs.finditer(text))): |
||
855 | number, counter = references.get(match.group(1), (None, None)) |
||
856 | if number is not None: |
||
857 | repl = reference_html.format(counter, |
||
858 | match.group(1), |
||
859 | number) |
||
860 | else: |
||
861 | repl = reference_html.format(match.group(1), |
||
862 | 'countererror', |
||
863 | '?' + match.group(1) + '?') |
||
864 | if "smarty-pants" in self.extras: |
||
865 | repl = repl.replace('"', self._escape_table['"']) |
||
866 | |||
867 | text = text[:match.start()] + repl + text[match.end():] |
||
868 | return text |
||
869 | |||
870 | def _extract_footnote_def_sub(self, match): |
||
871 | id, text = match.groups() |
||
872 | text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() |
||
873 | normed_id = re.sub(r'\W', '-', id) |
||
874 | # Ensure footnote text ends with a couple newlines (for some |
||
875 | # block gamut matches). |
||
876 | self.footnotes[normed_id] = text + "\n\n" |
||
877 | return "" |
||
878 | |||
879 | def _strip_footnote_definitions(self, text): |
||
880 | """A footnote definition looks like this: |
||
881 | |||
882 | [^note-id]: Text of the note. |
||
883 | |||
884 | May include one or more indented paragraphs. |
||
885 | |||
886 | Where, |
||
887 | - The 'note-id' can be pretty much anything, though typically it |
||
888 | is the number of the footnote. |
||
889 | - The first paragraph may start on the next line, like so: |
||
890 | |||
891 | [^note-id]: |
||
892 | Text of the note. |
||
893 | """ |
||
894 | less_than_tab = self.tab_width - 1 |
||
895 | footnote_def_re = re.compile(r''' |
||
896 | ^[ ]{0,%d}\[\^(.+)\]: # id = \1 |
||
897 | [ \t]* |
||
898 | ( # footnote text = \2 |
||
899 | # First line need not start with the spaces. |
||
900 | (?:\s*.*\n+) |
||
901 | (?: |
||
902 | (?:[ ]{%d} | \t) # Subsequent lines must be indented. |
||
903 | .*\n+ |
||
904 | )* |
||
905 | ) |
||
906 | # Lookahead for non-space at line-start, or end of doc. |
||
907 | (?:(?=^[ ]{0,%d}\S)|\Z) |
||
908 | ''' % (less_than_tab, self.tab_width, self.tab_width), |
||
909 | re.X | re.M) |
||
910 | return footnote_def_re.sub(self._extract_footnote_def_sub, text) |
||
911 | |||
912 | _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M) |
||
913 | |||
914 | def _run_block_gamut(self, text): |
||
915 | # These are all the transformations that form block-level |
||
916 | # tags like paragraphs, headers, and list items. |
||
917 | |||
918 | if "fenced-code-blocks" in self.extras: |
||
919 | text = self._do_fenced_code_blocks(text) |
||
920 | |||
921 | text = self._do_headers(text) |
||
922 | |||
923 | # Do Horizontal Rules: |
||
924 | # On the number of spaces in horizontal rules: The spec is fuzzy: "If |
||
925 | # you wish, you may use spaces between the hyphens or asterisks." |
||
926 | # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the |
||
927 | # hr chars to one or two. We'll reproduce that limit here. |
||
928 | hr = "\n<hr"+self.empty_element_suffix+"\n" |
||
929 | text = re.sub(self._hr_re, hr, text) |
||
930 | |||
931 | text = self._do_lists(text) |
||
932 | |||
933 | if "pyshell" in self.extras: |
||
934 | text = self._prepare_pyshell_blocks(text) |
||
935 | if "wiki-tables" in self.extras: |
||
936 | text = self._do_wiki_tables(text) |
||
937 | if "tables" in self.extras: |
||
938 | text = self._do_tables(text) |
||
939 | |||
940 | text = self._do_code_blocks(text) |
||
941 | |||
942 | text = self._do_block_quotes(text) |
||
943 | |||
944 | # We already ran _HashHTMLBlocks() before, in Markdown(), but that |
||
945 | # was to escape raw HTML in the original Markdown source. This time, |
||
946 | # we're escaping the markup we've just created, so that we don't wrap |
||
947 | # <p> tags around block-level tags. |
||
948 | text = self._hash_html_blocks(text) |
||
949 | |||
950 | text = self._form_paragraphs(text) |
||
951 | |||
952 | return text |
||
953 | |||
954 | def _pyshell_block_sub(self, match): |
||
955 | lines = match.group(0).splitlines(0) |
||
956 | _dedentlines(lines) |
||
957 | indent = ' ' * self.tab_width |
||
958 | s = ('\n' # separate from possible cuddled paragraph |
||
959 | + indent + ('\n'+indent).join(lines) |
||
960 | + '\n\n') |
||
961 | return s |
||
962 | |||
963 | def _prepare_pyshell_blocks(self, text): |
||
964 | """Ensure that Python interactive shell sessions are put in |
||
965 | code blocks -- even if not properly indented. |
||
966 | """ |
||
967 | if ">>>" not in text: |
||
968 | return text |
||
969 | |||
970 | less_than_tab = self.tab_width - 1 |
||
971 | _pyshell_block_re = re.compile(r""" |
||
972 | ^([ ]{0,%d})>>>[ ].*\n # first line |
||
973 | ^(\1.*\S+.*\n)* # any number of subsequent lines |
||
974 | ^\n # ends with a blank line |
||
975 | """ % less_than_tab, re.M | re.X) |
||
976 | |||
977 | return _pyshell_block_re.sub(self._pyshell_block_sub, text) |
||
978 | |||
979 | def _table_sub(self, match): |
||
980 | trim_space_re = '^[ \t\n]+|[ \t\n]+$' |
||
981 | trim_bar_re = '^\||\|$' |
||
982 | split_bar_re = '^\||(?<!\\\\)\|' |
||
983 | escape_bar_re = '\\\\\|' |
||
984 | |||
985 | head, underline, body = match.groups() |
||
986 | |||
987 | # Determine aligns for columns. |
||
988 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))] |
||
989 | align_from_col_idx = {} |
||
990 | for col_idx, col in enumerate(cols): |
||
991 | if col[0] == ':' and col[-1] == ':': |
||
992 | align_from_col_idx[col_idx] = ' align="center"' |
||
993 | elif col[0] == ':': |
||
994 | align_from_col_idx[col_idx] = ' align="left"' |
||
995 | elif col[-1] == ':': |
||
996 | align_from_col_idx[col_idx] = ' align="right"' |
||
997 | |||
998 | # thead |
||
999 | hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>'] |
||
1000 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] |
||
1001 | for col_idx, col in enumerate(cols): |
||
1002 | hlines.append(' <th%s>%s</th>' % ( |
||
1003 | align_from_col_idx.get(col_idx, ''), |
||
1004 | self._run_span_gamut(col) |
||
1005 | )) |
||
1006 | hlines.append('</tr>') |
||
1007 | hlines.append('</thead>') |
||
1008 | |||
1009 | # tbody |
||
1010 | hlines.append('<tbody>') |
||
1011 | for line in body.strip('\n').split('\n'): |
||
1012 | hlines.append('<tr>') |
||
1013 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] |
||
1014 | for col_idx, col in enumerate(cols): |
||
1015 | hlines.append(' <td%s>%s</td>' % ( |
||
1016 | align_from_col_idx.get(col_idx, ''), |
||
1017 | self._run_span_gamut(col) |
||
1018 | )) |
||
1019 | hlines.append('</tr>') |
||
1020 | hlines.append('</tbody>') |
||
1021 | hlines.append('</table>') |
||
1022 | |||
1023 | return '\n'.join(hlines) + '\n' |
||
1024 | |||
1025 | def _do_tables(self, text): |
||
1026 | """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from |
||
1027 | https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 |
||
1028 | """ |
||
1029 | less_than_tab = self.tab_width - 1 |
||
1030 | table_re = re.compile(r''' |
||
1031 | (?:(?<=\n\n)|\A\n?) # leading blank line |
||
1032 | |||
1033 | ^[ ]{0,%d} # allowed whitespace |
||
1034 | (.*[|].*) \n # $1: header row (at least one pipe) |
||
1035 | |||
1036 | ^[ ]{0,%d} # allowed whitespace |
||
1037 | ( # $2: underline row |
||
1038 | # underline row with leading bar |
||
1039 | (?: \|\ *:?-+:?\ * )+ \|? \n |
||
1040 | | |
||
1041 | # or, underline row without leading bar |
||
1042 | (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n |
||
1043 | ) |
||
1044 | |||
1045 | ( # $3: data rows |
||
1046 | (?: |
||
1047 | ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces |
||
1048 | .*\|.* \n |
||
1049 | )+ |
||
1050 | ) |
||
1051 | ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) |
||
1052 | return table_re.sub(self._table_sub, text) |
||
1053 | |||
1054 | def _wiki_table_sub(self, match): |
||
1055 | ttext = match.group(0).strip() |
||
1056 | # print 'wiki table: %r' % match.group(0) |
||
1057 | rows = [] |
||
1058 | for line in ttext.splitlines(0): |
||
1059 | line = line.strip()[2:-2].strip() |
||
1060 | row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)] |
||
1061 | rows.append(row) |
||
1062 | # pprint(rows) |
||
1063 | hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>'] |
||
1064 | for row in rows: |
||
1065 | hrow = ['<tr>'] |
||
1066 | for cell in row: |
||
1067 | hrow.append('<td>') |
||
1068 | hrow.append(self._run_span_gamut(cell)) |
||
1069 | hrow.append('</td>') |
||
1070 | hrow.append('</tr>') |
||
1071 | hlines.append(''.join(hrow)) |
||
1072 | hlines += ['</tbody>', '</table>'] |
||
1073 | return '\n'.join(hlines) + '\n' |
||
1074 | |||
1075 | def _do_wiki_tables(self, text): |
||
1076 | # Optimization. |
||
1077 | if "||" not in text: |
||
1078 | return text |
||
1079 | |||
1080 | less_than_tab = self.tab_width - 1 |
||
1081 | wiki_table_re = re.compile(r''' |
||
1082 | (?:(?<=\n\n)|\A\n?) # leading blank line |
||
1083 | ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line |
||
1084 | (^\1\|\|.+?\|\|\n)* # any number of subsequent lines |
||
1085 | ''' % less_than_tab, re.M | re.X) |
||
1086 | return wiki_table_re.sub(self._wiki_table_sub, text) |
||
1087 | |||
1088 | def _run_span_gamut(self, text): |
||
1089 | # These are all the transformations that occur *within* block-level |
||
1090 | # tags like paragraphs, headers, and list items. |
||
1091 | |||
1092 | text = self._do_code_spans(text) |
||
1093 | |||
1094 | text = self._escape_special_chars(text) |
||
1095 | |||
1096 | # Process anchor and image tags. |
||
1097 | text = self._do_links(text) |
||
1098 | |||
1099 | # Make links out of things like `<http://example.com/>` |
||
1100 | # Must come after _do_links(), because you can use < and > |
||
1101 | # delimiters in inline links like [this](<url>). |
||
1102 | text = self._do_auto_links(text) |
||
1103 | |||
1104 | if "link-patterns" in self.extras: |
||
1105 | text = self._do_link_patterns(text) |
||
1106 | |||
1107 | text = self._encode_amps_and_angles(text) |
||
1108 | |||
1109 | if "strike" in self.extras: |
||
1110 | text = self._do_strike(text) |
||
1111 | |||
1112 | text = self._do_italics_and_bold(text) |
||
1113 | |||
1114 | if "smarty-pants" in self.extras: |
||
1115 | text = self._do_smart_punctuation(text) |
||
1116 | |||
1117 | # Do hard breaks: |
||
1118 | if "break-on-newline" in self.extras: |
||
1119 | text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text) |
||
1120 | else: |
||
1121 | text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) |
||
1122 | |||
1123 | return text |
||
1124 | |||
1125 | # "Sorta" because auto-links are identified as "tag" tokens. |
||
1126 | _sorta_html_tokenize_re = re.compile(r""" |
||
1127 | ( |
||
1128 | # tag |
||
1129 | </? |
||
1130 | (?:\w+) # tag name |
||
1131 | (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes |
||
1132 | \s*/?> |
||
1133 | | |
||
1134 | # auto-link (e.g., <http://www.activestate.com/>) |
||
1135 | <\w+[^>]*> |
||
1136 | | |
||
1137 | <!--.*?--> # comment |
||
1138 | | |
||
1139 | <\?.*?\?> # processing instruction |
||
1140 | ) |
||
1141 | """, re.X) |
||
1142 | |||
1143 | def _escape_special_chars(self, text): |
||
1144 | # Python markdown note: the HTML tokenization here differs from |
||
1145 | # that in Markdown.pl, hence the behaviour for subtle cases can |
||
1146 | # differ (I believe the tokenizer here does a better job because |
||
1147 | # it isn't susceptible to unmatched '<' and '>' in HTML tags). |
||
1148 | # Note, however, that '>' is not allowed in an auto-link URL |
||
1149 | # here. |
||
1150 | escaped = [] |
||
1151 | is_html_markup = False |
||
1152 | for token in self._sorta_html_tokenize_re.split(text): |
||
1153 | if is_html_markup: |
||
1154 | # Within tags/HTML-comments/auto-links, encode * and _ |
||
1155 | # so they don't conflict with their use in Markdown for |
||
1156 | # italics and strong. We're replacing each such |
||
1157 | # character with its corresponding MD5 checksum value; |
||
1158 | # this is likely overkill, but it should prevent us from |
||
1159 | # colliding with the escape values by accident. |
||
1160 | escaped.append(token.replace('*', self._escape_table['*']) |
||
1161 | .replace('_', self._escape_table['_'])) |
||
1162 | else: |
||
1163 | escaped.append(self._encode_backslash_escapes(token)) |
||
1164 | is_html_markup = not is_html_markup |
||
1165 | return ''.join(escaped) |
||
1166 | |||
1167 | def _hash_html_spans(self, text): |
||
1168 | # Used for safe_mode. |
||
1169 | |||
1170 | def _is_auto_link(s): |
||
1171 | if ':' in s and self._auto_link_re.match(s): |
||
1172 | return True |
||
1173 | elif '@' in s and self._auto_email_link_re.match(s): |
||
1174 | return True |
||
1175 | return False |
||
1176 | |||
1177 | tokens = [] |
||
1178 | is_html_markup = False |
||
1179 | for token in self._sorta_html_tokenize_re.split(text): |
||
1180 | if is_html_markup and not _is_auto_link(token): |
||
1181 | sanitized = self._sanitize_html(token) |
||
1182 | key = _hash_text(sanitized) |
||
1183 | self.html_spans[key] = sanitized |
||
1184 | tokens.append(key) |
||
1185 | else: |
||
1186 | tokens.append(token) |
||
1187 | is_html_markup = not is_html_markup |
||
1188 | return ''.join(tokens) |
||
1189 | |||
1190 | def _unhash_html_spans(self, text): |
||
1191 | for key, sanitized in list(self.html_spans.items()): |
||
1192 | text = text.replace(key, sanitized) |
||
1193 | return text |
||
1194 | |||
1195 | def _sanitize_html(self, s): |
||
1196 | if self.safe_mode == "replace": |
||
1197 | return self.html_removed_text |
||
1198 | elif self.safe_mode == "escape": |
||
1199 | replacements = [ |
||
1200 | ('&', '&'), |
||
1201 | ('<', '<'), |
||
1202 | ('>', '>'), |
||
1203 | ] |
||
1204 | for before, after in replacements: |
||
1205 | s = s.replace(before, after) |
||
1206 | return s |
||
1207 | else: |
||
1208 | raise MarkdownError("invalid value for 'safe_mode': %r (must be " |
||
1209 | "'escape' or 'replace')" % self.safe_mode) |
||
1210 | |||
1211 | _inline_link_title = re.compile(r''' |
||
1212 | ( # \1 |
||
1213 | [ \t]+ |
||
1214 | (['"]) # quote char = \2 |
||
1215 | (?P<title>.*?) |
||
1216 | \2 |
||
1217 | )? # title is optional |
||
1218 | \)$ |
||
1219 | ''', re.X | re.S) |
||
1220 | _tail_of_reference_link_re = re.compile(r''' |
||
1221 | # Match tail of: [text][id] |
||
1222 | [ ]? # one optional space |
||
1223 | (?:\n[ ]*)? # one optional newline followed by spaces |
||
1224 | \[ |
||
1225 | (?P<id>.*?) |
||
1226 | \] |
||
1227 | ''', re.X | re.S) |
||
1228 | |||
1229 | _whitespace = re.compile(r'\s*') |
||
1230 | |||
1231 | _strip_anglebrackets = re.compile(r'<(.*)>.*') |
||
1232 | |||
1233 | def _find_non_whitespace(self, text, start): |
||
1234 | """Returns the index of the first non-whitespace character in text |
||
1235 | after (and including) start |
||
1236 | """ |
||
1237 | match = self._whitespace.match(text, start) |
||
1238 | return match.end() |
||
1239 | |||
1240 | def _find_balanced(self, text, start, open_c, close_c): |
||
1241 | """Returns the index where the open_c and close_c characters balance |
||
1242 | out - the same number of open_c and close_c are encountered - or the |
||
1243 | end of string if it's reached before the balance point is found. |
||
1244 | """ |
||
1245 | i = start |
||
1246 | l = len(text) |
||
1247 | count = 1 |
||
1248 | while count > 0 and i < l: |
||
1249 | if text[i] == open_c: |
||
1250 | count += 1 |
||
1251 | elif text[i] == close_c: |
||
1252 | count -= 1 |
||
1253 | i += 1 |
||
1254 | return i |
||
1255 | |||
1256 | def _extract_url_and_title(self, text, start): |
||
1257 | """Extracts the url and (optional) title from the tail of a link""" |
||
1258 | # text[start] equals the opening parenthesis |
||
1259 | idx = self._find_non_whitespace(text, start+1) |
||
1260 | if idx == len(text): |
||
1261 | return None, None, None |
||
1262 | end_idx = idx |
||
1263 | has_anglebrackets = text[idx] == "<" |
||
1264 | if has_anglebrackets: |
||
1265 | end_idx = self._find_balanced(text, end_idx+1, "<", ">") |
||
1266 | end_idx = self._find_balanced(text, end_idx, "(", ")") |
||
1267 | match = self._inline_link_title.search(text, idx, end_idx) |
||
1268 | if not match: |
||
1269 | return None, None, None |
||
1270 | url, title = text[idx:match.start()], match.group("title") |
||
1271 | if has_anglebrackets: |
||
1272 | url = self._strip_anglebrackets.sub(r'\1', url) |
||
1273 | return url, title, end_idx |
||
1274 | |||
1275 | _safe_protocols = re.compile(r'(https?|ftp):', re.I) |
||
1276 | def _do_links(self, text): |
||
1277 | """Turn Markdown link shortcuts into XHTML <a> and <img> tags. |
||
1278 | |||
1279 | This is a combination of Markdown.pl's _DoAnchors() and |
||
1280 | _DoImages(). They are done together because that simplified the |
||
1281 | approach. It was necessary to use a different approach than |
||
1282 | Markdown.pl because of the lack of atomic matching support in |
||
1283 | Python's regex engine used in $g_nested_brackets. |
||
1284 | """ |
||
1285 | MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 |
||
1286 | |||
1287 | # `anchor_allowed_pos` is used to support img links inside |
||
1288 | # anchors, but not anchors inside anchors. An anchor's start |
||
1289 | # pos must be `>= anchor_allowed_pos`. |
||
1290 | anchor_allowed_pos = 0 |
||
1291 | |||
1292 | curr_pos = 0 |
||
1293 | while True: # Handle the next link. |
||
1294 | # The next '[' is the start of: |
||
1295 | # - an inline anchor: [text](url "title") |
||
1296 | # - a reference anchor: [text][id] |
||
1297 | # - an inline img:  |
||
1298 | # - a reference img: ![text][id] |
||
1299 | # - a footnote ref: [^id] |
||
1300 | # (Only if 'footnotes' extra enabled) |
||
1301 | # - a footnote defn: [^id]: ... |
||
1302 | # (Only if 'footnotes' extra enabled) These have already |
||
1303 | # been stripped in _strip_footnote_definitions() so no |
||
1304 | # need to watch for them. |
||
1305 | # - a link definition: [id]: url "title" |
||
1306 | # These have already been stripped in |
||
1307 | # _strip_link_definitions() so no need to watch for them. |
||
1308 | # - not markup: [...anything else... |
||
1309 | try: |
||
1310 | start_idx = text.index('[', curr_pos) |
||
1311 | except ValueError: |
||
1312 | break |
||
1313 | text_length = len(text) |
||
1314 | |||
1315 | # Find the matching closing ']'. |
||
1316 | # Markdown.pl allows *matching* brackets in link text so we |
||
1317 | # will here too. Markdown.pl *doesn't* currently allow |
||
1318 | # matching brackets in img alt text -- we'll differ in that |
||
1319 | # regard. |
||
1320 | bracket_depth = 0 |
||
1321 | for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, |
||
1322 | text_length)): |
||
1323 | ch = text[p] |
||
1324 | if ch == ']': |
||
1325 | bracket_depth -= 1 |
||
1326 | if bracket_depth < 0: |
||
1327 | break |
||
1328 | elif ch == '[': |
||
1329 | bracket_depth += 1 |
||
1330 | else: |
||
1331 | # Closing bracket not found within sentinel length. |
||
1332 | # This isn't markup. |
||
1333 | curr_pos = start_idx + 1 |
||
1334 | continue |
||
1335 | link_text = text[start_idx+1:p] |
||
1336 | |||
1337 | # Possibly a footnote ref? |
||
1338 | if "footnotes" in self.extras and link_text.startswith("^"): |
||
1339 | normed_id = re.sub(r'\W', '-', link_text[1:]) |
||
1340 | if normed_id in self.footnotes: |
||
1341 | self.footnote_ids.append(normed_id) |
||
1342 | result = '<sup class="footnote-ref" id="fnref-%s">' \ |
||
1343 | '<a href="#fn-%s">%s</a></sup>' \ |
||
1344 | % (normed_id, normed_id, len(self.footnote_ids)) |
||
1345 | text = text[:start_idx] + result + text[p+1:] |
||
1346 | else: |
||
1347 | # This id isn't defined, leave the markup alone. |
||
1348 | curr_pos = p+1 |
||
1349 | continue |
||
1350 | |||
1351 | # Now determine what this is by the remainder. |
||
1352 | p += 1 |
||
1353 | if p == text_length: |
||
1354 | return text |
||
1355 | |||
1356 | # Inline anchor or img? |
||
1357 | if text[p] == '(': # attempt at perf improvement |
||
1358 | url, title, url_end_idx = self._extract_url_and_title(text, p) |
||
1359 | if url is not None: |
||
1360 | # Handle an inline anchor or img. |
||
1361 | is_img = start_idx > 0 and text[start_idx-1] == "!" |
||
1362 | if is_img: |
||
1363 | start_idx -= 1 |
||
1364 | |||
1365 | # We've got to encode these to avoid conflicting |
||
1366 | # with italics/bold. |
||
1367 | url = url.replace('*', self._escape_table['*']) \ |
||
1368 | .replace('_', self._escape_table['_']) |
||
1369 | if title: |
||
1370 | title_str = ' title="%s"' % ( |
||
1371 | _xml_escape_attr(title) |
||
1372 | .replace('*', self._escape_table['*']) |
||
1373 | .replace('_', self._escape_table['_'])) |
||
1374 | else: |
||
1375 | title_str = '' |
||
1376 | View Code Duplication | if is_img: |
|
1377 | img_class_str = self._html_class_str_from_tag("img") |
||
1378 | result = '<img src="%s" alt="%s"%s%s%s' \ |
||
1379 | % (_html_escape_url(url, safe_mode=self.safe_mode), |
||
1380 | _xml_escape_attr(link_text), |
||
1381 | title_str, |
||
1382 | img_class_str, |
||
1383 | self.empty_element_suffix) |
||
1384 | if "smarty-pants" in self.extras: |
||
1385 | result = result.replace('"', self._escape_table['"']) |
||
1386 | curr_pos = start_idx + len(result) |
||
1387 | text = text[:start_idx] + result + text[url_end_idx:] |
||
1388 | elif start_idx >= anchor_allowed_pos: |
||
1389 | if self.safe_mode and not self._safe_protocols.match(url): |
||
1390 | result_head = '<a href="#"%s>' % (title_str) |
||
1391 | else: |
||
1392 | result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
||
1393 | result = '%s%s</a>' % (result_head, _xml_escape_attr(link_text)) |
||
1394 | if "smarty-pants" in self.extras: |
||
1395 | result = result.replace('"', self._escape_table['"']) |
||
1396 | # <img> allowed from curr_pos on, <a> from |
||
1397 | # anchor_allowed_pos on. |
||
1398 | curr_pos = start_idx + len(result_head) |
||
1399 | anchor_allowed_pos = start_idx + len(result) |
||
1400 | text = text[:start_idx] + result + text[url_end_idx:] |
||
1401 | else: |
||
1402 | # Anchor not allowed here. |
||
1403 | curr_pos = start_idx + 1 |
||
1404 | continue |
||
1405 | |||
1406 | # Reference anchor or img? |
||
1407 | else: |
||
1408 | match = self._tail_of_reference_link_re.match(text, p) |
||
1409 | if match: |
||
1410 | # Handle a reference-style anchor or img. |
||
1411 | is_img = start_idx > 0 and text[start_idx-1] == "!" |
||
1412 | if is_img: |
||
1413 | start_idx -= 1 |
||
1414 | link_id = match.group("id").lower() |
||
1415 | if not link_id: |
||
1416 | link_id = link_text.lower() # for links like [this][] |
||
1417 | if link_id in self.urls: |
||
1418 | url = self.urls[link_id] |
||
1419 | # We've got to encode these to avoid conflicting |
||
1420 | # with italics/bold. |
||
1421 | url = url.replace('*', self._escape_table['*']) \ |
||
1422 | .replace('_', self._escape_table['_']) |
||
1423 | title = self.titles.get(link_id) |
||
1424 | if title: |
||
1425 | title = _xml_escape_attr(title) \ |
||
1426 | .replace('*', self._escape_table['*']) \ |
||
1427 | .replace('_', self._escape_table['_']) |
||
1428 | title_str = ' title="%s"' % title |
||
1429 | else: |
||
1430 | title_str = '' |
||
1431 | View Code Duplication | if is_img: |
|
1432 | img_class_str = self._html_class_str_from_tag("img") |
||
1433 | result = '<img src="%s" alt="%s"%s%s%s' \ |
||
1434 | % (_html_escape_url(url, safe_mode=self.safe_mode), |
||
1435 | _xml_escape_attr(link_text), |
||
1436 | title_str, |
||
1437 | img_class_str, |
||
1438 | self.empty_element_suffix) |
||
1439 | if "smarty-pants" in self.extras: |
||
1440 | result = result.replace('"', self._escape_table['"']) |
||
1441 | curr_pos = start_idx + len(result) |
||
1442 | text = text[:start_idx] + result + text[match.end():] |
||
1443 | elif start_idx >= anchor_allowed_pos: |
||
1444 | if self.safe_mode and not self._safe_protocols.match(url): |
||
1445 | result_head = '<a href="#"%s>' % (title_str) |
||
1446 | else: |
||
1447 | result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
||
1448 | result = '%s%s</a>' % (result_head, link_text) |
||
1449 | if "smarty-pants" in self.extras: |
||
1450 | result = result.replace('"', self._escape_table['"']) |
||
1451 | # <img> allowed from curr_pos on, <a> from |
||
1452 | # anchor_allowed_pos on. |
||
1453 | curr_pos = start_idx + len(result_head) |
||
1454 | anchor_allowed_pos = start_idx + len(result) |
||
1455 | text = text[:start_idx] + result + text[match.end():] |
||
1456 | else: |
||
1457 | # Anchor not allowed here. |
||
1458 | curr_pos = start_idx + 1 |
||
1459 | else: |
||
1460 | # This id isn't defined, leave the markup alone. |
||
1461 | curr_pos = match.end() |
||
1462 | continue |
||
1463 | |||
1464 | # Otherwise, it isn't markup. |
||
1465 | curr_pos = start_idx + 1 |
||
1466 | |||
1467 | return text |
||
1468 | |||
1469 | def header_id_from_text(self, text, prefix, n): |
||
1470 | """Generate a header id attribute value from the given header |
||
1471 | HTML content. |
||
1472 | |||
1473 | This is only called if the "header-ids" extra is enabled. |
||
1474 | Subclasses may override this for different header ids. |
||
1475 | |||
1476 | @param text {str} The text of the header tag |
||
1477 | @param prefix {str} The requested prefix for header ids. This is the |
||
1478 | value of the "header-ids" extra key, if any. Otherwise, None. |
||
1479 | @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag. |
||
1480 | @returns {str} The value for the header tag's "id" attribute. Return |
||
1481 | None to not have an id attribute and to exclude this header from |
||
1482 | the TOC (if the "toc" extra is specified). |
||
1483 | """ |
||
1484 | header_id = _slugify(text) |
||
1485 | if prefix and isinstance(prefix, base_string_type): |
||
1486 | header_id = prefix + '-' + header_id |
||
1487 | if header_id in self._count_from_header_id: |
||
1488 | self._count_from_header_id[header_id] += 1 |
||
1489 | header_id += '-%s' % self._count_from_header_id[header_id] |
||
1490 | else: |
||
1491 | self._count_from_header_id[header_id] = 1 |
||
1492 | if 0 == len(header_id): |
||
1493 | header_id += '-%s' % self._count_from_header_id[header_id] |
||
1494 | |||
1495 | return header_id |
||
1496 | |||
1497 | _toc = None |
||
1498 | def _toc_add_entry(self, level, id, name): |
||
1499 | if self._toc is None: |
||
1500 | self._toc = [] |
||
1501 | self._toc.append((level, id, self._unescape_special_chars(name))) |
||
1502 | |||
1503 | _h_re_base = r''' |
||
1504 | (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) |
||
1505 | | |
||
1506 | (^(\#{1,6}) # \1 = string of #'s |
||
1507 | [ \t]%s |
||
1508 | (.+?) # \2 = Header text |
||
1509 | [ \t]* |
||
1510 | (?<!\\) # ensure not an escaped trailing '#' |
||
1511 | \#* # optional closing #'s (not counted) |
||
1512 | \n+ |
||
1513 | ) |
||
1514 | ''' |
||
1515 | |||
1516 | _h_re = re.compile(_h_re_base % '*', re.X | re.M) |
||
1517 | _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) |
||
1518 | |||
1519 | def _h_sub(self, match): |
||
1520 | if match.group(1) is not None: |
||
1521 | # Setext header |
||
1522 | n = {"=": 1, "-": 2}[match.group(3)[0]] |
||
1523 | header_group = match.group(2) |
||
1524 | else: |
||
1525 | # atx header |
||
1526 | n = len(match.group(5)) |
||
1527 | header_group = match.group(6) |
||
1528 | |||
1529 | demote_headers = self.extras.get("demote-headers") |
||
1530 | if demote_headers: |
||
1531 | n = min(n + demote_headers, 6) |
||
1532 | header_id_attr = "" |
||
1533 | if "header-ids" in self.extras: |
||
1534 | header_id = self.header_id_from_text(header_group, |
||
1535 | self.extras["header-ids"], n) |
||
1536 | if header_id: |
||
1537 | header_id_attr = ' id="%s"' % header_id |
||
1538 | html = self._run_span_gamut(header_group) |
||
1539 | if "toc" in self.extras and header_id: |
||
1540 | self._toc_add_entry(n, header_id, html) |
||
1541 | return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) |
||
1542 | |||
1543 | def _do_headers(self, text): |
||
1544 | # Setext-style headers: |
||
1545 | # Header 1 |
||
1546 | # ======== |
||
1547 | # |
||
1548 | # Header 2 |
||
1549 | # -------- |
||
1550 | |||
1551 | # atx-style headers: |
||
1552 | # # Header 1 |
||
1553 | # ## Header 2 |
||
1554 | # ## Header 2 with closing hashes ## |
||
1555 | # ... |
||
1556 | # ###### Header 6 |
||
1557 | |||
1558 | if 'tag-friendly' in self.extras: |
||
1559 | return self._h_re_tag_friendly.sub(self._h_sub, text) |
||
1560 | return self._h_re.sub(self._h_sub, text) |
||
1561 | |||
1562 | _marker_ul_chars = '*+-' |
||
1563 | _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars |
||
1564 | _marker_ul = '(?:[%s])' % _marker_ul_chars |
||
1565 | _marker_ol = r'(?:\d+\.)' |
||
1566 | |||
1567 | def _list_sub(self, match): |
||
1568 | lst = match.group(1) |
||
1569 | lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" |
||
1570 | result = self._process_list_items(lst) |
||
1571 | if self.list_level: |
||
1572 | return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) |
||
1573 | else: |
||
1574 | return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) |
||
1575 | |||
1576 | def _do_lists(self, text): |
||
1577 | # Form HTML ordered (numbered) and unordered (bulleted) lists. |
||
1578 | |||
1579 | # Iterate over each *non-overlapping* list match. |
||
1580 | pos = 0 |
||
1581 | while True: |
||
1582 | # Find the *first* hit for either list style (ul or ol). We |
||
1583 | # match ul and ol separately to avoid adjacent lists of different |
||
1584 | # types running into each other (see issue #16). |
||
1585 | hits = [] |
||
1586 | for marker_pat in (self._marker_ul, self._marker_ol): |
||
1587 | less_than_tab = self.tab_width - 1 |
||
1588 | whole_list = r''' |
||
1589 | ( # \1 = whole list |
||
1590 | ( # \2 |
||
1591 | [ ]{0,%d} |
||
1592 | (%s) # \3 = first list item marker |
||
1593 | [ \t]+ |
||
1594 | (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case. |
||
1595 | ) |
||
1596 | (?:.+?) |
||
1597 | ( # \4 |
||
1598 | \Z |
||
1599 | | |
||
1600 | \n{2,} |
||
1601 | (?=\S) |
||
1602 | (?! # Negative lookahead for another list item marker |
||
1603 | [ \t]* |
||
1604 | %s[ \t]+ |
||
1605 | ) |
||
1606 | ) |
||
1607 | ) |
||
1608 | ''' % (less_than_tab, marker_pat, marker_pat) |
||
1609 | if self.list_level: # sub-list |
||
1610 | list_re = re.compile("^"+whole_list, re.X | re.M | re.S) |
||
1611 | else: |
||
1612 | list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, |
||
1613 | re.X | re.M | re.S) |
||
1614 | match = list_re.search(text, pos) |
||
1615 | if match: |
||
1616 | hits.append((match.start(), match)) |
||
1617 | if not hits: |
||
1618 | break |
||
1619 | hits.sort() |
||
1620 | match = hits[0][1] |
||
1621 | start, end = match.span() |
||
1622 | middle = self._list_sub(match) |
||
1623 | text = text[:start] + middle + text[end:] |
||
1624 | pos = start + len(middle) # start pos for next attempted match |
||
1625 | |||
1626 | return text |
||
1627 | |||
1628 | _list_item_re = re.compile(r''' |
||
1629 | (\n)? # leading line = \1 |
||
1630 | (^[ \t]*) # leading whitespace = \2 |
||
1631 | (?P<marker>%s) [ \t]+ # list marker = \3 |
||
1632 | ((?:.+?) # list item text = \4 |
||
1633 | (\n{1,2})) # eols = \5 |
||
1634 | (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) |
||
1635 | ''' % (_marker_any, _marker_any), |
||
1636 | re.M | re.X | re.S) |
||
1637 | |||
1638 | _task_list_item_re = re.compile(r''' |
||
1639 | (\[[\ x]\])[ \t]+ # tasklist marker = \1 |
||
1640 | (.*) # list item text = \2 |
||
1641 | ''', re.M | re.X | re.S) |
||
1642 | |||
1643 | _task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s' |
||
1644 | |||
1645 | def _task_list_item_sub(self, match): |
||
1646 | marker = match.group(1) |
||
1647 | item_text = match.group(2) |
||
1648 | if marker == '[x]': |
||
1649 | return self._task_list_warpper_str % ('checked ', item_text) |
||
1650 | elif marker == '[ ]': |
||
1651 | return self._task_list_warpper_str % ('', item_text) |
||
1652 | |||
1653 | _last_li_endswith_two_eols = False |
||
1654 | def _list_item_sub(self, match): |
||
1655 | item = match.group(4) |
||
1656 | leading_line = match.group(1) |
||
1657 | if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: |
||
1658 | item = self._run_block_gamut(self._outdent(item)) |
||
1659 | else: |
||
1660 | # Recursion for sub-lists: |
||
1661 | item = self._do_lists(self._outdent(item)) |
||
1662 | if item.endswith('\n'): |
||
1663 | item = item[:-1] |
||
1664 | item = self._run_span_gamut(item) |
||
1665 | self._last_li_endswith_two_eols = (len(match.group(5)) == 2) |
||
1666 | |||
1667 | if "task_list" in self.extras: |
||
1668 | item = self._task_list_item_re.sub(self._task_list_item_sub, item) |
||
1669 | |||
1670 | return "<li>%s</li>\n" % item |
||
1671 | |||
1672 | def _process_list_items(self, list_str): |
||
1673 | # Process the contents of a single ordered or unordered list, |
||
1674 | # splitting it into individual list items. |
||
1675 | |||
1676 | # The $g_list_level global keeps track of when we're inside a list. |
||
1677 | # Each time we enter a list, we increment it; when we leave a list, |
||
1678 | # we decrement. If it's zero, we're not in a list anymore. |
||
1679 | # |
||
1680 | # We do this because when we're not inside a list, we want to treat |
||
1681 | # something like this: |
||
1682 | # |
||
1683 | # I recommend upgrading to version |
||
1684 | # 8. Oops, now this line is treated |
||
1685 | # as a sub-list. |
||
1686 | # |
||
1687 | # As a single paragraph, despite the fact that the second line starts |
||
1688 | # with a digit-period-space sequence. |
||
1689 | # |
||
1690 | # Whereas when we're inside a list (or sub-list), that line will be |
||
1691 | # treated as the start of a sub-list. What a kludge, huh? This is |
||
1692 | # an aspect of Markdown's syntax that's hard to parse perfectly |
||
1693 | # without resorting to mind-reading. Perhaps the solution is to |
||
1694 | # change the syntax rules such that sub-lists must start with a |
||
1695 | # starting cardinal number; e.g. "1." or "a.". |
||
1696 | self.list_level += 1 |
||
1697 | self._last_li_endswith_two_eols = False |
||
1698 | list_str = list_str.rstrip('\n') + '\n' |
||
1699 | list_str = self._list_item_re.sub(self._list_item_sub, list_str) |
||
1700 | self.list_level -= 1 |
||
1701 | return list_str |
||
1702 | |||
1703 | def _get_pygments_lexer(self, lexer_name): |
||
1704 | try: |
||
1705 | from pygments import lexers, util |
||
1706 | except ImportError: |
||
1707 | return None |
||
1708 | try: |
||
1709 | return lexers.get_lexer_by_name(lexer_name) |
||
1710 | except util.ClassNotFound: |
||
1711 | return None |
||
1712 | |||
1713 | def _color_with_pygments(self, codeblock, lexer, **formatter_opts): |
||
1714 | import pygments |
||
1715 | import pygments.formatters |
||
1716 | |||
1717 | class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): |
||
1718 | def _wrap_code(self, inner): |
||
1719 | """A function for use in a Pygments Formatter which |
||
1720 | wraps in <code> tags. |
||
1721 | """ |
||
1722 | yield 0, "<code>" |
||
1723 | for tup in inner: |
||
1724 | yield tup |
||
1725 | yield 0, "</code>" |
||
1726 | |||
1727 | def wrap(self, source, outfile): |
||
1728 | """Return the source with a code, pre, and div.""" |
||
1729 | return self._wrap_div(self._wrap_pre(self._wrap_code(source))) |
||
1730 | |||
1731 | formatter_opts.setdefault("cssclass", "codehilite") |
||
1732 | formatter = HtmlCodeFormatter(**formatter_opts) |
||
1733 | return pygments.highlight(codeblock, lexer, formatter) |
||
1734 | |||
1735 | def _code_block_sub(self, match, is_fenced_code_block=False): |
||
1736 | lexer_name = None |
||
1737 | if is_fenced_code_block: |
||
1738 | lexer_name = match.group(1) |
||
1739 | if lexer_name: |
||
1740 | formatter_opts = self.extras['fenced-code-blocks'] or {} |
||
1741 | codeblock = match.group(2) |
||
1742 | codeblock = codeblock[:-1] # drop one trailing newline |
||
1743 | else: |
||
1744 | codeblock = match.group(1) |
||
1745 | codeblock = self._outdent(codeblock) |
||
1746 | codeblock = self._detab(codeblock) |
||
1747 | codeblock = codeblock.lstrip('\n') # trim leading newlines |
||
1748 | codeblock = codeblock.rstrip() # trim trailing whitespace |
||
1749 | |||
1750 | # Note: "code-color" extra is DEPRECATED. |
||
1751 | if "code-color" in self.extras and codeblock.startswith(":::"): |
||
1752 | lexer_name, rest = codeblock.split('\n', 1) |
||
1753 | lexer_name = lexer_name[3:].strip() |
||
1754 | codeblock = rest.lstrip("\n") # Remove lexer declaration line. |
||
1755 | formatter_opts = self.extras['code-color'] or {} |
||
1756 | |||
1757 | # Use pygments only if not using the highlightjs-lang extra |
||
1758 | if lexer_name and "highlightjs-lang" not in self.extras: |
||
1759 | def unhash_code(codeblock): |
||
1760 | for key, sanitized in list(self.html_spans.items()): |
||
1761 | codeblock = codeblock.replace(key, sanitized) |
||
1762 | replacements = [ |
||
1763 | ("&", "&"), |
||
1764 | ("<", "<"), |
||
1765 | (">", ">") |
||
1766 | ] |
||
1767 | for old, new in replacements: |
||
1768 | codeblock = codeblock.replace(old, new) |
||
1769 | return codeblock |
||
1770 | lexer = self._get_pygments_lexer(lexer_name) |
||
1771 | if lexer: |
||
1772 | codeblock = unhash_code( codeblock ) |
||
1773 | colored = self._color_with_pygments(codeblock, lexer, |
||
1774 | **formatter_opts) |
||
1775 | return "\n\n%s\n\n" % colored |
||
1776 | |||
1777 | codeblock = self._encode_code(codeblock) |
||
1778 | pre_class_str = self._html_class_str_from_tag("pre") |
||
1779 | |||
1780 | if "highlightjs-lang" in self.extras and lexer_name: |
||
1781 | code_class_str = ' class="%s"' % lexer_name |
||
1782 | else: |
||
1783 | code_class_str = self._html_class_str_from_tag("code") |
||
1784 | |||
1785 | return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % ( |
||
1786 | pre_class_str, code_class_str, codeblock) |
||
1787 | |||
1788 | def _html_class_str_from_tag(self, tag): |
||
1789 | """Get the appropriate ' class="..."' string (note the leading |
||
1790 | space), if any, for the given tag. |
||
1791 | """ |
||
1792 | if "html-classes" not in self.extras: |
||
1793 | return "" |
||
1794 | try: |
||
1795 | html_classes_from_tag = self.extras["html-classes"] |
||
1796 | except TypeError: |
||
1797 | return "" |
||
1798 | else: |
||
1799 | if tag in html_classes_from_tag: |
||
1800 | return ' class="%s"' % html_classes_from_tag[tag] |
||
1801 | return "" |
||
1802 | |||
1803 | def _do_code_blocks(self, text): |
||
1804 | """Process Markdown `<pre><code>` blocks.""" |
||
1805 | code_block_re = re.compile(r''' |
||
1806 | (?:\n\n|\A\n?) |
||
1807 | ( # $1 = the code block -- one or more lines, starting with a space/tab |
||
1808 | (?: |
||
1809 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces |
||
1810 | .*\n+ |
||
1811 | )+ |
||
1812 | ) |
||
1813 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc |
||
1814 | # Lookahead to make sure this block isn't already in a code block. |
||
1815 | # Needed when syntax highlighting is being used. |
||
1816 | (?![^<]*\</code\>) |
||
1817 | ''' % (self.tab_width, self.tab_width), |
||
1818 | re.M | re.X) |
||
1819 | return code_block_re.sub(self._code_block_sub, text) |
||
1820 | |||
1821 | _fenced_code_block_re = re.compile(r''' |
||
1822 | (?:\n+|\A\n?) |
||
1823 | ^```\s*?([\w+-]+)?\s*?\n # opening fence, $1 = optional lang |
||
1824 | (.*?) # $2 = code block content |
||
1825 | ^```[ \t]*\n # closing fence |
||
1826 | ''', re.M | re.X | re.S) |
||
1827 | |||
1828 | def _fenced_code_block_sub(self, match): |
||
1829 | return self._code_block_sub(match, is_fenced_code_block=True) |
||
1830 | |||
1831 | def _do_fenced_code_blocks(self, text): |
||
1832 | """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" |
||
1833 | return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) |
||
1834 | |||
1835 | # Rules for a code span: |
||
1836 | # - backslash escapes are not interpreted in a code span |
||
1837 | # - to include one or or a run of more backticks the delimiters must |
||
1838 | # be a longer run of backticks |
||
1839 | # - cannot start or end a code span with a backtick; pad with a |
||
1840 | # space and that space will be removed in the emitted HTML |
||
1841 | # See `test/tm-cases/escapes.text` for a number of edge-case |
||
1842 | # examples. |
||
1843 | _code_span_re = re.compile(r''' |
||
1844 | (?<!\\) |
||
1845 | (`+) # \1 = Opening run of ` |
||
1846 | (?!`) # See Note A test/tm-cases/escapes.text |
||
1847 | (.+?) # \2 = The code block |
||
1848 | (?<!`) |
||
1849 | \1 # Matching closer |
||
1850 | (?!`) |
||
1851 | ''', re.X | re.S) |
||
1852 | |||
1853 | def _code_span_sub(self, match): |
||
1854 | c = match.group(2).strip(" \t") |
||
1855 | c = self._encode_code(c) |
||
1856 | return "<code>%s</code>" % c |
||
1857 | |||
1858 | def _do_code_spans(self, text): |
||
1859 | # * Backtick quotes are used for <code></code> spans. |
||
1860 | # |
||
1861 | # * You can use multiple backticks as the delimiters if you want to |
||
1862 | # include literal backticks in the code span. So, this input: |
||
1863 | # |
||
1864 | # Just type ``foo `bar` baz`` at the prompt. |
||
1865 | # |
||
1866 | # Will translate to: |
||
1867 | # |
||
1868 | # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> |
||
1869 | # |
||
1870 | # There's no arbitrary limit to the number of backticks you |
||
1871 | # can use as delimters. If you need three consecutive backticks |
||
1872 | # in your code, use four for delimiters, etc. |
||
1873 | # |
||
1874 | # * You can use spaces to get literal backticks at the edges: |
||
1875 | # |
||
1876 | # ... type `` `bar` `` ... |
||
1877 | # |
||
1878 | # Turns to: |
||
1879 | # |
||
1880 | # ... type <code>`bar`</code> ... |
||
1881 | return self._code_span_re.sub(self._code_span_sub, text) |
||
1882 | |||
1883 | def _encode_code(self, text): |
||
1884 | """Encode/escape certain characters inside Markdown code runs. |
||
1885 | The point is that in code, these characters are literals, |
||
1886 | and lose their special Markdown meanings. |
||
1887 | """ |
||
1888 | replacements = [ |
||
1889 | # Encode all ampersands; HTML entities are not |
||
1890 | # entities within a Markdown code span. |
||
1891 | ('&', '&'), |
||
1892 | # Do the angle bracket song and dance: |
||
1893 | ('<', '<'), |
||
1894 | ('>', '>'), |
||
1895 | ] |
||
1896 | for before, after in replacements: |
||
1897 | text = text.replace(before, after) |
||
1898 | hashed = _hash_text(text) |
||
1899 | self._escape_table[text] = hashed |
||
1900 | return hashed |
||
1901 | |||
1902 | _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) |
||
1903 | def _do_strike(self, text): |
||
1904 | text = self._strike_re.sub(r"<strike>\1</strike>", text) |
||
1905 | return text |
||
1906 | |||
1907 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) |
||
1908 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) |
||
1909 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) |
||
1910 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) |
||
1911 | def _do_italics_and_bold(self, text): |
||
1912 | # <strong> must go first: |
||
1913 | if "code-friendly" in self.extras: |
||
1914 | text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) |
||
1915 | text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) |
||
1916 | else: |
||
1917 | text = self._strong_re.sub(r"<strong>\2</strong>", text) |
||
1918 | text = self._em_re.sub(r"<em>\2</em>", text) |
||
1919 | return text |
||
1920 | |||
1921 | # "smarty-pants" extra: Very liberal in interpreting a single prime as an |
||
1922 | # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and |
||
1923 | # "twixt" can be written without an initial apostrophe. This is fine because |
||
1924 | # using scare quotes (single quotation marks) is rare. |
||
1925 | _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))") |
||
1926 | _contractions = ["tis", "twas", "twer", "neath", "o", "n", |
||
1927 | "round", "bout", "twixt", "nuff", "fraid", "sup"] |
||
1928 | def _do_smart_contractions(self, text): |
||
1929 | text = self._apostrophe_year_re.sub(r"’\1", text) |
||
1930 | for c in self._contractions: |
||
1931 | text = text.replace("'%s" % c, "’%s" % c) |
||
1932 | text = text.replace("'%s" % c.capitalize(), |
||
1933 | "’%s" % c.capitalize()) |
||
1934 | return text |
||
1935 | |||
1936 | # Substitute double-quotes before single-quotes. |
||
1937 | _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)") |
||
1938 | _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)') |
||
1939 | _closing_single_quote_re = re.compile(r"(?<=\S)'") |
||
1940 | _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))') |
||
1941 | def _do_smart_punctuation(self, text): |
||
1942 | """Fancifies 'single quotes', "double quotes", and apostrophes. |
||
1943 | Converts --, ---, and ... into en dashes, em dashes, and ellipses. |
||
1944 | |||
1945 | Inspiration is: <http://daringfireball.net/projects/smartypants/> |
||
1946 | See "test/tm-cases/smarty_pants.text" for a full discussion of the |
||
1947 | support here and |
||
1948 | <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a |
||
1949 | discussion of some diversion from the original SmartyPants. |
||
1950 | """ |
||
1951 | if "'" in text: # guard for perf |
||
1952 | text = self._do_smart_contractions(text) |
||
1953 | text = self._opening_single_quote_re.sub("‘", text) |
||
1954 | text = self._closing_single_quote_re.sub("’", text) |
||
1955 | |||
1956 | if '"' in text: # guard for perf |
||
1957 | text = self._opening_double_quote_re.sub("“", text) |
||
1958 | text = self._closing_double_quote_re.sub("”", text) |
||
1959 | |||
1960 | text = text.replace("---", "—") |
||
1961 | text = text.replace("--", "–") |
||
1962 | text = text.replace("...", "…") |
||
1963 | text = text.replace(" . . . ", "…") |
||
1964 | text = text.replace(". . .", "…") |
||
1965 | return text |
||
1966 | |||
1967 | _block_quote_base = r''' |
||
1968 | ( # Wrap whole match in \1 |
||
1969 | ( |
||
1970 | ^[ \t]*>%s[ \t]? # '>' at the start of a line |
||
1971 | .+\n # rest of the first line |
||
1972 | (.+\n)* # subsequent consecutive lines |
||
1973 | \n* # blanks |
||
1974 | )+ |
||
1975 | ) |
||
1976 | ''' |
||
1977 | _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X) |
||
1978 | _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X) |
||
1979 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M) |
||
1980 | _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M) |
||
1981 | _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M) |
||
1982 | _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
||
1983 | def _dedent_two_spaces_sub(self, match): |
||
1984 | return re.sub(r'(?m)^ ', '', match.group(1)) |
||
1985 | |||
1986 | def _block_quote_sub(self, match): |
||
1987 | bq = match.group(1) |
||
1988 | is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq) |
||
1989 | # trim one level of quoting |
||
1990 | if is_spoiler: |
||
1991 | bq = self._bq_one_level_re_spoiler.sub('', bq) |
||
1992 | else: |
||
1993 | bq = self._bq_one_level_re.sub('', bq) |
||
1994 | # trim whitespace-only lines |
||
1995 | bq = self._ws_only_line_re.sub('', bq) |
||
1996 | bq = self._run_block_gamut(bq) # recurse |
||
1997 | |||
1998 | bq = re.sub('(?m)^', ' ', bq) |
||
1999 | # These leading spaces screw with <pre> content, so we need to fix that: |
||
2000 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) |
||
2001 | |||
2002 | if is_spoiler: |
||
2003 | return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq |
||
2004 | else: |
||
2005 | return '<blockquote>\n%s\n</blockquote>\n\n' % bq |
||
2006 | |||
2007 | def _do_block_quotes(self, text): |
||
2008 | if '>' not in text: |
||
2009 | return text |
||
2010 | if 'spoiler' in self.extras: |
||
2011 | return self._block_quote_re_spoiler.sub(self._block_quote_sub, text) |
||
2012 | else: |
||
2013 | return self._block_quote_re.sub(self._block_quote_sub, text) |
||
2014 | |||
2015 | def _form_paragraphs(self, text): |
||
2016 | # Strip leading and trailing lines: |
||
2017 | text = text.strip('\n') |
||
2018 | |||
2019 | # Wrap <p> tags. |
||
2020 | grafs = [] |
||
2021 | for i, graf in enumerate(re.split(r"\n{2,}", text)): |
||
2022 | if graf in self.html_blocks: |
||
2023 | # Unhashify HTML blocks |
||
2024 | grafs.append(self.html_blocks[graf]) |
||
2025 | else: |
||
2026 | cuddled_list = None |
||
2027 | if "cuddled-lists" in self.extras: |
||
2028 | # Need to put back trailing '\n' for `_list_item_re` |
||
2029 | # match at the end of the paragraph. |
||
2030 | li = self._list_item_re.search(graf + '\n') |
||
2031 | # Two of the same list marker in this paragraph: a likely |
||
2032 | # candidate for a list cuddled to preceding paragraph |
||
2033 | # text (issue 33). Note the `[-1]` is a quick way to |
||
2034 | # consider numeric bullets (e.g. "1." and "2.") to be |
||
2035 | # equal. |
||
2036 | if (li and len(li.group(2)) <= 3 and li.group("next_marker") |
||
2037 | and li.group("marker")[-1] == li.group("next_marker")[-1]): |
||
2038 | start = li.start() |
||
2039 | cuddled_list = self._do_lists(graf[start:]).rstrip("\n") |
||
2040 | assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>") |
||
2041 | graf = graf[:start] |
||
2042 | |||
2043 | # Wrap <p> tags. |
||
2044 | graf = self._run_span_gamut(graf) |
||
2045 | grafs.append("<p>" + graf.lstrip(" \t") + "</p>") |
||
2046 | |||
2047 | if cuddled_list: |
||
2048 | grafs.append(cuddled_list) |
||
2049 | |||
2050 | return "\n\n".join(grafs) |
||
2051 | |||
2052 | def _add_footnotes(self, text): |
||
2053 | if self.footnotes: |
||
2054 | footer = [ |
||
2055 | '<div class="footnotes">', |
||
2056 | '<hr' + self.empty_element_suffix, |
||
2057 | '<ol>', |
||
2058 | ] |
||
2059 | |||
2060 | if not self.footnote_title: |
||
2061 | self.footnote_title = "Jump back to footnote %d in the text." |
||
2062 | if not self.footnote_return_symbol: |
||
2063 | self.footnote_return_symbol = "↩" |
||
2064 | |||
2065 | for i, id in enumerate(self.footnote_ids): |
||
2066 | if i != 0: |
||
2067 | footer.append('') |
||
2068 | footer.append('<li id="fn-%s">' % id) |
||
2069 | footer.append(self._run_block_gamut(self.footnotes[id])) |
||
2070 | try: |
||
2071 | backlink = ('<a href="#fnref-%s" ' + |
||
2072 | 'class="footnoteBackLink" ' + |
||
2073 | 'title="' + self.footnote_title + '">' + |
||
2074 | self.footnote_return_symbol + |
||
2075 | '</a>') % (id, i+1) |
||
2076 | except TypeError: |
||
2077 | log.debug("Footnote error. `footnote_title` " |
||
2078 | "must include parameter. Using defaults.") |
||
2079 | backlink = ('<a href="#fnref-%s" ' |
||
2080 | 'class="footnoteBackLink" ' |
||
2081 | 'title="Jump back to footnote %d in the text.">' |
||
2082 | '↩</a>' % (id, i+1)) |
||
2083 | |||
2084 | if footer[-1].endswith("</p>"): |
||
2085 | footer[-1] = footer[-1][:-len("</p>")] \ |
||
2086 | + ' ' + backlink + "</p>" |
||
2087 | else: |
||
2088 | footer.append("\n<p>%s</p>" % backlink) |
||
2089 | footer.append('</li>') |
||
2090 | footer.append('</ol>') |
||
2091 | footer.append('</div>') |
||
2092 | return text + '\n\n' + '\n'.join(footer) |
||
2093 | else: |
||
2094 | return text |
||
2095 | |||
2096 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: |
||
2097 | # http://bumppo.net/projects/amputator/ |
||
2098 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') |
||
2099 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) |
||
2100 | _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I) |
||
2101 | |||
2102 | def _encode_amps_and_angles(self, text): |
||
2103 | # Smart processing for ampersands and angle brackets that need |
||
2104 | # to be encoded. |
||
2105 | text = self._ampersand_re.sub('&', text) |
||
2106 | |||
2107 | # Encode naked <'s |
||
2108 | text = self._naked_lt_re.sub('<', text) |
||
2109 | |||
2110 | # Encode naked >'s |
||
2111 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP |
||
2112 | # Markdown) don't do this. |
||
2113 | text = self._naked_gt_re.sub('>', text) |
||
2114 | return text |
||
2115 | |||
2116 | def _encode_backslash_escapes(self, text): |
||
2117 | for ch, escape in list(self._escape_table.items()): |
||
2118 | text = text.replace("\\"+ch, escape) |
||
2119 | return text |
||
2120 | |||
2121 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) |
||
2122 | def _auto_link_sub(self, match): |
||
2123 | g1 = match.group(1) |
||
2124 | return '<a href="%s">%s</a>' % (g1, g1) |
||
2125 | |||
2126 | _auto_email_link_re = re.compile(r""" |
||
2127 | < |
||
2128 | (?:mailto:)? |
||
2129 | ( |
||
2130 | [-.\w]+ |
||
2131 | \@ |
||
2132 | [-\w]+(\.[-\w]+)*\.[a-z]+ |
||
2133 | ) |
||
2134 | > |
||
2135 | """, re.I | re.X | re.U) |
||
2136 | def _auto_email_link_sub(self, match): |
||
2137 | return self._encode_email_address( |
||
2138 | self._unescape_special_chars(match.group(1))) |
||
2139 | |||
2140 | def _do_auto_links(self, text): |
||
2141 | text = self._auto_link_re.sub(self._auto_link_sub, text) |
||
2142 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) |
||
2143 | return text |
||
2144 | |||
2145 | def _encode_email_address(self, addr): |
||
2146 | # Input: an email address, e.g. "[email protected]" |
||
2147 | # |
||
2148 | # Output: the email address as a mailto link, with each character |
||
2149 | # of the address encoded as either a decimal or hex entity, in |
||
2150 | # the hopes of foiling most address harvesting spam bots. E.g.: |
||
2151 | # |
||
2152 | # <a href="mailto:foo@e |
||
2153 | # xample.com">foo |
||
2154 | # @example.com</a> |
||
2155 | # |
||
2156 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk |
||
2157 | # mailing list: <http://tinyurl.com/yu7ue> |
||
2158 | chars = [_xml_encode_email_char_at_random(ch) |
||
2159 | for ch in "mailto:" + addr] |
||
2160 | # Strip the mailto: from the visible part. |
||
2161 | addr = '<a href="%s">%s</a>' \ |
||
2162 | % (''.join(chars), ''.join(chars[7:])) |
||
2163 | return addr |
||
2164 | |||
2165 | def _do_link_patterns(self, text): |
||
2166 | """Caveat emptor: there isn't much guarding against link |
||
2167 | patterns being formed inside other standard Markdown links, e.g. |
||
2168 | inside a [link def][like this]. |
||
2169 | |||
2170 | Dev Notes: *Could* consider prefixing regexes with a negative |
||
2171 | lookbehind assertion to attempt to guard against this. |
||
2172 | """ |
||
2173 | link_from_hash = {} |
||
2174 | for regex, repl in self.link_patterns: |
||
2175 | replacements = [] |
||
2176 | for match in regex.finditer(text): |
||
2177 | if hasattr(repl, "__call__"): |
||
2178 | href = repl(match) |
||
2179 | else: |
||
2180 | href = match.expand(repl) |
||
2181 | replacements.append((match.span(), href)) |
||
2182 | for (start, end), href in reversed(replacements): |
||
2183 | escaped_href = ( |
||
2184 | href.replace('"', '"') # b/c of attr quote |
||
2185 | # To avoid markdown <em> and <strong>: |
||
2186 | .replace('*', self._escape_table['*']) |
||
2187 | .replace('_', self._escape_table['_'])) |
||
2188 | link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) |
||
2189 | hash = _hash_text(link) |
||
2190 | link_from_hash[hash] = link |
||
2191 | text = text[:start] + hash + text[end:] |
||
2192 | for hash, link in list(link_from_hash.items()): |
||
2193 | text = text.replace(hash, link) |
||
2194 | return text |
||
2195 | |||
2196 | def _unescape_special_chars(self, text): |
||
2197 | # Swap back in all the special characters we've hidden. |
||
2198 | for ch, hash in list(self._escape_table.items()): |
||
2199 | text = text.replace(hash, ch) |
||
2200 | return text |
||
2201 | |||
2202 | def _outdent(self, text): |
||
2203 | # Remove one level of line-leading tabs or spaces |
||
2204 | return self._outdent_re.sub('', text) |
||
2205 | |||
2665 |