| Total Complexity | 327 |
| Total Lines | 2014 |
| Duplicated Lines | 3.53 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | #!/usr/bin/env python |
||
| 191 | class Markdown(object): |
||
| 192 | # The dict of "extras" to enable in processing -- a mapping of |
||
| 193 | # extra name to argument for the extra. Most extras do not have an |
||
| 194 | # argument, in which case the value is None. |
||
| 195 | # |
||
| 196 | # This can be set via (a) subclassing and (b) the constructor |
||
| 197 | # "extras" argument. |
||
| 198 | extras = None |
||
| 199 | |||
| 200 | urls = None |
||
| 201 | titles = None |
||
| 202 | html_blocks = None |
||
| 203 | html_spans = None |
||
| 204 | html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py |
||
| 205 | |||
| 206 | # Used to track when we're inside an ordered or unordered list |
||
| 207 | # (see _ProcessListItems() for details): |
||
| 208 | list_level = 0 |
||
| 209 | |||
| 210 | _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) |
||
| 211 | |||
| 212 | def __init__(self, html4tags=False, tab_width=4, safe_mode=None, |
||
| 213 | extras=None, link_patterns=None, |
||
| 214 | footnote_title=None, footnote_return_symbol=None, |
||
| 215 | use_file_vars=False): |
||
| 216 | if html4tags: |
||
| 217 | self.empty_element_suffix = ">" |
||
| 218 | else: |
||
| 219 | self.empty_element_suffix = " />" |
||
| 220 | self.tab_width = tab_width |
||
| 221 | |||
| 222 | # For compatibility with earlier markdown2.py and with |
||
| 223 | # markdown.py's safe_mode being a boolean, |
||
| 224 | # safe_mode == True -> "replace" |
||
| 225 | if safe_mode is True: |
||
| 226 | self.safe_mode = "replace" |
||
| 227 | else: |
||
| 228 | self.safe_mode = safe_mode |
||
| 229 | |||
| 230 | # Massaging and building the "extras" info. |
||
| 231 | if self.extras is None: |
||
| 232 | self.extras = {} |
||
| 233 | elif not isinstance(self.extras, dict): |
||
| 234 | self.extras = dict([(e, None) for e in self.extras]) |
||
| 235 | if extras: |
||
| 236 | if not isinstance(extras, dict): |
||
| 237 | extras = dict([(e, None) for e in extras]) |
||
| 238 | self.extras.update(extras) |
||
| 239 | assert isinstance(self.extras, dict) |
||
| 240 | if "toc" in self.extras and "header-ids" not in self.extras: |
||
| 241 | self.extras["header-ids"] = None # "toc" implies "header-ids" |
||
| 242 | self._instance_extras = self.extras.copy() |
||
| 243 | |||
| 244 | self.link_patterns = link_patterns |
||
| 245 | self.footnote_title = footnote_title |
||
| 246 | self.footnote_return_symbol = footnote_return_symbol |
||
| 247 | self.use_file_vars = use_file_vars |
||
| 248 | self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) |
||
| 249 | |||
| 250 | self._escape_table = g_escape_table.copy() |
||
| 251 | if "smarty-pants" in self.extras: |
||
| 252 | self._escape_table['"'] = _hash_text('"') |
||
| 253 | self._escape_table["'"] = _hash_text("'") |
||
| 254 | |||
| 255 | def reset(self): |
||
| 256 | self.urls = {} |
||
| 257 | self.titles = {} |
||
| 258 | self.html_blocks = {} |
||
| 259 | self.html_spans = {} |
||
| 260 | self.list_level = 0 |
||
| 261 | self.extras = self._instance_extras.copy() |
||
| 262 | if "footnotes" in self.extras: |
||
| 263 | self.footnotes = {} |
||
| 264 | self.footnote_ids = [] |
||
| 265 | if "header-ids" in self.extras: |
||
| 266 | self._count_from_header_id = {} # no `defaultdict` in Python 2.4 |
||
| 267 | if "metadata" in self.extras: |
||
| 268 | self.metadata = {} |
||
| 269 | |||
| 270 | # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel" |
||
| 271 | # should only be used in <a> tags with an "href" attribute. |
||
| 272 | _a_nofollow = re.compile(r""" |
||
| 273 | <(a) |
||
| 274 | ( |
||
| 275 | [^>]* |
||
| 276 | href= # href is required |
||
| 277 | ['"]? # HTML5 attribute values do not have to be quoted |
||
| 278 | [^#'"] # We don't want to match href values that start with # (like footnotes) |
||
| 279 | ) |
||
| 280 | """, |
||
| 281 | re.IGNORECASE | re.VERBOSE |
||
| 282 | ) |
||
| 283 | |||
| 284 | # Opens the linked document in a new window or tab |
||
| 285 | # should only used in <a> tags with an "href" attribute. |
||
| 286 | # same with _a_nofollow |
||
| 287 | _a_blank = _a_nofollow |
||
| 288 | |||
| 289 | def convert(self, text): |
||
| 290 | """Convert the given text.""" |
||
| 291 | # Main function. The order in which other subs are called here is |
||
| 292 | # essential. Link and image substitutions need to happen before |
||
| 293 | # _EscapeSpecialChars(), so that any *'s or _'s in the <a> |
||
| 294 | # and <img> tags get encoded. |
||
| 295 | |||
| 296 | # Clear the global hashes. If we don't clear these, you get conflicts |
||
| 297 | # from other articles when generating a page which contains more than |
||
| 298 | # one article (e.g. an index page that shows the N most recent |
||
| 299 | # articles): |
||
| 300 | self.reset() |
||
| 301 | |||
| 302 | if not isinstance(text, unicode): |
||
| 303 | # TODO: perhaps shouldn't presume UTF-8 for string input? |
||
| 304 | text = unicode(text, 'utf-8') |
||
| 305 | |||
| 306 | View Code Duplication | if self.use_file_vars: |
|
|
|
|||
| 307 | # Look for emacs-style file variable hints. |
||
| 308 | emacs_vars = self._get_emacs_vars(text) |
||
| 309 | if "markdown-extras" in emacs_vars: |
||
| 310 | splitter = re.compile("[ ,]+") |
||
| 311 | for e in splitter.split(emacs_vars["markdown-extras"]): |
||
| 312 | if '=' in e: |
||
| 313 | ename, earg = e.split('=', 1) |
||
| 314 | try: |
||
| 315 | earg = int(earg) |
||
| 316 | except ValueError: |
||
| 317 | pass |
||
| 318 | else: |
||
| 319 | ename, earg = e, None |
||
| 320 | self.extras[ename] = earg |
||
| 321 | |||
| 322 | # Standardize line endings: |
||
| 323 | text = text.replace("\r\n", "\n") |
||
| 324 | text = text.replace("\r", "\n") |
||
| 325 | |||
| 326 | # Make sure $text ends with a couple of newlines: |
||
| 327 | text += "\n\n" |
||
| 328 | |||
| 329 | # Convert all tabs to spaces. |
||
| 330 | text = self._detab(text) |
||
| 331 | |||
| 332 | # Strip any lines consisting only of spaces and tabs. |
||
| 333 | # This makes subsequent regexen easier to write, because we can |
||
| 334 | # match consecutive blank lines with /\n+/ instead of something |
||
| 335 | # contorted like /[ \t]*\n+/ . |
||
| 336 | text = self._ws_only_line_re.sub("", text) |
||
| 337 | |||
| 338 | # strip metadata from head and extract |
||
| 339 | if "metadata" in self.extras: |
||
| 340 | text = self._extract_metadata(text) |
||
| 341 | |||
| 342 | text = self.preprocess(text) |
||
| 343 | |||
| 344 | if "fenced-code-blocks" in self.extras and not self.safe_mode: |
||
| 345 | text = self._do_fenced_code_blocks(text) |
||
| 346 | |||
| 347 | if self.safe_mode: |
||
| 348 | text = self._hash_html_spans(text) |
||
| 349 | |||
| 350 | # Turn block-level HTML blocks into hash entries |
||
| 351 | text = self._hash_html_blocks(text, raw=True) |
||
| 352 | |||
| 353 | if "fenced-code-blocks" in self.extras and self.safe_mode: |
||
| 354 | text = self._do_fenced_code_blocks(text) |
||
| 355 | |||
| 356 | # Because numbering references aren't links (yet?) then we can do everything associated with counters |
||
| 357 | # before we get started |
||
| 358 | if "numbering" in self.extras: |
||
| 359 | text = self._do_numbering(text) |
||
| 360 | |||
| 361 | # Strip link definitions, store in hashes. |
||
| 362 | if "footnotes" in self.extras: |
||
| 363 | # Must do footnotes first because an unlucky footnote defn |
||
| 364 | # looks like a link defn: |
||
| 365 | # [^4]: this "looks like a link defn" |
||
| 366 | text = self._strip_footnote_definitions(text) |
||
| 367 | text = self._strip_link_definitions(text) |
||
| 368 | |||
| 369 | text = self._run_block_gamut(text) |
||
| 370 | |||
| 371 | if "footnotes" in self.extras: |
||
| 372 | text = self._add_footnotes(text) |
||
| 373 | |||
| 374 | text = self.postprocess(text) |
||
| 375 | |||
| 376 | text = self._unescape_special_chars(text) |
||
| 377 | |||
| 378 | if self.safe_mode: |
||
| 379 | text = self._unhash_html_spans(text) |
||
| 380 | |||
| 381 | if "nofollow" in self.extras: |
||
| 382 | text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) |
||
| 383 | |||
| 384 | if "target-blank-links" in self.extras: |
||
| 385 | text = self._a_blank.sub(r'<\1 target="_blank"\2', text) |
||
| 386 | |||
| 387 | text += "\n" |
||
| 388 | |||
| 389 | rv = UnicodeWithAttrs(text) |
||
| 390 | if "toc" in self.extras: |
||
| 391 | rv._toc = self._toc |
||
| 392 | if "metadata" in self.extras: |
||
| 393 | rv.metadata = self.metadata |
||
| 394 | return rv |
||
| 395 | |||
| 396 | def postprocess(self, text): |
||
| 397 | """A hook for subclasses to do some postprocessing of the html, if |
||
| 398 | desired. This is called before unescaping of special chars and |
||
| 399 | unhashing of raw HTML spans. |
||
| 400 | """ |
||
| 401 | return text |
||
| 402 | |||
| 403 | def preprocess(self, text): |
||
| 404 | """A hook for subclasses to do some preprocessing of the Markdown, if |
||
| 405 | desired. This is called after basic formatting of the text, but prior |
||
| 406 | to any extras, safe mode, etc. processing. |
||
| 407 | """ |
||
| 408 | return text |
||
| 409 | |||
| 410 | # Is metadata if the content starts with optional '---'-fenced `key: value` |
||
| 411 | # pairs. E.g. (indented for presentation): |
||
| 412 | # --- |
||
| 413 | # foo: bar |
||
| 414 | # another-var: blah blah |
||
| 415 | # --- |
||
| 416 | # # header |
||
| 417 | # or: |
||
| 418 | # foo: bar |
||
| 419 | # another-var: blah blah |
||
| 420 | # |
||
| 421 | # # header |
||
| 422 | _meta_data_pattern = re.compile(r'^(?:---[\ \t]*\n)?(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)|([\S\w]+\s*:(?! >)[ \t]*.*\n?)(?:---[\ \t]*\n)?', re.MULTILINE) |
||
| 423 | _key_val_pat = re.compile("[\S\w]+\s*:(?! >)[ \t]*.*\n?", re.MULTILINE) |
||
| 424 | # this allows key: > |
||
| 425 | # value |
||
| 426 | # conutiues over multiple lines |
||
| 427 | _key_val_block_pat = re.compile( |
||
| 428 | "(.*:\s+>\n\s+[\S\s]+?)(?=\n\w+\s*:\s*\w+\n|\Z)", re.MULTILINE) |
||
| 429 | _meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE) |
||
| 430 | _meta_data_newline = re.compile("^\n", re.MULTILINE) |
||
| 431 | |||
| 432 | def _extract_metadata(self, text): |
||
| 433 | if text.startswith("---"): |
||
| 434 | fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2) |
||
| 435 | metadata_content = fence_splits[1] |
||
| 436 | match = re.findall(self._meta_data_pattern, metadata_content) |
||
| 437 | if not match: |
||
| 438 | return text |
||
| 439 | tail = fence_splits[2] |
||
| 440 | else: |
||
| 441 | metadata_split = re.split(self._meta_data_newline, text, maxsplit=1) |
||
| 442 | metadata_content = metadata_split[0] |
||
| 443 | match = re.findall(self._meta_data_pattern, metadata_content) |
||
| 444 | if not match: |
||
| 445 | return text |
||
| 446 | tail = metadata_split[1] |
||
| 447 | |||
| 448 | kv = re.findall(self._key_val_pat, metadata_content) |
||
| 449 | kvm = re.findall(self._key_val_block_pat, metadata_content) |
||
| 450 | kvm = [item.replace(": >\n", ":", 1) for item in kvm] |
||
| 451 | |||
| 452 | for item in kv + kvm: |
||
| 453 | k, v = item.split(":", 1) |
||
| 454 | self.metadata[k.strip()] = v.strip() |
||
| 455 | |||
| 456 | return tail |
||
| 457 | |||
| 458 | _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) |
||
| 459 | # This regular expression is intended to match blocks like this: |
||
| 460 | # PREFIX Local Variables: SUFFIX |
||
| 461 | # PREFIX mode: Tcl SUFFIX |
||
| 462 | # PREFIX End: SUFFIX |
||
| 463 | # Some notes: |
||
| 464 | # - "[ \t]" is used instead of "\s" to specifically exclude newlines |
||
| 465 | # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does |
||
| 466 | # not like anything other than Unix-style line terminators. |
||
| 467 | _emacs_local_vars_pat = re.compile(r"""^ |
||
| 468 | (?P<prefix>(?:[^\r\n|\n|\r])*?) |
||
| 469 | [\ \t]*Local\ Variables:[\ \t]* |
||
| 470 | (?P<suffix>.*?)(?:\r\n|\n|\r) |
||
| 471 | (?P<content>.*?\1End:) |
||
| 472 | """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) |
||
| 473 | |||
| 474 | def _get_emacs_vars(self, text): |
||
| 475 | """Return a dictionary of emacs-style local variables. |
||
| 476 | |||
| 477 | Parsing is done loosely according to this spec (and according to |
||
| 478 | some in-practice deviations from this): |
||
| 479 | http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables |
||
| 480 | """ |
||
| 481 | emacs_vars = {} |
||
| 482 | SIZE = pow(2, 13) # 8kB |
||
| 483 | |||
| 484 | # Search near the start for a '-*-'-style one-liner of variables. |
||
| 485 | head = text[:SIZE] |
||
| 486 | if "-*-" in head: |
||
| 487 | match = self._emacs_oneliner_vars_pat.search(head) |
||
| 488 | if match: |
||
| 489 | emacs_vars_str = match.group(1) |
||
| 490 | assert '\n' not in emacs_vars_str |
||
| 491 | emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') |
||
| 492 | if s.strip()] |
||
| 493 | if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: |
||
| 494 | # While not in the spec, this form is allowed by emacs: |
||
| 495 | # -*- Tcl -*- |
||
| 496 | # where the implied "variable" is "mode". This form |
||
| 497 | # is only allowed if there are no other variables. |
||
| 498 | emacs_vars["mode"] = emacs_var_strs[0].strip() |
||
| 499 | else: |
||
| 500 | for emacs_var_str in emacs_var_strs: |
||
| 501 | try: |
||
| 502 | variable, value = emacs_var_str.strip().split(':', 1) |
||
| 503 | except ValueError: |
||
| 504 | log.debug("emacs variables error: malformed -*- " |
||
| 505 | "line: %r", emacs_var_str) |
||
| 506 | continue |
||
| 507 | # Lowercase the variable name because Emacs allows "Mode" |
||
| 508 | # or "mode" or "MoDe", etc. |
||
| 509 | emacs_vars[variable.lower()] = value.strip() |
||
| 510 | |||
| 511 | tail = text[-SIZE:] |
||
| 512 | if "Local Variables" in tail: |
||
| 513 | match = self._emacs_local_vars_pat.search(tail) |
||
| 514 | if match: |
||
| 515 | prefix = match.group("prefix") |
||
| 516 | suffix = match.group("suffix") |
||
| 517 | lines = match.group("content").splitlines(0) |
||
| 518 | # print "prefix=%r, suffix=%r, content=%r, lines: %s"\ |
||
| 519 | # % (prefix, suffix, match.group("content"), lines) |
||
| 520 | |||
| 521 | # Validate the Local Variables block: proper prefix and suffix |
||
| 522 | # usage. |
||
| 523 | for i, line in enumerate(lines): |
||
| 524 | if not line.startswith(prefix): |
||
| 525 | log.debug("emacs variables error: line '%s' " |
||
| 526 | "does not use proper prefix '%s'" |
||
| 527 | % (line, prefix)) |
||
| 528 | return {} |
||
| 529 | # Don't validate suffix on last line. Emacs doesn't care, |
||
| 530 | # neither should we. |
||
| 531 | if i != len(lines)-1 and not line.endswith(suffix): |
||
| 532 | log.debug("emacs variables error: line '%s' " |
||
| 533 | "does not use proper suffix '%s'" |
||
| 534 | % (line, suffix)) |
||
| 535 | return {} |
||
| 536 | |||
| 537 | # Parse out one emacs var per line. |
||
| 538 | continued_for = None |
||
| 539 | for line in lines[:-1]: # no var on the last line ("PREFIX End:") |
||
| 540 | if prefix: line = line[len(prefix):] # strip prefix |
||
| 541 | if suffix: line = line[:-len(suffix)] # strip suffix |
||
| 542 | line = line.strip() |
||
| 543 | if continued_for: |
||
| 544 | variable = continued_for |
||
| 545 | if line.endswith('\\'): |
||
| 546 | line = line[:-1].rstrip() |
||
| 547 | else: |
||
| 548 | continued_for = None |
||
| 549 | emacs_vars[variable] += ' ' + line |
||
| 550 | else: |
||
| 551 | try: |
||
| 552 | variable, value = line.split(':', 1) |
||
| 553 | except ValueError: |
||
| 554 | log.debug("local variables error: missing colon " |
||
| 555 | "in local variables entry: '%s'" % line) |
||
| 556 | continue |
||
| 557 | # Do NOT lowercase the variable name, because Emacs only |
||
| 558 | # allows "mode" (and not "Mode", "MoDe", etc.) in this block. |
||
| 559 | value = value.strip() |
||
| 560 | if value.endswith('\\'): |
||
| 561 | value = value[:-1].rstrip() |
||
| 562 | continued_for = variable |
||
| 563 | else: |
||
| 564 | continued_for = None |
||
| 565 | emacs_vars[variable] = value |
||
| 566 | |||
| 567 | # Unquote values. |
||
| 568 | for var, val in list(emacs_vars.items()): |
||
| 569 | if len(val) > 1 and (val.startswith('"') and val.endswith('"') |
||
| 570 | or val.startswith('"') and val.endswith('"')): |
||
| 571 | emacs_vars[var] = val[1:-1] |
||
| 572 | |||
| 573 | return emacs_vars |
||
| 574 | |||
| 575 | def _detab_line(self, line): |
||
| 576 | r"""Recusively convert tabs to spaces in a single line. |
||
| 577 | |||
| 578 | Called from _detab().""" |
||
| 579 | if '\t' not in line: |
||
| 580 | return line |
||
| 581 | chunk1, chunk2 = line.split('\t', 1) |
||
| 582 | chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width)) |
||
| 583 | output = chunk1 + chunk2 |
||
| 584 | return self._detab_line(output) |
||
| 585 | |||
| 586 | def _detab(self, text): |
||
| 587 | r"""Iterate text line by line and convert tabs to spaces. |
||
| 588 | |||
| 589 | >>> m = Markdown() |
||
| 590 | >>> m._detab("\tfoo") |
||
| 591 | ' foo' |
||
| 592 | >>> m._detab(" \tfoo") |
||
| 593 | ' foo' |
||
| 594 | >>> m._detab("\t foo") |
||
| 595 | ' foo' |
||
| 596 | >>> m._detab(" foo") |
||
| 597 | ' foo' |
||
| 598 | >>> m._detab(" foo\n\tbar\tblam") |
||
| 599 | ' foo\n bar blam' |
||
| 600 | """ |
||
| 601 | if '\t' not in text: |
||
| 602 | return text |
||
| 603 | output = [] |
||
| 604 | for line in text.splitlines(): |
||
| 605 | output.append(self._detab_line(line)) |
||
| 606 | return '\n'.join(output) |
||
| 607 | |||
| 608 | # I broke out the html5 tags here and add them to _block_tags_a and |
||
| 609 | # _block_tags_b. This way html5 tags are easy to keep track of. |
||
| 610 | _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption' |
||
| 611 | |||
| 612 | _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' |
||
| 613 | _block_tags_a += _html5tags |
||
| 614 | |||
| 615 | _strict_tag_block_re = re.compile(r""" |
||
| 616 | ( # save in \1 |
||
| 617 | ^ # start of line (with re.M) |
||
| 618 | <(%s) # start tag = \2 |
||
| 619 | \b # word break |
||
| 620 | (.*\n)*? # any number of lines, minimally matching |
||
| 621 | </\2> # the matching end tag |
||
| 622 | [ \t]* # trailing spaces/tabs |
||
| 623 | (?=\n+|\Z) # followed by a newline or end of document |
||
| 624 | ) |
||
| 625 | """ % _block_tags_a, |
||
| 626 | re.X | re.M) |
||
| 627 | |||
| 628 | _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' |
||
| 629 | _block_tags_b += _html5tags |
||
| 630 | |||
| 631 | _liberal_tag_block_re = re.compile(r""" |
||
| 632 | ( # save in \1 |
||
| 633 | ^ # start of line (with re.M) |
||
| 634 | <(%s) # start tag = \2 |
||
| 635 | \b # word break |
||
| 636 | (.*\n)*? # any number of lines, minimally matching |
||
| 637 | .*</\2> # the matching end tag |
||
| 638 | [ \t]* # trailing spaces/tabs |
||
| 639 | (?=\n+|\Z) # followed by a newline or end of document |
||
| 640 | ) |
||
| 641 | """ % _block_tags_b, |
||
| 642 | re.X | re.M) |
||
| 643 | |||
| 644 | _html_markdown_attr_re = re.compile( |
||
| 645 | r'''\s+markdown=("1"|'1')''') |
||
| 646 | def _hash_html_block_sub(self, match, raw=False): |
||
| 647 | html = match.group(1) |
||
| 648 | if raw and self.safe_mode: |
||
| 649 | html = self._sanitize_html(html) |
||
| 650 | elif 'markdown-in-html' in self.extras and 'markdown=' in html: |
||
| 651 | first_line = html.split('\n', 1)[0] |
||
| 652 | m = self._html_markdown_attr_re.search(first_line) |
||
| 653 | if m: |
||
| 654 | lines = html.split('\n') |
||
| 655 | middle = '\n'.join(lines[1:-1]) |
||
| 656 | last_line = lines[-1] |
||
| 657 | first_line = first_line[:m.start()] + first_line[m.end():] |
||
| 658 | f_key = _hash_text(first_line) |
||
| 659 | self.html_blocks[f_key] = first_line |
||
| 660 | l_key = _hash_text(last_line) |
||
| 661 | self.html_blocks[l_key] = last_line |
||
| 662 | return ''.join(["\n\n", f_key, |
||
| 663 | "\n\n", middle, "\n\n", |
||
| 664 | l_key, "\n\n"]) |
||
| 665 | key = _hash_text(html) |
||
| 666 | self.html_blocks[key] = html |
||
| 667 | return "\n\n" + key + "\n\n" |
||
| 668 | |||
| 669 | def _hash_html_blocks(self, text, raw=False): |
||
| 670 | """Hashify HTML blocks |
||
| 671 | |||
| 672 | We only want to do this for block-level HTML tags, such as headers, |
||
| 673 | lists, and tables. That's because we still want to wrap <p>s around |
||
| 674 | "paragraphs" that are wrapped in non-block-level tags, such as anchors, |
||
| 675 | phrase emphasis, and spans. The list of tags we're looking for is |
||
| 676 | hard-coded. |
||
| 677 | |||
| 678 | @param raw {boolean} indicates if these are raw HTML blocks in |
||
| 679 | the original source. It makes a difference in "safe" mode. |
||
| 680 | """ |
||
| 681 | if '<' not in text: |
||
| 682 | return text |
||
| 683 | |||
| 684 | # Pass `raw` value into our calls to self._hash_html_block_sub. |
||
| 685 | hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) |
||
| 686 | |||
| 687 | # First, look for nested blocks, e.g.: |
||
| 688 | # <div> |
||
| 689 | # <div> |
||
| 690 | # tags for inner block must be indented. |
||
| 691 | # </div> |
||
| 692 | # </div> |
||
| 693 | # |
||
| 694 | # The outermost tags must start at the left margin for this to match, and |
||
| 695 | # the inner nested divs must be indented. |
||
| 696 | # We need to do this before the next, more liberal match, because the next |
||
| 697 | # match will start at the first `<div>` and stop at the first `</div>`. |
||
| 698 | text = self._strict_tag_block_re.sub(hash_html_block_sub, text) |
||
| 699 | |||
| 700 | # Now match more liberally, simply from `\n<tag>` to `</tag>\n` |
||
| 701 | text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) |
||
| 702 | |||
| 703 | # Special case just for <hr />. It was easier to make a special |
||
| 704 | # case than to make the other regex more complicated. |
||
| 705 | if "<hr" in text: |
||
| 706 | _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) |
||
| 707 | text = _hr_tag_re.sub(hash_html_block_sub, text) |
||
| 708 | |||
| 709 | # Special case for standalone HTML comments: |
||
| 710 | if "<!--" in text: |
||
| 711 | start = 0 |
||
| 712 | while True: |
||
| 713 | # Delimiters for next comment block. |
||
| 714 | try: |
||
| 715 | start_idx = text.index("<!--", start) |
||
| 716 | except ValueError: |
||
| 717 | break |
||
| 718 | try: |
||
| 719 | end_idx = text.index("-->", start_idx) + 3 |
||
| 720 | except ValueError: |
||
| 721 | break |
||
| 722 | |||
| 723 | # Start position for next comment block search. |
||
| 724 | start = end_idx |
||
| 725 | |||
| 726 | # Validate whitespace before comment. |
||
| 727 | if start_idx: |
||
| 728 | # - Up to `tab_width - 1` spaces before start_idx. |
||
| 729 | for i in range(self.tab_width - 1): |
||
| 730 | if text[start_idx - 1] != ' ': |
||
| 731 | break |
||
| 732 | start_idx -= 1 |
||
| 733 | if start_idx == 0: |
||
| 734 | break |
||
| 735 | # - Must be preceded by 2 newlines or hit the start of |
||
| 736 | # the document. |
||
| 737 | if start_idx == 0: |
||
| 738 | pass |
||
| 739 | elif start_idx == 1 and text[0] == '\n': |
||
| 740 | start_idx = 0 # to match minute detail of Markdown.pl regex |
||
| 741 | elif text[start_idx-2:start_idx] == '\n\n': |
||
| 742 | pass |
||
| 743 | else: |
||
| 744 | break |
||
| 745 | |||
| 746 | # Validate whitespace after comment. |
||
| 747 | # - Any number of spaces and tabs. |
||
| 748 | while end_idx < len(text): |
||
| 749 | if text[end_idx] not in ' \t': |
||
| 750 | break |
||
| 751 | end_idx += 1 |
||
| 752 | # - Must be following by 2 newlines or hit end of text. |
||
| 753 | if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): |
||
| 754 | continue |
||
| 755 | |||
| 756 | # Escape and hash (must match `_hash_html_block_sub`). |
||
| 757 | html = text[start_idx:end_idx] |
||
| 758 | if raw and self.safe_mode: |
||
| 759 | html = self._sanitize_html(html) |
||
| 760 | key = _hash_text(html) |
||
| 761 | self.html_blocks[key] = html |
||
| 762 | text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] |
||
| 763 | |||
| 764 | if "xml" in self.extras: |
||
| 765 | # Treat XML processing instructions and namespaced one-liner |
||
| 766 | # tags as if they were block HTML tags. E.g., if standalone |
||
| 767 | # (i.e. are their own paragraph), the following do not get |
||
| 768 | # wrapped in a <p> tag: |
||
| 769 | # <?foo bar?> |
||
| 770 | # |
||
| 771 | # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> |
||
| 772 | _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) |
||
| 773 | text = _xml_oneliner_re.sub(hash_html_block_sub, text) |
||
| 774 | |||
| 775 | return text |
||
| 776 | |||
| 777 | def _strip_link_definitions(self, text): |
||
| 778 | # Strips link definitions from text, stores the URLs and titles in |
||
| 779 | # hash references. |
||
| 780 | less_than_tab = self.tab_width - 1 |
||
| 781 | |||
| 782 | # Link defs are in the form: |
||
| 783 | # [id]: url "optional title" |
||
| 784 | _link_def_re = re.compile(r""" |
||
| 785 | ^[ ]{0,%d}\[(.+)\]: # id = \1 |
||
| 786 | [ \t]* |
||
| 787 | \n? # maybe *one* newline |
||
| 788 | [ \t]* |
||
| 789 | <?(.+?)>? # url = \2 |
||
| 790 | [ \t]* |
||
| 791 | (?: |
||
| 792 | \n? # maybe one newline |
||
| 793 | [ \t]* |
||
| 794 | (?<=\s) # lookbehind for whitespace |
||
| 795 | ['"(] |
||
| 796 | ([^\n]*) # title = \3 |
||
| 797 | ['")] |
||
| 798 | [ \t]* |
||
| 799 | )? # title is optional |
||
| 800 | (?:\n+|\Z) |
||
| 801 | """ % less_than_tab, re.X | re.M | re.U) |
||
| 802 | return _link_def_re.sub(self._extract_link_def_sub, text) |
||
| 803 | |||
| 804 | def _extract_link_def_sub(self, match): |
||
| 805 | id, url, title = match.groups() |
||
| 806 | key = id.lower() # Link IDs are case-insensitive |
||
| 807 | self.urls[key] = self._encode_amps_and_angles(url) |
||
| 808 | if title: |
||
| 809 | self.titles[key] = title |
||
| 810 | return "" |
||
| 811 | |||
| 812 | def _do_numbering(self, text): |
||
| 813 | ''' We handle the special extension for generic numbering for |
||
| 814 | tables, figures etc. |
||
| 815 | ''' |
||
| 816 | # First pass to define all the references |
||
| 817 | self.regex_defns = re.compile(r''' |
||
| 818 | \[\#(\w+)\s* # the counter. Open square plus hash plus a word \1 |
||
| 819 | ([^@]*)\s* # Some optional characters, that aren't an @. \2 |
||
| 820 | @(\w+) # the id. Should this be normed? \3 |
||
| 821 | ([^\]]*)\] # The rest of the text up to the terminating ] \4 |
||
| 822 | ''', re.VERBOSE) |
||
| 823 | self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] |
||
| 824 | counters = {} |
||
| 825 | references = {} |
||
| 826 | replacements = [] |
||
| 827 | definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>' |
||
| 828 | reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>' |
||
| 829 | for match in self.regex_defns.finditer(text): |
||
| 830 | # We must have four match groups otherwise this isn't a numbering reference |
||
| 831 | if len(match.groups()) != 4: |
||
| 832 | continue |
||
| 833 | counter = match.group(1) |
||
| 834 | text_before = match.group(2) |
||
| 835 | ref_id = match.group(3) |
||
| 836 | text_after = match.group(4) |
||
| 837 | number = counters.get(counter, 1) |
||
| 838 | references[ref_id] = (number, counter) |
||
| 839 | replacements.append((match.start(0), |
||
| 840 | definition_html.format(counter, |
||
| 841 | ref_id, |
||
| 842 | text_before, |
||
| 843 | number, |
||
| 844 | text_after), |
||
| 845 | match.end(0))) |
||
| 846 | counters[counter] = number + 1 |
||
| 847 | for repl in reversed(replacements): |
||
| 848 | text = text[:repl[0]] + repl[1] + text[repl[2]:] |
||
| 849 | |||
| 850 | # Second pass to replace the references with the right |
||
| 851 | # value of the counter |
||
| 852 | # Fwiw, it's vaguely annoying to have to turn the iterator into |
||
| 853 | # a list and then reverse it but I can't think of a better thing to do. |
||
| 854 | for match in reversed(list(self.regex_subs.finditer(text))): |
||
| 855 | number, counter = references.get(match.group(1), (None, None)) |
||
| 856 | if number is not None: |
||
| 857 | repl = reference_html.format(counter, |
||
| 858 | match.group(1), |
||
| 859 | number) |
||
| 860 | else: |
||
| 861 | repl = reference_html.format(match.group(1), |
||
| 862 | 'countererror', |
||
| 863 | '?' + match.group(1) + '?') |
||
| 864 | if "smarty-pants" in self.extras: |
||
| 865 | repl = repl.replace('"', self._escape_table['"']) |
||
| 866 | |||
| 867 | text = text[:match.start()] + repl + text[match.end():] |
||
| 868 | return text |
||
| 869 | |||
| 870 | def _extract_footnote_def_sub(self, match): |
||
| 871 | id, text = match.groups() |
||
| 872 | text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() |
||
| 873 | normed_id = re.sub(r'\W', '-', id) |
||
| 874 | # Ensure footnote text ends with a couple newlines (for some |
||
| 875 | # block gamut matches). |
||
| 876 | self.footnotes[normed_id] = text + "\n\n" |
||
| 877 | return "" |
||
| 878 | |||
| 879 | def _strip_footnote_definitions(self, text): |
||
| 880 | """A footnote definition looks like this: |
||
| 881 | |||
| 882 | [^note-id]: Text of the note. |
||
| 883 | |||
| 884 | May include one or more indented paragraphs. |
||
| 885 | |||
| 886 | Where, |
||
| 887 | - The 'note-id' can be pretty much anything, though typically it |
||
| 888 | is the number of the footnote. |
||
| 889 | - The first paragraph may start on the next line, like so: |
||
| 890 | |||
| 891 | [^note-id]: |
||
| 892 | Text of the note. |
||
| 893 | """ |
||
| 894 | less_than_tab = self.tab_width - 1 |
||
| 895 | footnote_def_re = re.compile(r''' |
||
| 896 | ^[ ]{0,%d}\[\^(.+)\]: # id = \1 |
||
| 897 | [ \t]* |
||
| 898 | ( # footnote text = \2 |
||
| 899 | # First line need not start with the spaces. |
||
| 900 | (?:\s*.*\n+) |
||
| 901 | (?: |
||
| 902 | (?:[ ]{%d} | \t) # Subsequent lines must be indented. |
||
| 903 | .*\n+ |
||
| 904 | )* |
||
| 905 | ) |
||
| 906 | # Lookahead for non-space at line-start, or end of doc. |
||
| 907 | (?:(?=^[ ]{0,%d}\S)|\Z) |
||
| 908 | ''' % (less_than_tab, self.tab_width, self.tab_width), |
||
| 909 | re.X | re.M) |
||
| 910 | return footnote_def_re.sub(self._extract_footnote_def_sub, text) |
||
| 911 | |||
| 912 | _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M) |
||
| 913 | |||
| 914 | def _run_block_gamut(self, text): |
||
| 915 | # These are all the transformations that form block-level |
||
| 916 | # tags like paragraphs, headers, and list items. |
||
| 917 | |||
| 918 | if "fenced-code-blocks" in self.extras: |
||
| 919 | text = self._do_fenced_code_blocks(text) |
||
| 920 | |||
| 921 | text = self._do_headers(text) |
||
| 922 | |||
| 923 | # Do Horizontal Rules: |
||
| 924 | # On the number of spaces in horizontal rules: The spec is fuzzy: "If |
||
| 925 | # you wish, you may use spaces between the hyphens or asterisks." |
||
| 926 | # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the |
||
| 927 | # hr chars to one or two. We'll reproduce that limit here. |
||
| 928 | hr = "\n<hr"+self.empty_element_suffix+"\n" |
||
| 929 | text = re.sub(self._hr_re, hr, text) |
||
| 930 | |||
| 931 | text = self._do_lists(text) |
||
| 932 | |||
| 933 | if "pyshell" in self.extras: |
||
| 934 | text = self._prepare_pyshell_blocks(text) |
||
| 935 | if "wiki-tables" in self.extras: |
||
| 936 | text = self._do_wiki_tables(text) |
||
| 937 | if "tables" in self.extras: |
||
| 938 | text = self._do_tables(text) |
||
| 939 | |||
| 940 | text = self._do_code_blocks(text) |
||
| 941 | |||
| 942 | text = self._do_block_quotes(text) |
||
| 943 | |||
| 944 | # We already ran _HashHTMLBlocks() before, in Markdown(), but that |
||
| 945 | # was to escape raw HTML in the original Markdown source. This time, |
||
| 946 | # we're escaping the markup we've just created, so that we don't wrap |
||
| 947 | # <p> tags around block-level tags. |
||
| 948 | text = self._hash_html_blocks(text) |
||
| 949 | |||
| 950 | text = self._form_paragraphs(text) |
||
| 951 | |||
| 952 | return text |
||
| 953 | |||
| 954 | def _pyshell_block_sub(self, match): |
||
| 955 | lines = match.group(0).splitlines(0) |
||
| 956 | _dedentlines(lines) |
||
| 957 | indent = ' ' * self.tab_width |
||
| 958 | s = ('\n' # separate from possible cuddled paragraph |
||
| 959 | + indent + ('\n'+indent).join(lines) |
||
| 960 | + '\n\n') |
||
| 961 | return s |
||
| 962 | |||
| 963 | def _prepare_pyshell_blocks(self, text): |
||
| 964 | """Ensure that Python interactive shell sessions are put in |
||
| 965 | code blocks -- even if not properly indented. |
||
| 966 | """ |
||
| 967 | if ">>>" not in text: |
||
| 968 | return text |
||
| 969 | |||
| 970 | less_than_tab = self.tab_width - 1 |
||
| 971 | _pyshell_block_re = re.compile(r""" |
||
| 972 | ^([ ]{0,%d})>>>[ ].*\n # first line |
||
| 973 | ^(\1.*\S+.*\n)* # any number of subsequent lines |
||
| 974 | ^\n # ends with a blank line |
||
| 975 | """ % less_than_tab, re.M | re.X) |
||
| 976 | |||
| 977 | return _pyshell_block_re.sub(self._pyshell_block_sub, text) |
||
| 978 | |||
| 979 | def _table_sub(self, match): |
||
| 980 | trim_space_re = '^[ \t\n]+|[ \t\n]+$' |
||
| 981 | trim_bar_re = '^\||\|$' |
||
| 982 | split_bar_re = '^\||(?<!\\\\)\|' |
||
| 983 | escape_bar_re = '\\\\\|' |
||
| 984 | |||
| 985 | head, underline, body = match.groups() |
||
| 986 | |||
| 987 | # Determine aligns for columns. |
||
| 988 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))] |
||
| 989 | align_from_col_idx = {} |
||
| 990 | for col_idx, col in enumerate(cols): |
||
| 991 | if col[0] == ':' and col[-1] == ':': |
||
| 992 | align_from_col_idx[col_idx] = ' align="center"' |
||
| 993 | elif col[0] == ':': |
||
| 994 | align_from_col_idx[col_idx] = ' align="left"' |
||
| 995 | elif col[-1] == ':': |
||
| 996 | align_from_col_idx[col_idx] = ' align="right"' |
||
| 997 | |||
| 998 | # thead |
||
| 999 | hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>'] |
||
| 1000 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] |
||
| 1001 | for col_idx, col in enumerate(cols): |
||
| 1002 | hlines.append(' <th%s>%s</th>' % ( |
||
| 1003 | align_from_col_idx.get(col_idx, ''), |
||
| 1004 | self._run_span_gamut(col) |
||
| 1005 | )) |
||
| 1006 | hlines.append('</tr>') |
||
| 1007 | hlines.append('</thead>') |
||
| 1008 | |||
| 1009 | # tbody |
||
| 1010 | hlines.append('<tbody>') |
||
| 1011 | for line in body.strip('\n').split('\n'): |
||
| 1012 | hlines.append('<tr>') |
||
| 1013 | cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] |
||
| 1014 | for col_idx, col in enumerate(cols): |
||
| 1015 | hlines.append(' <td%s>%s</td>' % ( |
||
| 1016 | align_from_col_idx.get(col_idx, ''), |
||
| 1017 | self._run_span_gamut(col) |
||
| 1018 | )) |
||
| 1019 | hlines.append('</tr>') |
||
| 1020 | hlines.append('</tbody>') |
||
| 1021 | hlines.append('</table>') |
||
| 1022 | |||
| 1023 | return '\n'.join(hlines) + '\n' |
||
| 1024 | |||
| 1025 | def _do_tables(self, text): |
||
| 1026 | """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from |
||
| 1027 | https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 |
||
| 1028 | """ |
||
| 1029 | less_than_tab = self.tab_width - 1 |
||
| 1030 | table_re = re.compile(r''' |
||
| 1031 | (?:(?<=\n\n)|\A\n?) # leading blank line |
||
| 1032 | |||
| 1033 | ^[ ]{0,%d} # allowed whitespace |
||
| 1034 | (.*[|].*) \n # $1: header row (at least one pipe) |
||
| 1035 | |||
| 1036 | ^[ ]{0,%d} # allowed whitespace |
||
| 1037 | ( # $2: underline row |
||
| 1038 | # underline row with leading bar |
||
| 1039 | (?: \|\ *:?-+:?\ * )+ \|? \n |
||
| 1040 | | |
||
| 1041 | # or, underline row without leading bar |
||
| 1042 | (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n |
||
| 1043 | ) |
||
| 1044 | |||
| 1045 | ( # $3: data rows |
||
| 1046 | (?: |
||
| 1047 | ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces |
||
| 1048 | .*\|.* \n |
||
| 1049 | )+ |
||
| 1050 | ) |
||
| 1051 | ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) |
||
| 1052 | return table_re.sub(self._table_sub, text) |
||
| 1053 | |||
| 1054 | def _wiki_table_sub(self, match): |
||
| 1055 | ttext = match.group(0).strip() |
||
| 1056 | # print 'wiki table: %r' % match.group(0) |
||
| 1057 | rows = [] |
||
| 1058 | for line in ttext.splitlines(0): |
||
| 1059 | line = line.strip()[2:-2].strip() |
||
| 1060 | row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)] |
||
| 1061 | rows.append(row) |
||
| 1062 | # pprint(rows) |
||
| 1063 | hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>'] |
||
| 1064 | for row in rows: |
||
| 1065 | hrow = ['<tr>'] |
||
| 1066 | for cell in row: |
||
| 1067 | hrow.append('<td>') |
||
| 1068 | hrow.append(self._run_span_gamut(cell)) |
||
| 1069 | hrow.append('</td>') |
||
| 1070 | hrow.append('</tr>') |
||
| 1071 | hlines.append(''.join(hrow)) |
||
| 1072 | hlines += ['</tbody>', '</table>'] |
||
| 1073 | return '\n'.join(hlines) + '\n' |
||
| 1074 | |||
| 1075 | def _do_wiki_tables(self, text): |
||
| 1076 | # Optimization. |
||
| 1077 | if "||" not in text: |
||
| 1078 | return text |
||
| 1079 | |||
| 1080 | less_than_tab = self.tab_width - 1 |
||
| 1081 | wiki_table_re = re.compile(r''' |
||
| 1082 | (?:(?<=\n\n)|\A\n?) # leading blank line |
||
| 1083 | ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line |
||
| 1084 | (^\1\|\|.+?\|\|\n)* # any number of subsequent lines |
||
| 1085 | ''' % less_than_tab, re.M | re.X) |
||
| 1086 | return wiki_table_re.sub(self._wiki_table_sub, text) |
||
| 1087 | |||
| 1088 | def _run_span_gamut(self, text): |
||
| 1089 | # These are all the transformations that occur *within* block-level |
||
| 1090 | # tags like paragraphs, headers, and list items. |
||
| 1091 | |||
| 1092 | text = self._do_code_spans(text) |
||
| 1093 | |||
| 1094 | text = self._escape_special_chars(text) |
||
| 1095 | |||
| 1096 | # Process anchor and image tags. |
||
| 1097 | text = self._do_links(text) |
||
| 1098 | |||
| 1099 | # Make links out of things like `<http://example.com/>` |
||
| 1100 | # Must come after _do_links(), because you can use < and > |
||
| 1101 | # delimiters in inline links like [this](<url>). |
||
| 1102 | text = self._do_auto_links(text) |
||
| 1103 | |||
| 1104 | if "link-patterns" in self.extras: |
||
| 1105 | text = self._do_link_patterns(text) |
||
| 1106 | |||
| 1107 | text = self._encode_amps_and_angles(text) |
||
| 1108 | |||
| 1109 | if "strike" in self.extras: |
||
| 1110 | text = self._do_strike(text) |
||
| 1111 | |||
| 1112 | text = self._do_italics_and_bold(text) |
||
| 1113 | |||
| 1114 | if "smarty-pants" in self.extras: |
||
| 1115 | text = self._do_smart_punctuation(text) |
||
| 1116 | |||
| 1117 | # Do hard breaks: |
||
| 1118 | if "break-on-newline" in self.extras: |
||
| 1119 | text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text) |
||
| 1120 | else: |
||
| 1121 | text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) |
||
| 1122 | |||
| 1123 | return text |
||
| 1124 | |||
| 1125 | # "Sorta" because auto-links are identified as "tag" tokens. |
||
| 1126 | _sorta_html_tokenize_re = re.compile(r""" |
||
| 1127 | ( |
||
| 1128 | # tag |
||
| 1129 | </? |
||
| 1130 | (?:\w+) # tag name |
||
| 1131 | (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes |
||
| 1132 | \s*/?> |
||
| 1133 | | |
||
| 1134 | # auto-link (e.g., <http://www.activestate.com/>) |
||
| 1135 | <\w+[^>]*> |
||
| 1136 | | |
||
| 1137 | <!--.*?--> # comment |
||
| 1138 | | |
||
| 1139 | <\?.*?\?> # processing instruction |
||
| 1140 | ) |
||
| 1141 | """, re.X) |
||
| 1142 | |||
| 1143 | def _escape_special_chars(self, text): |
||
| 1144 | # Python markdown note: the HTML tokenization here differs from |
||
| 1145 | # that in Markdown.pl, hence the behaviour for subtle cases can |
||
| 1146 | # differ (I believe the tokenizer here does a better job because |
||
| 1147 | # it isn't susceptible to unmatched '<' and '>' in HTML tags). |
||
| 1148 | # Note, however, that '>' is not allowed in an auto-link URL |
||
| 1149 | # here. |
||
| 1150 | escaped = [] |
||
| 1151 | is_html_markup = False |
||
| 1152 | for token in self._sorta_html_tokenize_re.split(text): |
||
| 1153 | if is_html_markup: |
||
| 1154 | # Within tags/HTML-comments/auto-links, encode * and _ |
||
| 1155 | # so they don't conflict with their use in Markdown for |
||
| 1156 | # italics and strong. We're replacing each such |
||
| 1157 | # character with its corresponding MD5 checksum value; |
||
| 1158 | # this is likely overkill, but it should prevent us from |
||
| 1159 | # colliding with the escape values by accident. |
||
| 1160 | escaped.append(token.replace('*', self._escape_table['*']) |
||
| 1161 | .replace('_', self._escape_table['_'])) |
||
| 1162 | else: |
||
| 1163 | escaped.append(self._encode_backslash_escapes(token)) |
||
| 1164 | is_html_markup = not is_html_markup |
||
| 1165 | return ''.join(escaped) |
||
| 1166 | |||
| 1167 | def _hash_html_spans(self, text): |
||
| 1168 | # Used for safe_mode. |
||
| 1169 | |||
| 1170 | def _is_auto_link(s): |
||
| 1171 | if ':' in s and self._auto_link_re.match(s): |
||
| 1172 | return True |
||
| 1173 | elif '@' in s and self._auto_email_link_re.match(s): |
||
| 1174 | return True |
||
| 1175 | return False |
||
| 1176 | |||
| 1177 | tokens = [] |
||
| 1178 | is_html_markup = False |
||
| 1179 | for token in self._sorta_html_tokenize_re.split(text): |
||
| 1180 | if is_html_markup and not _is_auto_link(token): |
||
| 1181 | sanitized = self._sanitize_html(token) |
||
| 1182 | key = _hash_text(sanitized) |
||
| 1183 | self.html_spans[key] = sanitized |
||
| 1184 | tokens.append(key) |
||
| 1185 | else: |
||
| 1186 | tokens.append(token) |
||
| 1187 | is_html_markup = not is_html_markup |
||
| 1188 | return ''.join(tokens) |
||
| 1189 | |||
| 1190 | def _unhash_html_spans(self, text): |
||
| 1191 | for key, sanitized in list(self.html_spans.items()): |
||
| 1192 | text = text.replace(key, sanitized) |
||
| 1193 | return text |
||
| 1194 | |||
| 1195 | def _sanitize_html(self, s): |
||
| 1196 | if self.safe_mode == "replace": |
||
| 1197 | return self.html_removed_text |
||
| 1198 | elif self.safe_mode == "escape": |
||
| 1199 | replacements = [ |
||
| 1200 | ('&', '&'), |
||
| 1201 | ('<', '<'), |
||
| 1202 | ('>', '>'), |
||
| 1203 | ] |
||
| 1204 | for before, after in replacements: |
||
| 1205 | s = s.replace(before, after) |
||
| 1206 | return s |
||
| 1207 | else: |
||
| 1208 | raise MarkdownError("invalid value for 'safe_mode': %r (must be " |
||
| 1209 | "'escape' or 'replace')" % self.safe_mode) |
||
| 1210 | |||
| 1211 | _inline_link_title = re.compile(r''' |
||
| 1212 | ( # \1 |
||
| 1213 | [ \t]+ |
||
| 1214 | (['"]) # quote char = \2 |
||
| 1215 | (?P<title>.*?) |
||
| 1216 | \2 |
||
| 1217 | )? # title is optional |
||
| 1218 | \)$ |
||
| 1219 | ''', re.X | re.S) |
||
| 1220 | _tail_of_reference_link_re = re.compile(r''' |
||
| 1221 | # Match tail of: [text][id] |
||
| 1222 | [ ]? # one optional space |
||
| 1223 | (?:\n[ ]*)? # one optional newline followed by spaces |
||
| 1224 | \[ |
||
| 1225 | (?P<id>.*?) |
||
| 1226 | \] |
||
| 1227 | ''', re.X | re.S) |
||
| 1228 | |||
| 1229 | _whitespace = re.compile(r'\s*') |
||
| 1230 | |||
| 1231 | _strip_anglebrackets = re.compile(r'<(.*)>.*') |
||
| 1232 | |||
| 1233 | def _find_non_whitespace(self, text, start): |
||
| 1234 | """Returns the index of the first non-whitespace character in text |
||
| 1235 | after (and including) start |
||
| 1236 | """ |
||
| 1237 | match = self._whitespace.match(text, start) |
||
| 1238 | return match.end() |
||
| 1239 | |||
| 1240 | def _find_balanced(self, text, start, open_c, close_c): |
||
| 1241 | """Returns the index where the open_c and close_c characters balance |
||
| 1242 | out - the same number of open_c and close_c are encountered - or the |
||
| 1243 | end of string if it's reached before the balance point is found. |
||
| 1244 | """ |
||
| 1245 | i = start |
||
| 1246 | l = len(text) |
||
| 1247 | count = 1 |
||
| 1248 | while count > 0 and i < l: |
||
| 1249 | if text[i] == open_c: |
||
| 1250 | count += 1 |
||
| 1251 | elif text[i] == close_c: |
||
| 1252 | count -= 1 |
||
| 1253 | i += 1 |
||
| 1254 | return i |
||
| 1255 | |||
| 1256 | def _extract_url_and_title(self, text, start): |
||
| 1257 | """Extracts the url and (optional) title from the tail of a link""" |
||
| 1258 | # text[start] equals the opening parenthesis |
||
| 1259 | idx = self._find_non_whitespace(text, start+1) |
||
| 1260 | if idx == len(text): |
||
| 1261 | return None, None, None |
||
| 1262 | end_idx = idx |
||
| 1263 | has_anglebrackets = text[idx] == "<" |
||
| 1264 | if has_anglebrackets: |
||
| 1265 | end_idx = self._find_balanced(text, end_idx+1, "<", ">") |
||
| 1266 | end_idx = self._find_balanced(text, end_idx, "(", ")") |
||
| 1267 | match = self._inline_link_title.search(text, idx, end_idx) |
||
| 1268 | if not match: |
||
| 1269 | return None, None, None |
||
| 1270 | url, title = text[idx:match.start()], match.group("title") |
||
| 1271 | if has_anglebrackets: |
||
| 1272 | url = self._strip_anglebrackets.sub(r'\1', url) |
||
| 1273 | return url, title, end_idx |
||
| 1274 | |||
| 1275 | _safe_protocols = re.compile(r'(https?|ftp):', re.I) |
||
| 1276 | def _do_links(self, text): |
||
| 1277 | """Turn Markdown link shortcuts into XHTML <a> and <img> tags. |
||
| 1278 | |||
| 1279 | This is a combination of Markdown.pl's _DoAnchors() and |
||
| 1280 | _DoImages(). They are done together because that simplified the |
||
| 1281 | approach. It was necessary to use a different approach than |
||
| 1282 | Markdown.pl because of the lack of atomic matching support in |
||
| 1283 | Python's regex engine used in $g_nested_brackets. |
||
| 1284 | """ |
||
| 1285 | MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 |
||
| 1286 | |||
| 1287 | # `anchor_allowed_pos` is used to support img links inside |
||
| 1288 | # anchors, but not anchors inside anchors. An anchor's start |
||
| 1289 | # pos must be `>= anchor_allowed_pos`. |
||
| 1290 | anchor_allowed_pos = 0 |
||
| 1291 | |||
| 1292 | curr_pos = 0 |
||
| 1293 | while True: # Handle the next link. |
||
| 1294 | # The next '[' is the start of: |
||
| 1295 | # - an inline anchor: [text](url "title") |
||
| 1296 | # - a reference anchor: [text][id] |
||
| 1297 | # - an inline img:  |
||
| 1298 | # - a reference img: ![text][id] |
||
| 1299 | # - a footnote ref: [^id] |
||
| 1300 | # (Only if 'footnotes' extra enabled) |
||
| 1301 | # - a footnote defn: [^id]: ... |
||
| 1302 | # (Only if 'footnotes' extra enabled) These have already |
||
| 1303 | # been stripped in _strip_footnote_definitions() so no |
||
| 1304 | # need to watch for them. |
||
| 1305 | # - a link definition: [id]: url "title" |
||
| 1306 | # These have already been stripped in |
||
| 1307 | # _strip_link_definitions() so no need to watch for them. |
||
| 1308 | # - not markup: [...anything else... |
||
| 1309 | try: |
||
| 1310 | start_idx = text.index('[', curr_pos) |
||
| 1311 | except ValueError: |
||
| 1312 | break |
||
| 1313 | text_length = len(text) |
||
| 1314 | |||
| 1315 | # Find the matching closing ']'. |
||
| 1316 | # Markdown.pl allows *matching* brackets in link text so we |
||
| 1317 | # will here too. Markdown.pl *doesn't* currently allow |
||
| 1318 | # matching brackets in img alt text -- we'll differ in that |
||
| 1319 | # regard. |
||
| 1320 | bracket_depth = 0 |
||
| 1321 | for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, |
||
| 1322 | text_length)): |
||
| 1323 | ch = text[p] |
||
| 1324 | if ch == ']': |
||
| 1325 | bracket_depth -= 1 |
||
| 1326 | if bracket_depth < 0: |
||
| 1327 | break |
||
| 1328 | elif ch == '[': |
||
| 1329 | bracket_depth += 1 |
||
| 1330 | else: |
||
| 1331 | # Closing bracket not found within sentinel length. |
||
| 1332 | # This isn't markup. |
||
| 1333 | curr_pos = start_idx + 1 |
||
| 1334 | continue |
||
| 1335 | link_text = text[start_idx+1:p] |
||
| 1336 | |||
| 1337 | # Possibly a footnote ref? |
||
| 1338 | if "footnotes" in self.extras and link_text.startswith("^"): |
||
| 1339 | normed_id = re.sub(r'\W', '-', link_text[1:]) |
||
| 1340 | if normed_id in self.footnotes: |
||
| 1341 | self.footnote_ids.append(normed_id) |
||
| 1342 | result = '<sup class="footnote-ref" id="fnref-%s">' \ |
||
| 1343 | '<a href="#fn-%s">%s</a></sup>' \ |
||
| 1344 | % (normed_id, normed_id, len(self.footnote_ids)) |
||
| 1345 | text = text[:start_idx] + result + text[p+1:] |
||
| 1346 | else: |
||
| 1347 | # This id isn't defined, leave the markup alone. |
||
| 1348 | curr_pos = p+1 |
||
| 1349 | continue |
||
| 1350 | |||
| 1351 | # Now determine what this is by the remainder. |
||
| 1352 | p += 1 |
||
| 1353 | if p == text_length: |
||
| 1354 | return text |
||
| 1355 | |||
| 1356 | # Inline anchor or img? |
||
| 1357 | if text[p] == '(': # attempt at perf improvement |
||
| 1358 | url, title, url_end_idx = self._extract_url_and_title(text, p) |
||
| 1359 | if url is not None: |
||
| 1360 | # Handle an inline anchor or img. |
||
| 1361 | is_img = start_idx > 0 and text[start_idx-1] == "!" |
||
| 1362 | if is_img: |
||
| 1363 | start_idx -= 1 |
||
| 1364 | |||
| 1365 | # We've got to encode these to avoid conflicting |
||
| 1366 | # with italics/bold. |
||
| 1367 | url = url.replace('*', self._escape_table['*']) \ |
||
| 1368 | .replace('_', self._escape_table['_']) |
||
| 1369 | if title: |
||
| 1370 | title_str = ' title="%s"' % ( |
||
| 1371 | _xml_escape_attr(title) |
||
| 1372 | .replace('*', self._escape_table['*']) |
||
| 1373 | .replace('_', self._escape_table['_'])) |
||
| 1374 | else: |
||
| 1375 | title_str = '' |
||
| 1376 | View Code Duplication | if is_img: |
|
| 1377 | img_class_str = self._html_class_str_from_tag("img") |
||
| 1378 | result = '<img src="%s" alt="%s"%s%s%s' \ |
||
| 1379 | % (_html_escape_url(url, safe_mode=self.safe_mode), |
||
| 1380 | _xml_escape_attr(link_text), |
||
| 1381 | title_str, |
||
| 1382 | img_class_str, |
||
| 1383 | self.empty_element_suffix) |
||
| 1384 | if "smarty-pants" in self.extras: |
||
| 1385 | result = result.replace('"', self._escape_table['"']) |
||
| 1386 | curr_pos = start_idx + len(result) |
||
| 1387 | text = text[:start_idx] + result + text[url_end_idx:] |
||
| 1388 | elif start_idx >= anchor_allowed_pos: |
||
| 1389 | if self.safe_mode and not self._safe_protocols.match(url): |
||
| 1390 | result_head = '<a href="#"%s>' % (title_str) |
||
| 1391 | else: |
||
| 1392 | result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
||
| 1393 | result = '%s%s</a>' % (result_head, _xml_escape_attr(link_text)) |
||
| 1394 | if "smarty-pants" in self.extras: |
||
| 1395 | result = result.replace('"', self._escape_table['"']) |
||
| 1396 | # <img> allowed from curr_pos on, <a> from |
||
| 1397 | # anchor_allowed_pos on. |
||
| 1398 | curr_pos = start_idx + len(result_head) |
||
| 1399 | anchor_allowed_pos = start_idx + len(result) |
||
| 1400 | text = text[:start_idx] + result + text[url_end_idx:] |
||
| 1401 | else: |
||
| 1402 | # Anchor not allowed here. |
||
| 1403 | curr_pos = start_idx + 1 |
||
| 1404 | continue |
||
| 1405 | |||
| 1406 | # Reference anchor or img? |
||
| 1407 | else: |
||
| 1408 | match = self._tail_of_reference_link_re.match(text, p) |
||
| 1409 | if match: |
||
| 1410 | # Handle a reference-style anchor or img. |
||
| 1411 | is_img = start_idx > 0 and text[start_idx-1] == "!" |
||
| 1412 | if is_img: |
||
| 1413 | start_idx -= 1 |
||
| 1414 | link_id = match.group("id").lower() |
||
| 1415 | if not link_id: |
||
| 1416 | link_id = link_text.lower() # for links like [this][] |
||
| 1417 | if link_id in self.urls: |
||
| 1418 | url = self.urls[link_id] |
||
| 1419 | # We've got to encode these to avoid conflicting |
||
| 1420 | # with italics/bold. |
||
| 1421 | url = url.replace('*', self._escape_table['*']) \ |
||
| 1422 | .replace('_', self._escape_table['_']) |
||
| 1423 | title = self.titles.get(link_id) |
||
| 1424 | if title: |
||
| 1425 | title = _xml_escape_attr(title) \ |
||
| 1426 | .replace('*', self._escape_table['*']) \ |
||
| 1427 | .replace('_', self._escape_table['_']) |
||
| 1428 | title_str = ' title="%s"' % title |
||
| 1429 | else: |
||
| 1430 | title_str = '' |
||
| 1431 | View Code Duplication | if is_img: |
|
| 1432 | img_class_str = self._html_class_str_from_tag("img") |
||
| 1433 | result = '<img src="%s" alt="%s"%s%s%s' \ |
||
| 1434 | % (_html_escape_url(url, safe_mode=self.safe_mode), |
||
| 1435 | _xml_escape_attr(link_text), |
||
| 1436 | title_str, |
||
| 1437 | img_class_str, |
||
| 1438 | self.empty_element_suffix) |
||
| 1439 | if "smarty-pants" in self.extras: |
||
| 1440 | result = result.replace('"', self._escape_table['"']) |
||
| 1441 | curr_pos = start_idx + len(result) |
||
| 1442 | text = text[:start_idx] + result + text[match.end():] |
||
| 1443 | elif start_idx >= anchor_allowed_pos: |
||
| 1444 | if self.safe_mode and not self._safe_protocols.match(url): |
||
| 1445 | result_head = '<a href="#"%s>' % (title_str) |
||
| 1446 | else: |
||
| 1447 | result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str) |
||
| 1448 | result = '%s%s</a>' % (result_head, link_text) |
||
| 1449 | if "smarty-pants" in self.extras: |
||
| 1450 | result = result.replace('"', self._escape_table['"']) |
||
| 1451 | # <img> allowed from curr_pos on, <a> from |
||
| 1452 | # anchor_allowed_pos on. |
||
| 1453 | curr_pos = start_idx + len(result_head) |
||
| 1454 | anchor_allowed_pos = start_idx + len(result) |
||
| 1455 | text = text[:start_idx] + result + text[match.end():] |
||
| 1456 | else: |
||
| 1457 | # Anchor not allowed here. |
||
| 1458 | curr_pos = start_idx + 1 |
||
| 1459 | else: |
||
| 1460 | # This id isn't defined, leave the markup alone. |
||
| 1461 | curr_pos = match.end() |
||
| 1462 | continue |
||
| 1463 | |||
| 1464 | # Otherwise, it isn't markup. |
||
| 1465 | curr_pos = start_idx + 1 |
||
| 1466 | |||
| 1467 | return text |
||
| 1468 | |||
| 1469 | def header_id_from_text(self, text, prefix, n): |
||
| 1470 | """Generate a header id attribute value from the given header |
||
| 1471 | HTML content. |
||
| 1472 | |||
| 1473 | This is only called if the "header-ids" extra is enabled. |
||
| 1474 | Subclasses may override this for different header ids. |
||
| 1475 | |||
| 1476 | @param text {str} The text of the header tag |
||
| 1477 | @param prefix {str} The requested prefix for header ids. This is the |
||
| 1478 | value of the "header-ids" extra key, if any. Otherwise, None. |
||
| 1479 | @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag. |
||
| 1480 | @returns {str} The value for the header tag's "id" attribute. Return |
||
| 1481 | None to not have an id attribute and to exclude this header from |
||
| 1482 | the TOC (if the "toc" extra is specified). |
||
| 1483 | """ |
||
| 1484 | header_id = _slugify(text) |
||
| 1485 | if prefix and isinstance(prefix, base_string_type): |
||
| 1486 | header_id = prefix + '-' + header_id |
||
| 1487 | if header_id in self._count_from_header_id: |
||
| 1488 | self._count_from_header_id[header_id] += 1 |
||
| 1489 | header_id += '-%s' % self._count_from_header_id[header_id] |
||
| 1490 | else: |
||
| 1491 | self._count_from_header_id[header_id] = 1 |
||
| 1492 | if 0 == len(header_id): |
||
| 1493 | header_id += '-%s' % self._count_from_header_id[header_id] |
||
| 1494 | |||
| 1495 | return header_id |
||
| 1496 | |||
| 1497 | _toc = None |
||
| 1498 | def _toc_add_entry(self, level, id, name): |
||
| 1499 | if self._toc is None: |
||
| 1500 | self._toc = [] |
||
| 1501 | self._toc.append((level, id, self._unescape_special_chars(name))) |
||
| 1502 | |||
| 1503 | _h_re_base = r''' |
||
| 1504 | (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) |
||
| 1505 | | |
||
| 1506 | (^(\#{1,6}) # \1 = string of #'s |
||
| 1507 | [ \t]%s |
||
| 1508 | (.+?) # \2 = Header text |
||
| 1509 | [ \t]* |
||
| 1510 | (?<!\\) # ensure not an escaped trailing '#' |
||
| 1511 | \#* # optional closing #'s (not counted) |
||
| 1512 | \n+ |
||
| 1513 | ) |
||
| 1514 | ''' |
||
| 1515 | |||
| 1516 | _h_re = re.compile(_h_re_base % '*', re.X | re.M) |
||
| 1517 | _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) |
||
| 1518 | |||
| 1519 | def _h_sub(self, match): |
||
| 1520 | if match.group(1) is not None: |
||
| 1521 | # Setext header |
||
| 1522 | n = {"=": 1, "-": 2}[match.group(3)[0]] |
||
| 1523 | header_group = match.group(2) |
||
| 1524 | else: |
||
| 1525 | # atx header |
||
| 1526 | n = len(match.group(5)) |
||
| 1527 | header_group = match.group(6) |
||
| 1528 | |||
| 1529 | demote_headers = self.extras.get("demote-headers") |
||
| 1530 | if demote_headers: |
||
| 1531 | n = min(n + demote_headers, 6) |
||
| 1532 | header_id_attr = "" |
||
| 1533 | if "header-ids" in self.extras: |
||
| 1534 | header_id = self.header_id_from_text(header_group, |
||
| 1535 | self.extras["header-ids"], n) |
||
| 1536 | if header_id: |
||
| 1537 | header_id_attr = ' id="%s"' % header_id |
||
| 1538 | html = self._run_span_gamut(header_group) |
||
| 1539 | if "toc" in self.extras and header_id: |
||
| 1540 | self._toc_add_entry(n, header_id, html) |
||
| 1541 | return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) |
||
| 1542 | |||
| 1543 | def _do_headers(self, text): |
||
| 1544 | # Setext-style headers: |
||
| 1545 | # Header 1 |
||
| 1546 | # ======== |
||
| 1547 | # |
||
| 1548 | # Header 2 |
||
| 1549 | # -------- |
||
| 1550 | |||
| 1551 | # atx-style headers: |
||
| 1552 | # # Header 1 |
||
| 1553 | # ## Header 2 |
||
| 1554 | # ## Header 2 with closing hashes ## |
||
| 1555 | # ... |
||
| 1556 | # ###### Header 6 |
||
| 1557 | |||
| 1558 | if 'tag-friendly' in self.extras: |
||
| 1559 | return self._h_re_tag_friendly.sub(self._h_sub, text) |
||
| 1560 | return self._h_re.sub(self._h_sub, text) |
||
| 1561 | |||
| 1562 | _marker_ul_chars = '*+-' |
||
| 1563 | _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars |
||
| 1564 | _marker_ul = '(?:[%s])' % _marker_ul_chars |
||
| 1565 | _marker_ol = r'(?:\d+\.)' |
||
| 1566 | |||
| 1567 | def _list_sub(self, match): |
||
| 1568 | lst = match.group(1) |
||
| 1569 | lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" |
||
| 1570 | result = self._process_list_items(lst) |
||
| 1571 | if self.list_level: |
||
| 1572 | return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) |
||
| 1573 | else: |
||
| 1574 | return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) |
||
| 1575 | |||
| 1576 | def _do_lists(self, text): |
||
| 1577 | # Form HTML ordered (numbered) and unordered (bulleted) lists. |
||
| 1578 | |||
| 1579 | # Iterate over each *non-overlapping* list match. |
||
| 1580 | pos = 0 |
||
| 1581 | while True: |
||
| 1582 | # Find the *first* hit for either list style (ul or ol). We |
||
| 1583 | # match ul and ol separately to avoid adjacent lists of different |
||
| 1584 | # types running into each other (see issue #16). |
||
| 1585 | hits = [] |
||
| 1586 | for marker_pat in (self._marker_ul, self._marker_ol): |
||
| 1587 | less_than_tab = self.tab_width - 1 |
||
| 1588 | whole_list = r''' |
||
| 1589 | ( # \1 = whole list |
||
| 1590 | ( # \2 |
||
| 1591 | [ ]{0,%d} |
||
| 1592 | (%s) # \3 = first list item marker |
||
| 1593 | [ \t]+ |
||
| 1594 | (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case. |
||
| 1595 | ) |
||
| 1596 | (?:.+?) |
||
| 1597 | ( # \4 |
||
| 1598 | \Z |
||
| 1599 | | |
||
| 1600 | \n{2,} |
||
| 1601 | (?=\S) |
||
| 1602 | (?! # Negative lookahead for another list item marker |
||
| 1603 | [ \t]* |
||
| 1604 | %s[ \t]+ |
||
| 1605 | ) |
||
| 1606 | ) |
||
| 1607 | ) |
||
| 1608 | ''' % (less_than_tab, marker_pat, marker_pat) |
||
| 1609 | if self.list_level: # sub-list |
||
| 1610 | list_re = re.compile("^"+whole_list, re.X | re.M | re.S) |
||
| 1611 | else: |
||
| 1612 | list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, |
||
| 1613 | re.X | re.M | re.S) |
||
| 1614 | match = list_re.search(text, pos) |
||
| 1615 | if match: |
||
| 1616 | hits.append((match.start(), match)) |
||
| 1617 | if not hits: |
||
| 1618 | break |
||
| 1619 | hits.sort() |
||
| 1620 | match = hits[0][1] |
||
| 1621 | start, end = match.span() |
||
| 1622 | middle = self._list_sub(match) |
||
| 1623 | text = text[:start] + middle + text[end:] |
||
| 1624 | pos = start + len(middle) # start pos for next attempted match |
||
| 1625 | |||
| 1626 | return text |
||
| 1627 | |||
| 1628 | _list_item_re = re.compile(r''' |
||
| 1629 | (\n)? # leading line = \1 |
||
| 1630 | (^[ \t]*) # leading whitespace = \2 |
||
| 1631 | (?P<marker>%s) [ \t]+ # list marker = \3 |
||
| 1632 | ((?:.+?) # list item text = \4 |
||
| 1633 | (\n{1,2})) # eols = \5 |
||
| 1634 | (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) |
||
| 1635 | ''' % (_marker_any, _marker_any), |
||
| 1636 | re.M | re.X | re.S) |
||
| 1637 | |||
| 1638 | _task_list_item_re = re.compile(r''' |
||
| 1639 | (\[[\ x]\])[ \t]+ # tasklist marker = \1 |
||
| 1640 | (.*) # list item text = \2 |
||
| 1641 | ''', re.M | re.X | re.S) |
||
| 1642 | |||
| 1643 | _task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s' |
||
| 1644 | |||
| 1645 | def _task_list_item_sub(self, match): |
||
| 1646 | marker = match.group(1) |
||
| 1647 | item_text = match.group(2) |
||
| 1648 | if marker == '[x]': |
||
| 1649 | return self._task_list_warpper_str % ('checked ', item_text) |
||
| 1650 | elif marker == '[ ]': |
||
| 1651 | return self._task_list_warpper_str % ('', item_text) |
||
| 1652 | |||
| 1653 | _last_li_endswith_two_eols = False |
||
| 1654 | def _list_item_sub(self, match): |
||
| 1655 | item = match.group(4) |
||
| 1656 | leading_line = match.group(1) |
||
| 1657 | if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: |
||
| 1658 | item = self._run_block_gamut(self._outdent(item)) |
||
| 1659 | else: |
||
| 1660 | # Recursion for sub-lists: |
||
| 1661 | item = self._do_lists(self._outdent(item)) |
||
| 1662 | if item.endswith('\n'): |
||
| 1663 | item = item[:-1] |
||
| 1664 | item = self._run_span_gamut(item) |
||
| 1665 | self._last_li_endswith_two_eols = (len(match.group(5)) == 2) |
||
| 1666 | |||
| 1667 | if "task_list" in self.extras: |
||
| 1668 | item = self._task_list_item_re.sub(self._task_list_item_sub, item) |
||
| 1669 | |||
| 1670 | return "<li>%s</li>\n" % item |
||
| 1671 | |||
| 1672 | def _process_list_items(self, list_str): |
||
| 1673 | # Process the contents of a single ordered or unordered list, |
||
| 1674 | # splitting it into individual list items. |
||
| 1675 | |||
| 1676 | # The $g_list_level global keeps track of when we're inside a list. |
||
| 1677 | # Each time we enter a list, we increment it; when we leave a list, |
||
| 1678 | # we decrement. If it's zero, we're not in a list anymore. |
||
| 1679 | # |
||
| 1680 | # We do this because when we're not inside a list, we want to treat |
||
| 1681 | # something like this: |
||
| 1682 | # |
||
| 1683 | # I recommend upgrading to version |
||
| 1684 | # 8. Oops, now this line is treated |
||
| 1685 | # as a sub-list. |
||
| 1686 | # |
||
| 1687 | # As a single paragraph, despite the fact that the second line starts |
||
| 1688 | # with a digit-period-space sequence. |
||
| 1689 | # |
||
| 1690 | # Whereas when we're inside a list (or sub-list), that line will be |
||
| 1691 | # treated as the start of a sub-list. What a kludge, huh? This is |
||
| 1692 | # an aspect of Markdown's syntax that's hard to parse perfectly |
||
| 1693 | # without resorting to mind-reading. Perhaps the solution is to |
||
| 1694 | # change the syntax rules such that sub-lists must start with a |
||
| 1695 | # starting cardinal number; e.g. "1." or "a.". |
||
| 1696 | self.list_level += 1 |
||
| 1697 | self._last_li_endswith_two_eols = False |
||
| 1698 | list_str = list_str.rstrip('\n') + '\n' |
||
| 1699 | list_str = self._list_item_re.sub(self._list_item_sub, list_str) |
||
| 1700 | self.list_level -= 1 |
||
| 1701 | return list_str |
||
| 1702 | |||
| 1703 | def _get_pygments_lexer(self, lexer_name): |
||
| 1704 | try: |
||
| 1705 | from pygments import lexers, util |
||
| 1706 | except ImportError: |
||
| 1707 | return None |
||
| 1708 | try: |
||
| 1709 | return lexers.get_lexer_by_name(lexer_name) |
||
| 1710 | except util.ClassNotFound: |
||
| 1711 | return None |
||
| 1712 | |||
| 1713 | def _color_with_pygments(self, codeblock, lexer, **formatter_opts): |
||
| 1714 | import pygments |
||
| 1715 | import pygments.formatters |
||
| 1716 | |||
| 1717 | class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): |
||
| 1718 | def _wrap_code(self, inner): |
||
| 1719 | """A function for use in a Pygments Formatter which |
||
| 1720 | wraps in <code> tags. |
||
| 1721 | """ |
||
| 1722 | yield 0, "<code>" |
||
| 1723 | for tup in inner: |
||
| 1724 | yield tup |
||
| 1725 | yield 0, "</code>" |
||
| 1726 | |||
| 1727 | def wrap(self, source, outfile): |
||
| 1728 | """Return the source with a code, pre, and div.""" |
||
| 1729 | return self._wrap_div(self._wrap_pre(self._wrap_code(source))) |
||
| 1730 | |||
| 1731 | formatter_opts.setdefault("cssclass", "codehilite") |
||
| 1732 | formatter = HtmlCodeFormatter(**formatter_opts) |
||
| 1733 | return pygments.highlight(codeblock, lexer, formatter) |
||
| 1734 | |||
| 1735 | def _code_block_sub(self, match, is_fenced_code_block=False): |
||
| 1736 | lexer_name = None |
||
| 1737 | if is_fenced_code_block: |
||
| 1738 | lexer_name = match.group(1) |
||
| 1739 | if lexer_name: |
||
| 1740 | formatter_opts = self.extras['fenced-code-blocks'] or {} |
||
| 1741 | codeblock = match.group(2) |
||
| 1742 | codeblock = codeblock[:-1] # drop one trailing newline |
||
| 1743 | else: |
||
| 1744 | codeblock = match.group(1) |
||
| 1745 | codeblock = self._outdent(codeblock) |
||
| 1746 | codeblock = self._detab(codeblock) |
||
| 1747 | codeblock = codeblock.lstrip('\n') # trim leading newlines |
||
| 1748 | codeblock = codeblock.rstrip() # trim trailing whitespace |
||
| 1749 | |||
| 1750 | # Note: "code-color" extra is DEPRECATED. |
||
| 1751 | if "code-color" in self.extras and codeblock.startswith(":::"): |
||
| 1752 | lexer_name, rest = codeblock.split('\n', 1) |
||
| 1753 | lexer_name = lexer_name[3:].strip() |
||
| 1754 | codeblock = rest.lstrip("\n") # Remove lexer declaration line. |
||
| 1755 | formatter_opts = self.extras['code-color'] or {} |
||
| 1756 | |||
| 1757 | # Use pygments only if not using the highlightjs-lang extra |
||
| 1758 | if lexer_name and "highlightjs-lang" not in self.extras: |
||
| 1759 | def unhash_code(codeblock): |
||
| 1760 | for key, sanitized in list(self.html_spans.items()): |
||
| 1761 | codeblock = codeblock.replace(key, sanitized) |
||
| 1762 | replacements = [ |
||
| 1763 | ("&", "&"), |
||
| 1764 | ("<", "<"), |
||
| 1765 | (">", ">") |
||
| 1766 | ] |
||
| 1767 | for old, new in replacements: |
||
| 1768 | codeblock = codeblock.replace(old, new) |
||
| 1769 | return codeblock |
||
| 1770 | lexer = self._get_pygments_lexer(lexer_name) |
||
| 1771 | if lexer: |
||
| 1772 | codeblock = unhash_code( codeblock ) |
||
| 1773 | colored = self._color_with_pygments(codeblock, lexer, |
||
| 1774 | **formatter_opts) |
||
| 1775 | return "\n\n%s\n\n" % colored |
||
| 1776 | |||
| 1777 | codeblock = self._encode_code(codeblock) |
||
| 1778 | pre_class_str = self._html_class_str_from_tag("pre") |
||
| 1779 | |||
| 1780 | if "highlightjs-lang" in self.extras and lexer_name: |
||
| 1781 | code_class_str = ' class="%s"' % lexer_name |
||
| 1782 | else: |
||
| 1783 | code_class_str = self._html_class_str_from_tag("code") |
||
| 1784 | |||
| 1785 | return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % ( |
||
| 1786 | pre_class_str, code_class_str, codeblock) |
||
| 1787 | |||
| 1788 | def _html_class_str_from_tag(self, tag): |
||
| 1789 | """Get the appropriate ' class="..."' string (note the leading |
||
| 1790 | space), if any, for the given tag. |
||
| 1791 | """ |
||
| 1792 | if "html-classes" not in self.extras: |
||
| 1793 | return "" |
||
| 1794 | try: |
||
| 1795 | html_classes_from_tag = self.extras["html-classes"] |
||
| 1796 | except TypeError: |
||
| 1797 | return "" |
||
| 1798 | else: |
||
| 1799 | if tag in html_classes_from_tag: |
||
| 1800 | return ' class="%s"' % html_classes_from_tag[tag] |
||
| 1801 | return "" |
||
| 1802 | |||
| 1803 | def _do_code_blocks(self, text): |
||
| 1804 | """Process Markdown `<pre><code>` blocks.""" |
||
| 1805 | code_block_re = re.compile(r''' |
||
| 1806 | (?:\n\n|\A\n?) |
||
| 1807 | ( # $1 = the code block -- one or more lines, starting with a space/tab |
||
| 1808 | (?: |
||
| 1809 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces |
||
| 1810 | .*\n+ |
||
| 1811 | )+ |
||
| 1812 | ) |
||
| 1813 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc |
||
| 1814 | # Lookahead to make sure this block isn't already in a code block. |
||
| 1815 | # Needed when syntax highlighting is being used. |
||
| 1816 | (?![^<]*\</code\>) |
||
| 1817 | ''' % (self.tab_width, self.tab_width), |
||
| 1818 | re.M | re.X) |
||
| 1819 | return code_block_re.sub(self._code_block_sub, text) |
||
| 1820 | |||
| 1821 | _fenced_code_block_re = re.compile(r''' |
||
| 1822 | (?:\n+|\A\n?) |
||
| 1823 | ^```\s*?([\w+-]+)?\s*?\n # opening fence, $1 = optional lang |
||
| 1824 | (.*?) # $2 = code block content |
||
| 1825 | ^```[ \t]*\n # closing fence |
||
| 1826 | ''', re.M | re.X | re.S) |
||
| 1827 | |||
| 1828 | def _fenced_code_block_sub(self, match): |
||
| 1829 | return self._code_block_sub(match, is_fenced_code_block=True) |
||
| 1830 | |||
| 1831 | def _do_fenced_code_blocks(self, text): |
||
| 1832 | """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" |
||
| 1833 | return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) |
||
| 1834 | |||
| 1835 | # Rules for a code span: |
||
| 1836 | # - backslash escapes are not interpreted in a code span |
||
| 1837 | # - to include one or or a run of more backticks the delimiters must |
||
| 1838 | # be a longer run of backticks |
||
| 1839 | # - cannot start or end a code span with a backtick; pad with a |
||
| 1840 | # space and that space will be removed in the emitted HTML |
||
| 1841 | # See `test/tm-cases/escapes.text` for a number of edge-case |
||
| 1842 | # examples. |
||
| 1843 | _code_span_re = re.compile(r''' |
||
| 1844 | (?<!\\) |
||
| 1845 | (`+) # \1 = Opening run of ` |
||
| 1846 | (?!`) # See Note A test/tm-cases/escapes.text |
||
| 1847 | (.+?) # \2 = The code block |
||
| 1848 | (?<!`) |
||
| 1849 | \1 # Matching closer |
||
| 1850 | (?!`) |
||
| 1851 | ''', re.X | re.S) |
||
| 1852 | |||
| 1853 | def _code_span_sub(self, match): |
||
| 1854 | c = match.group(2).strip(" \t") |
||
| 1855 | c = self._encode_code(c) |
||
| 1856 | return "<code>%s</code>" % c |
||
| 1857 | |||
| 1858 | def _do_code_spans(self, text): |
||
| 1859 | # * Backtick quotes are used for <code></code> spans. |
||
| 1860 | # |
||
| 1861 | # * You can use multiple backticks as the delimiters if you want to |
||
| 1862 | # include literal backticks in the code span. So, this input: |
||
| 1863 | # |
||
| 1864 | # Just type ``foo `bar` baz`` at the prompt. |
||
| 1865 | # |
||
| 1866 | # Will translate to: |
||
| 1867 | # |
||
| 1868 | # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> |
||
| 1869 | # |
||
| 1870 | # There's no arbitrary limit to the number of backticks you |
||
| 1871 | # can use as delimters. If you need three consecutive backticks |
||
| 1872 | # in your code, use four for delimiters, etc. |
||
| 1873 | # |
||
| 1874 | # * You can use spaces to get literal backticks at the edges: |
||
| 1875 | # |
||
| 1876 | # ... type `` `bar` `` ... |
||
| 1877 | # |
||
| 1878 | # Turns to: |
||
| 1879 | # |
||
| 1880 | # ... type <code>`bar`</code> ... |
||
| 1881 | return self._code_span_re.sub(self._code_span_sub, text) |
||
| 1882 | |||
| 1883 | def _encode_code(self, text): |
||
| 1884 | """Encode/escape certain characters inside Markdown code runs. |
||
| 1885 | The point is that in code, these characters are literals, |
||
| 1886 | and lose their special Markdown meanings. |
||
| 1887 | """ |
||
| 1888 | replacements = [ |
||
| 1889 | # Encode all ampersands; HTML entities are not |
||
| 1890 | # entities within a Markdown code span. |
||
| 1891 | ('&', '&'), |
||
| 1892 | # Do the angle bracket song and dance: |
||
| 1893 | ('<', '<'), |
||
| 1894 | ('>', '>'), |
||
| 1895 | ] |
||
| 1896 | for before, after in replacements: |
||
| 1897 | text = text.replace(before, after) |
||
| 1898 | hashed = _hash_text(text) |
||
| 1899 | self._escape_table[text] = hashed |
||
| 1900 | return hashed |
||
| 1901 | |||
| 1902 | _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) |
||
| 1903 | def _do_strike(self, text): |
||
| 1904 | text = self._strike_re.sub(r"<strike>\1</strike>", text) |
||
| 1905 | return text |
||
| 1906 | |||
| 1907 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) |
||
| 1908 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) |
||
| 1909 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) |
||
| 1910 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) |
||
| 1911 | def _do_italics_and_bold(self, text): |
||
| 1912 | # <strong> must go first: |
||
| 1913 | if "code-friendly" in self.extras: |
||
| 1914 | text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) |
||
| 1915 | text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) |
||
| 1916 | else: |
||
| 1917 | text = self._strong_re.sub(r"<strong>\2</strong>", text) |
||
| 1918 | text = self._em_re.sub(r"<em>\2</em>", text) |
||
| 1919 | return text |
||
| 1920 | |||
| 1921 | # "smarty-pants" extra: Very liberal in interpreting a single prime as an |
||
| 1922 | # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and |
||
| 1923 | # "twixt" can be written without an initial apostrophe. This is fine because |
||
| 1924 | # using scare quotes (single quotation marks) is rare. |
||
| 1925 | _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))") |
||
| 1926 | _contractions = ["tis", "twas", "twer", "neath", "o", "n", |
||
| 1927 | "round", "bout", "twixt", "nuff", "fraid", "sup"] |
||
| 1928 | def _do_smart_contractions(self, text): |
||
| 1929 | text = self._apostrophe_year_re.sub(r"’\1", text) |
||
| 1930 | for c in self._contractions: |
||
| 1931 | text = text.replace("'%s" % c, "’%s" % c) |
||
| 1932 | text = text.replace("'%s" % c.capitalize(), |
||
| 1933 | "’%s" % c.capitalize()) |
||
| 1934 | return text |
||
| 1935 | |||
| 1936 | # Substitute double-quotes before single-quotes. |
||
| 1937 | _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)") |
||
| 1938 | _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)') |
||
| 1939 | _closing_single_quote_re = re.compile(r"(?<=\S)'") |
||
| 1940 | _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))') |
||
| 1941 | def _do_smart_punctuation(self, text): |
||
| 1942 | """Fancifies 'single quotes', "double quotes", and apostrophes. |
||
| 1943 | Converts --, ---, and ... into en dashes, em dashes, and ellipses. |
||
| 1944 | |||
| 1945 | Inspiration is: <http://daringfireball.net/projects/smartypants/> |
||
| 1946 | See "test/tm-cases/smarty_pants.text" for a full discussion of the |
||
| 1947 | support here and |
||
| 1948 | <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a |
||
| 1949 | discussion of some diversion from the original SmartyPants. |
||
| 1950 | """ |
||
| 1951 | if "'" in text: # guard for perf |
||
| 1952 | text = self._do_smart_contractions(text) |
||
| 1953 | text = self._opening_single_quote_re.sub("‘", text) |
||
| 1954 | text = self._closing_single_quote_re.sub("’", text) |
||
| 1955 | |||
| 1956 | if '"' in text: # guard for perf |
||
| 1957 | text = self._opening_double_quote_re.sub("“", text) |
||
| 1958 | text = self._closing_double_quote_re.sub("”", text) |
||
| 1959 | |||
| 1960 | text = text.replace("---", "—") |
||
| 1961 | text = text.replace("--", "–") |
||
| 1962 | text = text.replace("...", "…") |
||
| 1963 | text = text.replace(" . . . ", "…") |
||
| 1964 | text = text.replace(". . .", "…") |
||
| 1965 | return text |
||
| 1966 | |||
| 1967 | _block_quote_base = r''' |
||
| 1968 | ( # Wrap whole match in \1 |
||
| 1969 | ( |
||
| 1970 | ^[ \t]*>%s[ \t]? # '>' at the start of a line |
||
| 1971 | .+\n # rest of the first line |
||
| 1972 | (.+\n)* # subsequent consecutive lines |
||
| 1973 | \n* # blanks |
||
| 1974 | )+ |
||
| 1975 | ) |
||
| 1976 | ''' |
||
| 1977 | _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X) |
||
| 1978 | _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X) |
||
| 1979 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M) |
||
| 1980 | _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M) |
||
| 1981 | _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M) |
||
| 1982 | _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
||
| 1983 | def _dedent_two_spaces_sub(self, match): |
||
| 1984 | return re.sub(r'(?m)^ ', '', match.group(1)) |
||
| 1985 | |||
| 1986 | def _block_quote_sub(self, match): |
||
| 1987 | bq = match.group(1) |
||
| 1988 | is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq) |
||
| 1989 | # trim one level of quoting |
||
| 1990 | if is_spoiler: |
||
| 1991 | bq = self._bq_one_level_re_spoiler.sub('', bq) |
||
| 1992 | else: |
||
| 1993 | bq = self._bq_one_level_re.sub('', bq) |
||
| 1994 | # trim whitespace-only lines |
||
| 1995 | bq = self._ws_only_line_re.sub('', bq) |
||
| 1996 | bq = self._run_block_gamut(bq) # recurse |
||
| 1997 | |||
| 1998 | bq = re.sub('(?m)^', ' ', bq) |
||
| 1999 | # These leading spaces screw with <pre> content, so we need to fix that: |
||
| 2000 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) |
||
| 2001 | |||
| 2002 | if is_spoiler: |
||
| 2003 | return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq |
||
| 2004 | else: |
||
| 2005 | return '<blockquote>\n%s\n</blockquote>\n\n' % bq |
||
| 2006 | |||
| 2007 | def _do_block_quotes(self, text): |
||
| 2008 | if '>' not in text: |
||
| 2009 | return text |
||
| 2010 | if 'spoiler' in self.extras: |
||
| 2011 | return self._block_quote_re_spoiler.sub(self._block_quote_sub, text) |
||
| 2012 | else: |
||
| 2013 | return self._block_quote_re.sub(self._block_quote_sub, text) |
||
| 2014 | |||
| 2015 | def _form_paragraphs(self, text): |
||
| 2016 | # Strip leading and trailing lines: |
||
| 2017 | text = text.strip('\n') |
||
| 2018 | |||
| 2019 | # Wrap <p> tags. |
||
| 2020 | grafs = [] |
||
| 2021 | for i, graf in enumerate(re.split(r"\n{2,}", text)): |
||
| 2022 | if graf in self.html_blocks: |
||
| 2023 | # Unhashify HTML blocks |
||
| 2024 | grafs.append(self.html_blocks[graf]) |
||
| 2025 | else: |
||
| 2026 | cuddled_list = None |
||
| 2027 | if "cuddled-lists" in self.extras: |
||
| 2028 | # Need to put back trailing '\n' for `_list_item_re` |
||
| 2029 | # match at the end of the paragraph. |
||
| 2030 | li = self._list_item_re.search(graf + '\n') |
||
| 2031 | # Two of the same list marker in this paragraph: a likely |
||
| 2032 | # candidate for a list cuddled to preceding paragraph |
||
| 2033 | # text (issue 33). Note the `[-1]` is a quick way to |
||
| 2034 | # consider numeric bullets (e.g. "1." and "2.") to be |
||
| 2035 | # equal. |
||
| 2036 | if (li and len(li.group(2)) <= 3 and li.group("next_marker") |
||
| 2037 | and li.group("marker")[-1] == li.group("next_marker")[-1]): |
||
| 2038 | start = li.start() |
||
| 2039 | cuddled_list = self._do_lists(graf[start:]).rstrip("\n") |
||
| 2040 | assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>") |
||
| 2041 | graf = graf[:start] |
||
| 2042 | |||
| 2043 | # Wrap <p> tags. |
||
| 2044 | graf = self._run_span_gamut(graf) |
||
| 2045 | grafs.append("<p>" + graf.lstrip(" \t") + "</p>") |
||
| 2046 | |||
| 2047 | if cuddled_list: |
||
| 2048 | grafs.append(cuddled_list) |
||
| 2049 | |||
| 2050 | return "\n\n".join(grafs) |
||
| 2051 | |||
| 2052 | def _add_footnotes(self, text): |
||
| 2053 | if self.footnotes: |
||
| 2054 | footer = [ |
||
| 2055 | '<div class="footnotes">', |
||
| 2056 | '<hr' + self.empty_element_suffix, |
||
| 2057 | '<ol>', |
||
| 2058 | ] |
||
| 2059 | |||
| 2060 | if not self.footnote_title: |
||
| 2061 | self.footnote_title = "Jump back to footnote %d in the text." |
||
| 2062 | if not self.footnote_return_symbol: |
||
| 2063 | self.footnote_return_symbol = "↩" |
||
| 2064 | |||
| 2065 | for i, id in enumerate(self.footnote_ids): |
||
| 2066 | if i != 0: |
||
| 2067 | footer.append('') |
||
| 2068 | footer.append('<li id="fn-%s">' % id) |
||
| 2069 | footer.append(self._run_block_gamut(self.footnotes[id])) |
||
| 2070 | try: |
||
| 2071 | backlink = ('<a href="#fnref-%s" ' + |
||
| 2072 | 'class="footnoteBackLink" ' + |
||
| 2073 | 'title="' + self.footnote_title + '">' + |
||
| 2074 | self.footnote_return_symbol + |
||
| 2075 | '</a>') % (id, i+1) |
||
| 2076 | except TypeError: |
||
| 2077 | log.debug("Footnote error. `footnote_title` " |
||
| 2078 | "must include parameter. Using defaults.") |
||
| 2079 | backlink = ('<a href="#fnref-%s" ' |
||
| 2080 | 'class="footnoteBackLink" ' |
||
| 2081 | 'title="Jump back to footnote %d in the text.">' |
||
| 2082 | '↩</a>' % (id, i+1)) |
||
| 2083 | |||
| 2084 | if footer[-1].endswith("</p>"): |
||
| 2085 | footer[-1] = footer[-1][:-len("</p>")] \ |
||
| 2086 | + ' ' + backlink + "</p>" |
||
| 2087 | else: |
||
| 2088 | footer.append("\n<p>%s</p>" % backlink) |
||
| 2089 | footer.append('</li>') |
||
| 2090 | footer.append('</ol>') |
||
| 2091 | footer.append('</div>') |
||
| 2092 | return text + '\n\n' + '\n'.join(footer) |
||
| 2093 | else: |
||
| 2094 | return text |
||
| 2095 | |||
| 2096 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: |
||
| 2097 | # http://bumppo.net/projects/amputator/ |
||
| 2098 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') |
||
| 2099 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) |
||
| 2100 | _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I) |
||
| 2101 | |||
| 2102 | def _encode_amps_and_angles(self, text): |
||
| 2103 | # Smart processing for ampersands and angle brackets that need |
||
| 2104 | # to be encoded. |
||
| 2105 | text = self._ampersand_re.sub('&', text) |
||
| 2106 | |||
| 2107 | # Encode naked <'s |
||
| 2108 | text = self._naked_lt_re.sub('<', text) |
||
| 2109 | |||
| 2110 | # Encode naked >'s |
||
| 2111 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP |
||
| 2112 | # Markdown) don't do this. |
||
| 2113 | text = self._naked_gt_re.sub('>', text) |
||
| 2114 | return text |
||
| 2115 | |||
| 2116 | def _encode_backslash_escapes(self, text): |
||
| 2117 | for ch, escape in list(self._escape_table.items()): |
||
| 2118 | text = text.replace("\\"+ch, escape) |
||
| 2119 | return text |
||
| 2120 | |||
| 2121 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) |
||
| 2122 | def _auto_link_sub(self, match): |
||
| 2123 | g1 = match.group(1) |
||
| 2124 | return '<a href="%s">%s</a>' % (g1, g1) |
||
| 2125 | |||
| 2126 | _auto_email_link_re = re.compile(r""" |
||
| 2127 | < |
||
| 2128 | (?:mailto:)? |
||
| 2129 | ( |
||
| 2130 | [-.\w]+ |
||
| 2131 | \@ |
||
| 2132 | [-\w]+(\.[-\w]+)*\.[a-z]+ |
||
| 2133 | ) |
||
| 2134 | > |
||
| 2135 | """, re.I | re.X | re.U) |
||
| 2136 | def _auto_email_link_sub(self, match): |
||
| 2137 | return self._encode_email_address( |
||
| 2138 | self._unescape_special_chars(match.group(1))) |
||
| 2139 | |||
| 2140 | def _do_auto_links(self, text): |
||
| 2141 | text = self._auto_link_re.sub(self._auto_link_sub, text) |
||
| 2142 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) |
||
| 2143 | return text |
||
| 2144 | |||
| 2145 | def _encode_email_address(self, addr): |
||
| 2146 | # Input: an email address, e.g. "[email protected]" |
||
| 2147 | # |
||
| 2148 | # Output: the email address as a mailto link, with each character |
||
| 2149 | # of the address encoded as either a decimal or hex entity, in |
||
| 2150 | # the hopes of foiling most address harvesting spam bots. E.g.: |
||
| 2151 | # |
||
| 2152 | # <a href="mailto:foo@e |
||
| 2153 | # xample.com">foo |
||
| 2154 | # @example.com</a> |
||
| 2155 | # |
||
| 2156 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk |
||
| 2157 | # mailing list: <http://tinyurl.com/yu7ue> |
||
| 2158 | chars = [_xml_encode_email_char_at_random(ch) |
||
| 2159 | for ch in "mailto:" + addr] |
||
| 2160 | # Strip the mailto: from the visible part. |
||
| 2161 | addr = '<a href="%s">%s</a>' \ |
||
| 2162 | % (''.join(chars), ''.join(chars[7:])) |
||
| 2163 | return addr |
||
| 2164 | |||
| 2165 | def _do_link_patterns(self, text): |
||
| 2166 | """Caveat emptor: there isn't much guarding against link |
||
| 2167 | patterns being formed inside other standard Markdown links, e.g. |
||
| 2168 | inside a [link def][like this]. |
||
| 2169 | |||
| 2170 | Dev Notes: *Could* consider prefixing regexes with a negative |
||
| 2171 | lookbehind assertion to attempt to guard against this. |
||
| 2172 | """ |
||
| 2173 | link_from_hash = {} |
||
| 2174 | for regex, repl in self.link_patterns: |
||
| 2175 | replacements = [] |
||
| 2176 | for match in regex.finditer(text): |
||
| 2177 | if hasattr(repl, "__call__"): |
||
| 2178 | href = repl(match) |
||
| 2179 | else: |
||
| 2180 | href = match.expand(repl) |
||
| 2181 | replacements.append((match.span(), href)) |
||
| 2182 | for (start, end), href in reversed(replacements): |
||
| 2183 | escaped_href = ( |
||
| 2184 | href.replace('"', '"') # b/c of attr quote |
||
| 2185 | # To avoid markdown <em> and <strong>: |
||
| 2186 | .replace('*', self._escape_table['*']) |
||
| 2187 | .replace('_', self._escape_table['_'])) |
||
| 2188 | link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) |
||
| 2189 | hash = _hash_text(link) |
||
| 2190 | link_from_hash[hash] = link |
||
| 2191 | text = text[:start] + hash + text[end:] |
||
| 2192 | for hash, link in list(link_from_hash.items()): |
||
| 2193 | text = text.replace(hash, link) |
||
| 2194 | return text |
||
| 2195 | |||
| 2196 | def _unescape_special_chars(self, text): |
||
| 2197 | # Swap back in all the special characters we've hidden. |
||
| 2198 | for ch, hash in list(self._escape_table.items()): |
||
| 2199 | text = text.replace(hash, ch) |
||
| 2200 | return text |
||
| 2201 | |||
| 2202 | def _outdent(self, text): |
||
| 2203 | # Remove one level of line-leading tabs or spaces |
||
| 2204 | return self._outdent_re.sub('', text) |
||
| 2205 | |||
| 2665 |