|
1
|
|
|
# coding: utf-8 |
|
2
|
|
|
import re |
|
3
|
|
|
from tidylib import tidy_document |
|
4
|
|
|
from xml.dom.minidom import parseString |
|
5
|
|
|
|
|
6
|
|
|
|
|
7
|
|
|
def sanitize(html): |
|
8
|
|
|
# with from __future__ import unicode_litterals |
|
9
|
|
|
# tidy_document does not want other options at all |
|
10
|
|
|
# such as div merge char-encoding and so on |
|
11
|
|
|
document, errors = tidy_document( |
|
12
|
|
|
html, options={"output-xhtml": 1, "force-output": 1}) |
|
13
|
|
|
|
|
14
|
|
|
parsed_dom = parseString(document) |
|
15
|
|
|
document_element = parsed_dom.documentElement |
|
16
|
|
|
remove_prohibited_elements(document_element) |
|
17
|
|
|
remove_prohibited_attributes(document_element) |
|
18
|
|
|
body = document_element.getElementsByTagName("body")[0] |
|
19
|
|
|
body.tagName = "en-note" |
|
20
|
|
|
return body.toxml() |
|
21
|
|
|
|
|
22
|
|
|
|
|
23
|
|
|
def remove_prohibited_elements(document_element): |
|
24
|
|
|
""" |
|
25
|
|
|
To fit the Evernote DTD need, drop this tag name |
|
26
|
|
|
""" |
|
27
|
|
|
prohibited_tag_names = [ |
|
28
|
|
|
"applet", "base", "basefont", "bgsound", "blink", "button", "dir", |
|
29
|
|
|
"embed", "fieldset", "form", "frame", "frameset", "head", "iframe", |
|
30
|
|
|
"ilayer", "input", "isindex", "label", "layer", "legend", "link", |
|
31
|
|
|
"marquee", "menu", "meta", "noframes", "noscript", "object", |
|
32
|
|
|
"optgroup", "option", "param", "plaintext", "script", "select", |
|
33
|
|
|
"style", "textarea", "xml", 'wbr'] |
|
34
|
|
|
for tag_name in prohibited_tag_names: |
|
35
|
|
|
remove_prohibited_element(tag_name, document_element) |
|
36
|
|
|
|
|
37
|
|
|
|
|
38
|
|
|
def remove_prohibited_element(tag_name, document_element): |
|
39
|
|
|
""" |
|
40
|
|
|
To fit the Evernote DTD need, drop this tag name |
|
41
|
|
|
""" |
|
42
|
|
|
elements = document_element.getElementsByTagName(tag_name) |
|
43
|
|
|
for element in elements: |
|
44
|
|
|
p = element.parentNode |
|
45
|
|
|
p.removeChild(element) |
|
46
|
|
|
|
|
47
|
|
|
|
|
48
|
|
|
def filter_term(att): |
|
49
|
|
|
if att.startswith("on") or \ |
|
50
|
|
|
att.startswith("data-") or \ |
|
51
|
|
|
att in ["id", "class", "accesskey", "data", "dynsrc", "tabindex", |
|
52
|
|
|
"frame", "rules", "width", "trbidi", "imageanchor"]: |
|
53
|
|
|
return True |
|
54
|
|
|
|
|
55
|
|
|
|
|
56
|
|
|
def remove_child_prohibited_attr(element): |
|
57
|
|
|
|
|
58
|
|
|
list_on_children = element.childNodes |
|
59
|
|
|
for child in list_on_children: |
|
60
|
|
|
if child.nodeType == 1: |
|
61
|
|
|
remove_prohibited_attributes(child) |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def remove_href_prohibited_attr(element): |
|
65
|
|
|
try: |
|
66
|
|
|
if element.hasAttribute("href"): |
|
67
|
|
|
t = element.toxml() |
|
68
|
|
|
if re.search('href="http', t) or re.search('href="https', t): |
|
69
|
|
|
pass |
|
70
|
|
|
else: |
|
71
|
|
|
element.removeAttribute("href") |
|
72
|
|
|
except: |
|
73
|
|
|
pass |
|
74
|
|
|
|
|
75
|
|
|
|
|
76
|
|
|
def remove_attr_prohibited(element): |
|
77
|
|
|
to_be_removed_atts = [att for att in element.attributes.keys() |
|
78
|
|
|
if filter_term(att.lower())] |
|
79
|
|
|
|
|
80
|
|
|
for attribute in to_be_removed_atts: |
|
81
|
|
|
element.removeAttribute(attribute) |
|
82
|
|
|
|
|
83
|
|
|
|
|
84
|
|
|
def remove_prohibited_attributes(element): |
|
85
|
|
|
""" |
|
86
|
|
|
To fit the Evernote DTD need, drop this attribute name |
|
87
|
|
|
""" |
|
88
|
|
|
remove_attr_prohibited(element) |
|
89
|
|
|
remove_href_prohibited_attr(element) |
|
90
|
|
|
remove_child_prohibited_attr(element) |
|
91
|
|
|
|