1
|
|
|
# coding: utf-8 |
2
|
|
|
import re |
3
|
|
|
from tidylib import tidy_document |
4
|
|
|
from xml.dom.minidom import parseString |
5
|
|
|
|
6
|
|
|
|
7
|
|
|
def sanitize(html): |
8
|
|
|
# with from __future__ import unicode_litterals |
9
|
|
|
# tidy_document does not want other options at all |
10
|
|
|
# such as div merge char-encoding and so on |
11
|
|
|
document, errors = tidy_document( |
12
|
|
|
html, options={"output-xhtml": 1, "force-output": 1}) |
13
|
|
|
|
14
|
|
|
parsed_dom = parseString(document) |
15
|
|
|
document_element = parsed_dom.documentElement |
16
|
|
|
remove_prohibited_elements(document_element) |
17
|
|
|
remove_prohibited_attributes(document_element) |
18
|
|
|
body = document_element.getElementsByTagName("body")[0] |
19
|
|
|
body.tagName = "en-note" |
20
|
|
|
return body.toxml() |
21
|
|
|
|
22
|
|
|
|
23
|
|
|
def remove_prohibited_elements(document_element): |
24
|
|
|
""" |
25
|
|
|
To fit the Evernote DTD need, drop this tag name |
26
|
|
|
""" |
27
|
|
|
prohibited_tag_names = [ |
28
|
|
|
"applet", "base", "basefont", "bgsound", "blink", "button", "dir", |
29
|
|
|
"embed", "fieldset", "form", "frame", "frameset", "head", "iframe", |
30
|
|
|
"ilayer", "input", "isindex", "label", "layer", "legend", "link", |
31
|
|
|
"marquee", "menu", "meta", "noframes", "noscript", "object", |
32
|
|
|
"optgroup", "option", "param", "plaintext", "script", "select", |
33
|
|
|
"style", "textarea", "xml", 'wbr'] |
34
|
|
|
for tag_name in prohibited_tag_names: |
35
|
|
|
remove_prohibited_element(tag_name, document_element) |
36
|
|
|
|
37
|
|
|
|
38
|
|
|
def remove_prohibited_element(tag_name, document_element): |
39
|
|
|
""" |
40
|
|
|
To fit the Evernote DTD need, drop this tag name |
41
|
|
|
""" |
42
|
|
|
elements = document_element.getElementsByTagName(tag_name) |
43
|
|
|
for element in elements: |
44
|
|
|
p = element.parentNode |
45
|
|
|
p.removeChild(element) |
46
|
|
|
|
47
|
|
|
|
48
|
|
|
def filter_term(att): |
49
|
|
|
if att.startswith("on") or \ |
50
|
|
|
att.startswith("data-") or \ |
51
|
|
|
att in ["id", "class", "accesskey", "data", "dynsrc", "tabindex", |
52
|
|
|
"frame", "rules", "width", "trbidi", "imageanchor"]: |
53
|
|
|
return True |
54
|
|
|
|
55
|
|
|
|
56
|
|
|
def remove_child_prohibited_attr(element): |
57
|
|
|
|
58
|
|
|
list_on_children = element.childNodes |
59
|
|
|
for child in list_on_children: |
60
|
|
|
if child.nodeType == 1: |
61
|
|
|
remove_prohibited_attributes(child) |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
def remove_href_prohibited_attr(element): |
65
|
|
|
try: |
66
|
|
|
if element.hasAttribute("href"): |
67
|
|
|
t = element.toxml() |
68
|
|
|
if re.search('href="http', t) or re.search('href="https', t): |
69
|
|
|
pass |
70
|
|
|
else: |
71
|
|
|
element.removeAttribute("href") |
72
|
|
|
except: |
73
|
|
|
pass |
74
|
|
|
|
75
|
|
|
|
76
|
|
|
def remove_attr_prohibited(element): |
77
|
|
|
to_be_removed_atts = [att for att in element.attributes.keys() |
78
|
|
|
if filter_term(att.lower())] |
79
|
|
|
|
80
|
|
|
for attribute in to_be_removed_atts: |
81
|
|
|
element.removeAttribute(attribute) |
82
|
|
|
|
83
|
|
|
|
84
|
|
|
def remove_prohibited_attributes(element): |
85
|
|
|
""" |
86
|
|
|
To fit the Evernote DTD need, drop this attribute name |
87
|
|
|
""" |
88
|
|
|
remove_attr_prohibited(element) |
89
|
|
|
remove_href_prohibited_attr(element) |
90
|
|
|
remove_child_prohibited_attr(element) |
91
|
|
|
|