Completed
Push — master ( 1119a2...0b7ace )
by Fox
01:13
created

remove_prohibited_attributes()   A

Complexity

Conditions 1

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 1
c 3
b 0
f 0
dl 0
loc 7
rs 9.4285
1
# coding: utf-8
2
import re
3
from tidylib import tidy_document
4
from xml.dom.minidom import parseString
5
6
7
def sanitize(html):
8
    # with from __future__ import unicode_litterals
9
    # tidy_document does not want other options at all
10
    # such as div merge char-encoding and so on
11
    document, errors = tidy_document(
12
        html, options={"output-xhtml": 1, "force-output": 1})
13
14
    parsed_dom = parseString(document)
15
    document_element = parsed_dom.documentElement
16
    remove_prohibited_elements(document_element)
17
    remove_prohibited_attributes(document_element)
18
    body = document_element.getElementsByTagName("body")[0]
19
    body.tagName = "en-note"
20
    return body.toxml()
21
22
23
def remove_prohibited_elements(document_element):
24
    """
25
        To fit the Evernote DTD need, drop this tag name
26
    """
27
    prohibited_tag_names = [
28
        "applet", "base", "basefont", "bgsound", "blink", "button", "dir",
29
        "embed", "fieldset", "form", "frame", "frameset", "head", "iframe",
30
        "ilayer", "input", "isindex", "label", "layer", "legend", "link",
31
        "marquee", "menu", "meta", "noframes", "noscript", "object",
32
        "optgroup", "option", "param", "plaintext", "script", "select",
33
        "style", "textarea", "xml", 'wbr']
34
    for tag_name in prohibited_tag_names:
35
        remove_prohibited_element(tag_name, document_element)
36
37
38
def remove_prohibited_element(tag_name, document_element):
39
    """
40
        To fit the Evernote DTD need, drop this tag name
41
    """
42
    elements = document_element.getElementsByTagName(tag_name)
43
    for element in elements:
44
        p = element.parentNode
45
        p.removeChild(element)
46
47
48
def filter_term(att):
49
    if att.startswith("on") or \
50
       att.startswith("data-") or \
51
       att in ["id", "class", "accesskey", "data", "dynsrc", "tabindex",
52
               "frame", "rules", "width", "trbidi", "imageanchor"]:
53
        return True
54
55
56
def remove_child_prohibited_attr(element):
57
58
    list_on_children = element.childNodes
59
    for child in list_on_children:
60
        if child.nodeType == 1:
61
            remove_prohibited_attributes(child)
62
63
64
def remove_href_prohibited_attr(element):
65
    try:
66
        if element.hasAttribute("href"):
67
            t = element.toxml()
68
            if re.search('href="http', t) or re.search('href="https', t):
69
                pass
70
            else:
71
                element.removeAttribute("href")
72
    except:
73
        pass
74
75
76
def remove_attr_prohibited(element):
77
    to_be_removed_atts = [att for att in element.attributes.keys()
78
                          if filter_term(att.lower())]
79
80
    for attribute in to_be_removed_atts:
81
        element.removeAttribute(attribute)
82
83
84
def remove_prohibited_attributes(element):
85
    """
86
        To fit the Evernote DTD need, drop this attribute name
87
    """
88
    remove_attr_prohibited(element)
89
    remove_href_prohibited_attr(element)
90
    remove_child_prohibited_attr(element)
91