strip_illegal_objects()   C
last analyzed

Complexity

Conditions 9

Size

Total Lines 54

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
dl 0
loc 54
rs 6.1721
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
def youtube_video_whitelist(iframe_tag):
2
    """
3
    Given an HTML iframe element, pass it through the filters we impose on
4
    embedded YouTube video.
5
6
    Returns the HTML iframe element as a string, which can be reinserted
7
    at the position of the element that was passed.
8
    """
9
    from bs4 import BeautifulSoup
10
    import re
11
12
    # Replace YouTube embed links with privacy-friendly alternative
13
    src = iframe_tag.get("src", "")
14
    iframe_tag['src'] = re.sub(r"(https?:)?//www\.youtube\.com/", "https://www.youtube-nocookie.com/", src)
15
16
    return iframe_tag
17
18
def umap_osm_whitelist(iframe_tag):
19
    """
20
    Given an HTML iframe element, pass it through the filters we impose on
21
    embedded OpenStreetMaps (umap.openstreetmap.fr).
22
23
    Returns the HTML iframe element as a string, which can be reinserted
24
    at the position of the element that was passed.
25
    """
26
    return iframe_tag
27
28
def filter_iframes(html, testing=False):
29
    """
30
    Given an HTML string, strips iframe tags that do not
31
    (just) contain an embedded video, OpenStreetMap or any
32
    other content we deem acceptable.
33
34
    In order to extend this list:
35
    1. Write a processing function that acceptably processes an iframe
36
       element of a given form.
37
    2. Add a matcher below that contains this function, as well as a
38
       regex that matches the desired src attribute as narrowly as
39
       possible.
40
41
    Returns the remaining HTML string.
42
    """
43
    from bs4 import BeautifulSoup
44
    import re
45
 
46
    # Tuple of tuples (regex, function) that define allowed URL patterns and their handling
47
    # functions. If an src tag of an iframe matches the regex, the iframe will be passed
48
    # to the function for further processing. Functions should allow one argument, the
49
    # iframe element to process.
50
    matchers = (("^(https?:)?//www\.youtube\.com/embed/[a-zA-Z0-9-_]{8,15}$", youtube_video_whitelist),
51
                ("^(https?:)?//umap\.openstreetmap\.fr/en/map/[a-zA-Z0-9-_]*\?", umap_osm_whitelist))
52
    # Tuple of allowed attributes in an iframe
53
    allowed_attributes = ('height', 'src', 'width', 'frameBorder')
54
55
    # Parse the input HTML into a DOM
56
    dom = BeautifulSoup(html, "html.parser")
57
58
    for iframe in dom.findAll("iframe"):
59
        src = iframe.get("src", "")
60
        matched = False
61
        # Check whether any one matcher matches
62
        for (expression, whitelist_function) in matchers:
63
            exp = re.compile(expression)
64
            if exp.match(src):
65
                iframe = whitelist_function(iframe)
66
                matched = True
67
                break
68
        # If no matcher matched, remove the iframe
69
        if not matched:
70
            iframe.extract()
71
            continue
72
        # If iframe tag contains something, remove the iframe
73
        if len(iframe.contents) > 0:
74
            iframe.extract()
75
            continue
76
        # Check for illegal iframe attributes
77
        for attr in iframe.attrs:
78
            # If iframe contains illegal attribute, remove the iframe
79
            if attr not in allowed_attributes:
80
                iframe.extract()
81
                break
82
83
    return str(dom)
84
85
def strip_scripts_not_in_whitelist(html):
86
    """
87
    Given an HTML string, will strip all script tags that do not conform to
88
    one of the whitelist patterns as defined in settings.py.
89
    """
90
    from bs4 import BeautifulSoup
91
    from mezzanine.conf import settings
92
    import logging
93
    logger = logging.getLogger(__name__)
94
95
    # Parse the whitelist into a list of tags (to make sure format matches exactly)
96
    allowed_tags = []
97
    for allowed_tag_str in settings.RICHTEXT_SCRIPT_TAG_WHITELIST:
98
        allowed_tags.append(str(BeautifulSoup(allowed_tag_str, "html.parser").find("script")))
99
100
    # Parse the input HTML into a DOM
101
    dom = BeautifulSoup(html, "html.parser")
102
103
    # Look for all script tags and match them to the whitelist
104
    for script_tag in dom.findAll("script"):
105
        if str(script_tag) not in allowed_tags:
106
            script_tag.extract()
107
            logger.debug("Found non-whitelisted script tag. Stripped.")
108
            logger.debug("CONF: stripped tag is "+str(script_tag))
109
        else:
110
            logger.debug("Found whitelisted script tag. Did not strip.")
111
112
    return str(dom)
113
114
115
def strip_illegal_objects(html):
116
    """
117
    Given an HTML string, will strip all object tags that do not embed
118
    a PDF that is locally stored on this server.
119
    
120
    Returns the remaining HTML string.
121
    """
122
    from bs4 import BeautifulSoup
123
    import re
124
    from mezzanine.conf import settings
125
    import logging
126
    logger = logging.getLogger(__name__)
127
 
128
    # Tuple of regexes that define allowed URL patterns
129
    matchers = ("^{0}".format(settings.MEDIA_URL),)
130
    # Tuple of allowed attributes in an object
131
    allowed_attributes = ('data', 'type', 'width', 'height')
132
133
    # Parse the input HTML into a DOM
134
    dom = BeautifulSoup(html, "html.parser")
135
136
    for object_tag in dom.findAll("object"):
137
        data = object_tag.get("data", "")
138
        filetype = object_tag.get("type", "")
139
        matched = False
140
        illegal_tag = False
141
        # Check whether any one matcher matches
142
        for matcher in matchers:
143
            exp = re.compile(matcher)
144
            if exp.match(data):
145
                matched = True
146
                break
147
        # If no matcher matched, remove the object
148
        if not matched:
149
            object_tag.extract()
150
            logger.debug("Stripped object - Could not match URL pattern.")
151
            continue
152
        # Check for illegal object attributes
153
        for attr in object_tag.attrs:
154
            # If object contains illegal attribute, remove the object
155
            if attr not in allowed_attributes:
156
                illegal_tag = True
157
                break
158
        if illegal_tag:
159
            object_tag.extract()
160
            logger.debug("Stripped object - Found illegal attribute.")
161
            continue
162
        # The value of the type attribute should be 'application/pdf'
163
        if filetype != "application/pdf":
164
            object_tag.extract()
165
            logger.debug("Stripped object - Found illegal filetype.")
166
            continue
167
    
168
    return str(dom)
169