strip_illegal_objects() - Code Metrics - jonge-democraten/website - Measure and Improve Code Quality continuously with Scrutinizer

strip_illegal_objects() C
last analyzed 2018-07-01 11:02 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
dl	0
loc	54
rs	6.1721
c	0
b	0
f	0

How to fix Long Method

def youtube_video_whitelist(iframe_tag):
    """
    Given an HTML iframe element, pass it through the filters we impose on
    embedded YouTube video.

    Returns the HTML iframe element as a string, which can be reinserted
    at the position of the element that was passed.
    """
    from bs4 import BeautifulSoup
    import re

    # Replace YouTube embed links with privacy-friendly alternative
    src = iframe_tag.get("src", "")
    iframe_tag['src'] = re.sub(r"(https?:)?//www\.youtube\.com/", "https://www.youtube-nocookie.com/", src)

    return iframe_tag

def umap_osm_whitelist(iframe_tag):
    """
    Given an HTML iframe element, pass it through the filters we impose on
    embedded OpenStreetMaps (umap.openstreetmap.fr).

    Returns the HTML iframe element as a string, which can be reinserted
    at the position of the element that was passed.
    """
    return iframe_tag

def filter_iframes(html, testing=False):
    """
    Given an HTML string, strips iframe tags that do not
    (just) contain an embedded video, OpenStreetMap or any
    other content we deem acceptable.

    In order to extend this list:
    1. Write a processing function that acceptably processes an iframe
       element of a given form.
    2. Add a matcher below that contains this function, as well as a
       regex that matches the desired src attribute as narrowly as
       possible.

    Returns the remaining HTML string.
    """
    from bs4 import BeautifulSoup
    import re
 
    # Tuple of tuples (regex, function) that define allowed URL patterns and their handling
    # functions. If an src tag of an iframe matches the regex, the iframe will be passed
    # to the function for further processing. Functions should allow one argument, the
    # iframe element to process.
    matchers = (("^(https?:)?//www\.youtube\.com/embed/[a-zA-Z0-9-_]{8,15}$", youtube_video_whitelist),
                ("^(https?:)?//umap\.openstreetmap\.fr/en/map/[a-zA-Z0-9-_]*\?", umap_osm_whitelist))
    # Tuple of allowed attributes in an iframe
    allowed_attributes = ('height', 'src', 'width', 'frameBorder')

    # Parse the input HTML into a DOM
    dom = BeautifulSoup(html, "html.parser")

    for iframe in dom.findAll("iframe"):
        src = iframe.get("src", "")
        matched = False
        # Check whether any one matcher matches
        for (expression, whitelist_function) in matchers:
            exp = re.compile(expression)
            if exp.match(src):
                iframe = whitelist_function(iframe)
                matched = True
                break
        # If no matcher matched, remove the iframe
        if not matched:
            iframe.extract()
            continue
        # If iframe tag contains something, remove the iframe
        if len(iframe.contents) > 0:
            iframe.extract()
            continue
        # Check for illegal iframe attributes
        for attr in iframe.attrs:
            # If iframe contains illegal attribute, remove the iframe
            if attr not in allowed_attributes:
                iframe.extract()
                break

    return str(dom)

def strip_scripts_not_in_whitelist(html):
    """
    Given an HTML string, will strip all script tags that do not conform to
    one of the whitelist patterns as defined in settings.py.
    """
    from bs4 import BeautifulSoup
    from mezzanine.conf import settings
    import logging
    logger = logging.getLogger(__name__)

    # Parse the whitelist into a list of tags (to make sure format matches exactly)
    allowed_tags = []
    for allowed_tag_str in settings.RICHTEXT_SCRIPT_TAG_WHITELIST:
        allowed_tags.append(str(BeautifulSoup(allowed_tag_str, "html.parser").find("script")))

    # Parse the input HTML into a DOM
    dom = BeautifulSoup(html, "html.parser")

    # Look for all script tags and match them to the whitelist
    for script_tag in dom.findAll("script"):
        if str(script_tag) not in allowed_tags:
            script_tag.extract()
            logger.debug("Found non-whitelisted script tag. Stripped.")
            logger.debug("CONF: stripped tag is "+str(script_tag))
        else:
            logger.debug("Found whitelisted script tag. Did not strip.")

    return str(dom)


def strip_illegal_objects(html):
    """
    Given an HTML string, will strip all object tags that do not embed
    a PDF that is locally stored on this server.
    
    Returns the remaining HTML string.
    """
    from bs4 import BeautifulSoup
    import re
    from mezzanine.conf import settings
    import logging
    logger = logging.getLogger(__name__)
 
    # Tuple of regexes that define allowed URL patterns
    matchers = ("^{0}".format(settings.MEDIA_URL),)
    # Tuple of allowed attributes in an object
    allowed_attributes = ('data', 'type', 'width', 'height')

    # Parse the input HTML into a DOM
    dom = BeautifulSoup(html, "html.parser")

    for object_tag in dom.findAll("object"):
        data = object_tag.get("data", "")
        filetype = object_tag.get("type", "")
        matched = False
        illegal_tag = False
        # Check whether any one matcher matches
        for matcher in matchers:
            exp = re.compile(matcher)
            if exp.match(data):
                matched = True
                break
        # If no matcher matched, remove the object
        if not matched:
            object_tag.extract()
            logger.debug("Stripped object - Could not match URL pattern.")
            continue
        # Check for illegal object attributes
        for attr in object_tag.attrs:
            # If object contains illegal attribute, remove the object
            if attr not in allowed_attributes:
                illegal_tag = True
                break
        if illegal_tag:
            object_tag.extract()
            logger.debug("Stripped object - Found illegal attribute.")
            continue
        # The value of the type attribute should be 'application/pdf'
        if filetype != "application/pdf":
            object_tag.extract()
            logger.debug("Stripped object - Found illegal filetype.")
            continue
    
    return str(dom)


1			def youtube_video_whitelist(iframe_tag):
2			"""
3			Given an HTML iframe element, pass it through the filters we impose on
4			embedded YouTube video.
5
6			Returns the HTML iframe element as a string, which can be reinserted
7			at the position of the element that was passed.
8			"""
9			from bs4 import BeautifulSoup
10			import re
11
12			# Replace YouTube embed links with privacy-friendly alternative
13			src = iframe_tag.get("src", "")
14			iframe_tag['src'] = re.sub(r"(https?:)?//www\.youtube\.com/", "https://www.youtube-nocookie.com/", src)
15
16			return iframe_tag
17
18			def umap_osm_whitelist(iframe_tag):
19			"""
20			Given an HTML iframe element, pass it through the filters we impose on
21			embedded OpenStreetMaps (umap.openstreetmap.fr).
22
23			Returns the HTML iframe element as a string, which can be reinserted
24			at the position of the element that was passed.
25			"""
26			return iframe_tag
27
28			def filter_iframes(html, testing=False):
29			"""
30			Given an HTML string, strips iframe tags that do not
31			(just) contain an embedded video, OpenStreetMap or any
32			other content we deem acceptable.
33
34			In order to extend this list:
35			1. Write a processing function that acceptably processes an iframe
36			element of a given form.
37			2. Add a matcher below that contains this function, as well as a
38			regex that matches the desired src attribute as narrowly as
39			possible.
40
41			Returns the remaining HTML string.
42			"""
43			from bs4 import BeautifulSoup
44			import re
45
46			# Tuple of tuples (regex, function) that define allowed URL patterns and their handling
47			# functions. If an src tag of an iframe matches the regex, the iframe will be passed
48			# to the function for further processing. Functions should allow one argument, the
49			# iframe element to process.
50			matchers = (("^(https?:)?//www\.youtube\.com/embed/[a-zA-Z0-9-_]{8,15}$", youtube_video_whitelist),
51			("^(https?:)?//umap\.openstreetmap\.fr/en/map/[a-zA-Z0-9-_]*\?", umap_osm_whitelist))
52			# Tuple of allowed attributes in an iframe
53			allowed_attributes = ('height', 'src', 'width', 'frameBorder')
54
55			# Parse the input HTML into a DOM
56			dom = BeautifulSoup(html, "html.parser")
57
58			for iframe in dom.findAll("iframe"):
59			src = iframe.get("src", "")
60			matched = False
61			# Check whether any one matcher matches
62			for (expression, whitelist_function) in matchers:
63			exp = re.compile(expression)
64			if exp.match(src):
65			iframe = whitelist_function(iframe)
66			matched = True
67			break
68			# If no matcher matched, remove the iframe
69			if not matched:
70			iframe.extract()
71			continue
72			# If iframe tag contains something, remove the iframe
73			if len(iframe.contents) > 0:
74			iframe.extract()
75			continue
76			# Check for illegal iframe attributes
77			for attr in iframe.attrs:
78			# If iframe contains illegal attribute, remove the iframe
79			if attr not in allowed_attributes:
80			iframe.extract()
81			break
82
83			return str(dom)
84
85			def strip_scripts_not_in_whitelist(html):
86			"""
87			Given an HTML string, will strip all script tags that do not conform to
88			one of the whitelist patterns as defined in settings.py.
89			"""
90			from bs4 import BeautifulSoup
91			from mezzanine.conf import settings
92			import logging
93			logger = logging.getLogger(__name__)
94
95			# Parse the whitelist into a list of tags (to make sure format matches exactly)
96			allowed_tags = []
97			for allowed_tag_str in settings.RICHTEXT_SCRIPT_TAG_WHITELIST:
98			allowed_tags.append(str(BeautifulSoup(allowed_tag_str, "html.parser").find("script")))
99
100			# Parse the input HTML into a DOM
101			dom = BeautifulSoup(html, "html.parser")
102
103			# Look for all script tags and match them to the whitelist
104			for script_tag in dom.findAll("script"):
105			if str(script_tag) not in allowed_tags:
106			script_tag.extract()
107			logger.debug("Found non-whitelisted script tag. Stripped.")
108			logger.debug("CONF: stripped tag is "+str(script_tag))
109			else:
110			logger.debug("Found whitelisted script tag. Did not strip.")
111
112			return str(dom)
113
114
115			def strip_illegal_objects(html):
116			"""
117			Given an HTML string, will strip all object tags that do not embed
118			a PDF that is locally stored on this server.
119
120			Returns the remaining HTML string.
121			"""
122			from bs4 import BeautifulSoup
123			import re
124			from mezzanine.conf import settings
125			import logging
126			logger = logging.getLogger(__name__)
127
128			# Tuple of regexes that define allowed URL patterns
129			matchers = ("^{0}".format(settings.MEDIA_URL),)
130			# Tuple of allowed attributes in an object
131			allowed_attributes = ('data', 'type', 'width', 'height')
132
133			# Parse the input HTML into a DOM
134			dom = BeautifulSoup(html, "html.parser")
135
136			for object_tag in dom.findAll("object"):
137			data = object_tag.get("data", "")
138			filetype = object_tag.get("type", "")
139			matched = False
140			illegal_tag = False
141			# Check whether any one matcher matches
142			for matcher in matchers:
143			exp = re.compile(matcher)
144			if exp.match(data):
145			matched = True
146			break
147			# If no matcher matched, remove the object
148			if not matched:
149			object_tag.extract()
150			logger.debug("Stripped object - Could not match URL pattern.")
151			continue
152			# Check for illegal object attributes
153			for attr in object_tag.attrs:
154			# If object contains illegal attribute, remove the object
155			if attr not in allowed_attributes:
156			illegal_tag = True
157			break
158			if illegal_tag:
159			object_tag.extract()
160			logger.debug("Stripped object - Found illegal attribute.")
161			continue
162			# The value of the type attribute should be 'application/pdf'
163			if filetype != "application/pdf":
164			object_tag.extract()
165			logger.debug("Stripped object - Found illegal filetype.")
166			continue
167
168			return str(dom)
169

jonge-democraten / website

strip_illegal_objects() C last analyzed 2018-07-01 11:02 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like

strip_illegal_objects() C
last analyzed 2018-07-01 11:02 UTC