|
1
|
|
|
def youtube_video_whitelist(iframe_tag): |
|
2
|
|
|
""" |
|
3
|
|
|
Given an HTML iframe element, pass it through the filters we impose on |
|
4
|
|
|
embedded YouTube video. |
|
5
|
|
|
|
|
6
|
|
|
Returns the HTML iframe element as a string, which can be reinserted |
|
7
|
|
|
at the position of the element that was passed. |
|
8
|
|
|
""" |
|
9
|
|
|
from bs4 import BeautifulSoup |
|
10
|
|
|
import re |
|
11
|
|
|
|
|
12
|
|
|
# Replace YouTube embed links with privacy-friendly alternative |
|
13
|
|
|
src = iframe_tag.get("src", "") |
|
14
|
|
|
iframe_tag['src'] = re.sub(r"(https?:)?//www\.youtube\.com/", "https://www.youtube-nocookie.com/", src) |
|
15
|
|
|
|
|
16
|
|
|
return iframe_tag |
|
17
|
|
|
|
|
18
|
|
|
def umap_osm_whitelist(iframe_tag): |
|
19
|
|
|
""" |
|
20
|
|
|
Given an HTML iframe element, pass it through the filters we impose on |
|
21
|
|
|
embedded OpenStreetMaps (umap.openstreetmap.fr). |
|
22
|
|
|
|
|
23
|
|
|
Returns the HTML iframe element as a string, which can be reinserted |
|
24
|
|
|
at the position of the element that was passed. |
|
25
|
|
|
""" |
|
26
|
|
|
return iframe_tag |
|
27
|
|
|
|
|
28
|
|
|
def filter_iframes(html, testing=False): |
|
29
|
|
|
""" |
|
30
|
|
|
Given an HTML string, strips iframe tags that do not |
|
31
|
|
|
(just) contain an embedded video, OpenStreetMap or any |
|
32
|
|
|
other content we deem acceptable. |
|
33
|
|
|
|
|
34
|
|
|
In order to extend this list: |
|
35
|
|
|
1. Write a processing function that acceptably processes an iframe |
|
36
|
|
|
element of a given form. |
|
37
|
|
|
2. Add a matcher below that contains this function, as well as a |
|
38
|
|
|
regex that matches the desired src attribute as narrowly as |
|
39
|
|
|
possible. |
|
40
|
|
|
|
|
41
|
|
|
Returns the remaining HTML string. |
|
42
|
|
|
""" |
|
43
|
|
|
from bs4 import BeautifulSoup |
|
44
|
|
|
import re |
|
45
|
|
|
|
|
46
|
|
|
# Tuple of tuples (regex, function) that define allowed URL patterns and their handling |
|
47
|
|
|
# functions. If an src tag of an iframe matches the regex, the iframe will be passed |
|
48
|
|
|
# to the function for further processing. Functions should allow one argument, the |
|
49
|
|
|
# iframe element to process. |
|
50
|
|
|
matchers = (("^(https?:)?//www\.youtube\.com/embed/[a-zA-Z0-9-_]{8,15}$", youtube_video_whitelist), |
|
51
|
|
|
("^(https?:)?//umap\.openstreetmap\.fr/en/map/[a-zA-Z0-9-_]*\?", umap_osm_whitelist)) |
|
52
|
|
|
# Tuple of allowed attributes in an iframe |
|
53
|
|
|
allowed_attributes = ('height', 'src', 'width', 'frameBorder') |
|
54
|
|
|
|
|
55
|
|
|
# Parse the input HTML into a DOM |
|
56
|
|
|
dom = BeautifulSoup(html, "html.parser") |
|
57
|
|
|
|
|
58
|
|
|
for iframe in dom.findAll("iframe"): |
|
59
|
|
|
src = iframe.get("src", "") |
|
60
|
|
|
matched = False |
|
61
|
|
|
# Check whether any one matcher matches |
|
62
|
|
|
for (expression, whitelist_function) in matchers: |
|
63
|
|
|
exp = re.compile(expression) |
|
64
|
|
|
if exp.match(src): |
|
65
|
|
|
iframe = whitelist_function(iframe) |
|
66
|
|
|
matched = True |
|
67
|
|
|
break |
|
68
|
|
|
# If no matcher matched, remove the iframe |
|
69
|
|
|
if not matched: |
|
70
|
|
|
iframe.extract() |
|
71
|
|
|
continue |
|
72
|
|
|
# If iframe tag contains something, remove the iframe |
|
73
|
|
|
if len(iframe.contents) > 0: |
|
74
|
|
|
iframe.extract() |
|
75
|
|
|
continue |
|
76
|
|
|
# Check for illegal iframe attributes |
|
77
|
|
|
for attr in iframe.attrs: |
|
78
|
|
|
# If iframe contains illegal attribute, remove the iframe |
|
79
|
|
|
if attr not in allowed_attributes: |
|
80
|
|
|
iframe.extract() |
|
81
|
|
|
break |
|
82
|
|
|
|
|
83
|
|
|
return str(dom) |
|
84
|
|
|
|
|
85
|
|
|
def strip_scripts_not_in_whitelist(html): |
|
86
|
|
|
""" |
|
87
|
|
|
Given an HTML string, will strip all script tags that do not conform to |
|
88
|
|
|
one of the whitelist patterns as defined in settings.py. |
|
89
|
|
|
""" |
|
90
|
|
|
from bs4 import BeautifulSoup |
|
91
|
|
|
from mezzanine.conf import settings |
|
92
|
|
|
import logging |
|
93
|
|
|
logger = logging.getLogger(__name__) |
|
94
|
|
|
|
|
95
|
|
|
# Parse the whitelist into a list of tags (to make sure format matches exactly) |
|
96
|
|
|
allowed_tags = [] |
|
97
|
|
|
for allowed_tag_str in settings.RICHTEXT_SCRIPT_TAG_WHITELIST: |
|
98
|
|
|
allowed_tags.append(str(BeautifulSoup(allowed_tag_str, "html.parser").find("script"))) |
|
99
|
|
|
|
|
100
|
|
|
# Parse the input HTML into a DOM |
|
101
|
|
|
dom = BeautifulSoup(html, "html.parser") |
|
102
|
|
|
|
|
103
|
|
|
# Look for all script tags and match them to the whitelist |
|
104
|
|
|
for script_tag in dom.findAll("script"): |
|
105
|
|
|
if str(script_tag) not in allowed_tags: |
|
106
|
|
|
script_tag.extract() |
|
107
|
|
|
logger.debug("Found non-whitelisted script tag. Stripped.") |
|
108
|
|
|
logger.debug("CONF: stripped tag is "+str(script_tag)) |
|
109
|
|
|
else: |
|
110
|
|
|
logger.debug("Found whitelisted script tag. Did not strip.") |
|
111
|
|
|
|
|
112
|
|
|
return str(dom) |
|
113
|
|
|
|
|
114
|
|
|
|
|
115
|
|
|
def strip_illegal_objects(html): |
|
116
|
|
|
""" |
|
117
|
|
|
Given an HTML string, will strip all object tags that do not embed |
|
118
|
|
|
a PDF that is locally stored on this server. |
|
119
|
|
|
|
|
120
|
|
|
Returns the remaining HTML string. |
|
121
|
|
|
""" |
|
122
|
|
|
from bs4 import BeautifulSoup |
|
123
|
|
|
import re |
|
124
|
|
|
from mezzanine.conf import settings |
|
125
|
|
|
import logging |
|
126
|
|
|
logger = logging.getLogger(__name__) |
|
127
|
|
|
|
|
128
|
|
|
# Tuple of regexes that define allowed URL patterns |
|
129
|
|
|
matchers = ("^{0}".format(settings.MEDIA_URL),) |
|
130
|
|
|
# Tuple of allowed attributes in an object |
|
131
|
|
|
allowed_attributes = ('data', 'type', 'width', 'height') |
|
132
|
|
|
|
|
133
|
|
|
# Parse the input HTML into a DOM |
|
134
|
|
|
dom = BeautifulSoup(html, "html.parser") |
|
135
|
|
|
|
|
136
|
|
|
for object_tag in dom.findAll("object"): |
|
137
|
|
|
data = object_tag.get("data", "") |
|
138
|
|
|
filetype = object_tag.get("type", "") |
|
139
|
|
|
matched = False |
|
140
|
|
|
illegal_tag = False |
|
141
|
|
|
# Check whether any one matcher matches |
|
142
|
|
|
for matcher in matchers: |
|
143
|
|
|
exp = re.compile(matcher) |
|
144
|
|
|
if exp.match(data): |
|
145
|
|
|
matched = True |
|
146
|
|
|
break |
|
147
|
|
|
# If no matcher matched, remove the object |
|
148
|
|
|
if not matched: |
|
149
|
|
|
object_tag.extract() |
|
150
|
|
|
logger.debug("Stripped object - Could not match URL pattern.") |
|
151
|
|
|
continue |
|
152
|
|
|
# Check for illegal object attributes |
|
153
|
|
|
for attr in object_tag.attrs: |
|
154
|
|
|
# If object contains illegal attribute, remove the object |
|
155
|
|
|
if attr not in allowed_attributes: |
|
156
|
|
|
illegal_tag = True |
|
157
|
|
|
break |
|
158
|
|
|
if illegal_tag: |
|
159
|
|
|
object_tag.extract() |
|
160
|
|
|
logger.debug("Stripped object - Found illegal attribute.") |
|
161
|
|
|
continue |
|
162
|
|
|
# The value of the type attribute should be 'application/pdf' |
|
163
|
|
|
if filetype != "application/pdf": |
|
164
|
|
|
object_tag.extract() |
|
165
|
|
|
logger.debug("Stripped object - Found illegal filetype.") |
|
166
|
|
|
continue |
|
167
|
|
|
|
|
168
|
|
|
return str(dom) |
|
169
|
|
|
|