| Conditions | 8 |
| Total Lines | 56 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | def youtube_video_whitelist(iframe_tag): |
||
| 28 | def filter_iframes(html, testing=False): |
||
| 29 | """ |
||
| 30 | Given an HTML string, strips iframe tags that do not |
||
| 31 | (just) contain an embedded video, OpenStreetMap or any |
||
| 32 | other content we deem acceptable. |
||
| 33 | |||
| 34 | In order to extend this list: |
||
| 35 | 1. Write a processing function that acceptably processes an iframe |
||
| 36 | element of a given form. |
||
| 37 | 2. Add a matcher below that contains this function, as well as a |
||
| 38 | regex that matches the desired src attribute as narrowly as |
||
| 39 | possible. |
||
| 40 | |||
| 41 | Returns the remaining HTML string. |
||
| 42 | """ |
||
| 43 | from bs4 import BeautifulSoup |
||
| 44 | import re |
||
| 45 | |||
| 46 | # Tuple of tuples (regex, function) that define allowed URL patterns and their handling |
||
| 47 | # functions. If an src tag of an iframe matches the regex, the iframe will be passed |
||
| 48 | # to the function for further processing. Functions should allow one argument, the |
||
| 49 | # iframe element to process. |
||
| 50 | matchers = (("^(https?:)?//www\.youtube\.com/embed/[a-zA-Z0-9-_]{8,15}$", youtube_video_whitelist), |
||
| 51 | ("^(https?:)?//umap\.openstreetmap\.fr/en/map/[a-zA-Z0-9-_]*\?", umap_osm_whitelist)) |
||
| 52 | # Tuple of allowed attributes in an iframe |
||
| 53 | allowed_attributes = ('height', 'src', 'width', 'frameBorder') |
||
| 54 | |||
| 55 | # Parse the input HTML into a DOM |
||
| 56 | dom = BeautifulSoup(html, "html.parser") |
||
| 57 | |||
| 58 | for iframe in dom.findAll("iframe"): |
||
| 59 | src = iframe.get("src", "") |
||
| 60 | matched = False |
||
| 61 | # Check whether any one matcher matches |
||
| 62 | for (expression, whitelist_function) in matchers: |
||
| 63 | exp = re.compile(expression) |
||
| 64 | if exp.match(src): |
||
| 65 | iframe = whitelist_function(iframe) |
||
| 66 | matched = True |
||
| 67 | break |
||
| 68 | # If no matcher matched, remove the iframe |
||
| 69 | if not matched: |
||
| 70 | iframe.extract() |
||
| 71 | continue |
||
| 72 | # If iframe tag contains something, remove the iframe |
||
| 73 | if len(iframe.contents) > 0: |
||
| 74 | iframe.extract() |
||
| 75 | continue |
||
| 76 | # Check for illegal iframe attributes |
||
| 77 | for attr in iframe.attrs: |
||
| 78 | # If iframe contains illegal attribute, remove the iframe |
||
| 79 | if attr not in allowed_attributes: |
||
| 80 | iframe.extract() |
||
| 81 | break |
||
| 82 | |||
| 83 | return str(dom) |
||
| 84 | |||
| 169 |