get_soup_at_url()   F
last analyzed

Complexity

Conditions 12

Size

Total Lines 36

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 12
c 2
b 0
f 0
dl 0
loc 36
rs 2.7855

How to fix   Complexity   

Complexity

Complex classes like get_soup_at_url() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#! /usr/bin/python3
2
# vim: set expandtab tabstop=4 shiftwidth=4 :
3
"""Module with functions wrapping urllib"""
4
5
import http.client
6
import urllib.request
7
import urllib.parse
8
import json
9
import shutil
10
import gzip
11
from bs4 import BeautifulSoup
12
import inspect
13
import logging
14
import time
15
16
17
def log(string):
18
    """Dirty logging function."""
19
    # TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes
20
    # we do not need to retrieve the function name manually
21
    logging.debug(inspect.stack()[1][3] + " " + string)
22
23
24
def convert_iri_to_plain_ascii_uri(uri):
25
    """Convert IRI to plain ASCII URL
26
    Based on http://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen."""
27
    lis = list(urllib.parse.urlsplit(uri))
28
    lis[2] = urllib.parse.quote(lis[2])
29
    url = urllib.parse.urlunsplit(lis)
30
    if False and url != uri:
31
        print(uri, '->', url)
32
    return url
33
34
35
def urlopen_wrapper(url, referer=None):
36
    """Wrapper around urllib.request.urlopen (user-agent, etc).
37
38
    url is a string
39
    referer is an optional string
40
    Returns a byte object."""
41
    log('(url : %s)' % url)
42
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30'
43
    try:
44
        req = urllib.request.Request(url, headers={'User-Agent': user_agent, 'Accept': '*/*'})
45
        if referer:
46
            req.add_header('Referer', referer)
47
        response = urllib.request.urlopen(req)
48
        if response.info().get('Content-Encoding') == 'gzip':
49
            return gzip.GzipFile(fileobj=response)
50
        return response
51
    except (urllib.error.HTTPError, urllib.error.URLError) as e:
52
        print("Exception %s for url %s" % (e, url))
53
        raise
54
55
56
def urljoin_wrapper(base, url):
57
    """Wrapper around urllib.parse.urljoin.
58
    Construct a full ("absolute") URL by combining a "base URL" (base) with
59
    another URL (url)."""
60
    return urllib.parse.urljoin(base, url)
61
62
63
def get_content(url):
64
    """Get content at url.
65
66
    url is a string
67
    Returns a string"""
68
    log('(url : %s)' % url)
69
    try:
70
        return urlopen_wrapper(url).read()
71
    except http.client.IncompleteRead as e:
72
        print("%s for %s" % (e, url))
73
        return e.partial
74
75
76
def extensions_are_equivalent(ext1, ext2):
77
    """Return whether file extensions can be considered as equivalent."""
78
    synonyms = [{'jpg', 'jpeg'}]
79
    ext1, ext2 = ext1.lower(), ext2.lower()
80
    return ext1 == ext2 or any((ext1 in s and ext2 in s) for s in synonyms)
81
82
83
def add_extension_to_filename_if_needed(ext, filename):
84
    """Given an extension and a filename, add the extension to the filename
85
    if the filename does not already have this extension (or an extension
86
    considered to be equivalent."""
87
    filename_ext = filename.split('.')[-1]
88
    if extensions_are_equivalent(ext, filename_ext):
89
        return filename
90
    else:
91
        return filename + '.' + ext
92
93
94
def get_file_at_url(url, path, referer=None):
95
    """Save content at url in path on file system.
96
    In theory, this could have been achieved with urlretrieve but it seems
97
    to be about to get deprecated and adding a user-agent seems to be quite
98
    awkward.
99
100
    url is a string
101
    path is a string corresponding to the file location
102
    referer is an optional string
103
    Returns the path if the file is retrieved properly, None otherwise."""
104
    log('(url : %s, path : %s)' % (url, path))
105
    try:
106
        with urlopen_wrapper(url, referer) as response:
107
            content_type = response.info().get('Content-Type', '').split('/')
108
            assert 1 <= len(content_type) <= 2
109
            if len(content_type) == 2:
110
                data = content_type[1].split(';')
111
                path = add_extension_to_filename_if_needed(data[0], path)
112
            with open(path, 'wb') as out_file:
113
                shutil.copyfileobj(response, out_file)
114
                time.sleep(0.4)
115
                return path
116
    except (urllib.error.HTTPError, urllib.error.URLError):
117
        return None
118
119
120
def get_filename_from_url(url):
121
    """Get filename from url
122
123
    url is a string
124
    Returns a string corresponding to the name of the file."""
125
    return urllib.parse.unquote(url).split('/')[-1]
126
127
128
def load_json_at_url(url):
129
    """Get content at url as JSON and return it."""
130
    return json.loads(get_content(url).decode())
131
132
133
def get_soup_at_url(url, detect_meta=False, detect_rel=False, detect_angular=False, save_in_file=False):
134
    """Get content at url as BeautifulSoup.
135
136
    url is a string
137
    detect_meta is a hacky flag used to detect comics using similar plugin to
138
        be able to reuse code at some point
139
    detect_rel is a hacky flag to detect next/first comics automatically
140
    detect_rel is a hacky flag to detect if page corresponds to an Angular app
141
    save_in_file is a hacky flag to save content in temp file for debugging
142
        purposes
143
    Returns a BeautifulSoup object."""
144
    time.sleep(0.4)
145
    content = get_content(url)
146
    soup = BeautifulSoup(content, "html.parser")
147
    if detect_meta:
148
        for meta_val in ['generator', 'ComicPress', 'Comic-Easel']:
149
            meta = soup.find('meta', attrs={'name': meta_val})
150
            if meta is not None:
151
                print(meta)
152
    if detect_rel:
153
        for tag in ['a', 'link']:
154
            next_ = soup.find(tag, rel='next')
155
            if next_ is not None:
156
                print(next_)
157
    if detect_angular:
158
        html = soup.find('html')
159
        if html.has_attr('ng-app'):
160
            print(url)
161
    if save_in_file:
162
        time_ms = time.time() * 1000
163
        prefix = 'get_soup_at_url_' + str(time_ms) + '_'
164
        with open(prefix + 'raw', 'wb') as f:
165
            f.write(content)
166
        with open(prefix + 'content', 'wb') as f:
167
            f.write(soup.encode('utf-8'))
168
    return soup
169