get_soup_at_url() - Code Metrics - SylvainDe/ComicBookMaker - Measure and Improve Code Quality continuously with Scrutinizer

get_soup_at_url() F
last analyzed 2018-06-15 09:38 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	0	Features	0

Metric	Value
cc	12
c	2
b	0
f	0
dl	0
loc	36
rs	2.7855

How to fix Complexity

#! /usr/bin/python3
# vim: set expandtab tabstop=4 shiftwidth=4 :
"""Module with functions wrapping urllib"""

import http.client
import urllib.request
import urllib.parse
import json
import shutil
import gzip
from bs4 import BeautifulSoup
import inspect
import logging
import time


def log(string):
    """Dirty logging function."""
    # TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes
    # we do not need to retrieve the function name manually
    logging.debug(inspect.stack()[1][3] + " " + string)


def convert_iri_to_plain_ascii_uri(uri):
    """Convert IRI to plain ASCII URL
    Based on http://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen."""
    lis = list(urllib.parse.urlsplit(uri))
    lis[2] = urllib.parse.quote(lis[2])
    url = urllib.parse.urlunsplit(lis)
    if False and url != uri:
        print(uri, '->', url)
    return url


def urlopen_wrapper(url, referer=None):
    """Wrapper around urllib.request.urlopen (user-agent, etc).

    url is a string
    referer is an optional string
    Returns a byte object."""
    log('(url : %s)' % url)
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30'
    try:
        req = urllib.request.Request(url, headers={'User-Agent': user_agent, 'Accept': '*/*'})
        if referer:
            req.add_header('Referer', referer)
        response = urllib.request.urlopen(req)
        if response.info().get('Content-Encoding') == 'gzip':
            return gzip.GzipFile(fileobj=response)
        return response
    except (urllib.error.HTTPError, urllib.error.URLError) as e:
        print("Exception %s for url %s" % (e, url))
        raise


def urljoin_wrapper(base, url):
    """Wrapper around urllib.parse.urljoin.
    Construct a full ("absolute") URL by combining a "base URL" (base) with
    another URL (url)."""
    return urllib.parse.urljoin(base, url)


def get_content(url):
    """Get content at url.

    url is a string
    Returns a string"""
    log('(url : %s)' % url)
    try:
        return urlopen_wrapper(url).read()
    except http.client.IncompleteRead as e:
        print("%s for %s" % (e, url))
        return e.partial


def extensions_are_equivalent(ext1, ext2):
    """Return whether file extensions can be considered as equivalent."""
    synonyms = [{'jpg', 'jpeg'}]
    ext1, ext2 = ext1.lower(), ext2.lower()
    return ext1 == ext2 or any((ext1 in s and ext2 in s) for s in synonyms)


def add_extension_to_filename_if_needed(ext, filename):
    """Given an extension and a filename, add the extension to the filename
    if the filename does not already have this extension (or an extension
    considered to be equivalent."""
    filename_ext = filename.split('.')[-1]
    if extensions_are_equivalent(ext, filename_ext):
        return filename
    else:
        return filename + '.' + ext


def get_file_at_url(url, path, referer=None):
    """Save content at url in path on file system.
    In theory, this could have been achieved with urlretrieve but it seems
    to be about to get deprecated and adding a user-agent seems to be quite
    awkward.

    url is a string
    path is a string corresponding to the file location
    referer is an optional string
    Returns the path if the file is retrieved properly, None otherwise."""
    log('(url : %s, path : %s)' % (url, path))
    try:
        with urlopen_wrapper(url, referer) as response:
            content_type = response.info().get('Content-Type', '').split('/')
            assert 1 <= len(content_type) <= 2
            if len(content_type) == 2:
                data = content_type[1].split(';')
                path = add_extension_to_filename_if_needed(data[0], path)
            with open(path, 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
                time.sleep(0.4)
                return path
    except (urllib.error.HTTPError, urllib.error.URLError):
        return None


def get_filename_from_url(url):
    """Get filename from url

    url is a string
    Returns a string corresponding to the name of the file."""
    return urllib.parse.unquote(url).split('/')[-1]


def load_json_at_url(url):
    """Get content at url as JSON and return it."""
    return json.loads(get_content(url).decode())


def get_soup_at_url(url, detect_meta=False, detect_rel=False, detect_angular=False, save_in_file=False):
    """Get content at url as BeautifulSoup.

    url is a string
    detect_meta is a hacky flag used to detect comics using similar plugin to
        be able to reuse code at some point
    detect_rel is a hacky flag to detect next/first comics automatically
    detect_rel is a hacky flag to detect if page corresponds to an Angular app
    save_in_file is a hacky flag to save content in temp file for debugging
        purposes
    Returns a BeautifulSoup object."""
    time.sleep(0.4)
    content = get_content(url)
    soup = BeautifulSoup(content, "html.parser")
    if detect_meta:
        for meta_val in ['generator', 'ComicPress', 'Comic-Easel']:
            meta = soup.find('meta', attrs={'name': meta_val})
            if meta is not None:
                print(meta)
    if detect_rel:
        for tag in ['a', 'link']:
            next_ = soup.find(tag, rel='next')
            if next_ is not None:
                print(next_)
    if detect_angular:
        html = soup.find('html')
        if html.has_attr('ng-app'):
            print(url)
    if save_in_file:
        time_ms = time.time() * 1000
        prefix = 'get_soup_at_url_' + str(time_ms) + '_'
        with open(prefix + 'raw', 'wb') as f:
            f.write(content)
        with open(prefix + 'content', 'wb') as f:
            f.write(soup.encode('utf-8'))
    return soup


1			#! /usr/bin/python3
2			# vim: set expandtab tabstop=4 shiftwidth=4 :
3			"""Module with functions wrapping urllib"""
4
5			import http.client
6			import urllib.request
7			import urllib.parse
8			import json
9			import shutil
10			import gzip
11			from bs4 import BeautifulSoup
12			import inspect
13			import logging
14			import time
15
16
17			def log(string):
18			"""Dirty logging function."""
19			# TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes
20			# we do not need to retrieve the function name manually
21			logging.debug(inspect.stack()[1][3] + " " + string)
22
23
24			def convert_iri_to_plain_ascii_uri(uri):
25			"""Convert IRI to plain ASCII URL
26			Based on http://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen."""
27			lis = list(urllib.parse.urlsplit(uri))
28			lis[2] = urllib.parse.quote(lis[2])
29			url = urllib.parse.urlunsplit(lis)
30			if False and url != uri:
31			print(uri, '->', url)
32			return url
33
34
35			def urlopen_wrapper(url, referer=None):
36			"""Wrapper around urllib.request.urlopen (user-agent, etc).
37
38			url is a string
39			referer is an optional string
40			Returns a byte object."""
41			log('(url : %s)' % url)
42			user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30'
43			try:
44			req = urllib.request.Request(url, headers={'User-Agent': user_agent, 'Accept': '/'})
45			if referer:
46			req.add_header('Referer', referer)
47			response = urllib.request.urlopen(req)
48			if response.info().get('Content-Encoding') == 'gzip':
49			return gzip.GzipFile(fileobj=response)
50			return response
51			except (urllib.error.HTTPError, urllib.error.URLError) as e:
52			print("Exception %s for url %s" % (e, url))
53			raise
54
55
56			def urljoin_wrapper(base, url):
57			"""Wrapper around urllib.parse.urljoin.
58			Construct a full ("absolute") URL by combining a "base URL" (base) with
59			another URL (url)."""
60			return urllib.parse.urljoin(base, url)
61
62
63			def get_content(url):
64			"""Get content at url.
65
66			url is a string
67			Returns a string"""
68			log('(url : %s)' % url)
69			try:
70			return urlopen_wrapper(url).read()
71			except http.client.IncompleteRead as e:
72			print("%s for %s" % (e, url))
73			return e.partial
74
75
76			def extensions_are_equivalent(ext1, ext2):
77			"""Return whether file extensions can be considered as equivalent."""
78			synonyms = [{'jpg', 'jpeg'}]
79			ext1, ext2 = ext1.lower(), ext2.lower()
80			return ext1 == ext2 or any((ext1 in s and ext2 in s) for s in synonyms)
81
82
83			def add_extension_to_filename_if_needed(ext, filename):
84			"""Given an extension and a filename, add the extension to the filename
85			if the filename does not already have this extension (or an extension
86			considered to be equivalent."""
87			filename_ext = filename.split('.')[-1]
88			if extensions_are_equivalent(ext, filename_ext):
89			return filename
90			else:
91			return filename + '.' + ext
92
93
94			def get_file_at_url(url, path, referer=None):
95			"""Save content at url in path on file system.
96			In theory, this could have been achieved with urlretrieve but it seems
97			to be about to get deprecated and adding a user-agent seems to be quite
98			awkward.
99
100			url is a string
101			path is a string corresponding to the file location
102			referer is an optional string
103			Returns the path if the file is retrieved properly, None otherwise."""
104			log('(url : %s, path : %s)' % (url, path))
105			try:
106			with urlopen_wrapper(url, referer) as response:
107			content_type = response.info().get('Content-Type', '').split('/')
108			assert 1 <= len(content_type) <= 2
109			if len(content_type) == 2:
110			data = content_type[1].split(';')
111			path = add_extension_to_filename_if_needed(data[0], path)
112			with open(path, 'wb') as out_file:
113			shutil.copyfileobj(response, out_file)
114			time.sleep(0.4)
115			return path
116			except (urllib.error.HTTPError, urllib.error.URLError):
117			return None
118
119
120			def get_filename_from_url(url):
121			"""Get filename from url
122
123			url is a string
124			Returns a string corresponding to the name of the file."""
125			return urllib.parse.unquote(url).split('/')[-1]
126
127
128			def load_json_at_url(url):
129			"""Get content at url as JSON and return it."""
130			return json.loads(get_content(url).decode())
131
132
133			def get_soup_at_url(url, detect_meta=False, detect_rel=False, detect_angular=False, save_in_file=False):
134			"""Get content at url as BeautifulSoup.
135
136			url is a string
137			detect_meta is a hacky flag used to detect comics using similar plugin to
138			be able to reuse code at some point
139			detect_rel is a hacky flag to detect next/first comics automatically
140			detect_rel is a hacky flag to detect if page corresponds to an Angular app
141			save_in_file is a hacky flag to save content in temp file for debugging
142			purposes
143			Returns a BeautifulSoup object."""
144			time.sleep(0.4)
145			content = get_content(url)
146			soup = BeautifulSoup(content, "html.parser")
147			if detect_meta:
148			for meta_val in ['generator', 'ComicPress', 'Comic-Easel']:
149			meta = soup.find('meta', attrs={'name': meta_val})
150			if meta is not None:
151			print(meta)
152			if detect_rel:
153			for tag in ['a', 'link']:
154			next_ = soup.find(tag, rel='next')
155			if next_ is not None:
156			print(next_)
157			if detect_angular:
158			html = soup.find('html')
159			if html.has_attr('ng-app'):
160			print(url)
161			if save_in_file:
162			time_ms = time.time() * 1000
163			prefix = 'get_soup_at_url_' + str(time_ms) + '_'
164			with open(prefix + 'raw', 'wb') as f:
165			f.write(content)
166			with open(prefix + 'content', 'wb') as f:
167			f.write(soup.encode('utf-8'))
168			return soup
169

SylvainDe / ComicBookMaker

get_soup_at_url() F last analyzed 2018-06-15 09:38 UTC

Complexity

Size

Duplication

Importance

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

get_soup_at_url() F
last analyzed 2018-06-15 09:38 UTC