get_encoding() - Code Metrics - Inspection of "Merge pull request #544 from wonderay/update_get_e..." - binux/pyspider - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 80caf2...0374f2 )

by Roy

created 2016-09-20 21:51 UTC

get_encoding() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	5
c	1
b	0
f	0
dl	0
loc	24
rs	8.1671

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
#         http://binux.me
# Created on 2012-11-02 11:16:02

import cgi
import re
import six
import json
import chardet
import lxml.html
import lxml.etree
from tblib import Traceback
from pyquery import PyQuery
from requests.structures import CaseInsensitiveDict
from requests import HTTPError
from pyspider.libs import utils


class Response(object):

    def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(),
                 content='', cookies={}, error=None, traceback=None, save=None, js_script_result=None, time=0):
        self.status_code = status_code
        self.url = url
        self.orig_url = orig_url
        self.headers = headers
        self.content = content
        self.cookies = cookies
        self.error = error
        self.traceback = traceback
        self.save = save
        self.js_script_result = js_script_result
        self.time = time

    def __repr__(self):
        return u'<Response [%d]>' % self.status_code

    def __bool__(self):
        """Returns true if `status_code` is 200 and no error"""
        return self.ok

    def __nonzero__(self):
        """Returns true if `status_code` is 200 and no error."""
        return self.ok

    @property
    def ok(self):
        """Return true if `status_code` is 200 and no error."""
        try:
            self.raise_for_status()
        except:
            return False
        return True

    @property
    def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type or content
        encoding = get_encoding(self.headers, self.content)

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding

    @encoding.setter
    def encoding(self, value):
        """
        set encoding of content manually
        it will overwrite the guessed encoding
        """
        self._encoding = value
        self._text = None

    @property
    def text(self):
        """
        Content of the response, in unicode.

        if Response.encoding is None and chardet module is available, encoding
        will be guessed.
        """
        if hasattr(self, '_text') and self._text:
            return self._text
        if not self.content:
            return u''
        if isinstance(self.content, six.text_type):
            return self.content

        content = None
        encoding = self.encoding

        # Decode unicode from given encoding.
        try:
            content = self.content.decode(encoding, 'replace')
        except LookupError:
            # A LookupError is raised if the encoding was not found which could
            # indicate a misspelling or similar mistake.
            #
            # So we try blindly encoding.
            content = self.content.decode('utf-8', 'replace')

        self._text = content
        return content

    @property
    def json(self):
        """Returns the json-encoded content of the response, if any."""
        if hasattr(self, '_json'):
            return self._json
        try:
            self._json = json.loads(self.text or self.content)
        except ValueError:
            self._json = None
        return self._json

    @property
    def doc(self):
        """Returns a PyQuery object of the response's content"""
        if hasattr(self, '_doc'):
            return self._doc
        elements = self.etree
        doc = self._doc = PyQuery(elements)
        doc.make_links_absolute(utils.text(self.url))
        return doc

    @property
    def etree(self):
        """Returns a lxml object of the response's content that can be selected by xpath"""
        if not hasattr(self, '_elements'):
            try:
                parser = lxml.html.HTMLParser(encoding=self.encoding)
                self._elements = lxml.html.fromstring(self.content, parser=parser)
            except LookupError:
                # lxml would raise LookupError when encoding not supported
                # try fromstring without encoding instead.
                # on windows, unicode is not availabe as encoding for lxml
                self._elements = lxml.html.fromstring(self.content)
        if isinstance(self._elements, lxml.etree._ElementTree):
            self._elements = self._elements.getroot()
        return self._elements

    def raise_for_status(self, allow_redirects=True):
        """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred."""

        if self.status_code == 304:
            return
        elif self.error:
            if self.traceback:
                six.reraise(Exception, self.error, Traceback.from_string(self.traceback).as_traceback())
            http_error = HTTPError(self.error)
        elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:
            http_error = HTTPError('%s Redirection' % (self.status_code))
        elif (self.status_code >= 400) and (self.status_code < 500):
            http_error = HTTPError('%s Client Error' % (self.status_code))
        elif (self.status_code >= 500) and (self.status_code < 600):
            http_error = HTTPError('%s Server Error' % (self.status_code))
        else:
            return

        http_error.response = self
        raise http_error

    def isok(self):
        try:
            self.raise_for_status()
            return True
        except:
            return False


def rebuild_response(r):
    response = Response(
        status_code=r.get('status_code', 599),
        url=r.get('url', ''),
        headers=CaseInsensitiveDict(r.get('headers', {})),
        content=r.get('content', ''),
        cookies=r.get('cookies', {}),
        error=r.get('error'),
        traceback=r.get('traceback'),
        time=r.get('time', 0),
        orig_url=r.get('orig_url', r.get('url', '')),
        js_script_result=r.get('js_script_result'),
        save=r.get('save'),
    )
    return response


def get_encoding(headers, content):
    """Get encoding from request headers or page head."""
    encoding = None

    content_type = headers.get('content-type')
    if content_type:
        _, params = cgi.parse_header(content_type)
        if 'charset' in params:
            encoding = params['charset'].strip("'\"")

    if not encoding:
        content = utils.pretty_unicode(content[:1000]) if six.PY3 else content

        charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]',
                                flags=re.I)
        pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]',
                               flags=re.I)
        xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
        encoding = (charset_re.findall(content) +
                    pragma_re.findall(content) +
                    xml_re.findall(content))
        encoding = encoding and encoding[0] or None

    return encoding


1			#!/usr/bin/env python
2			# -- encoding: utf-8 --
3			# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4			# Author: Binux<[email protected]>
5			# http://binux.me
6			# Created on 2012-11-02 11:16:02
7
8			import cgi
9			import re
10			import six
11			import json
12			import chardet
13			import lxml.html
14			import lxml.etree
15			from tblib import Traceback
16			from pyquery import PyQuery
17			from requests.structures import CaseInsensitiveDict
18			from requests import HTTPError
19			from pyspider.libs import utils
20
21
22			class Response(object):
23
24			def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(),
25			content='', cookies={}, error=None, traceback=None, save=None, js_script_result=None, time=0):
26			self.status_code = status_code
27			self.url = url
28			self.orig_url = orig_url
29			self.headers = headers
30			self.content = content
31			self.cookies = cookies
32			self.error = error
33			self.traceback = traceback
34			self.save = save
35			self.js_script_result = js_script_result
36			self.time = time
37
38			def __repr__(self):
39			return u'<Response [%d]>' % self.status_code
40
41			def __bool__(self):
42			"""Returns true if `status_code` is 200 and no error"""
43			return self.ok
44
45			def __nonzero__(self):
46			"""Returns true if `status_code` is 200 and no error."""
47			return self.ok
48
49			@property
50			def ok(self):
51			"""Return true if `status_code` is 200 and no error."""
52			try:
53			self.raise_for_status()
54			except:
55			return False
56			return True
57
58			@property
59			def encoding(self):
60			"""
61			encoding of Response.content.
62
63			if Response.encoding is None, encoding will be guessed
64			by header or content or chardet if available.
65			"""
66			if hasattr(self, '_encoding'):
67			return self._encoding
68
69			# content is unicode
70			if isinstance(self.content, six.text_type):
71			return 'unicode'
72
73			# Try charset from content-type or content
74			encoding = get_encoding(self.headers, self.content)
75
76			# Fallback to auto-detected encoding.
77			if not encoding and chardet is not None:
78			encoding = chardet.detect(self.content[:600])['encoding']
79
80			if encoding and encoding.lower() == 'gb2312':
81			encoding = 'gb18030'
82
83			self._encoding = encoding or 'utf-8'
84			return self._encoding
85
86			@encoding.setter
87			def encoding(self, value):
88			"""
89			set encoding of content manually
90			it will overwrite the guessed encoding
91			"""
92			self._encoding = value
93			self._text = None
94
95			@property
96			def text(self):
97			"""
98			Content of the response, in unicode.
99
100			if Response.encoding is None and chardet module is available, encoding
101			will be guessed.
102			"""
103			if hasattr(self, '_text') and self._text:
104			return self._text
105			if not self.content:
106			return u''
107			if isinstance(self.content, six.text_type):
108			return self.content
109
110			content = None
111			encoding = self.encoding
112
113			# Decode unicode from given encoding.
114			try:
115			content = self.content.decode(encoding, 'replace')
116			except LookupError:
117			# A LookupError is raised if the encoding was not found which could
118			# indicate a misspelling or similar mistake.
119			#
120			# So we try blindly encoding.
121			content = self.content.decode('utf-8', 'replace')
122
123			self._text = content
124			return content
125
126			@property
127			def json(self):
128			"""Returns the json-encoded content of the response, if any."""
129			if hasattr(self, '_json'):
130			return self._json
131			try:
132			self._json = json.loads(self.text or self.content)
133			except ValueError:
134			self._json = None
135			return self._json
136
137			@property
138			def doc(self):
139			"""Returns a PyQuery object of the response's content"""
140			if hasattr(self, '_doc'):
141			return self._doc
142			elements = self.etree
143			doc = self._doc = PyQuery(elements)
144			doc.make_links_absolute(utils.text(self.url))
145			return doc
146
147			@property
148			def etree(self):
149			"""Returns a lxml object of the response's content that can be selected by xpath"""
150			if not hasattr(self, '_elements'):
151			try:
152			parser = lxml.html.HTMLParser(encoding=self.encoding)
153			self._elements = lxml.html.fromstring(self.content, parser=parser)
154			except LookupError:
155			# lxml would raise LookupError when encoding not supported
156			# try fromstring without encoding instead.
157			# on windows, unicode is not availabe as encoding for lxml
158			self._elements = lxml.html.fromstring(self.content)
159			if isinstance(self._elements, lxml.etree._ElementTree):
160			self._elements = self._elements.getroot()
161			return self._elements
162
163			def raise_for_status(self, allow_redirects=True):
164			"""Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred."""
165
166			if self.status_code == 304:
167			return
168			elif self.error:
169			if self.traceback:
170			six.reraise(Exception, self.error, Traceback.from_string(self.traceback).as_traceback())
171			http_error = HTTPError(self.error)
172			elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:
173			http_error = HTTPError('%s Redirection' % (self.status_code))
174			elif (self.status_code >= 400) and (self.status_code < 500):
175			http_error = HTTPError('%s Client Error' % (self.status_code))
176			elif (self.status_code >= 500) and (self.status_code < 600):
177			http_error = HTTPError('%s Server Error' % (self.status_code))
178			else:
179			return
180
181			http_error.response = self
182			raise http_error
183
184			def isok(self):
185			try:
186			self.raise_for_status()
187			return True
188			except:
189			return False
190
191
192			def rebuild_response(r):
193			response = Response(
194			status_code=r.get('status_code', 599),
195			url=r.get('url', ''),
196			headers=CaseInsensitiveDict(r.get('headers', {})),
197			content=r.get('content', ''),
198			cookies=r.get('cookies', {}),
199			error=r.get('error'),
200			traceback=r.get('traceback'),
201			time=r.get('time', 0),
202			orig_url=r.get('orig_url', r.get('url', '')),
203			js_script_result=r.get('js_script_result'),
204			save=r.get('save'),
205			)
206			return response
207
208
209			def get_encoding(headers, content):
210			"""Get encoding from request headers or page head."""
211			encoding = None
212
213			content_type = headers.get('content-type')
214			if content_type:
215			_, params = cgi.parse_header(content_type)
216			if 'charset' in params:
217			encoding = params['charset'].strip("'\"")
218
219			if not encoding:
220			content = utils.pretty_unicode(content[:1000]) if six.PY3 else content
221
222			charset_re = re.compile(r'<meta.?charset=["\'](.+?)["\'>]',
223			flags=re.I)
224			pragma_re = re.compile(r'<meta.?content=["\'];?charset=(.+?)["\'>]',
225			flags=re.I)
226			xml_re = re.compile(r'^<\?xml.?encoding=["\'](.+?)["\'>]')
227			encoding = (charset_re.findall(content) +
228			pragma_re.findall(content) +
229			xml_re.findall(content))
230			encoding = encoding and encoding[0] or None
231
232			return encoding
233

binux / pyspider

Push — master ( 80caf2...0374f2 )

get_encoding() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like