Completed
Push — master ( 80caf2...0374f2 )
by Roy
11s
created

get_encoding()   B

Complexity

Conditions 5

Size

Total Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 5
c 1
b 0
f 0
dl 0
loc 24
rs 8.1671
1
#!/usr/bin/env python
2
# -*- encoding: utf-8 -*-
3
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4
# Author: Binux<[email protected]>
5
#         http://binux.me
6
# Created on 2012-11-02 11:16:02
7
8
import cgi
9
import re
10
import six
11
import json
12
import chardet
13
import lxml.html
14
import lxml.etree
15
from tblib import Traceback
16
from pyquery import PyQuery
17
from requests.structures import CaseInsensitiveDict
18
from requests import HTTPError
19
from pyspider.libs import utils
20
21
22
class Response(object):
23
24
    def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(),
25
                 content='', cookies={}, error=None, traceback=None, save=None, js_script_result=None, time=0):
26
        self.status_code = status_code
27
        self.url = url
28
        self.orig_url = orig_url
29
        self.headers = headers
30
        self.content = content
31
        self.cookies = cookies
32
        self.error = error
33
        self.traceback = traceback
34
        self.save = save
35
        self.js_script_result = js_script_result
36
        self.time = time
37
38
    def __repr__(self):
39
        return u'<Response [%d]>' % self.status_code
40
41
    def __bool__(self):
42
        """Returns true if `status_code` is 200 and no error"""
43
        return self.ok
44
45
    def __nonzero__(self):
46
        """Returns true if `status_code` is 200 and no error."""
47
        return self.ok
48
49
    @property
50
    def ok(self):
51
        """Return true if `status_code` is 200 and no error."""
52
        try:
53
            self.raise_for_status()
54
        except:
55
            return False
56
        return True
57
58
    @property
59
    def encoding(self):
60
        """
61
        encoding of Response.content.
62
63
        if Response.encoding is None, encoding will be guessed
64
        by header or content or chardet if available.
65
        """
66
        if hasattr(self, '_encoding'):
67
            return self._encoding
68
69
        # content is unicode
70
        if isinstance(self.content, six.text_type):
71
            return 'unicode'
72
73
        # Try charset from content-type or content
74
        encoding = get_encoding(self.headers, self.content)
75
76
        # Fallback to auto-detected encoding.
77
        if not encoding and chardet is not None:
78
            encoding = chardet.detect(self.content[:600])['encoding']
79
80
        if encoding and encoding.lower() == 'gb2312':
81
            encoding = 'gb18030'
82
83
        self._encoding = encoding or 'utf-8'
84
        return self._encoding
85
86
    @encoding.setter
87
    def encoding(self, value):
88
        """
89
        set encoding of content manually
90
        it will overwrite the guessed encoding
91
        """
92
        self._encoding = value
93
        self._text = None
94
95
    @property
96
    def text(self):
97
        """
98
        Content of the response, in unicode.
99
100
        if Response.encoding is None and chardet module is available, encoding
101
        will be guessed.
102
        """
103
        if hasattr(self, '_text') and self._text:
104
            return self._text
105
        if not self.content:
106
            return u''
107
        if isinstance(self.content, six.text_type):
108
            return self.content
109
110
        content = None
111
        encoding = self.encoding
112
113
        # Decode unicode from given encoding.
114
        try:
115
            content = self.content.decode(encoding, 'replace')
116
        except LookupError:
117
            # A LookupError is raised if the encoding was not found which could
118
            # indicate a misspelling or similar mistake.
119
            #
120
            # So we try blindly encoding.
121
            content = self.content.decode('utf-8', 'replace')
122
123
        self._text = content
124
        return content
125
126
    @property
127
    def json(self):
128
        """Returns the json-encoded content of the response, if any."""
129
        if hasattr(self, '_json'):
130
            return self._json
131
        try:
132
            self._json = json.loads(self.text or self.content)
133
        except ValueError:
134
            self._json = None
135
        return self._json
136
137
    @property
138
    def doc(self):
139
        """Returns a PyQuery object of the response's content"""
140
        if hasattr(self, '_doc'):
141
            return self._doc
142
        elements = self.etree
143
        doc = self._doc = PyQuery(elements)
144
        doc.make_links_absolute(utils.text(self.url))
145
        return doc
146
147
    @property
148
    def etree(self):
149
        """Returns a lxml object of the response's content that can be selected by xpath"""
150
        if not hasattr(self, '_elements'):
151
            try:
152
                parser = lxml.html.HTMLParser(encoding=self.encoding)
153
                self._elements = lxml.html.fromstring(self.content, parser=parser)
154
            except LookupError:
155
                # lxml would raise LookupError when encoding not supported
156
                # try fromstring without encoding instead.
157
                # on windows, unicode is not availabe as encoding for lxml
158
                self._elements = lxml.html.fromstring(self.content)
159
        if isinstance(self._elements, lxml.etree._ElementTree):
160
            self._elements = self._elements.getroot()
161
        return self._elements
162
163
    def raise_for_status(self, allow_redirects=True):
164
        """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred."""
165
166
        if self.status_code == 304:
167
            return
168
        elif self.error:
169
            if self.traceback:
170
                six.reraise(Exception, self.error, Traceback.from_string(self.traceback).as_traceback())
171
            http_error = HTTPError(self.error)
172
        elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:
173
            http_error = HTTPError('%s Redirection' % (self.status_code))
174
        elif (self.status_code >= 400) and (self.status_code < 500):
175
            http_error = HTTPError('%s Client Error' % (self.status_code))
176
        elif (self.status_code >= 500) and (self.status_code < 600):
177
            http_error = HTTPError('%s Server Error' % (self.status_code))
178
        else:
179
            return
180
181
        http_error.response = self
182
        raise http_error
183
184
    def isok(self):
185
        try:
186
            self.raise_for_status()
187
            return True
188
        except:
189
            return False
190
191
192
def rebuild_response(r):
193
    response = Response(
194
        status_code=r.get('status_code', 599),
195
        url=r.get('url', ''),
196
        headers=CaseInsensitiveDict(r.get('headers', {})),
197
        content=r.get('content', ''),
198
        cookies=r.get('cookies', {}),
199
        error=r.get('error'),
200
        traceback=r.get('traceback'),
201
        time=r.get('time', 0),
202
        orig_url=r.get('orig_url', r.get('url', '')),
203
        js_script_result=r.get('js_script_result'),
204
        save=r.get('save'),
205
    )
206
    return response
207
208
209
def get_encoding(headers, content):
210
    """Get encoding from request headers or page head."""
211
    encoding = None
212
213
    content_type = headers.get('content-type')
214
    if content_type:
215
        _, params = cgi.parse_header(content_type)
216
        if 'charset' in params:
217
            encoding = params['charset'].strip("'\"")
218
219
    if not encoding:
220
        content = utils.pretty_unicode(content[:1000]) if six.PY3 else content
221
222
        charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]',
223
                                flags=re.I)
224
        pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]',
225
                               flags=re.I)
226
        xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
227
        encoding = (charset_re.findall(content) +
228
                    pragma_re.findall(content) +
229
                    xml_re.findall(content))
230
        encoding = encoding and encoding[0] or None
231
232
    return encoding
233