Completed
Push — master ( 1aa254...d5cc3f )
by Roy
01:59
created

pyspider.libs.Response.doc()   A

Complexity

Conditions 2

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 2
dl 0
loc 9
rs 9.6666
1
#!/usr/bin/env python
2
# -*- encoding: utf-8 -*-
3
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4
# Author: Binux<[email protected]>
5
#         http://binux.me
6
# Created on 2012-11-02 11:16:02
7
8
import six
9
import json
10
import chardet
11
import lxml.html
12
import lxml.etree
13
from pyquery import PyQuery
14
from requests.structures import CaseInsensitiveDict
15
from requests.utils import get_encoding_from_headers
16
try:
17
    from requests.utils import get_encodings_from_content
18
except ImportError:
19
    get_encodings_from_content = None
20
from requests import HTTPError
21
from pyspider.libs import utils
22
23
24
class Response(object):
25
26
    def __init__(self):
27
        self.status_code = None
28
        self.url = None
29
        self.orig_url = None
30
        self.headers = CaseInsensitiveDict()
31
        self.content = ''
32
        self.cookies = {}
33
        self.error = None
34
        self.save = None
35
        self.js_script_result = None
36
        self.time = 0
37
38
    def __repr__(self):
39
        return u'<Response [%d]>' % self.status_code
40
41
    def __bool__(self):
42
        """Returns true if `status_code` is 200 and no error"""
43
        return self.ok
44
45
    def __nonzero__(self):
46
        """Returns true if `status_code` is 200 and no error."""
47
        return self.ok
48
49
    @property
50
    def ok(self):
51
        """Return true if `status_code` is 200 and no error."""
52
        try:
53
            self.raise_for_status()
54
        except:
55
            return False
56
        return True
57
58
    @property
59
    def encoding(self):
60
        """
61
        encoding of Response.content.
62
63
        if Response.encoding is None, encoding will be guessed
64
        by header or content or chardet if available.
65
        """
66
        if hasattr(self, '_encoding'):
67
            return self._encoding
68
69
        # content is unicode
70
        if isinstance(self.content, six.text_type):
71
            return 'unicode'
72
73
        # Try charset from content-type
74
        encoding = get_encoding_from_headers(self.headers)
75
        if encoding == 'ISO-8859-1':
76
            encoding = None
77
78
        # Try charset from content
79
        if not encoding and get_encodings_from_content:
80
            if six.PY3:
81
                encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
82
            else:
83
                encoding = get_encodings_from_content(self.content)
84
            encoding = encoding and encoding[0] or None
85
86
        # Fallback to auto-detected encoding.
87
        if not encoding and chardet is not None:
88
            encoding = chardet.detect(self.content)['encoding']
89
90
        if encoding and encoding.lower() == 'gb2312':
91
            encoding = 'gb18030'
92
93
        self._encoding = encoding or 'utf-8'
94
        return self._encoding
95
96
    @encoding.setter
97
    def encoding(self, value):
98
        """
99
        set encoding of content manually
100
        it will overwrite the guessed encoding
101
        """
102
        self._encoding = value
103
        self._text = None
104
105
    @property
106
    def text(self):
107
        """
108
        Content of the response, in unicode.
109
110
        if Response.encoding is None and chardet module is available, encoding
111
        will be guessed.
112
        """
113
        if hasattr(self, '_text') and self._text:
114
            return self._text
115
        if not self.content:
116
            return u''
117
        if isinstance(self.content, six.text_type):
118
            return self.content
119
120
        content = None
121
        encoding = self.encoding
122
123
        # Decode unicode from given encoding.
124
        try:
125
            content = self.content.decode(encoding, 'replace')
126
        except LookupError:
127
            # A LookupError is raised if the encoding was not found which could
128
            # indicate a misspelling or similar mistake.
129
            #
130
            # So we try blindly encoding.
131
            content = self.content.decode('utf-8', 'replace')
132
133
        self._text = content
134
        return content
135
136
    @property
137
    def json(self):
138
        """Returns the json-encoded content of the response, if any."""
139
        if hasattr(self, '_json'):
140
            return self._json
141
        try:
142
            self._json = json.loads(self.text or self.content)
143
        except ValueError:
144
            self._json = None
145
        return self._json
146
147
    @property
148
    def doc(self):
149
        """Returns a PyQuery object of the response's content"""
150
        if hasattr(self, '_doc'):
151
            return self._doc
152
        elements = self.etree
153
        doc = self._doc = PyQuery(elements)
154
        doc.make_links_absolute(self.url)
155
        return doc
156
157
    @property
158
    def etree(self):
159
        """Returns a lxml object of the response's content that can be selected by xpath"""
160
        if not hasattr(self, '_elements'):
161
            try:
162
                parser = lxml.html.HTMLParser(encoding=self.encoding)
163
                self._elements = lxml.html.fromstring(self.content, parser=parser)
164
            except LookupError:
165
                # lxml would raise LookupError when encoding not supported
166
                # try fromstring without encoding instead.
167
                # on windows, unicode is not availabe as encoding for lxml
168
                self._elements = lxml.html.fromstring(self.content)
169
        if isinstance(self._elements, lxml.etree._ElementTree):
170
            self._elements = self._elements.getroot()
171
        return self._elements
172
173
    def raise_for_status(self, allow_redirects=True):
174
        """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred."""
175
176
        if self.status_code == 304:
177
            return
178
        elif self.error:
179
            http_error = HTTPError(self.error)
180
        elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:
181
            http_error = HTTPError('%s Redirection' % (self.status_code))
182
        elif (self.status_code >= 400) and (self.status_code < 500):
183
            http_error = HTTPError('%s Client Error' % (self.status_code))
184
        elif (self.status_code >= 500) and (self.status_code < 600):
185
            http_error = HTTPError('%s Server Error' % (self.status_code))
186
        else:
187
            return
188
189
        http_error.response = self
190
        raise http_error
191
192
    def isok(self):
193
        try:
194
            self.raise_for_status()
195
            return True
196
        except:
197
            return False
198
199
200
def rebuild_response(r):
201
    response = Response()
202
    response.status_code = r.get('status_code', 599)
203
    response.url = r.get('url', '')
204
    response.headers = CaseInsensitiveDict(r.get('headers', {}))
205
    response.content = r.get('content', '')
206
    response.cookies = r.get('cookies', {})
207
    response.error = r.get('error')
208
    response.time = r.get('time', 0)
209
    response.orig_url = r.get('orig_url', response.url)
210
    response.js_script_result = r.get('js_script_result')
211
    response.save = r.get('save')
212
    return response
213