| 1 |  |  | #!/usr/bin/env python | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | # -*- encoding: utf-8 -*- | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | # Author: Binux<[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | #         http://binux.me | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | # Created on 2012-11-02 11:16:02 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | import six | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | import json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | import chardet | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | import lxml.html | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | import lxml.etree | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from pyquery import PyQuery | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from requests.structures import CaseInsensitiveDict | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | from requests.utils import get_encoding_from_headers | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |     from requests.utils import get_encodings_from_content | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | except ImportError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |     get_encodings_from_content = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | from requests import HTTPError | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | from pyspider.libs import utils | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | class Response(object): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |     def __init__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |         self.status_code = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |         self.url = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |         self.orig_url = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         self.headers = CaseInsensitiveDict() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |         self.content = '' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |         self.cookies = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |         self.error = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         self.save = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |         self.js_script_result = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         self.time = 0 | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 38 |  |  |     def __repr__(self): | 
            
                                                                        
                            
            
                                    
            
            
                | 39 |  |  |         return u'<Response [%d]>' % self.status_code | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     def __bool__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |         """Returns true if `status_code` is 200 and no error""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         return self.ok | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     def __nonzero__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |         """Returns true if `status_code` is 200 and no error.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         return self.ok | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |     def ok(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         """Return true if `status_code` is 200 and no error.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             self.raise_for_status() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |         except: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             return False | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         return True | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |     def encoding(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         encoding of Response.content. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         if Response.encoding is None, encoding will be guessed | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |         by header or content or chardet if available. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         if hasattr(self, '_encoding'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |             return self._encoding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         # content is unicode | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         if isinstance(self.content, six.text_type): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |             return 'unicode' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         # Try charset from content-type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         encoding = get_encoding_from_headers(self.headers) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         if encoding == 'ISO-8859-1': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |             encoding = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         # Try charset from content | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |         if not encoding and get_encodings_from_content: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |             if six.PY3: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |                 encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |             else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |                 encoding = get_encodings_from_content(self.content) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |             encoding = encoding and encoding[0] or None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         # Fallback to auto-detected encoding. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         if not encoding and chardet is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |             encoding = chardet.detect(self.content)['encoding'] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         if encoding and encoding.lower() == 'gb2312': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |             encoding = 'gb18030' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         self._encoding = encoding or 'utf-8' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         return self._encoding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |     @encoding.setter | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |     def encoding(self, value): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         set encoding of content manually | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         it will overwrite the guessed encoding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         self._encoding = value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         self._text = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |     def text(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         Content of the response, in unicode. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         if Response.encoding is None and chardet module is available, encoding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         will be guessed. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |         if hasattr(self, '_text') and self._text: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |             return self._text | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |         if not self.content: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |             return u'' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         if isinstance(self.content, six.text_type): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |             return self.content | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         content = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |         encoding = self.encoding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         # Decode unicode from given encoding. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |             content = self.content.decode(encoding, 'replace') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |         except LookupError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |             # A LookupError is raised if the encoding was not found which could | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |             # indicate a misspelling or similar mistake. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |             # | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |             # So we try blindly encoding. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |             content = self.content.decode('utf-8', 'replace') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         self._text = content | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |         return content | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |     def json(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         """Returns the json-encoded content of the response, if any.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         if hasattr(self, '_json'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |             return self._json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |             self._json = json.loads(self.text or self.content) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         except ValueError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |             self._json = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |         return self._json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |     def doc(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |         """Returns a PyQuery object of the response's content""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         if hasattr(self, '_doc'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |             return self._doc | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |         elements = self.etree | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |         doc = self._doc = PyQuery(elements) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         doc.make_links_absolute(self.url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |         return doc | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |     def etree(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |         """Returns a lxml object of the response's content that can be selected by xpath""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |         if not hasattr(self, '_elements'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |             try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |                 parser = lxml.html.HTMLParser(encoding=self.encoding) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |                 self._elements = lxml.html.fromstring(self.content, parser=parser) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |             except LookupError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |                 # lxml would raise LookupError when encoding not supported | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |                 # try fromstring without encoding instead. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |                 # on windows, unicode is not availabe as encoding for lxml | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |                 self._elements = lxml.html.fromstring(self.content) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |         if isinstance(self._elements, lxml.etree._ElementTree): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |             self._elements = self._elements.getroot() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |         return self._elements | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |     def raise_for_status(self, allow_redirects=True): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |         """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |         if self.status_code == 304: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |         elif self.error: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |             http_error = HTTPError(self.error) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |         elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |             http_error = HTTPError('%s Redirection' % (self.status_code)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |         elif (self.status_code >= 400) and (self.status_code < 500): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |             http_error = HTTPError('%s Client Error' % (self.status_code)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |         elif (self.status_code >= 500) and (self.status_code < 600): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |             http_error = HTTPError('%s Server Error' % (self.status_code)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |         http_error.response = self | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 |  |  |         raise http_error | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 |  |  |     def isok(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 |  |  |             self.raise_for_status() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 |  |  |             return True | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 |  |  |         except: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |             return False | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  | def rebuild_response(r): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 |  |  |     response = Response() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |     response.status_code = r.get('status_code', 599) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 |  |  |     response.url = r.get('url', '') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |     response.headers = CaseInsensitiveDict(r.get('headers', {})) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |     response.content = r.get('content', '') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |     response.cookies = r.get('cookies', {}) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 |  |  |     response.error = r.get('error') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |     response.time = r.get('time', 0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 |  |  |     response.orig_url = r.get('orig_url', response.url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |     response.js_script_result = r.get('js_script_result') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 |  |  |     response.save = r.get('save') | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 212 |  |  |     return response | 
            
                                                        
            
                                    
            
            
                | 213 |  |  |  |