@@ -69,13 +69,13 @@ discard block |
||
| 69 | 69 | $this->configurations = $configurations; |
| 70 | 70 | |
| 71 | 71 | // instancia o client http |
| 72 | - $this->client = new ClientHttp(); |
|
| 72 | + $this->client = new ClientHttp(); |
|
| 73 | 73 | |
| 74 | - // Executa um request para URL do serviço, retornando o cookie da requisição primária |
|
| 75 | - $this->instanceResponse = $this->client->request('GET', $this->configurations['home']); |
|
| 74 | + // Executa um request para URL do serviço, retornando o cookie da requisição primária |
|
| 75 | + $this->instanceResponse = $this->client->request('GET', $this->configurations['home']); |
|
| 76 | 76 | |
| 77 | - // Captura o cookie da requisição, será usuado posteriormente |
|
| 78 | - $this->cookie = $this->client->cookie(); |
|
| 77 | + // Captura o cookie da requisição, será usuado posteriormente |
|
| 78 | + $this->cookie = $this->client->cookie(); |
|
| 79 | 79 | |
| 80 | 80 | return $this; |
| 81 | 81 | } |
@@ -109,68 +109,68 @@ discard block |
||
| 109 | 109 | throw new ImageNotFound("Impossible to crawler image from response", 1); |
| 110 | 110 | } |
| 111 | 111 | |
| 112 | - $paramBot = $this->instanceResponse->filter( |
|
| 113 | - array_get($this->configurations, 'selectors.paramBot') |
|
| 114 | - ); |
|
| 112 | + $paramBot = $this->instanceResponse->filter( |
|
| 113 | + array_get($this->configurations, 'selectors.paramBot') |
|
| 114 | + ); |
|
| 115 | 115 | |
| 116 | - if(!$paramBot->count()){ |
|
| 116 | + if(!$paramBot->count()){ |
|
| 117 | 117 | throw new ImageNotFound("Impossible to crawler parambot from response", 1); |
| 118 | 118 | } |
| 119 | 119 | |
| 120 | 120 | // Inicia instancia do cURL |
| 121 | - $curl = new Curl; |
|
| 122 | - |
|
| 123 | - // Inicia uma requisição para capturar a imagem do captcha |
|
| 124 | - // informando cookie da requisição passada e os headers |
|
| 125 | - // |
|
| 126 | - // to-do: implementar guzzlehttp? |
|
| 127 | - // ele é melhor que o curl? ou mais organizado? |
|
| 128 | - $curl->init($this->configurations['base'] . $imageSrc->attr('src')); |
|
| 129 | - |
|
| 130 | - $this->params['parambot'] = trim($paramBot->attr('value')); |
|
| 131 | - |
|
| 132 | - // headers da requisição |
|
| 133 | - $curl->options([ |
|
| 134 | - CURLOPT_COOKIEJAR => 'cookiejar', |
|
| 135 | - CURLOPT_HTTPHEADER => array( |
|
| 136 | - "Pragma: no-cache", |
|
| 137 | - "Origin: " . $this->configurations['base'], |
|
| 138 | - "Host: ". array_get($this->configurations, 'headers.Host'), |
|
| 139 | - "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", |
|
| 140 | - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
| 141 | - "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", |
|
| 142 | - "Accept-Encoding: gzip, deflate", |
|
| 143 | - "Referer: " . $this->configurations['captcha'], |
|
| 144 | - "Cookie: flag=1; ". $this->cookie, |
|
| 145 | - "Connection: keep-alive" |
|
| 146 | - ), |
|
| 147 | - CURLOPT_RETURNTRANSFER => true, |
|
| 148 | - CURLOPT_FOLLOWLOCATION => 1, |
|
| 149 | - CURLOPT_BINARYTRANSFER => TRUE, |
|
| 150 | - CURLOPT_CONNECTTIMEOUT => 10, |
|
| 151 | - CURLOPT_TIMEOUT => 10, |
|
| 152 | - ]); |
|
| 153 | - |
|
| 154 | - // executa o curl, logo após fechando a conexão |
|
| 155 | - $curl->exec(); |
|
| 156 | - $curl->close(); |
|
| 157 | - |
|
| 158 | - // captura do retorno do curl |
|
| 159 | - // o esperado deverá ser o HTML da imagem |
|
| 160 | - $this->captcha = $curl->response(); |
|
| 161 | - |
|
| 162 | - // é uma imagem o retorno? |
|
| 163 | - if(@imagecreatefromstring($this->captcha) == false) |
|
| 164 | - { |
|
| 165 | - throw new NoCaptchaResponse('Não foi possível capturar o captcha'); |
|
| 166 | - } |
|
| 167 | - |
|
| 168 | - // constroe o base64 da imagem para o usuário digitar |
|
| 169 | - // to-do: um serviço automatizado para decifrar o captcha? |
|
| 170 | - // talvez deathbycaptcha? |
|
| 171 | - $this->captchaImage = 'data:image/png;base64,' . base64_encode($this->captcha); |
|
| 172 | - |
|
| 173 | - return $this->captchaImage; |
|
| 121 | + $curl = new Curl; |
|
| 122 | + |
|
| 123 | + // Inicia uma requisição para capturar a imagem do captcha |
|
| 124 | + // informando cookie da requisição passada e os headers |
|
| 125 | + // |
|
| 126 | + // to-do: implementar guzzlehttp? |
|
| 127 | + // ele é melhor que o curl? ou mais organizado? |
|
| 128 | + $curl->init($this->configurations['base'] . $imageSrc->attr('src')); |
|
| 129 | + |
|
| 130 | + $this->params['parambot'] = trim($paramBot->attr('value')); |
|
| 131 | + |
|
| 132 | + // headers da requisição |
|
| 133 | + $curl->options([ |
|
| 134 | + CURLOPT_COOKIEJAR => 'cookiejar', |
|
| 135 | + CURLOPT_HTTPHEADER => array( |
|
| 136 | + "Pragma: no-cache", |
|
| 137 | + "Origin: " . $this->configurations['base'], |
|
| 138 | + "Host: ". array_get($this->configurations, 'headers.Host'), |
|
| 139 | + "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", |
|
| 140 | + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
| 141 | + "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", |
|
| 142 | + "Accept-Encoding: gzip, deflate", |
|
| 143 | + "Referer: " . $this->configurations['captcha'], |
|
| 144 | + "Cookie: flag=1; ". $this->cookie, |
|
| 145 | + "Connection: keep-alive" |
|
| 146 | + ), |
|
| 147 | + CURLOPT_RETURNTRANSFER => true, |
|
| 148 | + CURLOPT_FOLLOWLOCATION => 1, |
|
| 149 | + CURLOPT_BINARYTRANSFER => TRUE, |
|
| 150 | + CURLOPT_CONNECTTIMEOUT => 10, |
|
| 151 | + CURLOPT_TIMEOUT => 10, |
|
| 152 | + ]); |
|
| 153 | + |
|
| 154 | + // executa o curl, logo após fechando a conexão |
|
| 155 | + $curl->exec(); |
|
| 156 | + $curl->close(); |
|
| 157 | + |
|
| 158 | + // captura do retorno do curl |
|
| 159 | + // o esperado deverá ser o HTML da imagem |
|
| 160 | + $this->captcha = $curl->response(); |
|
| 161 | + |
|
| 162 | + // é uma imagem o retorno? |
|
| 163 | + if(@imagecreatefromstring($this->captcha) == false) |
|
| 164 | + { |
|
| 165 | + throw new NoCaptchaResponse('Não foi possível capturar o captcha'); |
|
| 166 | + } |
|
| 167 | + |
|
| 168 | + // constroe o base64 da imagem para o usuário digitar |
|
| 169 | + // to-do: um serviço automatizado para decifrar o captcha? |
|
| 170 | + // talvez deathbycaptcha? |
|
| 171 | + $this->captchaImage = 'data:image/png;base64,' . base64_encode($this->captcha); |
|
| 172 | + |
|
| 173 | + return $this->captchaImage; |
|
| 174 | 174 | } |
| 175 | 175 | |
| 176 | 176 | /** |
@@ -212,58 +212,58 @@ discard block |
||
| 212 | 212 | public function getData($document, $cookie, $captcha, $params, $configurations) |
| 213 | 213 | { |
| 214 | 214 | // prepara o form |
| 215 | - $postParams = [ |
|
| 216 | - 'cnpj' => $document, // apenas números |
|
| 217 | - 'Key' => $captcha, |
|
| 218 | - 'botao' => 'Consulta por CNPJ', |
|
| 219 | - 'hidFlag' => '1', |
|
| 220 | - 'ie' => '', |
|
| 221 | - 'servico' => 'cnpj', |
|
| 222 | - 'paramBot' => $params['parambot'] |
|
| 223 | - ]; |
|
| 224 | - |
|
| 225 | - // inicia o cURL |
|
| 226 | - $curl = new Curl; |
|
| 227 | - |
|
| 228 | - // vamos registrar qual serviço será consultado |
|
| 229 | - $curl->init($configurations['data']); |
|
| 230 | - |
|
| 231 | - // define os headers para requisição curl. |
|
| 232 | - $curl->options( |
|
| 233 | - array( |
|
| 234 | - CURLOPT_HTTPHEADER => array( |
|
| 235 | - "Origin: http://pfeserv1.fazenda.sp.gov.br", |
|
| 236 | - "Host: pfeserv1.fazenda.sp.gov.br", |
|
| 237 | - "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36", |
|
| 238 | - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
| 239 | - "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4,es;q=0.2", |
|
| 240 | - "Accept-Encoding: gzip, deflate", |
|
| 241 | - "Referer: http://pfeserv1.fazenda.sp.gov.br/sintegrapfe/consultaSintegraServlet", |
|
| 242 | - "Cookie: flag=1; ". $cookie, |
|
| 243 | - "Connection: keep-alive" |
|
| 244 | - ), |
|
| 245 | - CURLOPT_RETURNTRANSFER => 1, |
|
| 246 | - CURLOPT_BINARYTRANSFER => 1, |
|
| 247 | - CURLOPT_FOLLOWLOCATION => 1, |
|
| 248 | - ) |
|
| 249 | - ); |
|
| 250 | - |
|
| 251 | - // efetua a chamada passando os parametros de form |
|
| 252 | - $curl->post($postParams); |
|
| 253 | - $curl->exec(); |
|
| 254 | - |
|
| 255 | - // completa a chamda |
|
| 256 | - $curl->close(); |
|
| 257 | - |
|
| 258 | - // vamos capturar retorno, que deverá ser o HTML para scrapping |
|
| 259 | - $html = $curl->response(); |
|
| 260 | - |
|
| 261 | - if(empty($html)) { |
|
| 262 | - throw new NoServiceResponse('No response from service', 99); |
|
| 263 | - } |
|
| 264 | - |
|
| 265 | - $crawler = new Crawler($html, array_get($configurations, 'selectors.data')); |
|
| 266 | - |
|
| 267 | - return $crawler; |
|
| 215 | + $postParams = [ |
|
| 216 | + 'cnpj' => $document, // apenas números |
|
| 217 | + 'Key' => $captcha, |
|
| 218 | + 'botao' => 'Consulta por CNPJ', |
|
| 219 | + 'hidFlag' => '1', |
|
| 220 | + 'ie' => '', |
|
| 221 | + 'servico' => 'cnpj', |
|
| 222 | + 'paramBot' => $params['parambot'] |
|
| 223 | + ]; |
|
| 224 | + |
|
| 225 | + // inicia o cURL |
|
| 226 | + $curl = new Curl; |
|
| 227 | + |
|
| 228 | + // vamos registrar qual serviço será consultado |
|
| 229 | + $curl->init($configurations['data']); |
|
| 230 | + |
|
| 231 | + // define os headers para requisição curl. |
|
| 232 | + $curl->options( |
|
| 233 | + array( |
|
| 234 | + CURLOPT_HTTPHEADER => array( |
|
| 235 | + "Origin: http://pfeserv1.fazenda.sp.gov.br", |
|
| 236 | + "Host: pfeserv1.fazenda.sp.gov.br", |
|
| 237 | + "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36", |
|
| 238 | + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
| 239 | + "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4,es;q=0.2", |
|
| 240 | + "Accept-Encoding: gzip, deflate", |
|
| 241 | + "Referer: http://pfeserv1.fazenda.sp.gov.br/sintegrapfe/consultaSintegraServlet", |
|
| 242 | + "Cookie: flag=1; ". $cookie, |
|
| 243 | + "Connection: keep-alive" |
|
| 244 | + ), |
|
| 245 | + CURLOPT_RETURNTRANSFER => 1, |
|
| 246 | + CURLOPT_BINARYTRANSFER => 1, |
|
| 247 | + CURLOPT_FOLLOWLOCATION => 1, |
|
| 248 | + ) |
|
| 249 | + ); |
|
| 250 | + |
|
| 251 | + // efetua a chamada passando os parametros de form |
|
| 252 | + $curl->post($postParams); |
|
| 253 | + $curl->exec(); |
|
| 254 | + |
|
| 255 | + // completa a chamda |
|
| 256 | + $curl->close(); |
|
| 257 | + |
|
| 258 | + // vamos capturar retorno, que deverá ser o HTML para scrapping |
|
| 259 | + $html = $curl->response(); |
|
| 260 | + |
|
| 261 | + if(empty($html)) { |
|
| 262 | + throw new NoServiceResponse('No response from service', 99); |
|
| 263 | + } |
|
| 264 | + |
|
| 265 | + $crawler = new Crawler($html, array_get($configurations, 'selectors.data')); |
|
| 266 | + |
|
| 267 | + return $crawler; |
|
| 268 | 268 | } |
| 269 | 269 | } |
| 270 | 270 | \ No newline at end of file |
@@ -86,7 +86,7 @@ discard block |
||
| 86 | 86 | */ |
| 87 | 87 | private function hasRequested() |
| 88 | 88 | { |
| 89 | - if(!$this->instanceResponse) { |
|
| 89 | + if (!$this->instanceResponse) { |
|
| 90 | 90 | throw new NoServiceCall("No request from this service, please call first method request", 1); |
| 91 | 91 | } |
| 92 | 92 | |
@@ -105,7 +105,7 @@ discard block |
||
| 105 | 105 | array_get($this->configurations, 'selectors.image') |
| 106 | 106 | ); |
| 107 | 107 | |
| 108 | - if(!$imageSrc->count()){ |
|
| 108 | + if (!$imageSrc->count()) { |
|
| 109 | 109 | throw new ImageNotFound("Impossible to crawler image from response", 1); |
| 110 | 110 | } |
| 111 | 111 | |
@@ -113,7 +113,7 @@ discard block |
||
| 113 | 113 | array_get($this->configurations, 'selectors.paramBot') |
| 114 | 114 | ); |
| 115 | 115 | |
| 116 | - if(!$paramBot->count()){ |
|
| 116 | + if (!$paramBot->count()) { |
|
| 117 | 117 | throw new ImageNotFound("Impossible to crawler parambot from response", 1); |
| 118 | 118 | } |
| 119 | 119 | |
@@ -135,13 +135,13 @@ discard block |
||
| 135 | 135 | CURLOPT_HTTPHEADER => array( |
| 136 | 136 | "Pragma: no-cache", |
| 137 | 137 | "Origin: " . $this->configurations['base'], |
| 138 | - "Host: ". array_get($this->configurations, 'headers.Host'), |
|
| 138 | + "Host: " . array_get($this->configurations, 'headers.Host'), |
|
| 139 | 139 | "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", |
| 140 | 140 | "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 141 | 141 | "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", |
| 142 | 142 | "Accept-Encoding: gzip, deflate", |
| 143 | 143 | "Referer: " . $this->configurations['captcha'], |
| 144 | - "Cookie: flag=1; ". $this->cookie, |
|
| 144 | + "Cookie: flag=1; " . $this->cookie, |
|
| 145 | 145 | "Connection: keep-alive" |
| 146 | 146 | ), |
| 147 | 147 | CURLOPT_RETURNTRANSFER => true, |
@@ -160,7 +160,7 @@ discard block |
||
| 160 | 160 | $this->captcha = $curl->response(); |
| 161 | 161 | |
| 162 | 162 | // é uma imagem o retorno? |
| 163 | - if(@imagecreatefromstring($this->captcha) == false) |
|
| 163 | + if (@imagecreatefromstring($this->captcha) == false) |
|
| 164 | 164 | { |
| 165 | 165 | throw new NoCaptchaResponse('Não foi possível capturar o captcha'); |
| 166 | 166 | } |
@@ -239,7 +239,7 @@ discard block |
||
| 239 | 239 | "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4,es;q=0.2", |
| 240 | 240 | "Accept-Encoding: gzip, deflate", |
| 241 | 241 | "Referer: http://pfeserv1.fazenda.sp.gov.br/sintegrapfe/consultaSintegraServlet", |
| 242 | - "Cookie: flag=1; ". $cookie, |
|
| 242 | + "Cookie: flag=1; " . $cookie, |
|
| 243 | 243 | "Connection: keep-alive" |
| 244 | 244 | ), |
| 245 | 245 | CURLOPT_RETURNTRANSFER => 1, |
@@ -258,7 +258,7 @@ discard block |
||
| 258 | 258 | // vamos capturar retorno, que deverá ser o HTML para scrapping |
| 259 | 259 | $html = $curl->response(); |
| 260 | 260 | |
| 261 | - if(empty($html)) { |
|
| 261 | + if (empty($html)) { |
|
| 262 | 262 | throw new NoServiceResponse('No response from service', 99); |
| 263 | 263 | } |
| 264 | 264 | |
@@ -34,7 +34,7 @@ discard block |
||
| 34 | 34 | */ |
| 35 | 35 | protected function registerReceitaFederal() |
| 36 | 36 | { |
| 37 | - $this->app->bind('ReceitaFederal', function(){ |
|
| 37 | + $this->app->bind('ReceitaFederal', function() { |
|
| 38 | 38 | return new RFSearch; |
| 39 | 39 | }); |
| 40 | 40 | } |
@@ -46,7 +46,7 @@ discard block |
||
| 46 | 46 | */ |
| 47 | 47 | protected function registerSintegra() |
| 48 | 48 | { |
| 49 | - $this->app->bind('Sintegra', function(){ |
|
| 49 | + $this->app->bind('Sintegra', function() { |
|
| 50 | 50 | return new Search; |
| 51 | 51 | }); |
| 52 | 52 | } |
@@ -33,7 +33,7 @@ |
||
| 33 | 33 | /** |
| 34 | 34 | * Verifica antes de fazer o crawler se possui erros |
| 35 | 35 | * na requisição |
| 36 | - * @return boolean |
|
| 36 | + * @return boolean|null |
|
| 37 | 37 | */ |
| 38 | 38 | public function hasError() |
| 39 | 39 | { |
@@ -37,7 +37,7 @@ discard block |
||
| 37 | 37 | */ |
| 38 | 38 | public function hasError() |
| 39 | 39 | { |
| 40 | - if(count( $this->selectors) == 0) { |
|
| 40 | + if (count($this->selectors) == 0) { |
|
| 41 | 41 | throw new NoSelectorsConfigured("NoSelectorsConfigured", 1); |
| 42 | 42 | } |
| 43 | 43 | |
@@ -45,12 +45,12 @@ discard block |
||
| 45 | 45 | // é página de erro da receita federal |
| 46 | 46 | $node = $this->filter($this->selectors['error']); |
| 47 | 47 | |
| 48 | - if($node->count()){ |
|
| 49 | - throw new ErrorFoundData( $this->clearString($node->text()), 1); |
|
| 48 | + if ($node->count()) { |
|
| 49 | + throw new ErrorFoundData($this->clearString($node->text()), 1); |
|
| 50 | 50 | } |
| 51 | 51 | |
| 52 | 52 | // CNPJ informado é válido? |
| 53 | - if($this->filter('#imgCaptcha')->count()){ |
|
| 53 | + if ($this->filter('#imgCaptcha')->count()) { |
|
| 54 | 54 | throw new InvalidCaptcha('Captcha inválido', 99); |
| 55 | 55 | } |
| 56 | 56 | } |
@@ -67,16 +67,16 @@ discard block |
||
| 67 | 67 | $this->hasError(); |
| 68 | 68 | |
| 69 | 69 | foreach ($this->selectors as $name => $selector) { |
| 70 | - if(is_string($selector)){ |
|
| 70 | + if (is_string($selector)) { |
|
| 71 | 71 | $node = $this->scrap($selector); |
| 72 | 72 | |
| 73 | - if($node->count()){ |
|
| 73 | + if ($node->count()) { |
|
| 74 | 74 | $scrapped[$name] = $this->clearString($node->text()); |
| 75 | 75 | } |
| 76 | - }elseif(is_array($selector)){ |
|
| 76 | + }elseif (is_array($selector)) { |
|
| 77 | 77 | foreach ($selector as $selector => $repeat) { |
| 78 | 78 | $node = $this->scrap($selector); |
| 79 | - if($node->count()){ |
|
| 79 | + if ($node->count()) { |
|
| 80 | 80 | foreach ($node->filter($repeat) as $loop) |
| 81 | 81 | { |
| 82 | 82 | $scrapped[$name][] = $this->clearString($loop->nodeValue); |
@@ -73,7 +73,7 @@ |
||
| 73 | 73 | if($node->count()){ |
| 74 | 74 | $scrapped[$name] = $this->clearString($node->text()); |
| 75 | 75 | } |
| 76 | - }elseif(is_array($selector)){ |
|
| 76 | + } elseif(is_array($selector)){ |
|
| 77 | 77 | foreach ($selector as $selector => $repeat) { |
| 78 | 78 | $node = $this->scrap($selector); |
| 79 | 79 | if($node->count()){ |
@@ -10,89 +10,89 @@ |
||
| 10 | 10 | class Crawler extends BaseCrawler implements CrawlerInterface |
| 11 | 11 | { |
| 12 | 12 | |
| 13 | - /** |
|
| 14 | - * [$selectors description] |
|
| 15 | - * @var [type] |
|
| 16 | - */ |
|
| 17 | - private $selectors = []; |
|
| 13 | + /** |
|
| 14 | + * [$selectors description] |
|
| 15 | + * @var [type] |
|
| 16 | + */ |
|
| 17 | + private $selectors = []; |
|
| 18 | 18 | |
| 19 | - /** |
|
| 20 | - * [__construct description] |
|
| 21 | - * @param [type] $html [description] |
|
| 22 | - * @param array $selectors [description] |
|
| 23 | - */ |
|
| 24 | - public function __construct($html, $selectors) |
|
| 25 | - { |
|
| 26 | - $this->selectors = $selectors; |
|
| 19 | + /** |
|
| 20 | + * [__construct description] |
|
| 21 | + * @param [type] $html [description] |
|
| 22 | + * @param array $selectors [description] |
|
| 23 | + */ |
|
| 24 | + public function __construct($html, $selectors) |
|
| 25 | + { |
|
| 26 | + $this->selectors = $selectors; |
|
| 27 | 27 | |
| 28 | - parent::__construct($html); |
|
| 29 | - } |
|
| 28 | + parent::__construct($html); |
|
| 29 | + } |
|
| 30 | 30 | |
| 31 | - /** |
|
| 32 | - * Verifica antes de fazer o crawler se possui erros |
|
| 33 | - * na requisição |
|
| 34 | - * @return boolean|null |
|
| 35 | - */ |
|
| 36 | - public function hasError() |
|
| 37 | - { |
|
| 38 | - $node = $this->scrap($this->selectors['razao_social']); |
|
| 31 | + /** |
|
| 32 | + * Verifica antes de fazer o crawler se possui erros |
|
| 33 | + * na requisição |
|
| 34 | + * @return boolean|null |
|
| 35 | + */ |
|
| 36 | + public function hasError() |
|
| 37 | + { |
|
| 38 | + $node = $this->scrap($this->selectors['razao_social']); |
|
| 39 | 39 | |
| 40 | - if (!$node->count()) |
|
| 41 | - { |
|
| 42 | - throw new ErrorFoundData($this->clearString($this->scrap($this->selectors['error'])->text()), 1); |
|
| 43 | - } |
|
| 44 | - } |
|
| 40 | + if (!$node->count()) |
|
| 41 | + { |
|
| 42 | + throw new ErrorFoundData($this->clearString($this->scrap($this->selectors['error'])->text()), 1); |
|
| 43 | + } |
|
| 44 | + } |
|
| 45 | 45 | |
| 46 | - /** |
|
| 47 | - * Extrai informações do HTML através do DOM |
|
| 48 | - * |
|
| 49 | - * @return array |
|
| 50 | - */ |
|
| 51 | - public function scraping() |
|
| 52 | - { |
|
| 53 | - $scrapped = []; |
|
| 46 | + /** |
|
| 47 | + * Extrai informações do HTML através do DOM |
|
| 48 | + * |
|
| 49 | + * @return array |
|
| 50 | + */ |
|
| 51 | + public function scraping() |
|
| 52 | + { |
|
| 53 | + $scrapped = []; |
|
| 54 | 54 | |
| 55 | - $this->hasError(); |
|
| 55 | + $this->hasError(); |
|
| 56 | 56 | |
| 57 | - foreach ($this->selectors as $name => $selector) { |
|
| 58 | - if (is_string($selector)) { |
|
| 59 | - $node = $this->scrap($selector); |
|
| 57 | + foreach ($this->selectors as $name => $selector) { |
|
| 58 | + if (is_string($selector)) { |
|
| 59 | + $node = $this->scrap($selector); |
|
| 60 | 60 | |
| 61 | - if ($node->count()) { |
|
| 62 | - $scrapped[$name] = $this->clearString($node->text()); |
|
| 63 | - } |
|
| 64 | - }elseif (is_array($selector)) { |
|
| 65 | - foreach ($selector as $selector => $repeat) { |
|
| 66 | - $node = $this->scrap($selector); |
|
| 67 | - if ($node->count()) { |
|
| 68 | - foreach ($node->filter($repeat) as $loop) |
|
| 69 | - { |
|
| 70 | - $scrapped[$name][] = $this->clearString($loop->nodeValue); |
|
| 71 | - } |
|
| 72 | - } |
|
| 73 | - } |
|
| 74 | - } |
|
| 75 | - } |
|
| 61 | + if ($node->count()) { |
|
| 62 | + $scrapped[$name] = $this->clearString($node->text()); |
|
| 63 | + } |
|
| 64 | + }elseif (is_array($selector)) { |
|
| 65 | + foreach ($selector as $selector => $repeat) { |
|
| 66 | + $node = $this->scrap($selector); |
|
| 67 | + if ($node->count()) { |
|
| 68 | + foreach ($node->filter($repeat) as $loop) |
|
| 69 | + { |
|
| 70 | + $scrapped[$name][] = $this->clearString($loop->nodeValue); |
|
| 71 | + } |
|
| 72 | + } |
|
| 73 | + } |
|
| 74 | + } |
|
| 75 | + } |
|
| 76 | 76 | |
| 77 | - return $scrapped; |
|
| 78 | - } |
|
| 77 | + return $scrapped; |
|
| 78 | + } |
|
| 79 | 79 | |
| 80 | - /** |
|
| 81 | - * Limpa o valor repassado |
|
| 82 | - * @param string $string |
|
| 83 | - * @return string |
|
| 84 | - */ |
|
| 85 | - public function clearString($string) |
|
| 86 | - { |
|
| 87 | - return trim(preg_replace(['/[\s]+/mu'], ' ', $string)); |
|
| 88 | - } |
|
| 80 | + /** |
|
| 81 | + * Limpa o valor repassado |
|
| 82 | + * @param string $string |
|
| 83 | + * @return string |
|
| 84 | + */ |
|
| 85 | + public function clearString($string) |
|
| 86 | + { |
|
| 87 | + return trim(preg_replace(['/[\s]+/mu'], ' ', $string)); |
|
| 88 | + } |
|
| 89 | 89 | |
| 90 | - /** |
|
| 91 | - * Filtra selector no crawler |
|
| 92 | - */ |
|
| 93 | - public function scrap($selector) |
|
| 94 | - { |
|
| 95 | - $node = $this->filter($selector); |
|
| 96 | - return $node; |
|
| 97 | - } |
|
| 90 | + /** |
|
| 91 | + * Filtra selector no crawler |
|
| 92 | + */ |
|
| 93 | + public function scrap($selector) |
|
| 94 | + { |
|
| 95 | + $node = $this->filter($selector); |
|
| 96 | + return $node; |
|
| 97 | + } |
|
| 98 | 98 | } |
| 99 | 99 | \ No newline at end of file |
@@ -61,7 +61,7 @@ |
||
| 61 | 61 | if ($node->count()) { |
| 62 | 62 | $scrapped[$name] = $this->clearString($node->text()); |
| 63 | 63 | } |
| 64 | - }elseif (is_array($selector)) { |
|
| 64 | + } elseif (is_array($selector)) { |
|
| 65 | 65 | foreach ($selector as $selector => $repeat) { |
| 66 | 66 | $node = $this->scrap($selector); |
| 67 | 67 | if ($node->count()) { |
@@ -119,13 +119,13 @@ discard block |
||
| 119 | 119 | CURLOPT_HTTPHEADER => array( |
| 120 | 120 | "Pragma: no-cache", |
| 121 | 121 | "Origin: " . $this->configurations['base'], |
| 122 | - "Host: ". array_get($this->configurations, 'headers.Host'), |
|
| 122 | + "Host: " . array_get($this->configurations, 'headers.Host'), |
|
| 123 | 123 | "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", |
| 124 | 124 | "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 125 | 125 | "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", |
| 126 | 126 | "Accept-Encoding: gzip, deflate", |
| 127 | 127 | "Referer: " . $this->configurations['home'], |
| 128 | - "Cookie: flag=1; ". $this->cookie, |
|
| 128 | + "Cookie: flag=1; " . $this->cookie, |
|
| 129 | 129 | "Connection: keep-alive" |
| 130 | 130 | ), |
| 131 | 131 | CURLOPT_RETURNTRANSFER => true, |
@@ -144,7 +144,7 @@ discard block |
||
| 144 | 144 | $this->captcha = $curl->response(); |
| 145 | 145 | |
| 146 | 146 | // é uma imagem o retorno? |
| 147 | - if(@imagecreatefromstring($this->captcha) == false) |
|
| 147 | + if (@imagecreatefromstring($this->captcha) == false) |
|
| 148 | 148 | { |
| 149 | 149 | throw new NoCaptchaResponse('Não foi possível capturar o captcha'); |
| 150 | 150 | } |
@@ -218,13 +218,13 @@ discard block |
||
| 218 | 218 | CURLOPT_HTTPHEADER => array( |
| 219 | 219 | "Pragma: no-cache", |
| 220 | 220 | "Origin: " . $this->configurations['base'], |
| 221 | - "Host: ". array_get($configurations, 'headers.Host'), |
|
| 221 | + "Host: " . array_get($configurations, 'headers.Host'), |
|
| 222 | 222 | "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", |
| 223 | 223 | "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 224 | 224 | "Accept-Language: pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3", |
| 225 | 225 | "Accept-Encoding: gzip, deflate", |
| 226 | - "Referer: " . $this->configurations['home'] .'?cnpj='. $document, |
|
| 227 | - "Cookie: flag=1; ". $cookie, |
|
| 226 | + "Referer: " . $this->configurations['home'] . '?cnpj=' . $document, |
|
| 227 | + "Cookie: flag=1; " . $cookie, |
|
| 228 | 228 | "Connection: keep-alive" |
| 229 | 229 | ), |
| 230 | 230 | CURLOPT_RETURNTRANSFER => 1, |
@@ -243,7 +243,7 @@ discard block |
||
| 243 | 243 | // vamos capturar retorno, que deverá ser o HTML para scrapping |
| 244 | 244 | $html = $curl->response(); |
| 245 | 245 | |
| 246 | - if(empty($html)) { |
|
| 246 | + if (empty($html)) { |
|
| 247 | 247 | throw new NoServiceResponse('No response from service', 99); |
| 248 | 248 | } |
| 249 | 249 | |
@@ -7,105 +7,105 @@ |
||
| 7 | 7 | class Curl |
| 8 | 8 | { |
| 9 | 9 | |
| 10 | - /** |
|
| 11 | - * [$url description] |
|
| 12 | - * @var [type] |
|
| 13 | - */ |
|
| 14 | - private $url; |
|
| 15 | - |
|
| 16 | - /** |
|
| 17 | - * [$options description] |
|
| 18 | - * @var [type] |
|
| 19 | - */ |
|
| 20 | - private $options; |
|
| 21 | - |
|
| 22 | - /** |
|
| 23 | - * [$instance description] |
|
| 24 | - * @var [type] |
|
| 25 | - */ |
|
| 26 | - private $instance; |
|
| 27 | - |
|
| 28 | - /** |
|
| 29 | - * [$response description] |
|
| 30 | - * @var [type] |
|
| 31 | - */ |
|
| 32 | - private $response; |
|
| 33 | - |
|
| 34 | - /** |
|
| 35 | - * [init description] |
|
| 36 | - * @return [type] [description] |
|
| 37 | - */ |
|
| 38 | - public function init($url) |
|
| 39 | - { |
|
| 40 | - $this->instance = curl_init($url); |
|
| 41 | - |
|
| 42 | - $this->url = $url; |
|
| 43 | - |
|
| 44 | - return $this; |
|
| 45 | - } |
|
| 46 | - |
|
| 47 | - /** |
|
| 48 | - * [options description] |
|
| 49 | - * @param array $options [description] |
|
| 50 | - * @return [type] [description] |
|
| 51 | - */ |
|
| 52 | - public function options(array $options) |
|
| 53 | - { |
|
| 54 | - $this->options = $options; |
|
| 55 | - |
|
| 56 | - curl_setopt_array($this->instance, $this->options); |
|
| 57 | - |
|
| 58 | - return $this; |
|
| 59 | - } |
|
| 60 | - |
|
| 61 | - /** |
|
| 62 | - * [post description] |
|
| 63 | - * @return [type] [description] |
|
| 64 | - */ |
|
| 65 | - public function post(array $fields) |
|
| 66 | - { |
|
| 67 | - $this->option(CURLOPT_POST, count($fields)); |
|
| 68 | - $this->option(CURLOPT_POSTFIELDS, http_build_query($fields)); |
|
| 69 | - |
|
| 70 | - return $this; |
|
| 71 | - } |
|
| 72 | - |
|
| 73 | - /** |
|
| 74 | - * Set option in cURL |
|
| 75 | - * @param integer $option |
|
| 76 | - * @param mix $value |
|
| 77 | - */ |
|
| 78 | - public function option($option, $value) |
|
| 79 | - { |
|
| 80 | - curl_setopt($this->instance, $option, $value); |
|
| 81 | - } |
|
| 82 | - |
|
| 83 | - /** |
|
| 84 | - * [exec description] |
|
| 85 | - * @return [type] [description] |
|
| 86 | - */ |
|
| 87 | - public function exec() |
|
| 88 | - { |
|
| 89 | - $this->response = curl_exec($this->instance); |
|
| 90 | - } |
|
| 91 | - |
|
| 92 | - /** |
|
| 93 | - * [close description] |
|
| 94 | - * @return [type] [description] |
|
| 95 | - */ |
|
| 96 | - public function close() |
|
| 97 | - { |
|
| 98 | - curl_close($this->instance); |
|
| 99 | - |
|
| 100 | - return $this; |
|
| 101 | - } |
|
| 102 | - |
|
| 103 | - /** |
|
| 104 | - * [response description] |
|
| 105 | - * @return [type] [description] |
|
| 106 | - */ |
|
| 107 | - public function response() |
|
| 108 | - { |
|
| 109 | - return $this->response; |
|
| 110 | - } |
|
| 10 | + /** |
|
| 11 | + * [$url description] |
|
| 12 | + * @var [type] |
|
| 13 | + */ |
|
| 14 | + private $url; |
|
| 15 | + |
|
| 16 | + /** |
|
| 17 | + * [$options description] |
|
| 18 | + * @var [type] |
|
| 19 | + */ |
|
| 20 | + private $options; |
|
| 21 | + |
|
| 22 | + /** |
|
| 23 | + * [$instance description] |
|
| 24 | + * @var [type] |
|
| 25 | + */ |
|
| 26 | + private $instance; |
|
| 27 | + |
|
| 28 | + /** |
|
| 29 | + * [$response description] |
|
| 30 | + * @var [type] |
|
| 31 | + */ |
|
| 32 | + private $response; |
|
| 33 | + |
|
| 34 | + /** |
|
| 35 | + * [init description] |
|
| 36 | + * @return [type] [description] |
|
| 37 | + */ |
|
| 38 | + public function init($url) |
|
| 39 | + { |
|
| 40 | + $this->instance = curl_init($url); |
|
| 41 | + |
|
| 42 | + $this->url = $url; |
|
| 43 | + |
|
| 44 | + return $this; |
|
| 45 | + } |
|
| 46 | + |
|
| 47 | + /** |
|
| 48 | + * [options description] |
|
| 49 | + * @param array $options [description] |
|
| 50 | + * @return [type] [description] |
|
| 51 | + */ |
|
| 52 | + public function options(array $options) |
|
| 53 | + { |
|
| 54 | + $this->options = $options; |
|
| 55 | + |
|
| 56 | + curl_setopt_array($this->instance, $this->options); |
|
| 57 | + |
|
| 58 | + return $this; |
|
| 59 | + } |
|
| 60 | + |
|
| 61 | + /** |
|
| 62 | + * [post description] |
|
| 63 | + * @return [type] [description] |
|
| 64 | + */ |
|
| 65 | + public function post(array $fields) |
|
| 66 | + { |
|
| 67 | + $this->option(CURLOPT_POST, count($fields)); |
|
| 68 | + $this->option(CURLOPT_POSTFIELDS, http_build_query($fields)); |
|
| 69 | + |
|
| 70 | + return $this; |
|
| 71 | + } |
|
| 72 | + |
|
| 73 | + /** |
|
| 74 | + * Set option in cURL |
|
| 75 | + * @param integer $option |
|
| 76 | + * @param mix $value |
|
| 77 | + */ |
|
| 78 | + public function option($option, $value) |
|
| 79 | + { |
|
| 80 | + curl_setopt($this->instance, $option, $value); |
|
| 81 | + } |
|
| 82 | + |
|
| 83 | + /** |
|
| 84 | + * [exec description] |
|
| 85 | + * @return [type] [description] |
|
| 86 | + */ |
|
| 87 | + public function exec() |
|
| 88 | + { |
|
| 89 | + $this->response = curl_exec($this->instance); |
|
| 90 | + } |
|
| 91 | + |
|
| 92 | + /** |
|
| 93 | + * [close description] |
|
| 94 | + * @return [type] [description] |
|
| 95 | + */ |
|
| 96 | + public function close() |
|
| 97 | + { |
|
| 98 | + curl_close($this->instance); |
|
| 99 | + |
|
| 100 | + return $this; |
|
| 101 | + } |
|
| 102 | + |
|
| 103 | + /** |
|
| 104 | + * [response description] |
|
| 105 | + * @return [type] [description] |
|
| 106 | + */ |
|
| 107 | + public function response() |
|
| 108 | + { |
|
| 109 | + return $this->response; |
|
| 110 | + } |
|
| 111 | 111 | } |
| 112 | 112 | \ No newline at end of file |