@@ -148,47 +148,47 @@ |
||
| 148 | 148 | /** |
| 149 | 149 | * @param string $data |
| 150 | 150 | */ |
| 151 | - private function getHTMLFieldsData($data){ |
|
| 151 | + private function getHTMLFieldsData($data) { |
|
| 152 | 152 | preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches); |
| 153 | - foreach( $matches[2] as $key => $field_text ){ |
|
| 153 | + foreach ($matches[2] as $key => $field_text) { |
|
| 154 | 154 | $matches[2][$key] = base64_decode($field_text); |
| 155 | 155 | $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key])); |
| 156 | 156 | } |
| 157 | 157 | return $matches; |
| 158 | 158 | } |
| 159 | 159 | |
| 160 | - public function loadPage($url, $agent=null){ |
|
| 160 | + public function loadPage($url, $agent = null) { |
|
| 161 | 161 | $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID; |
| 162 | 162 | $ch = curl_init(); |
| 163 | - curl_setopt( $ch, CURLOPT_URL, Director::absoluteBaseURL().'/'.$url ); |
|
| 164 | - curl_setopt( $ch, CURLOPT_HEADER, true ); |
|
| 165 | - curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); |
|
| 166 | - curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true ); |
|
| 167 | - curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 ); |
|
| 168 | - curl_setopt( $ch, CURLOPT_USERAGENT, $agent ); |
|
| 169 | - curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 ); |
|
| 170 | - curl_setopt( $ch, CURLOPT_TIMEOUT, 30 ); |
|
| 171 | - curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); |
|
| 172 | - curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) ); |
|
| 173 | - $data = curl_exec( $ch ); |
|
| 163 | + curl_setopt($ch, CURLOPT_URL, Director::absoluteBaseURL().'/'.$url); |
|
| 164 | + curl_setopt($ch, CURLOPT_HEADER, true); |
|
| 165 | + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); |
|
| 166 | + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
|
| 167 | + curl_setopt($ch, CURLOPT_MAXREDIRS, 10); |
|
| 168 | + curl_setopt($ch, CURLOPT_USERAGENT, $agent); |
|
| 169 | + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); |
|
| 170 | + curl_setopt($ch, CURLOPT_TIMEOUT, 30); |
|
| 171 | + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); |
|
| 172 | + curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-Crawl-Id: '.$crawl_id)); |
|
| 173 | + $data = curl_exec($ch); |
|
| 174 | 174 | |
| 175 | 175 | $fetched = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), PHP_URL_PATH); |
| 176 | - $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE ); |
|
| 177 | - $header = explode( "\r\n\r\n", substr( $data, 0, $header_size ) ); |
|
| 178 | - array_pop( $header ); // Remove last element as it will always be empty |
|
| 179 | - $header = array_pop( $header ); |
|
| 180 | - $body = preg_replace('/[\s]+/mu', ' ', substr( $data, $header_size )); |
|
| 176 | + $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); |
|
| 177 | + $header = explode("\r\n\r\n", substr($data, 0, $header_size)); |
|
| 178 | + array_pop($header); // Remove last element as it will always be empty |
|
| 179 | + $header = array_pop($header); |
|
| 180 | + $body = preg_replace('/[\s]+/mu', ' ', substr($data, $header_size)); |
|
| 181 | 181 | |
| 182 | - curl_close( $ch ); |
|
| 182 | + curl_close($ch); |
|
| 183 | 183 | |
| 184 | - if( !strpos( $header, ' 200 ' ) ) { |
|
| 185 | - return array( 'headers' => false, 'body' => false ); |
|
| 184 | + if (!strpos($header, ' 200 ')) { |
|
| 185 | + return array('headers' => false, 'body' => false); |
|
| 186 | 186 | } |
| 187 | 187 | |
| 188 | 188 | $field_data = $this->getHTMLFieldsData($body); |
| 189 | 189 | $body = str_replace($field_data[0], $field_data[2], $body); |
| 190 | 190 | |
| 191 | - return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched ); |
|
| 191 | + return array('headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched); |
|
| 192 | 192 | } |
| 193 | 193 | |
| 194 | 194 | /** |