dylangrech92 /
seotoolbox
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
| 1 | <?php |
||
| 2 | |||
| 3 | class SEOTestSiteTreeController extends Controller { |
||
| 4 | |||
| 5 | private static $alternate_domain = null; |
||
| 6 | private static $desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'; |
||
| 7 | private static $mobile_user_agent = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>'; |
||
| 8 | |||
| 9 | /** |
||
| 10 | * Array of regex that will be used by the crawler. |
||
| 11 | * If the url we're going to crawl matches any filter in here, it will be ignored |
||
| 12 | */ |
||
| 13 | private static $ignore_paths = array(); |
||
| 14 | |||
| 15 | private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage'); |
||
| 16 | |||
| 17 | public function init() { |
||
| 18 | parent::init(); |
||
| 19 | |||
| 20 | Requirements::clear(); |
||
| 21 | |||
| 22 | if (!Member::currentUser() || !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){ |
||
| 23 | return $this->redirect(Security::login_url().'?BackURL=/seotest'); |
||
| 24 | } |
||
| 25 | |||
| 26 | Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css'); |
||
| 27 | Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css'); |
||
| 28 | Requirements::combine_files('seotest.css', array( |
||
| 29 | SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css', |
||
| 30 | SEOTOOLBOX_DIR.'/css/seotest.css' |
||
| 31 | )); |
||
| 32 | |||
| 33 | Requirements::combine_files('seotest.js', array( |
||
| 34 | SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js', |
||
| 35 | SEOTOOLBOX_DIR.'/js/crawler_event_handler.js', |
||
| 36 | SEOTOOLBOX_DIR.'/js/crawler_painter.js', |
||
| 37 | SEOTOOLBOX_DIR.'/js/crawler.js', |
||
| 38 | SEOTOOLBOX_DIR.'/js/crawler_file_tester.js', |
||
| 39 | SEOTOOLBOX_DIR.'/js/default_tests.js', |
||
| 40 | SEOTOOLBOX_DIR.'/js/crawler_init.js' |
||
| 41 | )); |
||
| 42 | } |
||
| 43 | |||
| 44 | /** |
||
| 45 | * Curl the passed url. |
||
| 46 | * |
||
| 47 | * This is still run on the same domain as where your admin domain is located |
||
| 48 | * however curl can deal with redirects much better then ajax therefore giving a |
||
| 49 | * more accurate result. Also it prepares all the important data into an array object |
||
| 50 | * which is then encoded and sent to JS for parsing |
||
| 51 | * |
||
| 52 | * Object Contents |
||
| 53 | * obj['header'] = Headers that we got back from the curl |
||
| 54 | * obj['body'] = HTML of the page |
||
| 55 | * obj['phrases'] = A list of sentences as extracting from the DOM |
||
| 56 | * obj['field_data'] = A result from preg_match with all the html fields visible on the page |
||
| 57 | * |
||
| 58 | * @param SS_HTTPRequest $request |
||
| 59 | * @return String |
||
| 60 | */ |
||
| 61 | public function getPageData(SS_HTTPRequest $request) { |
||
| 62 | $agent = ($request->getVar('agent') == 'mobile') |
||
| 63 | ? $this->config()->get('mobile_user_agent') |
||
| 64 | : $this->config()->get('desktop_user_agent'); |
||
| 65 | |||
| 66 | $curl = $this->loadPage($request->getVar('u'), $agent); |
||
| 67 | $curl['phrases'] = $this->extractWords($curl['body']); |
||
| 68 | |||
| 69 | Requirements::clear(); |
||
| 70 | return json_encode($curl); |
||
| 71 | } |
||
| 72 | |||
| 73 | /** |
||
| 74 | * Get the page contents of the requested url. |
||
| 75 | * This is used as a proxy so that users running the admin on a subdomain |
||
| 76 | * still get the data from their main domain |
||
| 77 | * |
||
| 78 | * @param SS_HTTPRequest $request |
||
| 79 | * @return string |
||
| 80 | */ |
||
| 81 | public function getPage(SS_HTTPRequest $request){ |
||
| 82 | $agent = ($request->getVar('agent') == 'mobile') |
||
| 83 | ? $this->config()->get('mobile_user_agent') |
||
| 84 | : $this->config()->get('desktop_user_agent'); |
||
| 85 | |||
| 86 | $ch = $this->setupCurl($request->getVar('u'), $agent); |
||
| 87 | $data = curl_exec($ch); |
||
| 88 | $body = $this->getPageBody($ch, $data); |
||
| 89 | curl_close($ch); |
||
| 90 | |||
| 91 | Requirements::clear(); |
||
| 92 | |||
| 93 | return $body; |
||
| 94 | } |
||
| 95 | |||
| 96 | /** |
||
| 97 | * Break down the $html provided and returns all words that have an SEO significance |
||
| 98 | * |
||
| 99 | * @param string $html |
||
| 100 | * @return array |
||
| 101 | */ |
||
| 102 | private function extractWords($html) { |
||
| 103 | mb_internal_encoding('UTF-8'); |
||
| 104 | $html = preg_replace_callback( |
||
| 105 | "/(&#[0-9]+;)/", |
||
| 106 | function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); }, |
||
| 107 | $html |
||
| 108 | ); |
||
| 109 | $html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html)); |
||
| 110 | $phrases = array(); |
||
| 111 | $regex_find_replace = array( |
||
| 112 | array( |
||
| 113 | 'find' => '/<meta(.*?)name="(.*?)description"(.*?)content="(.*?)"(.*?)[>]/m', |
||
| 114 | 'find_pos' => 4, |
||
| 115 | 'replace' => '/<meta(.*?)[>]/i' |
||
| 116 | ), |
||
| 117 | array( |
||
| 118 | 'find' => '/<(img|a)[^<]*title=(\'|")(.*?)(\'|")[^<]*[>]/m', |
||
| 119 | 'find_pos' => 3 |
||
| 120 | ), |
||
| 121 | array( |
||
| 122 | 'find' => '/<img[^<]*alt=(\'|")(.*?)(\'|")[^<]*[>]/m', |
||
| 123 | 'find_pos' => 2, |
||
| 124 | 'replace' => '/<img(.*?)[>]/i' |
||
| 125 | ), |
||
| 126 | array( |
||
| 127 | 'find' => '/<(.*?)>(.*?)<\/[a-zA-Z0-9]++>/m', |
||
| 128 | 'find_pos' => 2, |
||
| 129 | ) |
||
| 130 | ); |
||
| 131 | |||
| 132 | foreach ($regex_find_replace as $commands) { |
||
| 133 | if (isset($commands['find'])) { |
||
| 134 | preg_match_all($commands['find'], $html, $matches); |
||
| 135 | array_walk($matches[$commands['find_pos']], function(&$phrase) { |
||
| 136 | $words = explode(' ', strip_tags($phrase)); |
||
| 137 | array_walk($words, function(&$w) { |
||
| 138 | $w = trim(preg_replace('/\s+/', ' ', strip_tags($w))); |
||
| 139 | }); |
||
| 140 | $phrase = preg_replace('/\s+/', ' ', implode(' ', $words)); |
||
| 141 | }); |
||
| 142 | $phrases = array_merge($phrases, $matches[$commands['find_pos']]); |
||
| 143 | } |
||
| 144 | |||
| 145 | if (isset($commands['replace'])) |
||
| 146 | $html = preg_replace($commands['replace'], ' ', $html); |
||
| 147 | } |
||
| 148 | |||
| 149 | // Remove the empty elements |
||
| 150 | return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; }); |
||
| 151 | } |
||
| 152 | |||
| 153 | /** |
||
| 154 | * Returns the first batch of urls the crawler will use |
||
| 155 | * and it's settings in json format |
||
| 156 | * |
||
| 157 | * @param SS_HTTPRequest $request |
||
| 158 | * @return string |
||
| 159 | */ |
||
| 160 | public function urlsAndSettings(SS_HTTPRequest $request) { |
||
|
0 ignored issues
–
show
|
|||
| 161 | Requirements::clear(); |
||
| 162 | return json_encode(array( |
||
| 163 | 'urls' => Versioned::get_by_stage('SiteTree', 'Live') |
||
| 164 | ->exclude('ClassName', 'RedirectorPage') |
||
| 165 | ->exclude('ClassName', 'ErrorPage') |
||
| 166 | ->map('ID', 'AbsoluteLink') |
||
| 167 | ->toArray(), |
||
| 168 | |||
| 169 | 'settings' => array( |
||
| 170 | 'ignore_paths' => $this->config()->get('ignore_paths'), |
||
| 171 | 'crawl_id' => GlobalAutoLinkSettings::get_current()->CrawlID |
||
| 172 | ) |
||
| 173 | )); |
||
| 174 | } |
||
| 175 | |||
| 176 | /** |
||
| 177 | * Parses the data that we got from curling the crawl version of the page |
||
| 178 | * and splits the html fields into an array |
||
| 179 | * |
||
| 180 | * @param string $data |
||
| 181 | * @return array |
||
| 182 | */ |
||
| 183 | private function getHTMLFieldsData($data){ |
||
| 184 | preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches); |
||
| 185 | foreach( $matches[2] as $key => $field_text ){ |
||
| 186 | $matches[2][$key] = base64_decode($field_text); |
||
| 187 | $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key])); |
||
| 188 | } |
||
| 189 | return $matches; |
||
| 190 | } |
||
| 191 | |||
| 192 | /** |
||
| 193 | * Setup a curl request |
||
| 194 | * |
||
| 195 | * @param string $url |
||
| 196 | * @param string $agent |
||
| 197 | * @param bool $useCrawlID |
||
| 198 | * |
||
| 199 | * @return resource |
||
| 200 | */ |
||
| 201 | public function setupCurl($url, $agent, $useCrawlID = false){ |
||
| 202 | $ch = curl_init(); |
||
| 203 | curl_setopt( $ch, CURLOPT_URL, $this->getCurlURL($url) ); |
||
| 204 | curl_setopt( $ch, CURLOPT_HEADER, true ); |
||
| 205 | curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); |
||
| 206 | curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true ); |
||
| 207 | curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 ); |
||
| 208 | curl_setopt( $ch, CURLOPT_USERAGENT, $agent ); |
||
| 209 | curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 ); |
||
| 210 | curl_setopt( $ch, CURLOPT_TIMEOUT, 30 ); |
||
| 211 | curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); |
||
| 212 | if( $useCrawlID ){ |
||
| 213 | $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID; |
||
| 214 | curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) ); |
||
| 215 | } |
||
| 216 | return $ch; |
||
| 217 | } |
||
| 218 | |||
| 219 | /** |
||
| 220 | * Return the domain to use to curl the page |
||
| 221 | * |
||
| 222 | * @return array|scalar|string |
||
| 223 | */ |
||
| 224 | public function getCurlDomain(){ |
||
| 225 | return ( self::config()->get('alternate_domain') != null ) |
||
| 226 | ? self::config()->get('alternate_domain') |
||
| 227 | : Director::absoluteBaseURL(); |
||
| 228 | } |
||
| 229 | |||
| 230 | /** |
||
| 231 | * Return a url ready to be curled |
||
| 232 | * |
||
| 233 | * @param string $url |
||
| 234 | * @return string |
||
| 235 | */ |
||
| 236 | public function getCurlURL($url){ |
||
| 237 | $domain = $this->getCurlDomain(); |
||
| 238 | return "$domain/$url"; |
||
| 239 | } |
||
| 240 | |||
| 241 | /** |
||
| 242 | * Get the page headers from a curl response |
||
| 243 | * |
||
| 244 | * @param resource $ch |
||
| 245 | * @param string $data |
||
| 246 | * @return string |
||
| 247 | */ |
||
| 248 | public function getPageHeaders($ch, $data){ |
||
| 249 | $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE ); |
||
| 250 | $header = explode( "\r\n\r\n", substr( $data, 0, $header_size ) ); |
||
| 251 | array_pop( $header ); // Remove last element as it will always be empty |
||
| 252 | return array_pop( $header ); |
||
| 253 | } |
||
| 254 | |||
| 255 | /** |
||
| 256 | * Get the body of a curl response |
||
| 257 | * |
||
| 258 | * @param resource $ch |
||
| 259 | * @param string $data |
||
| 260 | * @return string |
||
| 261 | */ |
||
| 262 | public function getPageBody($ch, $data){ |
||
| 263 | $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE ); |
||
| 264 | return substr( $data, $header_size ); |
||
| 265 | } |
||
| 266 | |||
| 267 | /** |
||
| 268 | * Curl the passed $url using the X-Crawl-ID header and parse the data |
||
| 269 | * into an array |
||
| 270 | * |
||
| 271 | * @param string $url |
||
| 272 | * @param (null|string) $agent |
||
| 273 | * @return array |
||
| 274 | */ |
||
| 275 | public function loadPage($url, $agent=null){ |
||
| 276 | $ch = $this->setupCurl($url, $agent, true); |
||
| 277 | $data = curl_exec($ch); |
||
| 278 | $fetched = str_replace($this->getCurlDomain(), '', curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); |
||
| 279 | $header = $this->getPageHeaders($ch, $data); |
||
| 280 | $body = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data)); |
||
| 281 | |||
| 282 | curl_close( $ch ); |
||
| 283 | |||
| 284 | if( !strpos( $header, ' 200 ' ) ) { |
||
| 285 | return array( 'headers' => false, 'body' => false ); |
||
| 286 | } |
||
| 287 | |||
| 288 | $field_data = $this->getHTMLFieldsData($body); |
||
| 289 | $body = str_replace($field_data[0], $field_data[2], $body); |
||
| 290 | |||
| 291 | return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched ); |
||
| 292 | } |
||
| 293 | |||
| 294 | /** |
||
| 295 | * If ErrorPage exists for Error Code 503 return it |
||
| 296 | * else create it and return it |
||
| 297 | * |
||
| 298 | * @return ErrorPage |
||
| 299 | */ |
||
| 300 | public static function getPermissionDeniedPage() { |
||
| 301 | $page = ErrorPage::get()->find('ErrorCode', 503); |
||
| 302 | if (!$page) { |
||
| 303 | $page = ErrorPage::create(array( |
||
| 304 | 'ErrorCode' => 503, |
||
| 305 | 'Title' => 'Permission Denied' |
||
| 306 | )); |
||
| 307 | $page->write(); |
||
| 308 | } |
||
| 309 | |||
| 310 | return $page; |
||
| 311 | } |
||
| 312 | } |
||
| 313 |
This check looks from parameters that have been defined for a function or method, but which are not used in the method body.