1 | <?php |
||
3 | class SEOTestSiteTreeController extends Controller { |
||
4 | |||
5 | private static $alternate_domain = null; |
||
6 | private static $desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'; |
||
7 | private static $mobile_user_agent = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>'; |
||
8 | |||
9 | /** |
||
10 | * Array of regex that will be used by the crawler. |
||
11 | * If the url we're going to crawl matches any filter in here, it will be ignored |
||
12 | */ |
||
13 | private static $ignore_paths = array(); |
||
14 | |||
15 | private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage'); |
||
16 | |||
17 | public function init() { |
||
43 | |||
44 | /** |
||
45 | * Curl the passed url. |
||
46 | * |
||
47 | * This is still run on the same domain as where your admin domain is located |
||
48 | * however curl can deal with redirects much better then ajax therefore giving a |
||
49 | * more accurate result. Also it prepares all the important data into an array object |
||
50 | * which is then encoded and sent to JS for parsing |
||
51 | * |
||
52 | * Object Contents |
||
53 | * obj['header'] = Headers that we got back from the curl |
||
54 | * obj['body'] = HTML of the page |
||
55 | * obj['phrases'] = A list of sentences as extracting from the DOM |
||
56 | * obj['field_data'] = A result from preg_match with all the html fields visible on the page |
||
57 | * |
||
58 | * @param SS_HTTPRequest $request |
||
59 | * @return String |
||
60 | */ |
||
61 | public function getPageData(SS_HTTPRequest $request) { |
||
72 | |||
73 | /** |
||
74 | * Get the page contents of the requested url. |
||
75 | * This is used as a proxy so that users running the admin on a subdomain |
||
76 | * still get the data from their main domain |
||
77 | * |
||
78 | * @param SS_HTTPRequest $request |
||
79 | * @return string |
||
80 | */ |
||
81 | public function getPage(SS_HTTPRequest $request){ |
||
95 | |||
96 | /** |
||
97 | * Break down the $html provided and returns all words that have an SEO significance |
||
98 | * |
||
99 | * @param string $html |
||
100 | * @return array |
||
101 | */ |
||
102 | private function extractWords($html) { |
||
152 | |||
153 | /** |
||
154 | * Returns the first batch of urls the crawler will use |
||
155 | * and it's settings in json format |
||
156 | * |
||
157 | * @param SS_HTTPRequest $request |
||
158 | * @return string |
||
159 | */ |
||
160 | public function urlsAndSettings(SS_HTTPRequest $request) { |
||
175 | |||
176 | /** |
||
177 | * Parses the data that we got from curling the crawl version of the page |
||
178 | * and splits the html fields into an array |
||
179 | * |
||
180 | * @param string $data |
||
181 | * @return array |
||
182 | */ |
||
183 | private function getHTMLFieldsData($data){ |
||
191 | |||
192 | /** |
||
193 | * Setup a curl request |
||
194 | * |
||
195 | * @param string $url |
||
196 | * @param string $agent |
||
197 | * @param bool $useCrawlID |
||
198 | * |
||
199 | * @return resource |
||
200 | */ |
||
201 | public function setupCurl($url, $agent, $useCrawlID = false){ |
||
218 | |||
219 | /** |
||
220 | * Return the domain to use to curl the page |
||
221 | * |
||
222 | * @return array|scalar|string |
||
223 | */ |
||
224 | public function getCurlDomain(){ |
||
229 | |||
230 | /** |
||
231 | * Return a url ready to be curled |
||
232 | * |
||
233 | * @param string $url |
||
234 | * @return string |
||
235 | */ |
||
236 | public function getCurlURL($url){ |
||
240 | |||
241 | /** |
||
242 | * Get the page headers from a curl response |
||
243 | * |
||
244 | * @param resource $ch |
||
245 | * @param string $data |
||
246 | * @return string |
||
247 | */ |
||
248 | public function getPageHeaders($ch, $data){ |
||
254 | |||
255 | /** |
||
256 | * Get the body of a curl response |
||
257 | * |
||
258 | * @param resource $ch |
||
259 | * @param string $data |
||
260 | * @return string |
||
261 | */ |
||
262 | public function getPageBody($ch, $data){ |
||
266 | |||
267 | /** |
||
268 | * Curl the passed $url using the X-Crawl-ID header and parse the data |
||
269 | * into an array |
||
270 | * |
||
271 | * @param string $url |
||
272 | * @param (null|string) $agent |
||
273 | * @return array |
||
274 | */ |
||
275 | public function loadPage($url, $agent=null){ |
||
293 | |||
294 | /** |
||
295 | * If ErrorPage exists for Error Code 503 return it |
||
296 | * else create it and return it |
||
297 | * |
||
298 | * @return ErrorPage |
||
299 | */ |
||
300 | public static function getPermissionDeniedPage() { |
||
312 | } |
||
313 |