1 | <?php |
||
3 | class SEOTestSiteTreeController extends Controller { |
||
4 | |||
5 | private static $desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'; |
||
6 | private static $mobile_user_agent = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>'; |
||
7 | |||
8 | /** |
||
9 | * Array of regex that will be used by the crawler. |
||
10 | * If the url we're going to crawl matches any filter in here, it will be ignored |
||
11 | */ |
||
12 | private static $ignore_paths = array(); |
||
13 | |||
14 | private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage'); |
||
15 | |||
16 | public function init() { |
||
41 | |||
42 | /** |
||
43 | * Curl the passed url. |
||
44 | * |
||
45 | * This is still run on the same domain as where your admin domain is located |
||
46 | * however curl can deal with redirects much better then ajax therefore giving a |
||
47 | * more accurate result. Also it prepares all the important data into an array object |
||
48 | * which is then encoded and sent to JS for parsing |
||
49 | * |
||
50 | * Object Contents |
||
51 | * obj['header'] = Headers that we got back from the curl |
||
52 | * obj['body'] = HTML of the page |
||
53 | * obj['phrases'] = A list of sentences as extracting from the DOM |
||
54 | * obj['field_data'] = A result from preg_match with all the html fields visible on the page |
||
55 | * |
||
56 | * @param SS_HTTPRequest $request |
||
57 | * @return String |
||
58 | */ |
||
59 | public function getPageData(SS_HTTPRequest $request) { |
||
70 | |||
71 | /** |
||
72 | * Get the page contents of the requested url. |
||
73 | * This is used as a proxy so that users running the admin on a subdomain |
||
74 | * still get the data from their main domain |
||
75 | * |
||
76 | * @param SS_HTTPRequest $request |
||
77 | * @return string |
||
78 | */ |
||
79 | public function getPage(SS_HTTPRequest $request){ |
||
93 | |||
94 | /** |
||
95 | * Break down the $html provided and returns all words that have an SEO significance |
||
96 | * |
||
97 | * @param string $html |
||
98 | * @return array |
||
99 | */ |
||
100 | private function extractWords($html) { |
||
150 | |||
151 | /** |
||
152 | * Returns the first batch of urls the crawler will use |
||
153 | * and it's settings in json format |
||
154 | * |
||
155 | * @param SS_HTTPRequest $request |
||
156 | * @return string |
||
157 | */ |
||
158 | public function urlsAndSettings(SS_HTTPRequest $request) { |
||
173 | |||
174 | /** |
||
175 | * Parses the data that we got from curling the crawl version of the page |
||
176 | * and splits the html fields into an array |
||
177 | * |
||
178 | * @param string $data |
||
179 | * @return array |
||
180 | */ |
||
181 | private function getHTMLFieldsData($data){ |
||
189 | |||
190 | /** |
||
191 | * Setup a curl request |
||
192 | * |
||
193 | * @param string $url |
||
194 | * @param string $agent |
||
195 | * @param bool $useCrawlID |
||
196 | * |
||
197 | * @return resource |
||
198 | */ |
||
199 | public function setupCurl($url, $agent, $useCrawlID = false){ |
||
216 | |||
217 | /** |
||
218 | * Get the page headers from a curl response |
||
219 | * |
||
220 | * @param resource $ch |
||
221 | * @param string $data |
||
222 | * @return string |
||
223 | */ |
||
224 | public function getPageHeaders($ch, $data){ |
||
230 | |||
231 | /** |
||
232 | * Get the body of a curl response |
||
233 | * |
||
234 | * @param resource $ch |
||
235 | * @param string $data |
||
236 | * @return string |
||
237 | */ |
||
238 | public function getPageBody($ch, $data){ |
||
242 | |||
243 | /** |
||
244 | * Curl the passed $url using the X-Crawl-ID header and parse the data |
||
245 | * into an array |
||
246 | * |
||
247 | * @param string $url |
||
248 | * @param (null|string) $agent |
||
249 | * @return array |
||
250 | */ |
||
251 | public function loadPage($url, $agent=null){ |
||
269 | |||
270 | /** |
||
271 | * If ErrorPage exists for Error Code 503 return it |
||
272 | * else create it and return it |
||
273 | * |
||
274 | * @return ErrorPage |
||
275 | */ |
||
276 | public static function getPermissionDeniedPage() { |
||
288 | } |
||
289 |