Completed
Push — master ( 090c72...ff33da )
by Dylan
04:20 queued 01:54
created

SEOTestSiteTreeController::getCurlDomain()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 5
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 5
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 4
nc 2
nop 0
1
<?php
2
3
class SEOTestSiteTreeController extends Controller {
4
5
    private static $alternate_domain    = null;
6
    private static $desktop_user_agent  = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
7
    private static $mobile_user_agent   = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';
8
9
    /**
10
     * Array of regex that will be used by the crawler.
11
     * If the url we're going to crawl matches any filter in here, it will be ignored
12
     */
13
    private static $ignore_paths = array();
14
15
    private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');
16
17
    public function init() {
18
        parent::init();
19
20
        Requirements::clear();
21
22
        if (!Member::currentUser()  || !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
23
            return $this->redirect(Security::login_url().'?BackURL=/seotest');
24
        }
25
26
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
27
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
28
        Requirements::combine_files('seotest.css', array(
29
            SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
30
            SEOTOOLBOX_DIR.'/css/seotest.css'
31
        ));
32
33
        Requirements::combine_files('seotest.js', array(
34
            SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
35
            SEOTOOLBOX_DIR.'/js/crawler_event_handler.js',
36
            SEOTOOLBOX_DIR.'/js/crawler_painter.js',
37
            SEOTOOLBOX_DIR.'/js/crawler.js',
38
            SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
39
            SEOTOOLBOX_DIR.'/js/default_tests.js',
40
            SEOTOOLBOX_DIR.'/js/crawler_init.js'
41
        ));
42
    }
43
44
    /**
45
     * Curl the passed url.
46
     *
47
     * This is still run on the same domain as where your admin domain is located
48
     * however curl can deal with redirects much better then ajax therefore giving a
49
     * more accurate result. Also it prepares all the important data into an array object
50
     * which is then encoded and sent to JS for parsing
51
     *
52
     * Object Contents
53
     * obj['header'] = Headers that we got back from the curl
54
     * obj['body'] = HTML of the page
55
     * obj['phrases'] = A list of sentences as extracting from the DOM
56
     * obj['field_data'] = A result from preg_match with all the html fields visible on the page
57
     *
58
     * @param SS_HTTPRequest $request
59
     * @return String
60
     */
61
    public function getPageData(SS_HTTPRequest $request) {
62
        $agent = ($request->getVar('agent') == 'mobile')
63
            ? $this->config()->get('mobile_user_agent')
64
            : $this->config()->get('desktop_user_agent');
65
66
        $curl = $this->loadPage($request->getVar('u'), $agent);
67
        $curl['phrases'] = $this->extractWords($curl['body']);
68
69
        Requirements::clear();
70
        return json_encode($curl);
71
    }
72
73
    /**
74
     * Get the page contents of the requested url.
75
     * This is used as a proxy so that users running the admin on a subdomain
76
     * still get the data from their main domain
77
     *
78
     * @param SS_HTTPRequest $request
79
     * @return string
80
     */
81
    public function getPage(SS_HTTPRequest $request){
82
        $agent = ($request->getVar('agent') == 'mobile')
83
            ? $this->config()->get('mobile_user_agent')
84
            : $this->config()->get('desktop_user_agent');
85
86
        $ch = $this->setupCurl($request->getVar('u'), $agent);
87
        $data = curl_exec($ch);
88
        $body = $this->getPageBody($ch, $data);
89
        curl_close($ch);
90
91
        Requirements::clear();
92
93
        return $body;
94
    }
95
96
    /**
97
     * Break down the $html provided and returns all words that have an SEO significance
98
     *
99
     * @param string    $html
100
     * @return array
101
     */
102
    private function extractWords($html) {
103
        mb_internal_encoding('UTF-8');
104
        $html = preg_replace_callback(
105
            "/(&#[0-9]+;)/",
106
            function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
107
            $html
108
        );
109
        $html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
110
        $phrases = array();
111
        $regex_find_replace = array(
112
            array(
113
                'find'      => '/<meta(.*?)name="(.*?)description"(.*?)content="(.*?)"(.*?)[>]/m',
114
                'find_pos'  => 4,
115
                'replace'   => '/<meta(.*?)[>]/i'
116
            ),
117
            array(
118
                'find'      => '/<(img|a)[^<]*title=(\'|")(.*?)(\'|")[^<]*[>]/m',
119
                'find_pos'  => 3
120
            ),
121
            array(
122
                'find'      => '/<img[^<]*alt=(\'|")(.*?)(\'|")[^<]*[>]/m',
123
                'find_pos'  => 2,
124
                'replace'   => '/<img(.*?)[>]/i'
125
            ),
126
            array(
127
                'find'      => '/<(.*?)>(.*?)<\/[a-zA-Z0-9]++>/m',
128
                'find_pos'  => 2,
129
            )
130
        );
131
132
        foreach ($regex_find_replace as $commands) {
133
            if (isset($commands['find'])) {
134
                preg_match_all($commands['find'], $html, $matches);
135
                array_walk($matches[$commands['find_pos']], function(&$phrase) {
136
                    $words = explode(' ', strip_tags($phrase));
137
                    array_walk($words, function(&$w) {
138
                        $w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
139
                    });
140
                    $phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
141
                });
142
                $phrases = array_merge($phrases, $matches[$commands['find_pos']]);
143
            }
144
145
            if (isset($commands['replace']))
146
                $html = preg_replace($commands['replace'], ' ', $html);
147
        }
148
149
        // Remove the empty elements
150
        return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
151
    }
152
153
    /**
154
     * Returns the first batch of urls the crawler will use
155
     * and it's settings in json format
156
     *
157
     * @param SS_HTTPRequest $request
158
     * @return string
159
     */
160
    public function urlsAndSettings(SS_HTTPRequest $request) {
161
        Requirements::clear();
162
        return json_encode(array(
163
            'urls' => Versioned::get_by_stage('SiteTree', 'Live')
164
                ->exclude('ClassName', 'RedirectorPage')
165
                ->exclude('ClassName', 'ErrorPage')
166
                ->map('ID', 'AbsoluteLink')
167
                ->toArray(),
168
169
            'settings' => array(
170
                'ignore_paths' => $this->config()->get('ignore_paths'),
171
                'crawl_id'     => GlobalAutoLinkSettings::get_current()->CrawlID
172
            )
173
        ));
174
    }
175
176
    /**
177
     * Parses the data that we got from curling the crawl version of the page
178
     * and splits the html fields into an array
179
     *
180
     * @param string $data
181
     * @return array
182
     */
183
    private function getHTMLFieldsData($data){
184
        preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches);
185
        foreach( $matches[2] as $key => $field_text ){
186
            $matches[2][$key] = base64_decode($field_text);
187
            $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
188
        }
189
        return $matches;
190
    }
191
192
    /**
193
     * Setup a curl request
194
     *
195
     * @param string    $url
196
     * @param string    $agent
197
     * @param bool      $useCrawlID
198
     *
199
     * @return resource
200
     */
201
    public function setupCurl($url, $agent, $useCrawlID = false){
202
        $ch = curl_init();
203
        curl_setopt( $ch, CURLOPT_URL, $this->getCurlURL($url) );
204
        curl_setopt( $ch, CURLOPT_HEADER, true );
205
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
206
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
207
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
208
        curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
209
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
210
        curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
211
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
212
        if( $useCrawlID ){
213
            $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
214
            curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
215
        }
216
        return $ch;
217
    }
218
219
    /**
220
     * Return the domain to use to curl the page
221
     *
222
     * @return array|scalar|string
223
     */
224
    public function getCurlDomain(){
225
        return ( self::config()->get('alternate_domain') != null )
226
            ? self::config()->get('alternate_domain')
227
            : Director::absoluteBaseURL();
228
    }
229
230
    /**
231
     * Return a url ready to be curled
232
     *
233
     * @param string $url
234
     * @return string
235
     */
236
    public function getCurlURL($url){
237
        $domain = $this->getCurlDomain();
238
        return "$domain/$url";
239
    }
240
241
    /**
242
     * Get the page headers from a curl response
243
     *
244
     * @param resource  $ch
245
     * @param string    $data
246
     * @return string
247
     */
248
    public function getPageHeaders($ch, $data){
249
        $header_size    = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
250
        $header 	    = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
251
        array_pop( $header ); // Remove last element as it will always be empty
252
        return array_pop( $header );
253
    }
254
255
    /**
256
     * Get the body of a curl response
257
     *
258
     * @param resource  $ch
259
     * @param string    $data
260
     * @return string
261
     */
262
    public function getPageBody($ch, $data){
263
        $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
264
        return substr( $data, $header_size );
265
    }
266
267
    /**
268
     * Curl the passed $url using the X-Crawl-ID header and parse the data
269
     * into an array
270
     *
271
     * @param string        $url
272
     * @param (null|string) $agent
273
     * @return array
274
     */
275
    public function loadPage($url, $agent=null){
276
        $ch         = $this->setupCurl($url, $agent, true);
277
        $data       = curl_exec($ch);
278
        $fetched    = str_replace($this->getCurlDomain(), '', curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
279
        $header     = $this->getPageHeaders($ch, $data);
280
        $body       = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));
281
282
        curl_close( $ch );
283
284
        if( !strpos( $header, ' 200 ' ) ) {
285
            return array( 'headers' => false, 'body' => false );
286
        }
287
288
        $field_data = $this->getHTMLFieldsData($body);
289
        $body = str_replace($field_data[0], $field_data[2], $body);
290
291
        return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
292
    }
293
294
    /**
295
     * If ErrorPage exists for Error Code 503 return it
296
     * else create it and return it
297
     *
298
     * @return ErrorPage
299
     */
300
    public static function getPermissionDeniedPage() {
301
        $page = ErrorPage::get()->find('ErrorCode', 503);
302
        if (!$page) {
303
            $page = ErrorPage::create(array(
304
                'ErrorCode' => 503,
305
                'Title'		=> 'Permission Denied'
306
            ));
307
            $page->write();
308
        }
309
310
        return $page;
311
    }
312
}
313