Completed
Push — develop ( 7852a2...d7f552 )
by Dylan
03:02
created

SEOTestSiteTreeController::getPageHeaders()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 5
nc 1
nop 2
1
<?php
2
3
class SEOTestSiteTreeController extends Controller {
4
5
    private static $desktop_user_agent  = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
6
    private static $mobile_user_agent   = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';
7
8
    /**
9
     * Array of regex that will be used by the crawler.
10
     * If the url we're going to crawl matches any filter in here, it will be ignored
11
     */
12
    private static $ignore_paths = array();
13
14
    private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');
15
16
    public function init() {
17
        parent::init();
18
19
        Requirements::clear();
20
21
        if (!Member::currentUser()  || !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
22
            return $this->redirect(Security::login_url().'?BackURL=/seotest');
23
        }
24
25
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
26
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
27
        Requirements::combine_files('seotest.css', array(
28
            SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
29
            SEOTOOLBOX_DIR.'/css/seotest.css'
30
        ));
31
32
        Requirements::combine_files('seotest.js', array(
33
            SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
34
            SEOTOOLBOX_DIR.'/js/crawler_painter.js',
35
            SEOTOOLBOX_DIR.'/js/crawler.js',
36
            SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
37
            SEOTOOLBOX_DIR.'/js/default_tests.js',
38
            SEOTOOLBOX_DIR.'/js/crawler_init.js'
39
        ));
40
    }
41
42
    /**
43
     * Curl the passed url.
44
     *
45
     * This is still run on the same domain as where your admin domain is located
46
     * however curl can deal with redirects much better then ajax therefore giving a
47
     * more accurate result. Also it prepares all the important data into an array object
48
     * which is then encoded and sent to JS for parsing
49
     *
50
     * Object Contents
51
     * obj['header'] = Headers that we got back from the curl
52
     * obj['body'] = HTML of the page
53
     * obj['phrases'] = A list of sentences as extracting from the DOM
54
     * obj['field_data'] = A result from preg_match with all the html fields visible on the page
55
     *
56
     * @param SS_HTTPRequest $request
57
     * @return String
58
     */
59
    public function getPageData(SS_HTTPRequest $request) {
60
        $agent = ($request->getVar('agent') == 'mobile')
61
            ? $this->config()->get('mobile_user_agent')
62
            : $this->config()->get('desktop_user_agent');
63
64
        $curl = $this->loadPage($request->getVar('u'), $agent);
65
        $curl['phrases'] = $this->extractWords($curl['body']);
66
67
        Requirements::clear();
68
        return json_encode($curl);
69
    }
70
71
    /**
72
     * Get the page contents of the requested url.
73
     * This is used as a proxy so that users running the admin on a subdomain
74
     * still get the data from their main domain
75
     *
76
     * @param SS_HTTPRequest $request
77
     * @return string
78
     */
79
    public function getPage(SS_HTTPRequest $request){
80
        $agent = ($request->getVar('agent') == 'mobile')
81
            ? $this->config()->get('mobile_user_agent')
82
            : $this->config()->get('desktop_user_agent');
83
84
        $ch = $this->setupCurl($request->getVar('u'), $agent);
85
        $data = curl_exec($ch);
86
        $body = $this->getPageBody($ch, $data);
87
        curl_close($ch);
88
89
        Requirements::clear();
90
91
        return $body;
92
    }
93
94
    /**
95
     * Break down the $html provided and returns all words that have an SEO significance
96
     *
97
     * @param string    $html
98
     * @return array
99
     */
100
    private function extractWords($html) {
101
        mb_internal_encoding('UTF-8');
102
        $html = preg_replace_callback(
103
            "/(&#[0-9]+;)/",
104
            function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
105
            $html
106
        );
107
        $html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
108
        $phrases = array();
109
        $regex_find_replace = array(
110
            array(
111
                'find'      => '/<meta(.*?)name="(.*?)description"(.*?)content="(.*?)"(.*?)[>]/m',
112
                'find_pos'  => 4,
113
                'replace'   => '/<meta(.*?)[>]/i'
114
            ),
115
            array(
116
                'find'      => '/<(img|a)[^<]*title=(\'|")(.*?)(\'|")[^<]*[>]/m',
117
                'find_pos'  => 3
118
            ),
119
            array(
120
                'find'      => '/<img[^<]*alt=(\'|")(.*?)(\'|")[^<]*[>]/m',
121
                'find_pos'  => 2,
122
                'replace'   => '/<img(.*?)[>]/i'
123
            ),
124
            array(
125
                'find'      => '/<(.*?)>(.*?)<\/[a-zA-Z0-9]++>/m',
126
                'find_pos'  => 2,
127
            )
128
        );
129
130
        foreach ($regex_find_replace as $commands) {
131
            if (isset($commands['find'])) {
132
                preg_match_all($commands['find'], $html, $matches);
133
                array_walk($matches[$commands['find_pos']], function(&$phrase) {
134
                    $words = explode(' ', strip_tags($phrase));
135
                    array_walk($words, function(&$w) {
136
                        $w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
137
                    });
138
                    $phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
139
                });
140
                $phrases = array_merge($phrases, $matches[$commands['find_pos']]);
141
            }
142
143
            if (isset($commands['replace']))
144
                $html = preg_replace($commands['replace'], ' ', $html);
145
        }
146
147
        // Remove the empty elements
148
        return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
149
    }
150
151
    /**
152
     * Returns the first batch of urls the crawler will use
153
     * and it's settings in json format
154
     *
155
     * @param SS_HTTPRequest $request
156
     * @return string
157
     */
158
    public function urlsAndSettings(SS_HTTPRequest $request) {
159
        Requirements::clear();
160
        return json_encode(array(
161
            'urls' => Versioned::get_by_stage('SiteTree', 'Live')
162
                ->exclude('ClassName', 'RedirectorPage')
163
                ->exclude('ClassName', 'ErrorPage')
164
                ->map('ID', 'AbsoluteLink')
165
                ->toArray(),
166
167
            'settings' => array(
168
                'ignore_paths' => $this->config()->get('ignore_paths'),
169
                'crawl_id'     => GlobalAutoLinkSettings::get_current()->CrawlID
170
            )
171
        ));
172
    }
173
174
    /**
175
     * Parses the data that we got from curling the crawl version of the page
176
     * and splits the html fields into an array
177
     *
178
     * @param string $data
179
     * @return array
180
     */
181
    private function getHTMLFieldsData($data){
182
        preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches);
183
        foreach( $matches[2] as $key => $field_text ){
184
            $matches[2][$key] = base64_decode($field_text);
185
            $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
186
        }
187
        return $matches;
188
    }
189
190
    /**
191
     * Setup a curl request
192
     *
193
     * @param string    $url
194
     * @param string    $agent
195
     * @param bool      $useCrawlID
196
     *
197
     * @return resource
198
     */
199
    public function setupCurl($url, $agent, $useCrawlID = false){
200
        $ch = curl_init();
201
        curl_setopt( $ch, CURLOPT_URL, Director::absoluteBaseURL().'/'.$url );
202
        curl_setopt( $ch, CURLOPT_HEADER, true );
203
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
204
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
205
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
206
        curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
207
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
208
        curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
209
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
210
        if( $useCrawlID ){
211
            $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
212
            curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
213
        }
214
        return $ch;
215
    }
216
217
    /**
218
     * Get the page headers from a curl response
219
     *
220
     * @param resource  $ch
221
     * @param string    $data
222
     * @return string
223
     */
224
    public function getPageHeaders($ch, $data){
225
        $header_size    = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
226
        $header 	    = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
227
        array_pop( $header ); // Remove last element as it will always be empty
228
        return array_pop( $header );
229
    }
230
231
    /**
232
     * Get the body of a curl response
233
     *
234
     * @param resource  $ch
235
     * @param string    $data
236
     * @return string
237
     */
238
    public function getPageBody($ch, $data){
239
        $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
240
        return substr( $data, $header_size );
241
    }
242
243
    /**
244
     * Curl the passed $url using the X-Crawl-ID header and parse the data
245
     * into an array
246
     *
247
     * @param string        $url
248
     * @param (null|string) $agent
249
     * @return array
250
     */
251
    public function loadPage($url, $agent=null){
252
        $ch         = $this->setupCurl($url, $agent, true);
253
        $data       = curl_exec($ch);
254
        $fetched    = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), PHP_URL_PATH);
255
        $header     = $this->getPageHeaders($ch, $data);
256
        $body       = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));
257
258
        curl_close( $ch );
259
260
        if( !strpos( $header, ' 200 ' ) ) {
261
            return array( 'headers' => false, 'body' => false );
262
        }
263
264
        $field_data = $this->getHTMLFieldsData($body);
265
        $body = str_replace($field_data[0], $field_data[2], $body);
266
267
        return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
268
    }
269
270
    /**
271
     * If ErrorPage exists for Error Code 503 return it
272
     * else create it and return it
273
     *
274
     * @return ErrorPage
275
     */
276
    public static function getPermissionDeniedPage() {
277
        $page = ErrorPage::get()->find('ErrorCode', 503);
278
        if (!$page) {
279
            $page = ErrorPage::create(array(
280
                'ErrorCode' => 503,
281
                'Title'		=> 'Permission Denied'
282
            ));
283
            $page->write();
284
        }
285
286
        return $page;
287
    }
288
}
289