SEOTestSiteTreeController::getCurlDomain() - Code Metrics - Inspection of "1.1.0 Changelog" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 090c72...ff33da )

by Dylan

created 2016-10-03 05:38 UTC

SEOTestSiteTreeController::getCurlDomain() A

↳ Parent: SEOTestSiteTreeController

Complexity

Conditions	2
Paths	2

Size

Total Lines	5
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	5
rs	9.4285
c	0
b	0
f	0
cc	2
eloc	4
nc	2
nop	0

<?php

class SEOTestSiteTreeController extends Controller {

    private static $alternate_domain    = null;
    private static $desktop_user_agent  = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
    private static $mobile_user_agent   = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';

    /**
     * Array of regex that will be used by the crawler.
     * If the url we're going to crawl matches any filter in here, it will be ignored
     */
    private static $ignore_paths = array();

    private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');

    public function init() {
        parent::init();

        Requirements::clear();

        if (!Member::currentUser()  || !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
            return $this->redirect(Security::login_url().'?BackURL=/seotest');
        }

        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
        Requirements::combine_files('seotest.css', array(
            SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
            SEOTOOLBOX_DIR.'/css/seotest.css'
        ));

        Requirements::combine_files('seotest.js', array(
            SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
            SEOTOOLBOX_DIR.'/js/crawler_event_handler.js',
            SEOTOOLBOX_DIR.'/js/crawler_painter.js',
            SEOTOOLBOX_DIR.'/js/crawler.js',
            SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
            SEOTOOLBOX_DIR.'/js/default_tests.js',
            SEOTOOLBOX_DIR.'/js/crawler_init.js'
        ));
    }

    /**
     * Curl the passed url.
     *
     * This is still run on the same domain as where your admin domain is located
     * however curl can deal with redirects much better then ajax therefore giving a
     * more accurate result. Also it prepares all the important data into an array object
     * which is then encoded and sent to JS for parsing
     *
     * Object Contents
     * obj['header'] = Headers that we got back from the curl
     * obj['body'] = HTML of the page
     * obj['phrases'] = A list of sentences as extracting from the DOM
     * obj['field_data'] = A result from preg_match with all the html fields visible on the page
     *
     * @param SS_HTTPRequest $request
     * @return String
     */
    public function getPageData(SS_HTTPRequest $request) {
        $agent = ($request->getVar('agent') == 'mobile')
            ? $this->config()->get('mobile_user_agent')
            : $this->config()->get('desktop_user_agent');

        $curl = $this->loadPage($request->getVar('u'), $agent);
        $curl['phrases'] = $this->extractWords($curl['body']);

        Requirements::clear();
        return json_encode($curl);
    }

    /**
     * Get the page contents of the requested url.
     * This is used as a proxy so that users running the admin on a subdomain
     * still get the data from their main domain
     *
     * @param SS_HTTPRequest $request
     * @return string
     */
    public function getPage(SS_HTTPRequest $request){
        $agent = ($request->getVar('agent') == 'mobile')
            ? $this->config()->get('mobile_user_agent')
            : $this->config()->get('desktop_user_agent');

        $ch = $this->setupCurl($request->getVar('u'), $agent);
        $data = curl_exec($ch);
        $body = $this->getPageBody($ch, $data);
        curl_close($ch);

        Requirements::clear();

        return $body;
    }

    /**
     * Break down the $html provided and returns all words that have an SEO significance
     *
     * @param string    $html
     * @return array
     */
    private function extractWords($html) {
        mb_internal_encoding('UTF-8');
        $html = preg_replace_callback(
            "/(&#[0-9]+;)/",
            function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
            $html
        );
        $html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
        $phrases = array();
        $regex_find_replace = array(
            array(
                'find'      => '/<meta(.*?)name="(.*?)description"(.*?)content="(.*?)"(.*?)[>]/m',
                'find_pos'  => 4,
                'replace'   => '/<meta(.*?)[>]/i'
            ),
            array(
                'find'      => '/<(img|a)[^<]*title=(\'|")(.*?)(\'|")[^<]*[>]/m',
                'find_pos'  => 3
            ),
            array(
                'find'      => '/<img[^<]*alt=(\'|")(.*?)(\'|")[^<]*[>]/m',
                'find_pos'  => 2,
                'replace'   => '/<img(.*?)[>]/i'
            ),
            array(
                'find'      => '/<(.*?)>(.*?)<\/[a-zA-Z0-9]++>/m',
                'find_pos'  => 2,
            )
        );

        foreach ($regex_find_replace as $commands) {
            if (isset($commands['find'])) {
                preg_match_all($commands['find'], $html, $matches);
                array_walk($matches[$commands['find_pos']], function(&$phrase) {
                    $words = explode(' ', strip_tags($phrase));
                    array_walk($words, function(&$w) {
                        $w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
                    });
                    $phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
                });
                $phrases = array_merge($phrases, $matches[$commands['find_pos']]);
            }

            if (isset($commands['replace']))
                $html = preg_replace($commands['replace'], ' ', $html);
        }

        // Remove the empty elements
        return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
    }

    /**
     * Returns the first batch of urls the crawler will use
     * and it's settings in json format
     *
     * @param SS_HTTPRequest $request
     * @return string
     */
    public function urlsAndSettings(SS_HTTPRequest $request) {
        Requirements::clear();
        return json_encode(array(
            'urls' => Versioned::get_by_stage('SiteTree', 'Live')
                ->exclude('ClassName', 'RedirectorPage')
                ->exclude('ClassName', 'ErrorPage')
                ->map('ID', 'AbsoluteLink')
                ->toArray(),

            'settings' => array(
                'ignore_paths' => $this->config()->get('ignore_paths'),
                'crawl_id'     => GlobalAutoLinkSettings::get_current()->CrawlID
            )
        ));
    }

    /**
     * Parses the data that we got from curling the crawl version of the page
     * and splits the html fields into an array
     *
     * @param string $data
     * @return array
     */
    private function getHTMLFieldsData($data){
        preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches);
        foreach( $matches[2] as $key => $field_text ){
            $matches[2][$key] = base64_decode($field_text);
            $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
        }
        return $matches;
    }

    /**
     * Setup a curl request
     *
     * @param string    $url
     * @param string    $agent
     * @param bool      $useCrawlID
     *
     * @return resource
     */
    public function setupCurl($url, $agent, $useCrawlID = false){
        $ch = curl_init();
        curl_setopt( $ch, CURLOPT_URL, $this->getCurlURL($url) );
        curl_setopt( $ch, CURLOPT_HEADER, true );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
        curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
        curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
        if( $useCrawlID ){
            $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
            curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
        }
        return $ch;
    }

    /**
     * Return the domain to use to curl the page
     *
     * @return array|scalar|string
     */
    public function getCurlDomain(){
        return ( self::config()->get('alternate_domain') != null )
            ? self::config()->get('alternate_domain')
            : Director::absoluteBaseURL();
    }

    /**
     * Return a url ready to be curled
     *
     * @param string $url
     * @return string
     */
    public function getCurlURL($url){
        $domain = $this->getCurlDomain();
        return "$domain/$url";
    }

    /**
     * Get the page headers from a curl response
     *
     * @param resource  $ch
     * @param string    $data
     * @return string
     */
    public function getPageHeaders($ch, $data){
        $header_size    = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
        $header 	    = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
        array_pop( $header ); // Remove last element as it will always be empty
        return array_pop( $header );
    }

    /**
     * Get the body of a curl response
     *
     * @param resource  $ch
     * @param string    $data
     * @return string
     */
    public function getPageBody($ch, $data){
        $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
        return substr( $data, $header_size );
    }

    /**
     * Curl the passed $url using the X-Crawl-ID header and parse the data
     * into an array
     *
     * @param string        $url
     * @param (null|string) $agent
     * @return array
     */
    public function loadPage($url, $agent=null){
        $ch         = $this->setupCurl($url, $agent, true);
        $data       = curl_exec($ch);
        $fetched    = str_replace($this->getCurlDomain(), '', curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
        $header     = $this->getPageHeaders($ch, $data);
        $body       = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));

        curl_close( $ch );

        if( !strpos( $header, ' 200 ' ) ) {
            return array( 'headers' => false, 'body' => false );
        }

        $field_data = $this->getHTMLFieldsData($body);
        $body = str_replace($field_data[0], $field_data[2], $body);

        return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
    }

    /**
     * If ErrorPage exists for Error Code 503 return it
     * else create it and return it
     *
     * @return ErrorPage
     */
    public static function getPermissionDeniedPage() {
        $page = ErrorPage::get()->find('ErrorCode', 503);
        if (!$page) {
            $page = ErrorPage::create(array(
                'ErrorCode' => 503,
                'Title'		=> 'Permission Denied'
            ));
            $page->write();
        }

        return $page;
    }
}


1			<?php
2
3			class SEOTestSiteTreeController extends Controller {
4
5			private static $alternate_domain = null;
6			private static $desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
7			private static $mobile_user_agent = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';
8
9			/**
10			* Array of regex that will be used by the crawler.
11			* If the url we're going to crawl matches any filter in here, it will be ignored
12			*/
13			private static $ignore_paths = array();
14
15			private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');
16
17			public function init() {
18			parent::init();
19
20			Requirements::clear();
21
22			if (!Member::currentUser() \|\| !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
23			return $this->redirect(Security::login_url().'?BackURL=/seotest');
24			}
25
26			Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
27			Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
28			Requirements::combine_files('seotest.css', array(
29			SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
30			SEOTOOLBOX_DIR.'/css/seotest.css'
31			));
32
33			Requirements::combine_files('seotest.js', array(
34			SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
35			SEOTOOLBOX_DIR.'/js/crawler_event_handler.js',
36			SEOTOOLBOX_DIR.'/js/crawler_painter.js',
37			SEOTOOLBOX_DIR.'/js/crawler.js',
38			SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
39			SEOTOOLBOX_DIR.'/js/default_tests.js',
40			SEOTOOLBOX_DIR.'/js/crawler_init.js'
41			));
42			}
43
44			/**
45			* Curl the passed url.
46			*
47			* This is still run on the same domain as where your admin domain is located
48			* however curl can deal with redirects much better then ajax therefore giving a
49			* more accurate result. Also it prepares all the important data into an array object
50			* which is then encoded and sent to JS for parsing
51			*
52			* Object Contents
53			* obj['header'] = Headers that we got back from the curl
54			* obj['body'] = HTML of the page
55			* obj['phrases'] = A list of sentences as extracting from the DOM
56			* obj['field_data'] = A result from preg_match with all the html fields visible on the page
57			*
58			* @param SS_HTTPRequest $request
59			* @return String
60			*/
61			public function getPageData(SS_HTTPRequest $request) {
62			$agent = ($request->getVar('agent') == 'mobile')
63			? $this->config()->get('mobile_user_agent')
64			: $this->config()->get('desktop_user_agent');
65
66			$curl = $this->loadPage($request->getVar('u'), $agent);
67			$curl['phrases'] = $this->extractWords($curl['body']);
68
69			Requirements::clear();
70			return json_encode($curl);
71			}
72
73			/**
74			* Get the page contents of the requested url.
75			* This is used as a proxy so that users running the admin on a subdomain
76			* still get the data from their main domain
77			*
78			* @param SS_HTTPRequest $request
79			* @return string
80			*/
81			public function getPage(SS_HTTPRequest $request){
82			$agent = ($request->getVar('agent') == 'mobile')
83			? $this->config()->get('mobile_user_agent')
84			: $this->config()->get('desktop_user_agent');
85
86			$ch = $this->setupCurl($request->getVar('u'), $agent);
87			$data = curl_exec($ch);
88			$body = $this->getPageBody($ch, $data);
89			curl_close($ch);
90
91			Requirements::clear();
92
93			return $body;
94			}
95
96			/**
97			* Break down the $html provided and returns all words that have an SEO significance
98			*
99			* @param string $html
100			* @return array
101			*/
102			private function extractWords($html) {
103			mb_internal_encoding('UTF-8');
104			$html = preg_replace_callback(
105			"/(&#[0-9]+;)/",
106			function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
107			$html
108			);
109			$html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
110			$phrases = array();
111			$regex_find_replace = array(
112			array(
113			'find' => '/<meta(.?)name="(.?)description"(.?)content="(.?)"(.*?)[>]/m',
114			'find_pos' => 4,
115			'replace' => '/<meta(.*?)[>]/i'
116			),
117			array(
118			'find' => '/<(img\|a)[^<]title=(\'\|")(.?)(\'\|")[^<]*[>]/m',
119			'find_pos' => 3
120			),
121			array(
122			'find' => '/<img[^<]alt=(\'\|")(.?)(\'\|")[^<]*[>]/m',
123			'find_pos' => 2,
124			'replace' => '/<img(.*?)[>]/i'
125			),
126			array(
127			'find' => '/<(.?)>(.?)<\/[a-zA-Z0-9]++>/m',
128			'find_pos' => 2,
129			)
130			);
131
132			foreach ($regex_find_replace as $commands) {
133			if (isset($commands['find'])) {
134			preg_match_all($commands['find'], $html, $matches);
135			array_walk($matches[$commands['find_pos']], function(&$phrase) {
136			$words = explode(' ', strip_tags($phrase));
137			array_walk($words, function(&$w) {
138			$w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
139			});
140			$phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
141			});
142			$phrases = array_merge($phrases, $matches[$commands['find_pos']]);
143			}
144
145			if (isset($commands['replace']))
146			$html = preg_replace($commands['replace'], ' ', $html);
147			}
148
149			// Remove the empty elements
150			return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
151			}
152
153			/**
154			* Returns the first batch of urls the crawler will use
155			* and it's settings in json format
156			*
157			* @param SS_HTTPRequest $request
158			* @return string
159			*/
160			public function urlsAndSettings(SS_HTTPRequest $request) {
161			Requirements::clear();
162			return json_encode(array(
163			'urls' => Versioned::get_by_stage('SiteTree', 'Live')
164			->exclude('ClassName', 'RedirectorPage')
165			->exclude('ClassName', 'ErrorPage')
166			->map('ID', 'AbsoluteLink')
167			->toArray(),
168
169			'settings' => array(
170			'ignore_paths' => $this->config()->get('ignore_paths'),
171			'crawl_id' => GlobalAutoLinkSettings::get_current()->CrawlID
172			)
173			));
174			}
175
176			/**
177			* Parses the data that we got from curling the crawl version of the page
178			* and splits the html fields into an array
179			*
180			* @param string $data
181			* @return array
182			*/
183			private function getHTMLFieldsData($data){
184			preg_match_all('/\[\\\[(.?)\]\\\[(.?)\]\\\]/im', $data, $matches);
185			foreach( $matches[2] as $key => $field_text ){
186			$matches[2][$key] = base64_decode($field_text);
187			$matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
188			}
189			return $matches;
190			}
191
192			/**
193			* Setup a curl request
194			*
195			* @param string $url
196			* @param string $agent
197			* @param bool $useCrawlID
198			*
199			* @return resource
200			*/
201			public function setupCurl($url, $agent, $useCrawlID = false){
202			$ch = curl_init();
203			curl_setopt( $ch, CURLOPT_URL, $this->getCurlURL($url) );
204			curl_setopt( $ch, CURLOPT_HEADER, true );
205			curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
206			curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
207			curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
208			curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
209			curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
210			curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
211			curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
212			if( $useCrawlID ){
213			$crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
214			curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
215			}
216			return $ch;
217			}
218
219			/**
220			* Return the domain to use to curl the page
221			*
222			* @return array\|scalar\|string
223			*/
224			public function getCurlDomain(){
225			return ( self::config()->get('alternate_domain') != null )
226			? self::config()->get('alternate_domain')
227			: Director::absoluteBaseURL();
228			}
229
230			/**
231			* Return a url ready to be curled
232			*
233			* @param string $url
234			* @return string
235			*/
236			public function getCurlURL($url){
237			$domain = $this->getCurlDomain();
238			return "$domain/$url";
239			}
240
241			/**
242			* Get the page headers from a curl response
243			*
244			* @param resource $ch
245			* @param string $data
246			* @return string
247			*/
248			public function getPageHeaders($ch, $data){
249			$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
250			$header = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
251			array_pop( $header ); // Remove last element as it will always be empty
252			return array_pop( $header );
253			}
254
255			/**
256			* Get the body of a curl response
257			*
258			* @param resource $ch
259			* @param string $data
260			* @return string
261			*/
262			public function getPageBody($ch, $data){
263			$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
264			return substr( $data, $header_size );
265			}
266
267			/**
268			* Curl the passed $url using the X-Crawl-ID header and parse the data
269			* into an array
270			*
271			* @param string $url
272			* @param (null\|string) $agent
273			* @return array
274			*/
275			public function loadPage($url, $agent=null){
276			$ch = $this->setupCurl($url, $agent, true);
277			$data = curl_exec($ch);
278			$fetched = str_replace($this->getCurlDomain(), '', curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
279			$header = $this->getPageHeaders($ch, $data);
280			$body = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));
281
282			curl_close( $ch );
283
284			if( !strpos( $header, ' 200 ' ) ) {
285			return array( 'headers' => false, 'body' => false );
286			}
287
288			$field_data = $this->getHTMLFieldsData($body);
289			$body = str_replace($field_data[0], $field_data[2], $body);
290
291			return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
292			}
293
294			/**
295			* If ErrorPage exists for Error Code 503 return it
296			* else create it and return it
297			*
298			* @return ErrorPage
299			*/
300			public static function getPermissionDeniedPage() {
301			$page = ErrorPage::get()->find('ErrorCode', 503);
302			if (!$page) {
303			$page = ErrorPage::create(array(
304			'ErrorCode' => 503,
305			'Title' => 'Permission Denied'
306			));
307			$page->write();
308			}
309
310			return $page;
311			}
312			}
313

dylangrech92 / seotoolbox

Push — master ( 090c72...ff33da )

SEOTestSiteTreeController::getCurlDomain() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like