SEOTestSiteTreeController::getPageBody() - Code Metrics - Inspection of "#12 Bug & QA Fixes" - dylangrech92/seotoolbox - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — develop ( 7852a2...d7f552 )

by Dylan

created 2016-09-29 07:00 UTC

SEOTestSiteTreeController::getPageBody() A

↳ Parent: SEOTestSiteTreeController

Complexity

Conditions	1
Paths	1

Size

Total Lines	4
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	4
rs	10
c	0
b	0
f	0
cc	1
eloc	3
nc	1
nop	2

<?php

class SEOTestSiteTreeController extends Controller {

    private static $desktop_user_agent  = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
    private static $mobile_user_agent   = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';

    /**
     * Array of regex that will be used by the crawler.
     * If the url we're going to crawl matches any filter in here, it will be ignored
     */
    private static $ignore_paths = array();

    private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');

    public function init() {
        parent::init();

        Requirements::clear();

        if (!Member::currentUser()  || !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
            return $this->redirect(Security::login_url().'?BackURL=/seotest');
        }

        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
        Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
        Requirements::combine_files('seotest.css', array(
            SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
            SEOTOOLBOX_DIR.'/css/seotest.css'
        ));

        Requirements::combine_files('seotest.js', array(
            SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
            SEOTOOLBOX_DIR.'/js/crawler_painter.js',
            SEOTOOLBOX_DIR.'/js/crawler.js',
            SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
            SEOTOOLBOX_DIR.'/js/default_tests.js',
            SEOTOOLBOX_DIR.'/js/crawler_init.js'
        ));
    }

    /**
     * Curl the passed url.
     *
     * This is still run on the same domain as where your admin domain is located
     * however curl can deal with redirects much better then ajax therefore giving a
     * more accurate result. Also it prepares all the important data into an array object
     * which is then encoded and sent to JS for parsing
     *
     * Object Contents
     * obj['header'] = Headers that we got back from the curl
     * obj['body'] = HTML of the page
     * obj['phrases'] = A list of sentences as extracting from the DOM
     * obj['field_data'] = A result from preg_match with all the html fields visible on the page
     *
     * @param SS_HTTPRequest $request
     * @return String
     */
    public function getPageData(SS_HTTPRequest $request) {
        $agent = ($request->getVar('agent') == 'mobile')
            ? $this->config()->get('mobile_user_agent')
            : $this->config()->get('desktop_user_agent');

        $curl = $this->loadPage($request->getVar('u'), $agent);
        $curl['phrases'] = $this->extractWords($curl['body']);

        Requirements::clear();
        return json_encode($curl);
    }

    /**
     * Get the page contents of the requested url.
     * This is used as a proxy so that users running the admin on a subdomain
     * still get the data from their main domain
     *
     * @param SS_HTTPRequest $request
     * @return string
     */
    public function getPage(SS_HTTPRequest $request){
        $agent = ($request->getVar('agent') == 'mobile')
            ? $this->config()->get('mobile_user_agent')
            : $this->config()->get('desktop_user_agent');

        $ch = $this->setupCurl($request->getVar('u'), $agent);
        $data = curl_exec($ch);
        $body = $this->getPageBody($ch, $data);
        curl_close($ch);

        Requirements::clear();

        return $body;
    }

    /**
     * Break down the $html provided and returns all words that have an SEO significance
     *
     * @param string    $html
     * @return array
     */
    private function extractWords($html) {
        mb_internal_encoding('UTF-8');
        $html = preg_replace_callback(
            "/(&#[0-9]+;)/",
            function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
            $html
        );
        $html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
        $phrases = array();
        $regex_find_replace = array(
            array(
                'find'      => '/<meta(.*?)name="(.*?)description"(.*?)content="(.*?)"(.*?)[>]/m',
                'find_pos'  => 4,
                'replace'   => '/<meta(.*?)[>]/i'
            ),
            array(
                'find'      => '/<(img|a)[^<]*title=(\'|")(.*?)(\'|")[^<]*[>]/m',
                'find_pos'  => 3
            ),
            array(
                'find'      => '/<img[^<]*alt=(\'|")(.*?)(\'|")[^<]*[>]/m',
                'find_pos'  => 2,
                'replace'   => '/<img(.*?)[>]/i'
            ),
            array(
                'find'      => '/<(.*?)>(.*?)<\/[a-zA-Z0-9]++>/m',
                'find_pos'  => 2,
            )
        );

        foreach ($regex_find_replace as $commands) {
            if (isset($commands['find'])) {
                preg_match_all($commands['find'], $html, $matches);
                array_walk($matches[$commands['find_pos']], function(&$phrase) {
                    $words = explode(' ', strip_tags($phrase));
                    array_walk($words, function(&$w) {
                        $w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
                    });
                    $phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
                });
                $phrases = array_merge($phrases, $matches[$commands['find_pos']]);
            }

            if (isset($commands['replace']))
                $html = preg_replace($commands['replace'], ' ', $html);
        }

        // Remove the empty elements
        return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
    }

    /**
     * Returns the first batch of urls the crawler will use
     * and it's settings in json format
     *
     * @param SS_HTTPRequest $request
     * @return string
     */
    public function urlsAndSettings(SS_HTTPRequest $request) {
        Requirements::clear();
        return json_encode(array(
            'urls' => Versioned::get_by_stage('SiteTree', 'Live')
                ->exclude('ClassName', 'RedirectorPage')
                ->exclude('ClassName', 'ErrorPage')
                ->map('ID', 'AbsoluteLink')
                ->toArray(),

            'settings' => array(
                'ignore_paths' => $this->config()->get('ignore_paths'),
                'crawl_id'     => GlobalAutoLinkSettings::get_current()->CrawlID
            )
        ));
    }

    /**
     * Parses the data that we got from curling the crawl version of the page
     * and splits the html fields into an array
     *
     * @param string $data
     * @return array
     */
    private function getHTMLFieldsData($data){
        preg_match_all('/\[\*\*\[(.*?)\]\*\*\[(.*?)\]\*\*\]/im', $data, $matches);
        foreach( $matches[2] as $key => $field_text ){
            $matches[2][$key] = base64_decode($field_text);
            $matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
        }
        return $matches;
    }

    /**
     * Setup a curl request
     *
     * @param string    $url
     * @param string    $agent
     * @param bool      $useCrawlID
     *
     * @return resource
     */
    public function setupCurl($url, $agent, $useCrawlID = false){
        $ch = curl_init();
        curl_setopt( $ch, CURLOPT_URL, Director::absoluteBaseURL().'/'.$url );
        curl_setopt( $ch, CURLOPT_HEADER, true );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
        curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
        curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
        if( $useCrawlID ){
            $crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
            curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
        }
        return $ch;
    }

    /**
     * Get the page headers from a curl response
     *
     * @param resource  $ch
     * @param string    $data
     * @return string
     */
    public function getPageHeaders($ch, $data){
        $header_size    = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
        $header 	    = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
        array_pop( $header ); // Remove last element as it will always be empty
        return array_pop( $header );
    }

    /**
     * Get the body of a curl response
     *
     * @param resource  $ch
     * @param string    $data
     * @return string
     */
    public function getPageBody($ch, $data){
        $header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
        return substr( $data, $header_size );
    }

    /**
     * Curl the passed $url using the X-Crawl-ID header and parse the data
     * into an array
     *
     * @param string        $url
     * @param (null|string) $agent
     * @return array
     */
    public function loadPage($url, $agent=null){
        $ch         = $this->setupCurl($url, $agent, true);
        $data       = curl_exec($ch);
        $fetched    = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), PHP_URL_PATH);
        $header     = $this->getPageHeaders($ch, $data);
        $body       = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));

        curl_close( $ch );

        if( !strpos( $header, ' 200 ' ) ) {
            return array( 'headers' => false, 'body' => false );
        }

        $field_data = $this->getHTMLFieldsData($body);
        $body = str_replace($field_data[0], $field_data[2], $body);

        return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
    }

    /**
     * If ErrorPage exists for Error Code 503 return it
     * else create it and return it
     *
     * @return ErrorPage
     */
    public static function getPermissionDeniedPage() {
        $page = ErrorPage::get()->find('ErrorCode', 503);
        if (!$page) {
            $page = ErrorPage::create(array(
                'ErrorCode' => 503,
                'Title'		=> 'Permission Denied'
            ));
            $page->write();
        }

        return $page;
    }
}


1			<?php
2
3			class SEOTestSiteTreeController extends Controller {
4
5			private static $desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36';
6			private static $mobile_user_agent = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>';
7
8			/**
9			* Array of regex that will be used by the crawler.
10			* If the url we're going to crawl matches any filter in here, it will be ignored
11			*/
12			private static $ignore_paths = array();
13
14			private static $allowed_actions = array('urlsAndSettings', 'getPageData', 'getPage');
15
16			public function init() {
17			parent::init();
18
19			Requirements::clear();
20
21			if (!Member::currentUser() \|\| !Permission::check('CMS_ACCESS_SEOToolboxAdmin')){
22			return $this->redirect(Security::login_url().'?BackURL=/seotest');
23			}
24
25			Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap.min.css');
26			Requirements::css(SEOTOOLBOX_DIR.'/third-party/bootstrap/css/bootstrap-theme.min.css');
27			Requirements::combine_files('seotest.css', array(
28			SEOTOOLBOX_DIR.'/css/fonts/lato/lato.css',
29			SEOTOOLBOX_DIR.'/css/seotest.css'
30			));
31
32			Requirements::combine_files('seotest.js', array(
33			SEOTOOLBOX_DIR.'/third-party/jquery-1.12.0.js',
34			SEOTOOLBOX_DIR.'/js/crawler_painter.js',
35			SEOTOOLBOX_DIR.'/js/crawler.js',
36			SEOTOOLBOX_DIR.'/js/crawler_file_tester.js',
37			SEOTOOLBOX_DIR.'/js/default_tests.js',
38			SEOTOOLBOX_DIR.'/js/crawler_init.js'
39			));
40			}
41
42			/**
43			* Curl the passed url.
44			*
45			* This is still run on the same domain as where your admin domain is located
46			* however curl can deal with redirects much better then ajax therefore giving a
47			* more accurate result. Also it prepares all the important data into an array object
48			* which is then encoded and sent to JS for parsing
49			*
50			* Object Contents
51			* obj['header'] = Headers that we got back from the curl
52			* obj['body'] = HTML of the page
53			* obj['phrases'] = A list of sentences as extracting from the DOM
54			* obj['field_data'] = A result from preg_match with all the html fields visible on the page
55			*
56			* @param SS_HTTPRequest $request
57			* @return String
58			*/
59			public function getPageData(SS_HTTPRequest $request) {
60			$agent = ($request->getVar('agent') == 'mobile')
61			? $this->config()->get('mobile_user_agent')
62			: $this->config()->get('desktop_user_agent');
63
64			$curl = $this->loadPage($request->getVar('u'), $agent);
65			$curl['phrases'] = $this->extractWords($curl['body']);
66
67			Requirements::clear();
68			return json_encode($curl);
69			}
70
71			/**
72			* Get the page contents of the requested url.
73			* This is used as a proxy so that users running the admin on a subdomain
74			* still get the data from their main domain
75			*
76			* @param SS_HTTPRequest $request
77			* @return string
78			*/
79			public function getPage(SS_HTTPRequest $request){
80			$agent = ($request->getVar('agent') == 'mobile')
81			? $this->config()->get('mobile_user_agent')
82			: $this->config()->get('desktop_user_agent');
83
84			$ch = $this->setupCurl($request->getVar('u'), $agent);
85			$data = curl_exec($ch);
86			$body = $this->getPageBody($ch, $data);
87			curl_close($ch);
88
89			Requirements::clear();
90
91			return $body;
92			}
93
94			/**
95			* Break down the $html provided and returns all words that have an SEO significance
96			*
97			* @param string $html
98			* @return array
99			*/
100			private function extractWords($html) {
101			mb_internal_encoding('UTF-8');
102			$html = preg_replace_callback(
103			"/(&#[0-9]+;)/",
104			function($m) {return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
105			$html
106			);
107			$html = str_replace(array("\n", "\r"), ' ', mb_strtolower($html));
108			$phrases = array();
109			$regex_find_replace = array(
110			array(
111			'find' => '/<meta(.?)name="(.?)description"(.?)content="(.?)"(.*?)[>]/m',
112			'find_pos' => 4,
113			'replace' => '/<meta(.*?)[>]/i'
114			),
115			array(
116			'find' => '/<(img\|a)[^<]title=(\'\|")(.?)(\'\|")[^<]*[>]/m',
117			'find_pos' => 3
118			),
119			array(
120			'find' => '/<img[^<]alt=(\'\|")(.?)(\'\|")[^<]*[>]/m',
121			'find_pos' => 2,
122			'replace' => '/<img(.*?)[>]/i'
123			),
124			array(
125			'find' => '/<(.?)>(.?)<\/[a-zA-Z0-9]++>/m',
126			'find_pos' => 2,
127			)
128			);
129
130			foreach ($regex_find_replace as $commands) {
131			if (isset($commands['find'])) {
132			preg_match_all($commands['find'], $html, $matches);
133			array_walk($matches[$commands['find_pos']], function(&$phrase) {
134			$words = explode(' ', strip_tags($phrase));
135			array_walk($words, function(&$w) {
136			$w = trim(preg_replace('/\s+/', ' ', strip_tags($w)));
137			});
138			$phrase = preg_replace('/\s+/', ' ', implode(' ', $words));
139			});
140			$phrases = array_merge($phrases, $matches[$commands['find_pos']]);
141			}
142
143			if (isset($commands['replace']))
144			$html = preg_replace($commands['replace'], ' ', $html);
145			}
146
147			// Remove the empty elements
148			return array_filter($phrases, function($phrase) {return strlen(trim($phrase)) > 0; });
149			}
150
151			/**
152			* Returns the first batch of urls the crawler will use
153			* and it's settings in json format
154			*
155			* @param SS_HTTPRequest $request
156			* @return string
157			*/
158			public function urlsAndSettings(SS_HTTPRequest $request) {
159			Requirements::clear();
160			return json_encode(array(
161			'urls' => Versioned::get_by_stage('SiteTree', 'Live')
162			->exclude('ClassName', 'RedirectorPage')
163			->exclude('ClassName', 'ErrorPage')
164			->map('ID', 'AbsoluteLink')
165			->toArray(),
166
167			'settings' => array(
168			'ignore_paths' => $this->config()->get('ignore_paths'),
169			'crawl_id' => GlobalAutoLinkSettings::get_current()->CrawlID
170			)
171			));
172			}
173
174			/**
175			* Parses the data that we got from curling the crawl version of the page
176			* and splits the html fields into an array
177			*
178			* @param string $data
179			* @return array
180			*/
181			private function getHTMLFieldsData($data){
182			preg_match_all('/\[\\\[(.?)\]\\\[(.?)\]\\\]/im', $data, $matches);
183			foreach( $matches[2] as $key => $field_text ){
184			$matches[2][$key] = base64_decode($field_text);
185			$matches[3][$key] = preg_replace('/[\s]+/mu', ' ', strip_tags($matches[2][$key]));
186			}
187			return $matches;
188			}
189
190			/**
191			* Setup a curl request
192			*
193			* @param string $url
194			* @param string $agent
195			* @param bool $useCrawlID
196			*
197			* @return resource
198			*/
199			public function setupCurl($url, $agent, $useCrawlID = false){
200			$ch = curl_init();
201			curl_setopt( $ch, CURLOPT_URL, Director::absoluteBaseURL().'/'.$url );
202			curl_setopt( $ch, CURLOPT_HEADER, true );
203			curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
204			curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
205			curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
206			curl_setopt( $ch, CURLOPT_USERAGENT, $agent );
207			curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 10 );
208			curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
209			curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );
210			if( $useCrawlID ){
211			$crawl_id = GlobalAutoLinkSettings::get_current()->CrawlID;
212			curl_setopt( $ch, CURLOPT_HTTPHEADER, array( 'X-Crawl-Id: '.$crawl_id ) );
213			}
214			return $ch;
215			}
216
217			/**
218			* Get the page headers from a curl response
219			*
220			* @param resource $ch
221			* @param string $data
222			* @return string
223			*/
224			public function getPageHeaders($ch, $data){
225			$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
226			$header = explode( "\r\n\r\n", substr( $data, 0, $header_size ) );
227			array_pop( $header ); // Remove last element as it will always be empty
228			return array_pop( $header );
229			}
230
231			/**
232			* Get the body of a curl response
233			*
234			* @param resource $ch
235			* @param string $data
236			* @return string
237			*/
238			public function getPageBody($ch, $data){
239			$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
240			return substr( $data, $header_size );
241			}
242
243			/**
244			* Curl the passed $url using the X-Crawl-ID header and parse the data
245			* into an array
246			*
247			* @param string $url
248			* @param (null\|string) $agent
249			* @return array
250			*/
251			public function loadPage($url, $agent=null){
252			$ch = $this->setupCurl($url, $agent, true);
253			$data = curl_exec($ch);
254			$fetched = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), PHP_URL_PATH);
255			$header = $this->getPageHeaders($ch, $data);
256			$body = preg_replace('/[\s]+/mu', ' ', $this->getPageBody($ch, $data));
257
258			curl_close( $ch );
259
260			if( !strpos( $header, ' 200 ' ) ) {
261			return array( 'headers' => false, 'body' => false );
262			}
263
264			$field_data = $this->getHTMLFieldsData($body);
265			$body = str_replace($field_data[0], $field_data[2], $body);
266
267			return array( 'headers' => $header, 'body' => $body, 'field_data' => $field_data, 'url_fetched' => $fetched );
268			}
269
270			/**
271			* If ErrorPage exists for Error Code 503 return it
272			* else create it and return it
273			*
274			* @return ErrorPage
275			*/
276			public static function getPermissionDeniedPage() {
277			$page = ErrorPage::get()->find('ErrorCode', 503);
278			if (!$page) {
279			$page = ErrorPage::create(array(
280			'ErrorCode' => 503,
281			'Title' => 'Permission Denied'
282			));
283			$page->write();
284			}
285
286			return $page;
287			}
288			}
289

dylangrech92 / seotoolbox

Push — develop ( 7852a2...d7f552 )

SEOTestSiteTreeController::getPageBody() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like