Harvest - Code Metrics - Inspection of "remove slow slow donwloadOnly on header checks" - PiedWeb/UrlHarvester - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 7cb44c...7cbff6 )

by Dev

created 2019-01-16 10:17 UTC

Harvest C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	301
Duplicated Lines	0 %

Test Coverage

Coverage

85.71%

Importance

Changes

Metric	Value
eloc	102
dl	0
loc	301
ccs	96
cts	112
cp	0.8571
rs	6.4799
c	0
b	0
f	0
wmc	54

23 Methods

Rating	Name	Size	Complexity
A	getRedirection()	9	3
A	find()	3	1
A	amIRedirectToHttps()	11	5
A	getBreadCrumb()	16	3
A	getTag()	5	2
A	getDomain()	9	2
A	getBaseUrl()	12	5
A	getResponse()	3	1
A	isIndexable()	3	1
A	isCanonicalCorrect()	5	2
A	getDom()	8	2
A	getUniqueTag()	8	3
A	fromUrl()	12	2
A	__construct()	3	1
A	getRatioTxtCode()	6	2
A	findOne()	3	1
A	getMeta()	5	3
A	getCanonical()	5	3
A	getKws()	10	1
A	getRobotsTxt()	24	5
A	getDomainAndScheme()	7	2
A	setRobotsTxt()	5	3
A	getDomainAndSchemeFrom()	5	1

How to fix Complexity

<?php

namespace PiedWeb\UrlHarvester;

use PiedWeb\Curl\Response;
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
use phpuri;
use simple_html_dom;
use Spatie\Robots\RobotsTxt;
use PiedWeb\Curl\Request as CurlRequest;

class Harvest
{
    use HarvestLinksTrait;

    const LINK_SELF = 1;
    const LINK_INTERNAL = 2;
    const LINK_SUB = 3;
    const LINK_EXTERNAL = 4;

    /**
     * @var Response
     */
    protected $response;

    /**
     * @var simple_html_dom
     */
    protected $dom;

    /** @var string */
    protected $baseUrl;

    /** @var string */
    protected $domain;

    /** @var RobotsTxt|string (empty string) */
    protected $robotsTxt;

    /** @var string */
    private $domainWithScheme;

    /**
     * @return self|int
     */
    public static function fromUrl(
        string $url,
        string $userAgent = 'SeoPocketCrawler - Open Source Bot for SEO Metrics',
        string $language = 'en,en-US;q=0.5'
    ) {
        $response = Request::make($url, $userAgent, '200;html', $language);

        if ($response instanceof Response) {
            return new self($response);
        }

        return $response;
    }

    /**
     * @param Response $response
     */
    public function __construct(Response $response)
    {
        $this->response = $response;
    }

    public function getResponse()
    {
        return $this->response;
    }

    public function getRedirection()
    {
        $headers = $this->response->getHeaders();
        $headers = array_change_key_case($headers ? $headers : []);
        if (isset($headers['location'])) {
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
        }

        return false;
    }

    public function getDom()
    {
        if (null === $this->dom) {
            $this->dom = new simple_html_dom();
            $this->dom->load($this->response->getContent());
        }

        return $this->dom;
    }

    private function find($selector, $number = null)
    {
        return $this->getDom()->find($selector, $number);
    }

    private function findOne($selector)
    {
        return $this->find($selector, 0);
    }

    /**
     * Return content inside a selector.
     *
     * @return string
     */
    public function getTag($selector)
    {
        $found = $this->findOne($selector);

        return null !== $found ? Helper::clean($found->innertext) : null;
    }

    public function getUniqueTag($selector = 'title')
    {
        $found = $this->find($selector);
        if ($found) {
            if (count($found) > 1) {
                return count($found).' `'.$selector.'` !!';
            } else {
                return Helper::clean($found[0]->innertext);
            }
        }
    }

    /**
     * Return content inside a meta.
     *
     * @return string from content attribute
     */
    public function getMeta(string $name)
    {
        $meta = $this->findOne('meta[name='.$name.']');

        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
    }

    /**
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
     *
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
     */
    public function getCanonical()
    {
        $canonical = $this->findOne('link[rel=canonical]');

        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
    }

    /*
     * @return bool
     */
    public function isCanonicalCorrect()
    {
        $canonical = $this->getCanonical();

        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
    }

    public function getKws()
    {
        $kws = TextAnalyzer::get(
            $this->getDom(),
            true,   // only sentences
            1,      // no expression, just words
            0      // keep trail
        );

        return $kws->getExpressions(10);
    }

    /**
     * @return int
     */
    public function getRatioTxtCode(): int
    {
        $textLenght = strlen($this->getDom()->plaintext);
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));

        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
    }

    /**
     * Return an array of object with two elements Link and anchor.
     *
     * @return array|null if we didn't found breadcrumb
     */
    public function getBreadCrumb(?string $separator = null)
    {
        $breadcrumb = ExtractBreadcrumb::get(
            $this->response->getContent(),
            $this->getBaseUrl(),
            $this->response->getEffectiveUrl()
        );

        if (null !== $separator && is_array($breadcrumb)) {
            $breadcrumb = array_map(function ($item) {
                return $item->getCleanName();
            }, $breadcrumb);
            $breadcrumb = implode($separator, $breadcrumb);
        }

        return $breadcrumb;
    }

    /**
     * @return string|false
     */
    public function amIRedirectToHttps()
    {
        $headers = $this->response->getHeaders();
        $headers = array_change_key_case(null !== $headers ? $headers : []);
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
        $url = $this->response->getUrl();
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
            return $httpsUrl;
        }

        return false;
    }

    public function getBaseUrl()
    {
        if (!isset($this->baseUrl)) {
            $base = $this->findOne('base');
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
                $this->baseUrl = $base->href;
            } else {
                $this->baseUrl = $this->response->getEffectiveUrl();
            }
        }

        return $this->baseUrl;
    }

    public function getDomain()
    {
        if (!isset($this->domain)) {
            $urlParsed = parse_url($this->response->getEffectiveUrl());
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
            $this->domain = $match[0];
        }

        return $this->domain;
    }

    /**
     * @return int correspond to a const from Indexable
     */
    public function isIndexable(string $userAgent = 'googlebot')
    {
        return Indexable::isIndexable($this, $userAgent);
    }

    /**
     * @return RobotsTxt|string containing the current Robots.txt or NULL if an error occured
     *                          or empty string if robots is empty file
     */
    public function getRobotsTxt()
    {
        if (null === $this->robotsTxt) {
            $url = $this->getDomainAndScheme().'/robots.txt';

            $request = new CurlRequest($url);
            $request
                ->setDefaultSpeedOptions()
                ->setDownloadOnly('0-500')
                ->setUserAgent($this->getResponse()->getRequest()->getUserAgent())
            ;
            $result = $request->exec();

            if (!$result instanceof \PiedWeb\Curl\Response
                || false === stripos($result->getContentType(), 'text/plain')
                || empty(trim($result->getContent()))
            ) {
                $this->robotsTxt = '';
            } else {
                $this->robotsTxt = new RobotsTxt($result->getContent());
            }
        }

        return $this->robotsTxt;
    }

    /**
     * @param RobotsTxt|string (empty)
filter:
    dependency_paths: ["lib/*"]
     *
     * @return self
     */
    public function setRobotsTxt($robotsTxt)
    {
        $this->robotsTxt = is_string($robotsTxt) ? (empty($robotsTxt) ? '' : new RobotsTxt($robotsTxt)) : $robotsTxt;

        return $this;
    }

    public function getDomainAndScheme()
    {
        if (null === $this->domainWithScheme) {
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
        }

        return $this->domainWithScheme;
    }

    public static function getDomainAndSchemeFrom(string $url)
    {
        $url = parse_url($url);

        return $url['scheme'].'://'.$url['host'];
    }
}


1		<?php
2
3		namespace PiedWeb\UrlHarvester;
4
5		use PiedWeb\Curl\Response;
6		use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7		use phpuri;
8		use simple_html_dom;
9		use Spatie\Robots\RobotsTxt;
10		use PiedWeb\Curl\Request as CurlRequest;
11
12		class Harvest
13		{
14		use HarvestLinksTrait;
15
16		const LINK_SELF = 1;
17		const LINK_INTERNAL = 2;
18		const LINK_SUB = 3;
19		const LINK_EXTERNAL = 4;
20
21		/**
22		* @var Response
23		*/
24		protected $response;
25
26		/**
27		* @var simple_html_dom
28		*/
29		protected $dom;
30
31		/** @var string */
32		protected $baseUrl;
33
34		/** @var string */
35		protected $domain;
36
37		/** @var RobotsTxt\|string (empty string) */
38		protected $robotsTxt;
39
40		/** @var string */
41		private $domainWithScheme;
42
43		/**
44		* @return self\|int
45		*/
46	12	public static function fromUrl(
47		string $url,
48		string $userAgent = 'SeoPocketCrawler - Open Source Bot for SEO Metrics',
49		string $language = 'en,en-US;q=0.5'
50		) {
51	12	$response = Request::make($url, $userAgent, '200;html', $language);
52
53	12	if ($response instanceof Response) {
54	12	return new self($response);
55		}
56
57		return $response;
58		}
59
60		/**
61		* @param Response $response
62		*/
63	12	public function __construct(Response $response)
64		{
65	12	$this->response = $response;
66	12	}
67
68	9	public function getResponse()
69		{
70	9	return $this->response;
71		}
72
73	3	public function getRedirection()
74		{
75	3	$headers = $this->response->getHeaders();
76	3	$headers = array_change_key_case($headers ? $headers : []);
77	3	if (isset($headers['location'])) {
78	3	return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
79		}
80
81		return false;
82		}
83
84	12	public function getDom()
85		{
86	12	if (null === $this->dom) {
87	6	$this->dom = new simple_html_dom();
88	6	$this->dom->load($this->response->getContent());
89		}
90
91	12	return $this->dom;
92		}
93
94	9	private function find($selector, $number = null)
95		{
96	9	return $this->getDom()->find($selector, $number);
97		}
98
99	9	private function findOne($selector)
100		{
101	9	return $this->find($selector, 0);
102		}
103
104		/**
105		* Return content inside a selector.
106		*
107		* @return string
108		*/
109	3	public function getTag($selector)
110		{
111	3	$found = $this->findOne($selector);
112
113	3	return null !== $found ? Helper::clean($found->innertext) : null;
114		}
115
116	3	public function getUniqueTag($selector = 'title')
117		{
118	3	$found = $this->find($selector);
119	3	if ($found) {
120	3	if (count($found) > 1) {
121		return count($found).' `'.$selector.'` !!';
122		} else {
123	3	return Helper::clean($found[0]->innertext);
124		}
125		}
126		}
127
128		/**
129		* Return content inside a meta.
130		*
131		* @return string from content attribute
132		*/
133	9	public function getMeta(string $name)
134		{
135	9	$meta = $this->findOne('meta[name='.$name.']');
136
137	9	return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
138		}
139
140		/**
141		* Renvoie le contenu de l'attribut href de la balise link rel=canonical.
142		*
143		* @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
144		*/
145	6	public function getCanonical()
146		{
147	6	$canonical = $this->findOne('link[rel=canonical]');
148
149	6	return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
150		}
151
152		/*
153		* @return bool
154		*/
155	6	public function isCanonicalCorrect()
156		{
157	6	$canonical = $this->getCanonical();
158
159	6	return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
160		}
161
162	3	public function getKws()
163		{
164	3	$kws = TextAnalyzer::get(
165	3	$this->getDom(),
166	3	true, // only sentences
167	3	1, // no expression, just words
168	3	0 // keep trail
169		);
170
171	3	return $kws->getExpressions(10);
172		}
173
174		/**
175		* @return int
176		*/
177	3	public function getRatioTxtCode(): int
178		{
179	3	$textLenght = strlen($this->getDom()->plaintext);
180	3	$htmlLenght = strlen(Helper::clean($this->response->getContent()));
181
182	3	return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
183		}
184
185		/**
186		* Return an array of object with two elements Link and anchor.
187		*
188		* @return array\|null if we didn't found breadcrumb
189		*/
190	3	public function getBreadCrumb(?string $separator = null)
191		{
192	3	$breadcrumb = ExtractBreadcrumb::get(
193	3	$this->response->getContent(),
194	3	$this->getBaseUrl(),
195	3	$this->response->getEffectiveUrl()
196		);
197
198	3	if (null !== $separator && is_array($breadcrumb)) {
199		$breadcrumb = array_map(function ($item) {
200		return $item->getCleanName();
201		}, $breadcrumb);
202		$breadcrumb = implode($separator, $breadcrumb);
203		}
204
205	3	return $breadcrumb;
206		}
207
208		/**
209		* @return string\|false
210		*/
211		public function amIRedirectToHttps()
212		{
213		$headers = $this->response->getHeaders();
214		$headers = array_change_key_case(null !== $headers ? $headers : []);
215		$redirUrl = isset($headers['location']) ? $headers['location'] : null;
216		$url = $this->response->getUrl();
217		if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
218		return $httpsUrl;
219		}
220
221		return false;
222		}
223
224	3	public function getBaseUrl()
225		{
226	3	if (!isset($this->baseUrl)) {
227	3	$base = $this->findOne('base');
228	3	if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
229		$this->baseUrl = $base->href;
230		} else {
231	3	$this->baseUrl = $this->response->getEffectiveUrl();
232		}
233		}
234
235	3	return $this->baseUrl;
236		}
237
238	6	public function getDomain()
239		{
240	6	if (!isset($this->domain)) {
241	6	$urlParsed = parse_url($this->response->getEffectiveUrl());
242	6	preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
243	6	$this->domain = $match[0];
244		}
245
246	6	return $this->domain;
247		}
248
249		/**
250		* @return int correspond to a const from Indexable
251		*/
252	6	public function isIndexable(string $userAgent = 'googlebot')
253		{
254	6	return Indexable::isIndexable($this, $userAgent);
255		}
256
257		/**
258		* @return RobotsTxt\|string containing the current Robots.txt or NULL if an error occured
259		* or empty string if robots is empty file
260		*/
261	6	public function getRobotsTxt()
262		{
263	6	if (null === $this->robotsTxt) {
264	6	$url = $this->getDomainAndScheme().'/robots.txt';
265
266	6	$request = new CurlRequest($url);
267		$request
268	6	->setDefaultSpeedOptions()
269	6	->setDownloadOnly('0-500')
270	6	->setUserAgent($this->getResponse()->getRequest()->getUserAgent())
271		;
272	6	$result = $request->exec();
273
274	6	if (!$result instanceof \PiedWeb\Curl\Response
275	6	\|\| false === stripos($result->getContentType(), 'text/plain')
276	6	\|\| empty(trim($result->getContent()))
277		) {
278	3	$this->robotsTxt = '';
279		} else {
280	3	$this->robotsTxt = new RobotsTxt($result->getContent());
281		}
282		}
283
284	6	return $this->robotsTxt;
285		}
286
287		/**
288		* @param RobotsTxt\|string (empty)
		0 ignored issues – show Bug introduced 2019-01-16 06:47 UTC by Report Bug Copy Issue Report The type `PiedWeb\UrlHarvester\empty` was not found. Maybe you did not declare it correctly or list all dependencies? The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. `excluded_paths: ["lib/"]`, you can move it to the dependency path list as follows: filter: dependency_paths: ["lib/"] For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths Loading history...
289		*
290		* @return self
291		*/
292	3	public function setRobotsTxt($robotsTxt)
293		{
294	3	$this->robotsTxt = is_string($robotsTxt) ? (empty($robotsTxt) ? '' : new RobotsTxt($robotsTxt)) : $robotsTxt;
295
296	3	return $this;
297		}
298
299	9	public function getDomainAndScheme()
300		{
301	9	if (null === $this->domainWithScheme) {
302	9	$this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
303		}
304
305	9	return $this->domainWithScheme;
306		}
307
308	9	public static function getDomainAndSchemeFrom(string $url)
309		{
310	9	$url = parse_url($url);
311
312	9	return $url['scheme'].'://'.$url['host'];
313		}
314		}
315

PiedWeb / UrlHarvester

Push — master ( 7cb44c...7cbff6 )

Harvest C

Complexity

Size/Duplication

Test Coverage

Importance

23 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like