Completed
Push — master ( 7cb44c...7cbff6 )
by Dev
02:31
created

Harvest   C

Complexity

Total Complexity 54

Size/Duplication

Total Lines 301
Duplicated Lines 0 %

Test Coverage

Coverage 85.71%

Importance

Changes 0
Metric Value
eloc 102
dl 0
loc 301
ccs 96
cts 112
cp 0.8571
rs 6.4799
c 0
b 0
f 0
wmc 54

23 Methods

Rating   Name   Duplication   Size   Complexity  
A getRedirection() 0 9 3
A find() 0 3 1
A amIRedirectToHttps() 0 11 5
A getBreadCrumb() 0 16 3
A getTag() 0 5 2
A getDomain() 0 9 2
A getBaseUrl() 0 12 5
A getResponse() 0 3 1
A isIndexable() 0 3 1
A isCanonicalCorrect() 0 5 2
A getDom() 0 8 2
A getUniqueTag() 0 8 3
A fromUrl() 0 12 2
A __construct() 0 3 1
A getRatioTxtCode() 0 6 2
A findOne() 0 3 1
A getMeta() 0 5 3
A getCanonical() 0 5 3
A getKws() 0 10 1
A getRobotsTxt() 0 24 5
A getDomainAndScheme() 0 7 2
A setRobotsTxt() 0 5 3
A getDomainAndSchemeFrom() 0 5 1

How to fix   Complexity   

Complex Class

Complex classes like Harvest often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Harvest, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use Spatie\Robots\RobotsTxt;
10
use PiedWeb\Curl\Request as CurlRequest;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait;
15
16
    const LINK_SELF = 1;
17
    const LINK_INTERNAL = 2;
18
    const LINK_SUB = 3;
19
    const LINK_EXTERNAL = 4;
20
21
    /**
22
     * @var Response
23
     */
24
    protected $response;
25
26
    /**
27
     * @var simple_html_dom
28
     */
29
    protected $dom;
30
31
    /** @var string */
32
    protected $baseUrl;
33
34
    /** @var string */
35
    protected $domain;
36
37
    /** @var RobotsTxt|string (empty string) */
38
    protected $robotsTxt;
39
40
    /** @var string */
41
    private $domainWithScheme;
42
43
    /**
44
     * @return self|int
45
     */
46 12
    public static function fromUrl(
47
        string $url,
48
        string $userAgent = 'SeoPocketCrawler - Open Source Bot for SEO Metrics',
49
        string $language = 'en,en-US;q=0.5'
50
    ) {
51 12
        $response = Request::make($url, $userAgent, '200;html', $language);
52
53 12
        if ($response instanceof Response) {
54 12
            return new self($response);
55
        }
56
57
        return $response;
58
    }
59
60
    /**
61
     * @param Response $response
62
     */
63 12
    public function __construct(Response $response)
64
    {
65 12
        $this->response = $response;
66 12
    }
67
68 9
    public function getResponse()
69
    {
70 9
        return $this->response;
71
    }
72
73 3
    public function getRedirection()
74
    {
75 3
        $headers = $this->response->getHeaders();
76 3
        $headers = array_change_key_case($headers ? $headers : []);
77 3
        if (isset($headers['location'])) {
78 3
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
79
        }
80
81
        return false;
82
    }
83
84 12
    public function getDom()
85
    {
86 12
        if (null === $this->dom) {
87 6
            $this->dom = new simple_html_dom();
88 6
            $this->dom->load($this->response->getContent());
89
        }
90
91 12
        return $this->dom;
92
    }
93
94 9
    private function find($selector, $number = null)
95
    {
96 9
        return $this->getDom()->find($selector, $number);
97
    }
98
99 9
    private function findOne($selector)
100
    {
101 9
        return $this->find($selector, 0);
102
    }
103
104
    /**
105
     * Return content inside a selector.
106
     *
107
     * @return string
108
     */
109 3
    public function getTag($selector)
110
    {
111 3
        $found = $this->findOne($selector);
112
113 3
        return null !== $found ? Helper::clean($found->innertext) : null;
114
    }
115
116 3
    public function getUniqueTag($selector = 'title')
117
    {
118 3
        $found = $this->find($selector);
119 3
        if ($found) {
120 3
            if (count($found) > 1) {
121
                return count($found).' `'.$selector.'` !!';
122
            } else {
123 3
                return Helper::clean($found[0]->innertext);
124
            }
125
        }
126
    }
127
128
    /**
129
     * Return content inside a meta.
130
     *
131
     * @return string from content attribute
132
     */
133 9
    public function getMeta(string $name)
134
    {
135 9
        $meta = $this->findOne('meta[name='.$name.']');
136
137 9
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
138
    }
139
140
    /**
141
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
142
     *
143
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
144
     */
145 6
    public function getCanonical()
146
    {
147 6
        $canonical = $this->findOne('link[rel=canonical]');
148
149 6
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
150
    }
151
152
    /*
153
     * @return bool
154
     */
155 6
    public function isCanonicalCorrect()
156
    {
157 6
        $canonical = $this->getCanonical();
158
159 6
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
160
    }
161
162 3
    public function getKws()
163
    {
164 3
        $kws = TextAnalyzer::get(
165 3
            $this->getDom(),
166 3
            true,   // only sentences
167 3
            1,      // no expression, just words
168 3
            0      // keep trail
169
        );
170
171 3
        return $kws->getExpressions(10);
172
    }
173
174
    /**
175
     * @return int
176
     */
177 3
    public function getRatioTxtCode(): int
178
    {
179 3
        $textLenght = strlen($this->getDom()->plaintext);
180 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
181
182 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
183
    }
184
185
    /**
186
     * Return an array of object with two elements Link and anchor.
187
     *
188
     * @return array|null if we didn't found breadcrumb
189
     */
190 3
    public function getBreadCrumb(?string $separator = null)
191
    {
192 3
        $breadcrumb = ExtractBreadcrumb::get(
193 3
            $this->response->getContent(),
194 3
            $this->getBaseUrl(),
195 3
            $this->response->getEffectiveUrl()
196
        );
197
198 3
        if (null !== $separator && is_array($breadcrumb)) {
199
            $breadcrumb = array_map(function ($item) {
200
                return $item->getCleanName();
201
            }, $breadcrumb);
202
            $breadcrumb = implode($separator, $breadcrumb);
203
        }
204
205 3
        return $breadcrumb;
206
    }
207
208
    /**
209
     * @return string|false
210
     */
211
    public function amIRedirectToHttps()
212
    {
213
        $headers = $this->response->getHeaders();
214
        $headers = array_change_key_case(null !== $headers ? $headers : []);
215
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
216
        $url = $this->response->getUrl();
217
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
218
            return $httpsUrl;
219
        }
220
221
        return false;
222
    }
223
224 3
    public function getBaseUrl()
225
    {
226 3
        if (!isset($this->baseUrl)) {
227 3
            $base = $this->findOne('base');
228 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
229
                $this->baseUrl = $base->href;
230
            } else {
231 3
                $this->baseUrl = $this->response->getEffectiveUrl();
232
            }
233
        }
234
235 3
        return $this->baseUrl;
236
    }
237
238 6
    public function getDomain()
239
    {
240 6
        if (!isset($this->domain)) {
241 6
            $urlParsed = parse_url($this->response->getEffectiveUrl());
242 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
243 6
            $this->domain = $match[0];
244
        }
245
246 6
        return $this->domain;
247
    }
248
249
    /**
250
     * @return int correspond to a const from Indexable
251
     */
252 6
    public function isIndexable(string $userAgent = 'googlebot')
253
    {
254 6
        return Indexable::isIndexable($this, $userAgent);
255
    }
256
257
    /**
258
     * @return RobotsTxt|string containing the current Robots.txt or NULL if an error occured
259
     *                          or empty string if robots is empty file
260
     */
261 6
    public function getRobotsTxt()
262
    {
263 6
        if (null === $this->robotsTxt) {
264 6
            $url = $this->getDomainAndScheme().'/robots.txt';
265
266 6
            $request = new CurlRequest($url);
267
            $request
268 6
                ->setDefaultSpeedOptions()
269 6
                ->setDownloadOnly('0-500')
270 6
                ->setUserAgent($this->getResponse()->getRequest()->getUserAgent())
271
            ;
272 6
            $result = $request->exec();
273
274 6
            if (!$result instanceof \PiedWeb\Curl\Response
275 6
                || false === stripos($result->getContentType(), 'text/plain')
276 6
                || empty(trim($result->getContent()))
277
            ) {
278 3
                $this->robotsTxt = '';
279
            } else {
280 3
                $this->robotsTxt = new RobotsTxt($result->getContent());
281
            }
282
        }
283
284 6
        return $this->robotsTxt;
285
    }
286
287
    /**
288
     * @param RobotsTxt|string (empty)
0 ignored issues
show
Bug introduced by
The type PiedWeb\UrlHarvester\empty was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
289
     *
290
     * @return self
291
     */
292 3
    public function setRobotsTxt($robotsTxt)
293
    {
294 3
        $this->robotsTxt = is_string($robotsTxt) ? (empty($robotsTxt) ? '' : new RobotsTxt($robotsTxt)) : $robotsTxt;
295
296 3
        return $this;
297
    }
298
299 9
    public function getDomainAndScheme()
300
    {
301 9
        if (null === $this->domainWithScheme) {
302 9
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
303
        }
304
305 9
        return $this->domainWithScheme;
306
    }
307
308 9
    public static function getDomainAndSchemeFrom(string $url)
309
    {
310 9
        $url = parse_url($url);
311
312 9
        return $url['scheme'].'://'.$url['host'];
313
    }
314
}
315