Completed
Push — master ( 39e215...dae30c )
by Dev
02:03
created

Harvest   C

Complexity

Total Complexity 54

Size/Duplication

Total Lines 287
Duplicated Lines 0 %

Test Coverage

Coverage 84.76%

Importance

Changes 0
Metric Value
wmc 54
eloc 94
dl 0
loc 287
ccs 89
cts 105
cp 0.8476
rs 6.4799
c 0
b 0
f 0

24 Methods

Rating   Name   Duplication   Size   Complexity  
A getRedirection() 0 9 3
A getTextAnalysis() 0 12 2
A find() 0 3 1
A mayFollow() 0 8 4
A amIRedirectToHttps() 0 11 5
A getBreadCrumb() 0 16 3
A getTag() 0 5 2
A getDomain() 0 9 2
A getBaseUrl() 0 12 5
A getResponse() 0 3 1
A isIndexable() 0 3 1
A isCanonicalCorrect() 0 5 2
A metaAuthorizeToFollow() 0 3 2
A getDom() 0 8 2
A getDomainAndScheme() 0 7 2
A getUniqueTag() 0 8 3
A fromUrl() 0 13 2
A __construct() 0 3 1
A getRatioTxtCode() 0 6 2
A findOne() 0 3 1
A getDomainAndSchemeFrom() 0 5 1
A getCanonical() 0 5 3
A getMeta() 0 5 3
A getKws() 0 3 1

How to fix   Complexity   

Complex Class

Complex classes like Harvest often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Harvest, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use PiedWeb\Curl\Request as CurlRequest;
10
use Spatie\Robots\RobotsHeaders;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait, RobotsTxtTrait;
15
16
    const LINK_SELF = 1;
17
    const LINK_INTERNAL = 2;
18
    const LINK_SUB = 3;
19
    const LINK_EXTERNAL = 4;
20
21
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
22
23
    /**
24
     * @var Response
25
     */
26
    protected $response;
27
28
    /**
29
     * @var simple_html_dom
30
     */
31
    protected $dom;
32
33
    /** @var string */
34
    protected $baseUrl;
35
36
    /** @var string */
37
    protected $domain;
38
39
    /** @var bool */
40
    protected $follow;
41
42
    /** @var string */
43
    private $domainWithScheme;
44
45
    /** @var \PiedWeb\TextAnalyzer\Analysis */
46
    private $textAnalysis;
47
48
    /**
49
     * @return self|int
50
     */
51 15
    public static function fromUrl(
52
        string $url,
53
        string $userAgent = self::DEFAULT_USER_AGENT,
54
        string $language = 'en,en-US;q=0.5',
55
        ?CurlRequest $previousRequest = null
56
    ) {
57 15
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
58
59 15
        if ($response instanceof Response) {
60 15
            return new self($response);
61
        }
62
63
        return $response;
64
    }
65
66
    /**
67
     * @param Response $response
68
     */
69 18
    public function __construct(Response $response)
70
    {
71 18
        $this->response = $response;
72 18
    }
73
74 15
    public function getResponse()
75
    {
76 15
        return $this->response;
77
    }
78
79 3
    public function getRedirection()
80
    {
81 3
        $headers = $this->response->getHeaders();
82 3
        $headers = array_change_key_case($headers ? $headers : []);
83 3
        if (isset($headers['location'])) {
84 3
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
85
        }
86
87
        return false;
88
    }
89
90 21
    public function getDom()
91
    {
92 21
        if (null === $this->dom) {
93 9
            $this->dom = new simple_html_dom();
94 9
            $this->dom->load($this->response->getContent());
95
        }
96
97 21
        return $this->dom;
98
    }
99
100 18
    private function find($selector, $number = null)
101
    {
102 18
        return $this->getDom()->find($selector, $number);
103
    }
104
105 15
    private function findOne($selector)
106
    {
107 15
        return $this->find($selector, 0);
108
    }
109
110
    /**
111
     * Return content inside a selector.
112
     *
113
     * @return string
114
     */
115 3
    public function getTag($selector)
116
    {
117 3
        $found = $this->findOne($selector);
118
119 3
        return null !== $found ? Helper::clean($found->innertext) : null;
120
    }
121
122 6
    public function getUniqueTag($selector = 'title')
123
    {
124 6
        $found = $this->find($selector);
125 6
        if ($found) {
126 6
            if (count($found) > 1) {
127
                return count($found).' `'.$selector.'` !!';
128
            } else {
129 6
                return Helper::clean($found[0]->innertext);
130
            }
131
        }
132
    }
133
134
    /**
135
     * Return content inside a meta.
136
     *
137
     * @return string from content attribute
138
     */
139 15
    public function getMeta(string $name)
140
    {
141 15
        $meta = $this->findOne('meta[name='.$name.']');
142
143 15
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
144
    }
145
146
    /**
147
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
148
     *
149
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
150
     */
151 9
    public function getCanonical()
152
    {
153 9
        $canonical = $this->findOne('link[rel=canonical]');
154
155 9
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
156
    }
157
158
    /*
159
     * @return bool
160
     */
161 9
    public function isCanonicalCorrect()
162
    {
163 9
        $canonical = $this->getCanonical();
164
165 9
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
166
    }
167
168 3
    public function getTextAnalysis()
169
    {
170 3
        if (null === $this->textAnalysis) {
171 3
            $this->textAnalysis = TextAnalyzer::get(
172 3
                $this->getDom(),
173 3
                true,   // only sentences
174 3
                1,      // no expression, just words
175 3
                0      // keep trail
176
            );
177
        }
178
179 3
        return $this->textAnalysis;
180
    }
181
182 3
    public function getKws()
183
    {
184 3
        return $this->getTextAnalysis()->getExpressions(10);
185
    }
186
187
    /**
188
     * @return int
189
     */
190 3
    public function getRatioTxtCode(): int
191
    {
192 3
        $textLenght = strlen($this->getDom()->plaintext);
193 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
194
195 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
196
    }
197
198
    /**
199
     * Return an array of object with two elements Link and anchor.
200
     *
201
     * @return array|null if we didn't found breadcrumb
202
     */
203 3
    public function getBreadCrumb(?string $separator = null)
204
    {
205 3
        $breadcrumb = ExtractBreadcrumb::get(
206 3
            $this->response->getContent(),
207 3
            $this->getBaseUrl(),
208 3
            $this->response->getEffectiveUrl()
209
        );
210
211 3
        if (null !== $separator && is_array($breadcrumb)) {
212
            $breadcrumb = array_map(function ($item) {
213
                return $item->getCleanName();
214
            }, $breadcrumb);
215
            $breadcrumb = implode($separator, $breadcrumb);
216
        }
217
218 3
        return $breadcrumb;
219
    }
220
221
    /**
222
     * @return string|false
223
     */
224
    public function amIRedirectToHttps()
225
    {
226
        $headers = $this->response->getHeaders();
227
        $headers = array_change_key_case(null !== $headers ? $headers : []);
228
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
229
        $url = $this->response->getUrl();
230
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
231
            return $httpsUrl;
232
        }
233
234
        return false;
235
    }
236
237 3
    public function getBaseUrl()
238
    {
239 3
        if (!isset($this->baseUrl)) {
240 3
            $base = $this->findOne('base');
241 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
242
                $this->baseUrl = $base->href;
243
            } else {
244 3
                $this->baseUrl = $this->response->getEffectiveUrl();
245
            }
246
        }
247
248 3
        return $this->baseUrl;
249
    }
250
251 6
    public function getDomain()
252
    {
253 6
        if (!isset($this->domain)) {
254 6
            $urlParsed = parse_url($this->response->getEffectiveUrl());
255 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
256 6
            $this->domain = $match[0];
257
        }
258
259 6
        return $this->domain;
260
    }
261
262
    /**
263
     * @return int correspond to a const from Indexable
264
     */
265 9
    public function isIndexable(string $userAgent = 'googlebot')
266
    {
267 9
        return Indexable::isIndexable($this, $userAgent);
268
    }
269
270 12
    public function getDomainAndScheme()
271
    {
272 12
        if (null === $this->domainWithScheme) {
273 12
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
274
        }
275
276 12
        return $this->domainWithScheme;
277
    }
278
279 12
    public static function getDomainAndSchemeFrom(string $url)
280
    {
281 12
        $url = parse_url($url);
282
283 12
        return $url['scheme'].'://'.$url['host'];
284
    }
285
286 3
    protected function metaAuthorizeToFollow()
287
    {
288 3
        return ! (strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
289
    }
290
291 3
    public function mayFollow()
292
    {
293 3
        if ($this->follow === null) {
294 3
            $robotsHeaders = new RobotsHeaders($this->response->getHeaders());
1 ignored issue
show
Bug introduced by
It seems like $this->response->getHeaders() can also be of type null; however, parameter $headers of Spatie\Robots\RobotsHeaders::__construct() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

294
            $robotsHeaders = new RobotsHeaders(/** @scrutinizer ignore-type */ $this->response->getHeaders());
Loading history...
295 3
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
296
        }
297
298 3
        return $this->follow;
299
    }
300
}
301