Completed
Push — master ( f11cb1...b0051f )
by Dev
04:43
created

Harvest   B

Complexity

Total Complexity 51

Size/Duplication

Total Lines 284
Duplicated Lines 0 %

Test Coverage

Coverage 85.19%

Importance

Changes 0
Metric Value
eloc 98
dl 0
loc 284
ccs 92
cts 108
cp 0.8519
rs 7.92
c 0
b 0
f 0
wmc 51

22 Methods

Rating   Name   Duplication   Size   Complexity  
A getResponse() 0 3 1
A getRedirection() 0 9 3
A find() 0 3 1
A getTag() 0 5 2
A isCanonicalCorrect() 0 5 2
A getDom() 0 8 2
A getUniqueTag() 0 8 3
A fromUrl() 0 12 2
A __construct() 0 3 1
A findOne() 0 3 1
A getMeta() 0 5 3
A getCanonical() 0 5 3
A amIRedirectToHttps() 0 11 5
A getBreadCrumb() 0 16 3
A getRobotsTxt() 0 21 5
A getDomain() 0 9 2
A getBaseUrl() 0 12 5
A isIndexable() 0 3 1
A getDomainAndScheme() 0 7 2
A getRatioTxtCode() 0 6 2
A getDomainAndSchemeFrom() 0 5 1
A getKws() 0 11 1

How to fix   Complexity   

Complex Class

Complex classes like Harvest often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Harvest, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use Spatie\Robots\RobotsTxt;
10
use PiedWeb\Curl\Request as CurlRequest;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait;
15
16
    const LINK_SELF = 1;
17
    const LINK_INTERNAL = 2;
18
    const LINK_SUB = 3;
19
    const LINK_EXTERNAL = 4;
20
21
    /**
22
     * @var Response
23
     */
24
    protected $response;
25
26
    /**
27
     * @var simple_html_dom
28
     */
29
    protected $dom;
30
31
    /** @var string */
32
    protected $baseUrl;
33
34
    /** @var string */
35
    protected $domain;
36
37
    /** @var RobotsTxt|string (empty string) */
38
    protected $robotsTxt;
39
40
    /** @var string */
41
    private $domainWithScheme;
42
43 12
    public static function fromUrl(
44
        string $url,
45
        string $userAgent = 'Bot: Url Harvester',
46
        string $language = 'en,en-US;q=0.5'
47
    ) {
48 12
        $response = Request::make($url, $userAgent, '200;html', $language);
49
50 12
        if ($response instanceof Response) {
51 12
            return new self($response);
52
        }
53
54
        return $response;
55
    }
56
57
    /**
58
     * @param Response $response
59
     */
60 12
    public function __construct(Response $response)
61
    {
62 12
        $this->response = $response;
63 12
    }
64
65 9
    public function getResponse()
66
    {
67 9
        return $this->response;
68
    }
69
70 3
    public function getRedirection()
71
    {
72 3
        $headers = $this->response->getHeaders();
73 3
        $headers = array_change_key_case($headers ? $headers : []);
74 3
        if (isset($headers['location'])) {
75 3
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
76
        }
77
78
        return false;
79
    }
80
81 12
    public function getDom()
82
    {
83 12
        if (null === $this->dom) {
84 6
            $this->dom = new simple_html_dom();
85 6
            $this->dom->load($this->response->getContent());
86
        }
87
88 12
        return $this->dom;
89
    }
90
91 9
    private function find($selector, $number = null)
92
    {
93 9
        return $this->getDom()->find($selector, $number);
94
    }
95
96 9
    private function findOne($selector)
97
    {
98 9
        return $this->find($selector, 0);
99
    }
100
101
    /**
102
     * Return content inside a selector.
103
     *
104
     * @return string
105
     */
106 3
    public function getTag($selector)
107
    {
108 3
        $found = $this->findOne($selector);
109
110 3
        return null !== $found ? Helper::clean($found->innertext) : null;
111
    }
112
113 3
    public function getUniqueTag($selector = 'title')
114
    {
115 3
        $found = $this->find($selector);
116 3
        if ($found) {
117 3
            if (count($found) > 1) {
118
                return count($found).' `'.$selector.'` !!';
119
            } else {
120 3
                return Helper::clean($found[0]->innertext);
121
            }
122
        }
123
    }
124
125
    /**
126
     * Return content inside a meta.
127
     *
128
     * @return string from content attribute
129
     */
130 9
    public function getMeta(string $name)
131
    {
132 9
        $meta = $this->findOne('meta[name='.$name.']');
133
134 9
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
135
    }
136
137
    /**
138
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
139
     *
140
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
141
     */
142 6
    public function getCanonical()
143
    {
144 6
        $canonical = $this->findOne('link[rel=canonical]');
145
146 6
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
147
    }
148
149
    /*
150
     * @return bool
151
     */
152 6
    public function isCanonicalCorrect()
153
    {
154 6
        $canonical = $this->getCanonical();
155
156 6
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
157
    }
158
159 3
    public function getKws()
160
    {
161 3
        var_dump($this->response->getContent());
0 ignored issues
show
Security Debugging Code introduced by
var_dump($this->response->getContent()) looks like debug code. Are you sure you do not want to remove it?
Loading history...
162 3
        $kws = TextAnalyzer::get(
163 3
            $this->response->getContent(),
164 3
            true,   // only sentences
165 3
            1,      // no expression, just words
166 3
            0      // keep trail
167
        );
168
169 3
        return $kws->getExpressions(10);
170
    }
171
172
    /**
173
     * @return int
174
     */
175 3
    public function getRatioTxtCode(): int
176
    {
177 3
        $textLenght = strlen($this->getDom()->plaintext);
178 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
179
180 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
181
    }
182
183
    /**
184
     * Return an array of object with two elements Link and anchor.
185
     *
186
     * @return array|NULL if we didn't found breadcrumb
187
     */
188 3
    public function getBreadCrumb(?string $separator = null)
189
    {
190 3
        $breadcrumb = ExtractBreadcrumb::get(
191 3
            $this->response->getContent(),
192 3
            $this->getBaseUrl(),
193 3
            $this->response->getEffectiveUrl()
194
        );
195
196 3
        if (null !== $separator && is_array($breadcrumb)) {
197
            $breadcrumb = array_map(function ($item) {
198
                return $item->getCleanName();
199
            }, $breadcrumb);
200
            $breadcrumb = implode($separator, $breadcrumb);
201
        }
202
203 3
        return $breadcrumb;
204
    }
205
206
    /**
207
     * @return string|false
208
     */
209
    public function amIRedirectToHttps()
210
    {
211
        $headers = $this->response->getHeaders();
212
        $headers = array_change_key_case(null !== $headers ? $headers : []);
213
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
214
        $url = $this->response->getUrl();
215
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
216
            return $httpsUrl;
217
        }
218
219
        return false;
220
    }
221
222 3
    public function getBaseUrl()
223
    {
224 3
        if (!isset($this->baseUrl)) {
225 3
            $base = $this->findOne('base');
226 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
227
                $this->baseUrl = $base->href;
228
            } else {
229 3
                $this->baseUrl = $this->response->getEffectiveUrl();
230
            }
231
        }
232
233 3
        return $this->baseUrl;
234
    }
235
236 6
    public function getDomain()
237
    {
238 6
        if (!isset($this->domain)) {
239 6
            $urlParsed = parse_url($this->response->getEffectiveUrl());
240 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
241 6
            $this->domain = $match[0];
242
        }
243
244 6
        return $this->domain;
245
    }
246
247
    /**
248
     * @return int correspond to a const from Indexable
249
     */
250 6
    public function isIndexable(string $userAgent = 'googlebot')
251
    {
252 6
        return Indexable::isIndexable($this, $userAgent);
253
    }
254
255
    /**
256
     * @return RobotsTxt|string containing the current Robots.txt or NULL if an error occured
257
     *                          or empty string if robots is empty file
258
     */
259 6
    public function getRobotsTxt()
260
    {
261 6
        if (null === $this->robotsTxt) {
262 6
            $url = $this->getDomainAndScheme().'/robots.txt';
263
264 6
            $request = new CurlRequest($url);
265
            $request
266 6
                ->setDefaultSpeedOptions()
267
                ->setDownloadOnlyIf(function ($line) {
268 6
                    return 0 === stripos(trim($line), 'content-type') && false !== stripos($line, 'text/plain');
269 6
                })
270 6
                ->setUserAgent($this->getResponse()->getRequest()->getUserAgent())
271
            ;
272 6
            $result = $request->exec();
273
274 6
            $noNeedToParse = !$result instanceof \PiedWeb\Curl\Response || empty(trim($result->getContent()));
275
276 6
            $this->robotsTxt = $noNeedToParse ? '' : new RobotsTxt($result->getContent());
277
        }
278
279 6
        return $this->robotsTxt;
280
    }
281
282 9
    public function getDomainAndScheme()
283
    {
284 9
        if (null === $this->domainWithScheme) {
285 9
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
286
        }
287
288 9
        return $this->domainWithScheme;
289
    }
290
291 9
    public static function getDomainAndSchemeFrom(string $url)
292
    {
293 9
        $url = parse_url($url);
294
295 9
        return $url['scheme'].'://'.$url['host'];
296
    }
297
}
298