Completed
Push — master ( 2d7c5c...16dc59 )
by Dev
09:53
created

Harvest::isCanonicalCorrect()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 2
nc 2
nop 0
dl 0
loc 5
ccs 3
cts 3
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use Spatie\Robots\RobotsTxt;
10
use PiedWeb\Curl\Request as CurlRequest;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait;
15
16
    const LINK_SELF = 1;
17
    const LINK_INTERNAL = 2;
18
    const LINK_SUB = 3;
19
    const LINK_EXTERNAL = 4;
20
21
    /**
22
     * @var Response
23
     */
24
    protected $response;
25
26
    /**
27
     * @var simple_html_dom
28
     */
29
    protected $dom;
30
31
    /** @var string */
32
    protected $baseUrl;
33
34
    /** @var string */
35
    protected $domain;
36
37
    /** @var RobotsTxt|string (empty string) */
38
    protected $robotsTxt;
39
40
    /** @var string */
41
    private $domainWithScheme;
42
43 15
    public static function fromUrl(
44
        string $url,
45
        string $userAgent = 'Bot: Url Harvester',
46
        string $language = 'en,en-US;q=0.5',
47
        bool   $tryHttps = false
48
    ) {
49 15
        $request = Request::make($url, $userAgent, '200;html', $language, $tryHttps);
50 15
        $response = $request->getResponse();
51
52 15
        if ($response instanceof Response) {
0 ignored issues
show
introduced by
$response is always a sub-type of PiedWeb\Curl\Response.
Loading history...
53 15
            return new self($response);
54
        }
55
56
        return $request->get()->getError();
57
    }
58
59
    /**
60
     * @param Response $response
61
     */
62 15
    public function __construct(Response $response)
63
    {
64 15
        $this->response = $response;
65 15
    }
66
67 6
    public function getResponse()
68
    {
69 6
        return $this->response;
70
    }
71
72 3
    public function getRedirection()
73
    {
74 3
        $headers = $this->response->getHeaders();
75 3
        $headers = array_change_key_case($headers ? $headers : []);
76 3
        if (isset($headers['location'])) {
77
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
78
        }
79
80 3
        return false;
81
    }
82
83 6
    public function getDom()
84
    {
85 6
        if (null === $this->dom) {
86 6
            $this->dom = new simple_html_dom();
87 6
            $this->dom->load($this->response->getContent());
88
        }
89
90 6
        return $this->dom;
91
    }
92
93 3
    private function find($selector, $number = null)
94
    {
95 3
        return $this->getDom()->find($selector, $number);
96
    }
97
98 3
    private function findOne($selector)
99
    {
100 3
        return $this->find($selector, 0);
101
    }
102
103
    /**
104
     * Return content inside a selector.
105
     *
106
     * @return string
107
     */
108 3
    public function getTag($selector)
109
    {
110 3
        $found = $this->findOne($selector);
111
112 3
        return null !== $found ? Helper::clean($found->innertext) : null;
113
    }
114
115 3
    public function getUniqueTag($selector = 'title')
116
    {
117 3
        $found = $this->find($selector);
118 3
        if ($found) {
119 3
            if (count($found) > 1) {
120
                return count($found).' `'.$selector.'` !!';
121
            } else {
122 3
                return Helper::clean($found[0]->innertext);
123
            }
124
        }
125
    }
126
127
    /**
128
     * Return content inside a meta.
129
     *
130
     * @return string from content attribute
131
     */
132 3
    public function getMeta(string $name)
133
    {
134 3
        $meta = $this->findOne('meta[name='.$name.']');
135
136 3
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
137
    }
138
139
    /**
140
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
141
     *
142
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
143
     */
144 3
    public function getCanonical()
145
    {
146 3
        $canonical = $this->findOne('link[rel=canonical]');
147
148 3
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
149
    }
150
151
    /*
152
     * @return bool
153
     */
154 3
    public function isCanonicalCorrect()
155
    {
156 3
        $canonical = $this->getCanonical();
157
158 3
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
159
    }
160
161 3
    public function getKws()
162
    {
163 3
        $kws = TextAnalyzer::get(
164 3
            $this->response->getContent(),
165 3
            true,   // only sentences
166 3
            1,      // no expression, just words
167 3
            0      // keep trail
168
        );
169
170 3
        return $kws->getExpressions(10);
171
    }
172
173
    /**
174
     * @return int
175
     */
176 3
    public function getRatioTxtCode(): int
177
    {
178 3
        $textLenght = strlen($this->getDom()->plaintext);
179 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
180
181 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
182
    }
183
184
    /**
185
     * Return an array of object with two elements Link and anchor.
186
     *
187
     * @return array or NULL if we didn't found breadcrumb
188
     */
189 3
    public function getBreadCrumb()
190
    {
191 3
        return ExtractBreadcrumb::get(
192 3
            $this->response->getContent(),
193 3
            $this->getBaseUrl(),
194 3
            $this->response->getEffectiveUrl()
195
        );
196
    }
197
198 3
    public function getBaseUrl()
199
    {
200 3
        if (!isset($this->baseUrl)) {
201 3
            $base = $this->findOne('base');
202 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
203
                $this->baseUrl = $base->href;
204
            } else {
205 3
                $this->baseUrl = $this->response->getEffectiveUrl();
206
            }
207
        }
208
209 3
        return $this->baseUrl;
210
    }
211
212 9
    public function getDomain()
213
    {
214 9
        if (!isset($this->domain)) {
215 9
            $urlParsed = parse_url($this->response->getEffectiveUrl());
216 9
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
217 9
            $this->domain = $match[0];
218
        }
219
220 9
        return $this->domain;
221
    }
222
223
    /**
224
     * @return int correspond to a const from Indexable
225
     */
226 3
    public function isIndexable(?string $userAgent = 'googlebot')
227
    {
228 3
        return Indexable::isIndexable($this, $userAgent);
0 ignored issues
show
Bug introduced by
It seems like $userAgent can also be of type null; however, parameter $isIndexableFor of PiedWeb\UrlHarvester\Indexable::isIndexable() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

228
        return Indexable::isIndexable($this, /** @scrutinizer ignore-type */ $userAgent);
Loading history...
229
    }
230
231
232
    /**
233
     * @return RobotsTxt|string containing the current Robots.txt or NULL if an error occured
234
     *                          or empty string if robots is empty file
235
     */
236 3
    public function getRobotsTxt()
237
    {
238 3
        if ($this->robotsTxt === null) {
239 3
            $url = $this->getDomainAndScheme().'/robots.txt';
240
241 3
            $request = new CurlRequest($url);
242
            $request
243 3
                ->setDefaultSpeedOptions()
244
                ->setDownloadOnlyIf(function($line){
245 3
                    return 0 === stripos(trim($line), 'content-type') && false !== stripos($line, 'text/plain');
246 3
                })
247 3
                ->setUserAgent($this->getResponse()->getRequest()->getUserAgent())
248
            ;
249 3
            $result = $request->exec();
250
251 3
            $noNeedToParse = ! $result instanceof \PiedWeb\Curl\Response || empty(trim($result->getContent()));
0 ignored issues
show
introduced by
$result is always a sub-type of PiedWeb\Curl\Response.
Loading history...
252
253 3
            $this->robotsTxt = $noNeedToParse ? '' : new RobotsTxt($result->getContent());
254
        }
255
256 3
        return $this->robotsTxt;
257
    }
258
259 6
    public function getDomainAndScheme()
260
    {
261 6
        if (null === $this->domainWithScheme) {
262 6
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
263
        }
264
265 6
        return $this->domainWithScheme;
266
    }
267
268 6
    public static function getDomainAndSchemeFrom(string $url)
269
    {
270 6
        $url = parse_url($url);
271
272 6
        return $url['scheme'].'://'.$url['host'];
273
    }
274
275
}
276