Completed
Push — master ( 7e536e...5c7ae7 )
by Dev
24:11 queued 09:15
created

Harvest::getRatioTxtCode()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
ccs 1
cts 1
cp 1
cc 2
nc 2
nop 0
crap 2
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use PiedWeb\Curl\Request as CurlRequest;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait, RobotsTxtTrait;
14
15
    const LINK_SELF = 1;
16
    const LINK_INTERNAL = 2;
17
    const LINK_SUB = 3;
18
    const LINK_EXTERNAL = 4;
19
20
    /**
21
     * @var Response
22
     */
23
    protected $response;
24
25
    /**
26
     * @var simple_html_dom
27
     */
28
    protected $dom;
29
30
    /** @var string */
31
    protected $baseUrl;
32
33
    /** @var string */
34
    protected $domain;
35
36
    /** @var string */
37
    private $domainWithScheme;
38
39
    /**
40
     * @return self|int
41
     */
42
    public static function fromUrl(
43
        string $url,
44
        string $userAgent = 'SeoPocketCrawler - Open Source Bot for SEO Metrics',
45
        string $language = 'en,en-US;q=0.5',
46 15
        ?CurlRequest $previousRequest = null
47
    ) {
48
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
49
50
        if ($response instanceof Response) {
51
            return new self($response);
52 15
        }
53
54 15
        return $response;
55 15
    }
56
57
    /**
58
     * @param Response $response
59
     */
60
    public function __construct(Response $response)
61
    {
62
        $this->response = $response;
63
    }
64 15
65
    public function getResponse()
66 15
    {
67 15
        return $this->response;
68
    }
69 12
70
    public function getRedirection()
71 12
    {
72
        $headers = $this->response->getHeaders();
73
        $headers = array_change_key_case($headers ? $headers : []);
74 3
        if (isset($headers['location'])) {
75
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
76 3
        }
77 3
78 3
        return false;
79 3
    }
80
81
    public function getDom()
82
    {
83
        if (null === $this->dom) {
84
            $this->dom = new simple_html_dom();
85 15
            $this->dom->load($this->response->getContent());
86
        }
87 15
88 6
        return $this->dom;
89 6
    }
90
91
    private function find($selector, $number = null)
92 15
    {
93
        return $this->getDom()->find($selector, $number);
94
    }
95 12
96
    private function findOne($selector)
97 12
    {
98
        return $this->find($selector, 0);
99
    }
100 9
101
    /**
102 9
     * Return content inside a selector.
103
     *
104
     * @return string
105
     */
106
    public function getTag($selector)
107
    {
108
        $found = $this->findOne($selector);
109
110 3
        return null !== $found ? Helper::clean($found->innertext) : null;
111
    }
112 3
113
    public function getUniqueTag($selector = 'title')
114 3
    {
115
        $found = $this->find($selector);
116
        if ($found) {
117 6
            if (count($found) > 1) {
118
                return count($found).' `'.$selector.'` !!';
119 6
            } else {
120 6
                return Helper::clean($found[0]->innertext);
121 6
            }
122
        }
123
    }
124 6
125
    /**
126
     * Return content inside a meta.
127
     *
128
     * @return string from content attribute
129
     */
130
    public function getMeta(string $name)
131
    {
132
        $meta = $this->findOne('meta[name='.$name.']');
133
134 9
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
135
    }
136 9
137
    /**
138 9
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
139
     *
140
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
141
     */
142
    public function getCanonical()
143
    {
144
        $canonical = $this->findOne('link[rel=canonical]');
145
146 6
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
147
    }
148 6
149
    /*
150 6
     * @return bool
151
     */
152
    public function isCanonicalCorrect()
153
    {
154
        $canonical = $this->getCanonical();
155
156 6
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
157
    }
158 6
159
    public function getKws()
160 6
    {
161
        $kws = TextAnalyzer::get(
162
            $this->getDom(),
163 3
            true,   // only sentences
164
            1,      // no expression, just words
165 3
            0      // keep trail
166 3
        );
167 3
168 3
        return $kws->getExpressions(10);
169 3
    }
170
171
    /**
172 3
     * @return int
173
     */
174
    public function getRatioTxtCode(): int
175
    {
176
        $textLenght = strlen($this->getDom()->plaintext);
177
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
178 3
179
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
180 3
    }
181 3
182
    /**
183 3
     * Return an array of object with two elements Link and anchor.
184
     *
185
     * @return array|null if we didn't found breadcrumb
186
     */
187
    public function getBreadCrumb(?string $separator = null)
188
    {
189
        $breadcrumb = ExtractBreadcrumb::get(
190
            $this->response->getContent(),
191 3
            $this->getBaseUrl(),
192
            $this->response->getEffectiveUrl()
193 3
        );
194 3
195 3
        if (null !== $separator && is_array($breadcrumb)) {
196 3
            $breadcrumb = array_map(function ($item) {
197
                return $item->getCleanName();
198
            }, $breadcrumb);
199 3
            $breadcrumb = implode($separator, $breadcrumb);
200
        }
201
202
        return $breadcrumb;
203
    }
204
205
    /**
206 3
     * @return string|false
207
     */
208
    public function amIRedirectToHttps()
209
    {
210
        $headers = $this->response->getHeaders();
211
        $headers = array_change_key_case(null !== $headers ? $headers : []);
212
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
213
        $url = $this->response->getUrl();
214
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
215
            return $httpsUrl;
216
        }
217
218
        return false;
219
    }
220
221
    public function getBaseUrl()
222
    {
223
        if (!isset($this->baseUrl)) {
224
            $base = $this->findOne('base');
225 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
226
                $this->baseUrl = $base->href;
227 3
            } else {
228 3
                $this->baseUrl = $this->response->getEffectiveUrl();
229 3
            }
230
        }
231
232 3
        return $this->baseUrl;
233
    }
234
235
    public function getDomain()
236 3
    {
237
        if (!isset($this->domain)) {
238
            $urlParsed = parse_url($this->response->getEffectiveUrl());
239 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
240
            $this->domain = $match[0];
241 6
        }
242 6
243 6
        return $this->domain;
244 6
    }
245
246
    /**
247 6
     * @return int correspond to a const from Indexable
248
     */
249
    public function isIndexable(string $userAgent = 'googlebot')
250
    {
251
        return Indexable::isIndexable($this, $userAgent);
252
    }
253 6
254
    public function getDomainAndScheme()
255 6
    {
256
        if (null === $this->domainWithScheme) {
257
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
258
        }
259
260
        return $this->domainWithScheme;
261
    }
262 6
263
    public static function getDomainAndSchemeFrom(string $url)
264 6
    {
265 6
        $url = parse_url($url);
266
267 6
        return $url['scheme'].'://'.$url['host'];
268
    }
269
}
270