Completed
Push — master ( acb1e6...d7f53c )
by Dev
08:59 queued 05:50
created

Harvest::getRedirection()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3.0416

Importance

Changes 0
Metric Value
cc 3
eloc 5
nc 2
nop 0
dl 0
loc 9
ccs 5
cts 6
cp 0.8333
crap 3.0416
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use PiedWeb\Curl\Request as CurlRequest;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait, RobotsTxtTrait;
14
15
    const LINK_SELF = 1;
16
    const LINK_INTERNAL = 2;
17
    const LINK_SUB = 3;
18
    const LINK_EXTERNAL = 4;
19
20
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
21
22
    /**
23
     * @var Response
24
     */
25
    protected $response;
26
27
    /**
28
     * @var simple_html_dom
29
     */
30
    protected $dom;
31
32
    /** @var string */
33
    protected $baseUrl;
34
35
    /** @var string */
36
    protected $domain;
37
38
    /** @var string */
39
    private $domainWithScheme;
40
41
    /**
42
     * @return self|int
43
     */
44 15
    public static function fromUrl(
45
        string $url,
46
        string $userAgent = self::DEFAULT_USER_AGENT,
47
        string $language = 'en,en-US;q=0.5',
48
        ?CurlRequest $previousRequest = null
49
    ) {
50 15
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
51
52 15
        if ($response instanceof Response) {
53 15
            return new self($response);
54
        }
55
56
        return $response;
57
    }
58
59
    /**
60
     * @param Response $response
61
     */
62 18
    public function __construct(Response $response)
63
    {
64 18
        $this->response = $response;
65 18
    }
66
67 15
    public function getResponse()
68
    {
69 15
        return $this->response;
70
    }
71
72 3
    public function getRedirection()
73
    {
74 3
        $headers = $this->response->getHeaders();
75 3
        $headers = array_change_key_case($headers ? $headers : []);
76 3
        if (isset($headers['location'])) {
77 3
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
78
        }
79
80
        return false;
81
    }
82
83 18
    public function getDom()
84
    {
85 18
        if (null === $this->dom) {
86 9
            $this->dom = new simple_html_dom();
87 9
            $this->dom->load($this->response->getContent());
88
        }
89
90 18
        return $this->dom;
91
    }
92
93 15
    private function find($selector, $number = null)
94
    {
95 15
        return $this->getDom()->find($selector, $number);
96
    }
97
98 12
    private function findOne($selector)
99
    {
100 12
        return $this->find($selector, 0);
101
    }
102
103
    /**
104
     * Return content inside a selector.
105
     *
106
     * @return string
107
     */
108 3
    public function getTag($selector)
109
    {
110 3
        $found = $this->findOne($selector);
111
112 3
        return null !== $found ? Helper::clean($found->innertext) : null;
113
    }
114
115 6
    public function getUniqueTag($selector = 'title')
116
    {
117 6
        $found = $this->find($selector);
118 6
        if ($found) {
119 6
            if (count($found) > 1) {
120
                return count($found).' `'.$selector.'` !!';
121
            } else {
122 6
                return Helper::clean($found[0]->innertext);
123
            }
124
        }
125
    }
126
127
    /**
128
     * Return content inside a meta.
129
     *
130
     * @return string from content attribute
131
     */
132 12
    public function getMeta(string $name)
133
    {
134 12
        $meta = $this->findOne('meta[name='.$name.']');
135
136 12
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
137
    }
138
139
    /**
140
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
141
     *
142
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
143
     */
144 9
    public function getCanonical()
145
    {
146 9
        $canonical = $this->findOne('link[rel=canonical]');
147
148 9
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
149
    }
150
151
    /*
152
     * @return bool
153
     */
154 9
    public function isCanonicalCorrect()
155
    {
156 9
        $canonical = $this->getCanonical();
157
158 9
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
159
    }
160
161 3
    public function getKws()
162
    {
163 3
        $kws = TextAnalyzer::get(
164 3
            $this->getDom(),
165 3
            true,   // only sentences
166 3
            1,      // no expression, just words
167 3
            0      // keep trail
168
        );
169
170 3
        return $kws->getExpressions(10);
171
    }
172
173
    /**
174
     * @return int
175
     */
176 3
    public function getRatioTxtCode(): int
177
    {
178 3
        $textLenght = strlen($this->getDom()->plaintext);
179 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
180
181 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
182
    }
183
184
    /**
185
     * Return an array of object with two elements Link and anchor.
186
     *
187
     * @return array|null if we didn't found breadcrumb
188
     */
189 3
    public function getBreadCrumb(?string $separator = null)
190
    {
191 3
        $breadcrumb = ExtractBreadcrumb::get(
192 3
            $this->response->getContent(),
193 3
            $this->getBaseUrl(),
194 3
            $this->response->getEffectiveUrl()
195
        );
196
197 3
        if (null !== $separator && is_array($breadcrumb)) {
198
            $breadcrumb = array_map(function ($item) {
199
                return $item->getCleanName();
200
            }, $breadcrumb);
201
            $breadcrumb = implode($separator, $breadcrumb);
202
        }
203
204 3
        return $breadcrumb;
205
    }
206
207
    /**
208
     * @return string|false
209
     */
210
    public function amIRedirectToHttps()
211
    {
212
        $headers = $this->response->getHeaders();
213
        $headers = array_change_key_case(null !== $headers ? $headers : []);
214
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
215
        $url = $this->response->getUrl();
216
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
217
            return $httpsUrl;
218
        }
219
220
        return false;
221
    }
222
223 3
    public function getBaseUrl()
224
    {
225 3
        if (!isset($this->baseUrl)) {
226 3
            $base = $this->findOne('base');
227 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
228
                $this->baseUrl = $base->href;
229
            } else {
230 3
                $this->baseUrl = $this->response->getEffectiveUrl();
231
            }
232
        }
233
234 3
        return $this->baseUrl;
235
    }
236
237 6
    public function getDomain()
238
    {
239 6
        if (!isset($this->domain)) {
240 6
            $urlParsed = parse_url($this->response->getEffectiveUrl());
241 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
242 6
            $this->domain = $match[0];
243
        }
244
245 6
        return $this->domain;
246
    }
247
248
    /**
249
     * @return int correspond to a const from Indexable
250
     */
251 9
    public function isIndexable(string $userAgent = 'googlebot')
252
    {
253 9
        return Indexable::isIndexable($this, $userAgent);
254
    }
255
256 12
    public function getDomainAndScheme()
257
    {
258 12
        if (null === $this->domainWithScheme) {
259 12
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
260
        }
261
262 12
        return $this->domainWithScheme;
263
    }
264
265 12
    public static function getDomainAndSchemeFrom(string $url)
266
    {
267 12
        $url = parse_url($url);
268
269 12
        return $url['scheme'].'://'.$url['host'];
270
    }
271
}
272