Passed
Push — master ( f60a2c...2d7c5c )
by Dev
10:34 queued 19s
created

Harvest::getDomain()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 0
dl 0
loc 9
ccs 6
cts 6
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
10
class Harvest
11
{
12
    use HarvestLinksTrait;
13
14
    /**
15
     * @var Response
16
     */
17
    protected $response;
18
19
    /**
20
     * @var simple_html_dom
21
     */
22
    protected $dom;
23
24
    /** @var string */
25
    protected $baseUrl;
26
27
    /** @var string */
28
    protected $domain;
29
30 9
    public static function fromUrl(
31
        string $url,
32
        string $userAgent = 'Bot: Url Harvester',
33
        string $language = 'en,en-US;q=0.5',
34
        bool   $tryHttps = false
35
    ) {
36 9
        $request = Request::make($url, $userAgent, 'text/html', $language, $tryHttps);
37 9
        $response = $request->getResponse();
38
39 9
        if ($response instanceof Response) {
0 ignored issues
show
introduced by
$response is always a sub-type of PiedWeb\Curl\Response.
Loading history...
40 9
            return new self($response);
41
        }
42
43
        return $request->getError();
44
    }
45
46
    /**
47
     * @param Response $response
48
     */
49 9
    public function __construct(Response $response)
50
    {
51 9
        $this->response = $response;
52 9
    }
53
54 3
    public function getResponse()
55
    {
56 3
        return $this->response;
57
    }
58
59
    public function getRedirection()
60
    {
61
        $headers = $this->response->getHeaders();
62
        $headers = array_change_key_case($headers ? $headers : []);
63
        if (isset($headers['location'])) {
64
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
65
        }
66
67
        return false;
68
    }
69
70 6
    public function getDom()
71
    {
72 6
        if (null === $this->dom) {
73 6
            $this->dom = new simple_html_dom();
74 6
            $this->dom->load($this->response->getContent());
75
        }
76
77 6
        return $this->dom;
78
    }
79
80 3
    private function find($selector, $number = null)
81
    {
82 3
        return $this->getDom()->find($selector, $number);
83
    }
84
85 3
    private function findOne($selector)
86
    {
87 3
        return $this->find($selector, 0);
88
    }
89
90
    /**
91
     * Return content inside a selector.
92
     *
93
     * @return string
94
     */
95 3
    public function getTag($selector)
96
    {
97 3
        $found = $this->findOne($selector);
98
99 3
        return null !== $found ? Helper::clean($found->innertext) : null;
100
    }
101
102
    public function getUniqueTag($selector = 'title')
103
    {
104
        $found = $this->find($selector);
105
        if ($found) {
106
            if (count($found) > 1) {
107
                return count($found).' `'.$selector.'` !!';
108
            } else {
109
                return Helper::clean($found[0]->innertext);
110
            }
111
        }
112
    }
113
114
    /**
115
     * Return content inside a meta.
116
     *
117
     * @return string from content attribute
118
     */
119 3
    public function getMeta(string $name)
120
    {
121 3
        $meta = $this->findOne('meta[name='.$name.']');
122
123 3
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : null;
124
    }
125
126
    /**
127
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
128
     *
129
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
130
     */
131 3
    public function getCanonical()
132
    {
133 3
        $canonical = $this->findOne('link[rel=canonical]');
134
135 3
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
136
    }
137
138
    /*
139
     * @return bool
140
     */
141 3
    public function isCanonicalCorrect()
142
    {
143 3
        return $this->response->getEffectiveUrl() == $this->getCanonical();
144
    }
145
146 3
    public function getKws()
147
    {
148 3
        $kws = TextAnalyzer::get(
149 3
            $this->response->getContent(),
150 3
            true,   // only sentences
151 3
            1,      // no expression, just words
152 3
            0      // keep trail
153
        );
154
155 3
        return $kws->getExpressions(10);
156
    }
157
158
    /**
159
     * @return int
160
     */
161 3
    public function getRatioTxtCode(): int
162
    {
163 3
        $textLenght = strlen($this->getDom()->plaintext);
164 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
165
166 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
167
    }
168
169
    /**
170
     * Return an array of object with two elements Link and anchor.
171
     *
172
     * @return array or NULL if we didn't found breadcrumb
173
     */
174 3
    public function getBreadCrumb()
175
    {
176 3
        return ExtractBreadcrumb::get(
177 3
            $this->response->getContent(),
178 3
            $this->getBaseUrl(),
179 3
            $this->response->getEffectiveUrl()
180
        );
181
    }
182
183 3
    public function getBaseUrl()
184
    {
185 3
        if (!isset($this->baseUrl)) {
186 3
            $base = $this->findOne('base');
187 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
188
                $this->baseUrl = $base->href;
189
            } else {
190 3
                $this->baseUrl = $this->response->getEffectiveUrl();
191
            }
192
        }
193
194 3
        return $this->baseUrl;
195
    }
196
197 9
    public function getDomain()
198
    {
199 9
        if (!isset($this->domain)) {
200 9
            $urlParsed = parse_url($this->response->getEffectiveUrl());
201 9
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
202 9
            $this->domain = $match[0];
203
        }
204
205 9
        return $this->domain;
206
    }
207
}
208