Passed
Push — master ( 0cac9c...933430 )
by Dev
02:38
created

Harvest::getTextAnalysis()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 12
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 12
ccs 8
cts 8
cp 1
rs 10
c 0
b 0
f 0
cc 2
nc 2
nop 0
crap 2
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Response;
6
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
7
use phpuri;
8
use simple_html_dom;
9
use PiedWeb\Curl\Request as CurlRequest;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait, RobotsTxtTrait;
14
15
    const LINK_SELF = 1;
16
    const LINK_INTERNAL = 2;
17
    const LINK_SUB = 3;
18
    const LINK_EXTERNAL = 4;
19
20
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
21
22
    /**
23
     * @var Response
24
     */
25
    protected $response;
26
27
    /**
28
     * @var simple_html_dom
29
     */
30
    protected $dom;
31
32
    /** @var string */
33
    protected $baseUrl;
34
35
    /** @var string */
36
    protected $domain;
37
38
    /** @var string */
39
    private $domainWithScheme;
40
41
    /** @var \PiedWeb\TextAnalyzer\Analysis */
42
    private $textAnalysis;
43
44
    /**
45
     * @return self|int
46
     */
47 15
    public static function fromUrl(
48
        string $url,
49
        string $userAgent = self::DEFAULT_USER_AGENT,
50
        string $language = 'en,en-US;q=0.5',
51
        ?CurlRequest $previousRequest = null
52
    ) {
53 15
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
54
55 15
        if ($response instanceof Response) {
56 15
            return new self($response);
57
        }
58
59
        return $response;
60
    }
61
62
    /**
63
     * @param Response $response
64
     */
65 18
    public function __construct(Response $response)
66
    {
67 18
        $this->response = $response;
68 18
    }
69
70 15
    public function getResponse()
71
    {
72 15
        return $this->response;
73
    }
74
75 3
    public function getRedirection()
76
    {
77 3
        $headers = $this->response->getHeaders();
78 3
        $headers = array_change_key_case($headers ? $headers : []);
79 3
        if (isset($headers['location'])) {
80 3
            return phpUri::parse($this->response->getEffectiveUrl())->join($headers['location']);
81
        }
82
83
        return false;
84
    }
85
86 18
    public function getDom()
87
    {
88 18
        if (null === $this->dom) {
89 9
            $this->dom = new simple_html_dom();
90 9
            $this->dom->load($this->response->getContent());
91
        }
92
93 18
        return $this->dom;
94
    }
95
96 15
    private function find($selector, $number = null)
97
    {
98 15
        return $this->getDom()->find($selector, $number);
99
    }
100
101 12
    private function findOne($selector)
102
    {
103 12
        return $this->find($selector, 0);
104
    }
105
106
    /**
107
     * Return content inside a selector.
108
     *
109
     * @return string
110
     */
111 3
    public function getTag($selector)
112
    {
113 3
        $found = $this->findOne($selector);
114
115 3
        return null !== $found ? Helper::clean($found->innertext) : null;
116
    }
117
118 6
    public function getUniqueTag($selector = 'title')
119
    {
120 6
        $found = $this->find($selector);
121 6
        if ($found) {
122 6
            if (count($found) > 1) {
123
                return count($found).' `'.$selector.'` !!';
124
            } else {
125 6
                return Helper::clean($found[0]->innertext);
126
            }
127
        }
128
    }
129
130
    /**
131
     * Return content inside a meta.
132
     *
133
     * @return string from content attribute
134
     */
135 12
    public function getMeta(string $name)
136
    {
137 12
        $meta = $this->findOne('meta[name='.$name.']');
138
139 12
        return null !== $meta ? (isset($meta->content) ? Helper::clean($meta->content) : '') : '';
140
    }
141
142
    /**
143
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
144
     *
145
     * @return string le contenu de l'attribute href sinon NULL si la balise n'existe pas
146
     */
147 9
    public function getCanonical()
148
    {
149 9
        $canonical = $this->findOne('link[rel=canonical]');
150
151 9
        return null !== $canonical ? (isset($canonical->href) ? $canonical->href : '') : null;
152
    }
153
154
    /*
155
     * @return bool
156
     */
157 9
    public function isCanonicalCorrect()
158
    {
159 9
        $canonical = $this->getCanonical();
160
161 9
        return $canonical ? $this->response->getEffectiveUrl() == $canonical : true;
162
    }
163
164 3
    public function getTextAnalysis()
165
    {
166 3
        if (null === $this->textAnalysis) {
167 3
            $this->textAnalysis = TextAnalyzer::get(
168 3
                $this->getDom(),
169 3
                true,   // only sentences
170 3
                1,      // no expression, just words
171 3
                0      // keep trail
172
            );
173
        }
174
175 3
        return $this->textAnalysis;
176
    }
177
178 3
    public function getKws()
179
    {
180 3
        return $this->getTextAnalysis()->getExpressions(10);
181
    }
182
183
    /**
184
     * @return int
185
     */
186 3
    public function getRatioTxtCode(): int
187
    {
188 3
        $textLenght = strlen($this->getDom()->plaintext);
189 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
190
191 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
192
    }
193
194
    /**
195
     * Return an array of object with two elements Link and anchor.
196
     *
197
     * @return array|null if we didn't found breadcrumb
198
     */
199 3
    public function getBreadCrumb(?string $separator = null)
200
    {
201 3
        $breadcrumb = ExtractBreadcrumb::get(
202 3
            $this->response->getContent(),
203 3
            $this->getBaseUrl(),
204 3
            $this->response->getEffectiveUrl()
205
        );
206
207 3
        if (null !== $separator && is_array($breadcrumb)) {
208
            $breadcrumb = array_map(function ($item) {
209
                return $item->getCleanName();
210
            }, $breadcrumb);
211
            $breadcrumb = implode($separator, $breadcrumb);
212
        }
213
214 3
        return $breadcrumb;
215
    }
216
217
    /**
218
     * @return string|false
219
     */
220
    public function amIRedirectToHttps()
221
    {
222
        $headers = $this->response->getHeaders();
223
        $headers = array_change_key_case(null !== $headers ? $headers : []);
224
        $redirUrl = isset($headers['location']) ? $headers['location'] : null;
225
        $url = $this->response->getUrl();
226
        if (null !== $redirUrl && ($httpsUrl = preg_replace('#^http:#', 'https:', $url, 1)) == $redirUrl) {
227
            return $httpsUrl;
228
        }
229
230
        return false;
231
    }
232
233 3
    public function getBaseUrl()
234
    {
235 3
        if (!isset($this->baseUrl)) {
236 3
            $base = $this->findOne('base');
237 3
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
238
                $this->baseUrl = $base->href;
239
            } else {
240 3
                $this->baseUrl = $this->response->getEffectiveUrl();
241
            }
242
        }
243
244 3
        return $this->baseUrl;
245
    }
246
247 6
    public function getDomain()
248
    {
249 6
        if (!isset($this->domain)) {
250 6
            $urlParsed = parse_url($this->response->getEffectiveUrl());
251 6
            preg_match("/[^\.\/]+(\.com?)?\.[^\.\/]+$/", $urlParsed['host'], $match);
252 6
            $this->domain = $match[0];
253
        }
254
255 6
        return $this->domain;
256
    }
257
258
    /**
259
     * @return int correspond to a const from Indexable
260
     */
261 9
    public function isIndexable(string $userAgent = 'googlebot')
262
    {
263 9
        return Indexable::isIndexable($this, $userAgent);
264
    }
265
266 12
    public function getDomainAndScheme()
267
    {
268 12
        if (null === $this->domainWithScheme) {
269 12
            $this->domainWithScheme = self::getDomainAndSchemeFrom($this->response->getEffectiveUrl());
270
        }
271
272 12
        return $this->domainWithScheme;
273
    }
274
275 12
    public static function getDomainAndSchemeFrom(string $url)
276
    {
277 12
        $url = parse_url($url);
278
279 12
        return $url['scheme'].'://'.$url['host'];
280
    }
281
}
282