Passed
Push — master ( d8bc96...81959d )
by Dev
15:46 queued 01:29
created

Link::mayFollow()   A

Complexity

Conditions 6
Paths 4

Size

Total Lines 15
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 6
c 1
b 0
f 0
dl 0
loc 15
ccs 0
cts 0
cp 0
rs 9.2222
cc 6
nc 4
nop 0
crap 42
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Link
12
{
13 12
    /** @var Url */
14
    protected $url;
15 12
16 12
    /** @var string cache */
17 12
    protected $anchor;
18
19 12
    /** @var \DomElement */
20 12
    protected $element;
21
22 12
    /** @var Harvest */
23
    protected $parentDoc;
24 12
25
    /** @var int */
26 12
    protected $wrapper;
27
28
    /** @var int */
29 12
    protected $type;
30
31 9
    // Ce serait dans une liste, dans une phrase...
32
    protected $context;
33 9
34
    // wrapper related
35
    const LINK_A = 1;
36 3
    const LINK_SRC = 4;
37
    const LINK_3XX = 2;
38 3
    const LINK_301 = 3;
39
40
    // type related
41 3
    const LINK_SELF = 1;
42
    const LINK_INTERNAL = 2;
43 3
    const LINK_SUB = 3;
44
    const LINK_EXTERNAL = 4;
45
46 3
    /**
47
     * Add trailing slash for domain. Eg: https://piedweb.com => https://piedweb.com/ and '/test ' = '/test'.
48 3
     */
49
    public static function normalizeUrl(string $url): string
50
    {
51 6
        $url = trim($url);
52
53 6
        if ('' == preg_replace('@(.*\://?([^\/]+))@', '', $url)) {
54 3
            $url .= '/';
55 3
        }
56
57
        return $url;
58
    }
59 3
60
    protected static function getWrapperFromElement(\DomElement $element): ?int
61
    {
62
        if ('a' == $element->tagName && $element->getAttribute('href')) {
63
            return self::LINK_A;
64
        }
65
66
        if ($element->getAttribute('src')) {
67
            return self::LINK_SRC;
68
        }
69
70
        return null;
71
    }
72
73
    /**
74
     * Always submit absoute Url !
75
     */
76
    public function __construct(string $url, Harvest $parent, \DOMElement $element = null, int $wrapper = null)
77
    {
78
        $this->url = new Url(self::normalizeUrl($url));
79
80
        $this->parentDoc = $parent;
81
82
        if (null !== $element) {
83
            $this->setAnchor($element);
84
        }
85
86
        $this->element = $element;
87
88
        $this->wrapper = $wrapper ?? (null !== $element ? self::getWrapperFromElement($element) : null);
89
    }
90
91
    public static function createRedirection(string $url, Harvest $parent, int $redirType = null): self
92
    {
93
        return new self($url, $parent, null, $redirType ?? self::LINK_3XX);
94
    }
95
96
    public function getWrapper(): ?int
97
    {
98
        return $this->wrapper;
99
    }
100
101
    protected function setAnchor(\DomElement $element)
102
    {
103
        // Get classic text anchor
104
        $this->anchor = $element->textContent;
105
106
        // If get nothing, then maybe we can get an alternative text (eg: img)
107
        if (empty($this->anchor)) {
108
            $alt = (new DomCrawler($element))->filter('*[alt]');
109
            if ($alt->count() > 0) {
110
                $this->anchor = $alt->eq(0)->attr('alt') ?? '';
111
            }
112
        }
113
114
        // Limit to 100 characters
115
        // Totally subjective
116
        $this->anchor = substr(Helper::clean($this->anchor), 0, 99);
117
118
        return $this;
119
    }
120
121
    public function getUrl(): Url
122
    {
123
        return $this->url;
124
    }
125
126
    public function getPageUrl(): string
127
    {
128
        return $this->url->getDocumentUrl(); //return preg_replace('/(\#.*)/si', '', $this->url->get());
129
    }
130
131
    public function getParentUrl(): Url
132
    {
133
        return $this->parentDoc->getUrl();
134
    }
135
136
    public function getAnchor()
137
    {
138
        return $this->anchor;
139
    }
140
141
    public function getElement()
142
    {
143
        return $this->element;
144
    }
145
146
    /**
147
     * @return bool
148
     */
149
    public function mayFollow()
150
    {
151
        // check meta robots and headers
152
        if (null !== $this->parentDoc && !$this->parentDoc->mayFollow()) {
153
            return false;
154
        }
155
156
        // check "wrapper" rel
157
        if (null !== $this->element && null !== $this->element->getAttribute('rel')) {
158
            if (preg_match('(nofollow|sponsored|ugc)', $this->element->getAttribute('rel'))) {
159
                return false;
160
            }
161
        }
162
163
        return true;
164
    }
165
166
    /**
167
     * @return string
168
     */
169
    public function getRelAttribute(): ?string
170
    {
171
        return null !== $this->element ? $this->element->getAttribute('rel') : null;
172
    }
173
174
    public function isInternalLink(): bool
175
    {
176
        return $this->url->getOrigin() == $this->getParentUrl()->getOrigin();
177
    }
178
179
    public function isSubLink(): bool
180
    {
181
        return !$this->isInternalLink()
182
            && $this->url->getRegistrableDomain() == $this->getParentUrl()->getRegistrableDomain();
183
        //&& strtolower(substr($this->getHost(), -strlen($this->parentDomain))) === $this->parentDomain;
184
    }
185
186
    public function isSelfLink(): bool
187
    {
188
        return $this->isInternalLink()
189
            && $this->url->getDocumentUrl() == $this->getParentUrl()->getDocumentUrl();
190
    }
191
192
    public function getType()
193
    {
194
        if ($this->isSelfLink()) {
195
            return self::LINK_SELF;
196
        }
197
198
        if ($this->isInternalLink()) {
199
            return self::LINK_INTERNAL;
200
        }
201
202
        if ($this->isSubLink()) {
203
            return self::LINK_SUB;
204
        }
205
206
        return self::LINK_EXTERNAL;
207
    }
208
209
    // todo useless ?!
210
    public function getAbsoluteInternalLink()
211
    {
212
        if ($this->isInternalLink()) {
213
            return substr($this->url, strlen($this->getParentUrl()->getOrigin()));
214
        }
215
    }
216
}
217