Passed
Push — master ( 92084b...91cbe9 )
by Dev
13:57 queued 12:21
created

Link::getWrapper()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
ccs 0
cts 2
cp 0
rs 10
cc 1
nc 1
nop 0
crap 2
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Link
12
{
13
    /** @var Url */
14
    protected $url;
15
16
    /** @var string cache */
17
    protected $anchor;
18
19
    /** @var \DomElement */
20
    protected $element;
21
22
    /** @var Harvest */
23
    protected $parentDoc;
24
25
    /** @var int */
26
    protected $wrapper;
27
28
    /** @var int */
29
    protected $type;
30
31
    // Ce serait dans une liste, dans une phrase...
32
    protected $context;
33
34
    // wrapper related
35
    const LINK_A = 1;
36
    const LINK_SRC = 4;
37
    const LINK_3XX = 2;
38
    const LINK_301 = 3;
39
40
    // type related
41
    const LINK_SELF = 1;
42
    const LINK_INTERNAL = 2;
43
    const LINK_SUB = 3;
44
    const LINK_EXTERNAL = 4;
45
46
    /**
47
     * Add trailing slash for domain. Eg: https://piedweb.com => https://piedweb.com/ and '/test ' = '/test'.
48
     */
49 36
    public static function normalizeUrl(string $url): string
50
    {
51 36
        $url = trim($url);
52
53 36
        if ('' == preg_replace('@(.*\://?([^\/]+))@', '', $url)) {
54 12
            $url .= '/';
55
        }
56
57 36
        return $url;
58
    }
59
60 18
    protected static function getWrapperFromElement(\DomElement $element): ?int
61
    {
62 18
        if ('a' == $element->tagName && $element->getAttribute('href')) {
63 18
            return self::LINK_A;
64
        }
65
66 6
        if ($element->getAttribute('src')) {
67 6
            return self::LINK_SRC;
68
        }
69
70 6
        return null;
71
    }
72
73
    /**
74
     * Always submit absoute Url !
75
     */
76 24
    public function __construct(string $url, Harvest $parent, \DOMElement $element = null, int $wrapper = null)
77
    {
78 24
        $this->url = new Url(self::normalizeUrl($url));
79
80 24
        $this->parentDoc = $parent;
81
82 24
        if (null !== $element) {
83 18
            $this->setAnchor($element);
84
        }
85
86 24
        $this->element = $element;
87
88 24
        $this->wrapper = $wrapper ?? (null !== $element ? self::getWrapperFromElement($element) : null);
89 24
    }
90
91 3
    public static function createRedirection(string $url, Harvest $parent, int $redirType = null): self
92
    {
93 3
        return new self($url, $parent, null, $redirType ?? self::LINK_3XX);
94
    }
95
96
    public function getWrapper(): ?int
97
    {
98
        return $this->wrapper;
99
    }
100
101 18
    protected function setAnchor(\DomElement $element)
102
    {
103
        // Get classic text anchor
104 18
        $this->anchor = $element->textContent;
105
106
        // If get nothing, then maybe we can get an alternative text (eg: img)
107 18
        if (empty($this->anchor)) {
108 6
            $alt = (new DomCrawler($element))->filter('*[alt]');
109 6
            if ($alt->count() > 0) {
110
                $this->anchor = $alt->eq(0)->attr('alt') ?? '';
111
            }
112
        }
113
114
        // Limit to 100 characters
115
        // Totally subjective
116 18
        $this->anchor = substr(Helper::clean($this->anchor), 0, 99);
117
118 18
        return $this;
119
    }
120
121 6
    public function getUrl(): Url
122
    {
123 6
        return $this->url;
124
    }
125
126 3
    public function getPageUrl(): string
127
    {
128 3
        return $this->url->getDocumentUrl(); //return preg_replace('/(\#.*)/si', '', $this->url->get());
129
    }
130
131 9
    public function getParentUrl(): Url
132
    {
133 9
        return $this->parentDoc->getUrl();
134
    }
135
136 9
    public function getAnchor()
137
    {
138 9
        return $this->anchor;
139
    }
140
141 3
    public function getElement()
142
    {
143 3
        return $this->element;
144
    }
145
146
    /**
147
     * @return bool
148
     */
149 9
    public function mayFollow()
150
    {
151
        // check meta robots and headers
152 9
        if (null !== $this->parentDoc && !$this->parentDoc->mayFollow()) {
153
            return false;
154
        }
155
156
        // check "wrapper" rel
157 9
        if (null !== $this->element && null !== $this->element->getAttribute('rel')) {
158 6
            if (preg_match('(nofollow|sponsored|ugc)', $this->element->getAttribute('rel'))) {
159 3
                return false;
160
            }
161
        }
162
163 6
        return true;
164
    }
165
166
    /**
167
     * @return string
168
     */
169
    public function getRelAttribute(): ?string
170
    {
171
        return null !== $this->element ? $this->element->getAttribute('rel') : null;
172
    }
173
174 9
    public function isInternalLink(): bool
175
    {
176 9
        return $this->url->getOrigin() == $this->getParentUrl()->getOrigin();
177
    }
178
179 9
    public function isSubLink(): bool
180
    {
181 9
        return !$this->isInternalLink()
182 9
            && $this->url->getRegistrableDomain() == $this->getParentUrl()->getRegistrableDomain();
183
        //&& strtolower(substr($this->getHost(), -strlen($this->parentDomain))) === $this->parentDomain;
184
    }
185
186 9
    public function isSelfLink(): bool
187
    {
188 9
        return $this->isInternalLink()
189 9
            && $this->url->getDocumentUrl() == $this->getParentUrl()->getDocumentUrl();
190
    }
191
192 6
    public function getType()
193
    {
194 6
        if ($this->isSelfLink()) {
195 6
            return self::LINK_SELF;
196
        }
197
198 6
        if ($this->isInternalLink()) {
199 6
            return self::LINK_INTERNAL;
200
        }
201
202 6
        if ($this->isSubLink()) {
203 3
            return self::LINK_SUB;
204
        }
205
206 6
        return self::LINK_EXTERNAL;
207
    }
208
209
    // todo useless ?!
210
    public function getAbsoluteInternalLink()
211
    {
212
        if ($this->isInternalLink()) {
213
            return substr($this->url, strlen($this->getParentUrl()->getOrigin()));
214
        }
215
    }
216
}
217