Link::__construct()   A
last analyzed

Complexity

Conditions 3
Paths 4

Size

Total Lines 13
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 3

Importance

Changes 2
Bugs 1 Features 0
Metric Value
eloc 6
c 2
b 1
f 0
dl 0
loc 13
ccs 7
cts 7
cp 1
rs 10
cc 3
nc 4
nop 4
crap 3
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Link
12
{
13
    /** @var Url */
14
    protected $url;
15
16
    /** @var string cache */
17
    protected $anchor;
18
19
    /** @var \DomElement */
20
    protected $element;
21
22
    /** @var Harvest */
23
    protected $parentDoc;
24
25
    /** @var int */
26
    protected $wrapper;
27
28
    /** @var int */
29
    protected $type;
30
31
    // Ce serait dans une liste, dans une phrase...
32
    protected $context;
33
34
    // wrapper related
35
    public const LINK_A = 1;
36
37
    public const LINK_SRC = 4;
38
39
    public const LINK_3XX = 2;
40
41
    public const LINK_301 = 3;
42
43
    // type related
44
    public const LINK_SELF = 1;
45
46
    public const LINK_INTERNAL = 2;
47
48
    public const LINK_SUB = 3;
49 36
50
    public const LINK_EXTERNAL = 4;
51 36
52
    /**
53 36
     * Add trailing slash for domain. Eg: https://piedweb.com => https://piedweb.com/ and '/test ' = '/test'.
54 12
     */
55
    public static function normalizeUrl(string $url): string
56
    {
57 36
        $url = trim($url);
58
59
        if ('' == preg_replace('@(.*\://?([^\/]+))@', '', $url)) {
60 18
            $url .= '/';
61
        }
62 18
63 18
        return $url;
64
    }
65
66 6
    protected static function getWrapperFromElement(\DomElement $element): ?int
67 6
    {
68
        if ('a' == $element->tagName && $element->getAttribute('href')) {
69
            return self::LINK_A;
70 6
        }
71
72
        if ($element->getAttribute('src')) {
73
            return self::LINK_SRC;
74
        }
75
76 24
        return null;
77
    }
78 24
79
    /**
80 24
     * Always submit absoute Url !
81
     */
82 24
    public function __construct(string $url, Harvest $parent, \DOMElement $element = null, int $wrapper = null)
83 18
    {
84
        $this->url = new Url(self::normalizeUrl($url));
85
86 24
        $this->parentDoc = $parent;
87
88 24
        if (null !== $element) {
89 24
            $this->setAnchor($element);
90
        }
91 3
92
        $this->element = $element;
93 3
94
        $this->wrapper = $wrapper ?? (null !== $element ? self::getWrapperFromElement($element) : null);
95
    }
96
97
    public static function createRedirection(string $url, Harvest $parent, int $redirType = null): self
98
    {
99
        return new self($url, $parent, null, $redirType ?? self::LINK_3XX);
100
    }
101 18
102
    public function getWrapper(): ?int
103
    {
104 18
        return $this->wrapper;
105
    }
106
107 18
    protected function setAnchor(\DomElement $element)
108 6
    {
109 6
        // Get classic text anchor
110
        $this->anchor = $element->textContent;
111
112
        // If get nothing, then maybe we can get an alternative text (eg: img)
113
        if (empty($this->anchor)) {
114
            $alt = (new DomCrawler($element))->filter('*[alt]');
115
            if ($alt->count() > 0) {
116 18
                $this->anchor = $alt->eq(0)->attr('alt') ?? '';
117
            }
118 18
        }
119
120
        // Limit to 100 characters
121 9
        // Totally subjective
122
        $this->anchor = substr(Helper::clean($this->anchor), 0, 99);
123 9
124
        return $this;
125
    }
126 3
127
    public function getUrl(): Url
128 3
    {
129
        return $this->url;
130
    }
131 9
132
    public function getPageUrl(): string
133 9
    {
134
        return $this->url->getDocumentUrl(); //return preg_replace('/(\#.*)/si', '', $this->url->get());
135
    }
136 9
137
    public function getParentUrl(): Url
138 9
    {
139
        return $this->parentDoc->getUrl();
140
    }
141 3
142
    public function getAnchor()
143 3
    {
144
        return $this->anchor;
145
    }
146
147
    public function getElement()
148
    {
149 9
        return $this->element;
150
    }
151
152 9
    /**
153
     * @return bool
154
     */
155
    public function mayFollow()
156
    {
157 9
        // check meta robots and headers
158 6
        if (null !== $this->parentDoc && ! $this->parentDoc->mayFollow()) {
159 3
            return false;
160
        }
161
162
        // check "wrapper" rel
163 6
        if (null !== $this->element && $this->element->getAttribute('rel')) {
164
            if (preg_match('(nofollow|sponsored|ugc)', $this->element->getAttribute('rel'))) {
165
                return false;
166
            }
167
        }
168
169
        return true;
170
    }
171
172
    /**
173
     * @return string
174 9
     */
175
    public function getRelAttribute(): ?string
176 9
    {
177
        return null !== $this->element ? $this->element->getAttribute('rel') : null;
178
    }
179 9
180
    public function isInternalLink(): bool
181 9
    {
182 9
        return $this->url->getOrigin() == $this->getParentUrl()->getOrigin();
183
    }
184
185
    public function isSubLink(): bool
186 9
    {
187
        return ! $this->isInternalLink()
188 9
            && $this->url->getRegistrableDomain() == $this->getParentUrl()->getRegistrableDomain();
189 9
        //&& strtolower(substr($this->getHost(), -strlen($this->parentDomain))) === $this->parentDomain;
190
    }
191
192 6
    public function isSelfLink(): bool
193
    {
194 6
        return $this->isInternalLink()
195 6
            && $this->url->getDocumentUrl() == $this->getParentUrl()->getDocumentUrl();
196
    }
197
198 6
    public function getType()
199 6
    {
200
        if ($this->isSelfLink()) {
201
            return self::LINK_SELF;
202 6
        }
203 3
204
        if ($this->isInternalLink()) {
205
            return self::LINK_INTERNAL;
206 6
        }
207
208
        if ($this->isSubLink()) {
209
            return self::LINK_SUB;
210
        }
211
212
        return self::LINK_EXTERNAL;
213
    }
214
}
215