Passed
Push — master ( 9abeb8...36c770 )
by Dev
13:09
created

Link::getPageUrl()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
ccs 0
cts 0
cp 0
rs 10
cc 1
nc 1
nop 0
crap 2
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Link
12
{
13 12
    /** @var Url */
14
    protected $url;
15 12
16 12
    /** @var string cache */
17 12
    protected $anchor;
18
19 12
    /** @var \DomElement */
20 12
    protected $element;
21
22 12
    /** @var Harvest */
23
    protected $parentDoc;
24 12
25
    /** @var int */
26 12
    protected $wrapper;
27
28
    /** @var int */
29 12
    protected $type;
30
31 9
    // Ce serait dans une liste, dans une phrase...
32
    protected $context;
33 9
34
    // wrapper related
35
    const LINK_A = 1;
36 3
    const LINK_SRC = 4;
37
    const LINK_3XX = 2;
38 3
    const LINK_301 = 3;
39
40
    // type related
41 3
    const LINK_SELF = 1;
42
    const LINK_INTERNAL = 2;
43 3
    const LINK_SUB = 3;
44
    const LINK_EXTERNAL = 4;
45
46 3
    /**
47
     * Add trailing slash for domain. Eg: https://piedweb.com => https://piedweb.com/ and '/test ' = '/test'.
48 3
     */
49
    public static function normalizeUrl(string $url): string
50
    {
51 6
        $url = trim($url);
52
53 6
        if ('' == preg_replace('@(.*\://?([^\/]+))@', '', $url)) {
54 3
            $url .= '/';
55 3
        }
56
57
        return $url;
58
    }
59 3
60
    protected static function getWrapperFromElement(\DomElement $element): ?int
61
    {
62
        if ('a' == $element->tagName && $element->getAttribute('href')) {
63
            return self::LINK_A;
64
        }
65
66
        if ($element->getAttribute('src')) {
67
            return self::LINK_SRC;
68
        }
69
70
        return null;
71
    }
72
73
    /**
74
     * Always submit absoute Url !
75
     */
76
    public function __construct(string $url, Harvest $parent, \DOMElement $element = null, int $wrapper = null)
77
    {
78
        $this->url = new Url(self::normalizeUrl($url));
79
80
        $this->parentDoc = $parent;
81
82
        if (null !== $element) {
83
            $this->setAnchor($element);
84
        }
85
86
        $this->element = $element;
87
88
        $this->wrapper = $wrapper ?? (null !== $element ? self::getWrapperFromElement($element) : null);
89
    }
90
91
    public static function createRedirection(string $url, Harvest $parent, int $redirType = null): self
92
    {
93
        return new self($url, $parent, null, $redirType ?? self::LINK_3XX);
94
    }
95
96
    public function getWrapper(): ?int
97
    {
98
        return $this->wrapper;
99
    }
100
101
    protected function setAnchor(\DomElement $element)
102
    {
103
        // Get classic text anchor
104
        $this->anchor = $element->textContent;
105
106
        // If get nothing, then maybe we can get an alternative text (eg: img)
107
        if (empty($this->anchor)) {
108
            $alt = (new DomCrawler($element))->filter('*[alt]');
109
            if ($alt->count() > 0) {
110
                $this->anchor = $alt->eq(0)->attr('alt') ?? '';
111
            }
112
        }
113
114
        // Limit to 100 characters
115
        // Totally subjective
116
        $this->anchor = substr(Helper::clean($this->anchor), 0, 99);
117
118
        return $this;
119
    }
120
121
    public function getUrl($string = false): Url
0 ignored issues
show
Unused Code introduced by
The parameter $string is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

121
    public function getUrl(/** @scrutinizer ignore-unused */ $string = false): Url

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
122
    {
123
        return $this->url;
124
    }
125
126
    public function getPageUrl(): string
127
    {
128
        return $this->url->getDocumentUrl(); //return preg_replace('/(\#.*)/si', '', $this->url->get());
129
    }
130
131
    public function getParentUrl(): Url
132
    {
133
        return $this->parentDoc->getUrl();
134
    }
135
136
    public function getAnchor()
137
    {
138
        return $this->anchor;
139
    }
140
141
    public function getElement()
142
    {
143
        return $this->element;
144
    }
145
146
    /**
147
     * @return bool
148
     */
149
    public function mayFollow()
150
    {
151
        // check meta robots and headers
152
        if (null !== $this->parentDoc && !$this->parentDoc->mayFollow()) {
153
            return false;
154
        }
155
156
        // check "wrapper" rel
157
        if (null !== $this->element && null !== $this->element->getAttribute('rel')) {
158
            if (false !== strpos($this->element->getAttribute('rel'), 'nofollow')
159
                || false !== strpos($this->element->getAttribute('rel'), 'sponsored')
160
                || false !== strpos($this->element->getAttribute('rel'), 'ugc')
161
            ) {
162
                return false;
163
            }
164
        }
165
166
        return true;
167
    }
168
169
    /**
170
     * @return string
171
     */
172
    public function getRelAttribute(): ?string
173
    {
174
        return null !== $this->element ? $this->element->getAttribute('rel') : null;
175
    }
176
177
    public function isInternalLink(): bool
178
    {
179
        return $this->url->getOrigin() == $this->getParentUrl()->getOrigin();
180
    }
181
182
    public function isSubLink(): bool
183
    {
184
        return !$this->isInternalLink()
185
            && $this->url->getRegistrableDomain() == $this->getParentUrl()->getRegistrableDomain();
186
        //&& strtolower(substr($this->getHost(), -strlen($this->parentDomain))) === $this->parentDomain;
187
    }
188
189
    public function isSelfLink(): bool
190
    {
191
        return $this->isInternalLink()
192
            && $this->url->getDocumentUrl() == $this->getParentUrl()->getDocumentUrl();
193
    }
194
195
    public function getType()
196
    {
197
        if ($this->isSelfLink()) {
198
            return self::LINK_SELF;
199
        }
200
201
        if ($this->isInternalLink()) {
202
            return self::LINK_INTERNAL;
203
        }
204
205
        if ($this->isSubLink()) {
206
            return self::LINK_SUB;
207
        }
208
209
        return self::LINK_EXTERNAL;
210
    }
211
212
    // todo useless ?!
213
    public function getAbsoluteInternalLink()
214
    {
215
        if ($this->isInternalLink()) {
216
            return substr($this->url, strlen($this->getParentUrl()->getOrigin()));
217
        }
218
    }
219
}
220