Passed
Push — master ( b1c15a...560dd8 )
by Dev
19:05 queued 17:27
created

CrawlerUrl::harvestBreadcrumb()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 7
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 5
nc 2
nop 0
dl 0
loc 7
rs 10
c 1
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
9
class CrawlerUrl
10
{
11
    /** @var mixed */
12
    protected $harvest;
13
    /** @var Url */
14
    protected $url;
15
    /** @var CrawlerConfig */
16
    protected $config;
17
18
    /** @var array internal links from the current Url */
19
    protected $links = [];
20
21
    public function __construct(Url $url, CrawlerConfig $config)
22
    {
23
        $this->url = $url;
24
        $this->config = $config;
25
26
        $this->harvest();
27
    }
28
29
    protected function harvest()
30
    {
31
        if ($this->isNetworkError()) {
32
            return null;
33
        }
34
35
        if ($this->isRedirection()) {
36
            return null;
37
        }
38
39
        $this->defaultHarvesting();
40
    }
41
42
    /**
43
     * permit to easily extend and change what is harvested, for example adding :
44
     * $this->harvestBreadcrumb();
45
     * $this->url->setKws(','.implode(',', array_keys($this->getHarvester()->getKws())).','); // Slow ~20%
46
     * $this->url->setRatioTextCode($this->getHarvester()->getRatioTxtCode()); // Slow ~30%
47
     * $this->url->setH1($this->getHarvester()->getUniqueTag('h1') ?? '');.
48
     */
49
    protected function defaultHarvesting()
50
    {
51
        $this->url->setIndexable($this->getHarvester()->indexable()); // slow ~30%
52
53
        $this->url->setMimeType((string) $this->getHarvester()->getResponse()->getMimeType());
54
55
        $this->harvestLinks();
56
57
        // Old way: $this->getHarvester()->getTextAnalysis()->getWordNumber();
58
        $this->url->setWordCount($this->getHarvester()->getWordCount());
59
60
        $this->url->setLoadTime($this->getHarvester()->getResponse()->getInfo('total_time'));
61
62
        $this->url->setSize($this->getHarvester()->getResponse()->getInfo('size_download'));
63
64
        $this->url->setTitle($this->getHarvester()->getUniqueTag('head title') ?? '');
65
    }
66
67
    protected function isNetworkError()
68
    {
69
        if (!$this->getHarvester() instanceof Harvest) {
0 ignored issues
show
introduced by
$this->getHarvester() is always a sub-type of PiedWeb\UrlHarvester\Harvest.
Loading history...
70
            $this->url->setIndexable(Indexable::NOT_INDEXABLE_NETWORK_ERROR);
71
72
            return true;
73
        }
74
75
        $this->config->getRecorder()->cache($this->getHarvester(), $this->url);
76
77
        return false;
78
    }
79
80
    protected function isRedirection()
81
    {
82
        if ($redir = $this->getHarvester()->getRedirectionLink()) {
83
            if ($redir->isInternalLink()) { // add to $links to permits to update counter & co
84
                $this->links[] = $redir;
85
            }
86
            $this->url->setIndexable(Indexable::NOT_INDEXABLE_3XX);
87
88
            return true;
89
        }
90
91
        return false;
92
    }
93
94
    protected function harvestLinks()
95
    {
96
        $this->config->getRecorder()->recordOutboundLink($this->url, $this->getHarvester()->getLinks()); // ~10%
97
        $this->url->links = count($this->getHarvester()->getLinks());
98
        $this->url->links_duplicate = $this->getHarvester()->getNbrDuplicateLinks();
99
        $this->url->links_internal = count($this->getHarvester()->getLinks(Link::LINK_INTERNAL));
100
        $this->url->links_self = count($this->getHarvester()->getLinks(Link::LINK_SELF));
101
        $this->url->links_sub = count($this->getHarvester()->getLinks(Link::LINK_SUB));
102
        $this->url->links_external = count($this->getHarvester()->getLinks(Link::LINK_EXTERNAL));
103
        $this->links = $this->getHarvester()->getLinks(Link::LINK_INTERNAL);
104
    }
105
106
107
    public function getHarvester(): ?Harvest
108
    {
109
        if (null !== $this->harvest) {
110
            return $this->harvest === false ? null : $this->harvest;
111
        }
112
113
        $this->harvest = Harvest::fromUrl(
114
            $this->config->getBase().$this->url->getUri(),
115
            $this->config->getUserAgent(),
116
            'en,en-US;q=0.5',
117
            $this->config->getRequestCached()
118
        );
119
120
        if ($this->harvest instanceof Harvest) { // could be an int corresponding to curl error
121
            $this->harvest = false;
122
        }
123
124
        if (false !== $this->harvest && null !== $this->config->getRobotsTxtCached()) {
125
            $this->harvest->setRobotsTxt($this->config->getRobotsTxtCached());
0 ignored issues
show
Bug introduced by
The method setRobotsTxt() does not exist on integer. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

125
            $this->harvest->/** @scrutinizer ignore-call */ 
126
                            setRobotsTxt($this->config->getRobotsTxtCached());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
126
        }
127
128
        return $this->getHarvester();
129
    }
130
131
    public function getLinks()
132
    {
133
        return $this->links;
134
    }
135
136
    /**
137
    protected function harvestBreadcrumb()
138
    {
139
        $breadcrumb = $this->getHarvester()->getBreadCrumb();
140
        if (is_array($breadcrumb)) {
141
            $this->url->setBreadcrumbLevel(count($breadcrumb));
142
            $this->url->setBreadcrumbFirst(isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '');
143
            $this->url->setBreadcrumbText($this->getHarvester()->getBreadCrumb('//'));
144
        }
145
    }/**/
146
}
147