CrawlerUrl   A
last analyzed

Complexity

Total Complexity 17

Size/Duplication

Total Lines 126
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 52
c 1
b 0
f 0
dl 0
loc 126
rs 10
wmc 17

8 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 6 1
A defaultHarvesting() 0 16 1
A harvest() 0 11 3
A getLinks() 0 3 1
A getHarvester() 0 18 4
A isRedirection() 0 12 3
A harvestLinks() 0 10 1
A isNetworkError() 0 14 3
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Indexable;
7
use PiedWeb\UrlHarvester\Link;
8
9
class CrawlerUrl
10
{
11
    /** @var mixed */
12
    protected $harvest;
13
    /** @var Url */
14
    protected $url;
15
    /** @var CrawlerConfig */
16
    protected $config;
17
18
    /** @var array internal links from the current Url */
19
    protected $links = [];
20
21
    public function __construct(Url $url, CrawlerConfig $config)
22
    {
23
        $this->url = $url;
24
        $this->config = $config;
25
26
        $this->harvest();
27
    }
28
29
    protected function harvest()
30
    {
31
        if ($this->isNetworkError()) {
32
            return null;
33
        }
34
35
        if ($this->isRedirection()) {
36
            return null;
37
        }
38
39
        $this->defaultHarvesting();
40
    }
41
42
    /**
43
     * permit to easily extend and change what is harvested, for example adding :
44
     * $this->harvestBreadcrumb();
45
     * $this->url->setKws(','.implode(',', array_keys($this->getHarvester()->getKws())).','); // Slow ~20%
46
     * $this->url->setRatioTextCode($this->getHarvester()->getRatioTxtCode()); // Slow ~30%
47
     * $this->url->setH1($this->getHarvester()->getUniqueTag('h1') ?? '');.
48
     */
49
    protected function defaultHarvesting()
50
    {
51
        $this->url->setIndexable($this->getHarvester()->indexable()); // slow ~30%
52
53
        $this->url->setMimeType((string) $this->getHarvester()->getResponse()->getMimeType());
54
55
        $this->harvestLinks();
56
57
        // Old way: $this->getHarvester()->getTextAnalysis()->getWordNumber();
58
        $this->url->setWordCount($this->getHarvester()->getWordCount());
59
60
        $this->url->setLoadTime($this->getHarvester()->getResponse()->getInfo('total_time'));
61
62
        $this->url->setSize($this->getHarvester()->getResponse()->getInfo('size_download'));
63
64
        $this->url->setTitle($this->getHarvester()->getUniqueTag('head title') ?? '');
65
    }
66
67
    protected function isNetworkError()
68
    {
69
        if (! $this->getHarvester() instanceof Harvest) {
70
            $this->url->setIndexable(
71
                42 != $this->getHarvester() ? Indexable::NOT_INDEXABLE_NETWORK_ERROR : Indexable::NOT_INDEXABLE_TOO_BIG
72
            );
73
            $this->config->getRecorder()->cache($this->getHarvester(), $this->url);
74
75
            return true;
76
        }
77
78
        $this->config->getRecorder()->cache($this->getHarvester(), $this->url);
79
80
        return false;
81
    }
82
83
    protected function isRedirection()
84
    {
85
        if ($redir = $this->getHarvester()->getRedirectionLink()) {
86
            if ($redir->isInternalLink()) { // add to $links to permits to update counter & co
87
                $this->links[] = $redir;
88
            }
89
            $this->url->setIndexable(Indexable::NOT_INDEXABLE_3XX);
90
91
            return true;
92
        }
93
94
        return false;
95
    }
96
97
    protected function harvestLinks()
98
    {
99
        $this->config->getRecorder()->recordOutboundLink($this->url, $this->getHarvester()->getLinks()); // ~10%
100
        $this->url->links = count($this->getHarvester()->getLinks());
101
        $this->url->links_duplicate = $this->getHarvester()->getNbrDuplicateLinks();
102
        $this->url->links_internal = count($this->getHarvester()->getLinks(Link::LINK_INTERNAL));
103
        $this->url->links_self = count($this->getHarvester()->getLinks(Link::LINK_SELF));
104
        $this->url->links_sub = count($this->getHarvester()->getLinks(Link::LINK_SUB));
105
        $this->url->links_external = count($this->getHarvester()->getLinks(Link::LINK_EXTERNAL));
106
        $this->links = $this->getHarvester()->getLinks(Link::LINK_INTERNAL);
107
    }
108
109
    /**
110
     * @return int|Harvest int correspond to curl error
111
     */
112
    public function getHarvester()
113
    {
114
        if (null !== $this->harvest) {
115
            return $this->harvest;
116
        }
117
118
        $this->harvest = Harvest::fromUrl(
119
            $this->config->getBase().$this->url->getUri(),
120
            $this->config->getUserAgent(),
121
            'en,en-US;q=0.5',
122
            $this->config->getRequestCached()
123
        );
124
125
        if ($this->harvest instanceof Harvest && null !== $this->config->getRobotsTxtCached()) {
126
            $this->harvest->setRobotsTxt($this->config->getRobotsTxtCached());
127
        }
128
129
        return $this->getHarvester();
130
    }
131
132
    public function getLinks()
133
    {
134
        return $this->links;
135
    }
136
137
    /*
138
    protected function harvestBreadcrumb()
139
    {
140
        $breadcrumb = $this->getHarvester()->getBreadCrumb();
141
        if (is_array($breadcrumb)) {
142
            $this->url->setBreadcrumbLevel(count($breadcrumb));
143
            $this->url->setBreadcrumbFirst(isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '');
144
            $this->url->setBreadcrumbText($this->getHarvester()->getBreadCrumb('//'));
145
        }
146
    }/**/
147
}
148