Completed
Pull Request — master (#150)
by Brent
01:40
created

CrawlRequestFulfilled::mayFollow()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 16
Code Lines 8

Duplication

Lines 16
Ratio 100 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 16
loc 16
rs 9.2
cc 4
eloc 8
nc 4
nop 2
1
<?php
2
3
namespace Spatie\Crawler\Handlers;
4
5
use Psr\Http\Message\ResponseInterface;
6
use Psr\Http\Message\StreamInterface;
7
use Psr\Http\Message\UriInterface;
8
use Spatie\Crawler\Crawler;
9
use Spatie\Crawler\CrawlProfile;
10
use Spatie\Crawler\CrawlQueue\CrawlQueue;
11
use Spatie\Crawler\CrawlSubdomains;
12
use Spatie\Crawler\CrawlUrl;
13
use Spatie\Robots\RobotsHeaders;
14
use Spatie\Robots\RobotsMeta;
15
16
class CrawlRequestFulfilled
17
{
18
    /** @var \Spatie\Crawler\Crawler */
19
    protected $crawler;
20
21
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
22
    protected $crawlQueue;
23
24
    /** @var \Spatie\Crawler\CrawlProfile */
25
    protected $crawlProfile;
26
27
    /** @var array[\Spatie\Crawler\CrawlObserver] */
28
    protected $crawlObservers;
29
30
    /** @var \Psr\Http\Message\UriInterface */
31
    protected $baseUrl;
32
33
    /** @var int */
34
    protected $maximumResponseSize;
35
36
    /** @var bool */
37
    protected $respectRobots;
38
39
    public function __construct(
40
        Crawler $crawler,
41
        UriInterface $baseUrl,
42
        CrawlQueue $crawlQueue,
43
        CrawlProfile $crawlProfile,
44
        array $crawlObservers,
45
        int $maximumResponseSize,
46
        bool $respectRobots
47
    ) {
48
        $this->crawler = $crawler;
49
        $this->baseUrl = $baseUrl;
50
        $this->crawlQueue = $crawlQueue;
51
        $this->crawlProfile = $crawlProfile;
52
        $this->crawlObservers = $crawlObservers;
53
        $this->maximumResponseSize = $maximumResponseSize;
54
        $this->respectRobots = $respectRobots;
55
    }
56
57
    public function __invoke(ResponseInterface $response, $index)
58
    {
59
        $crawlUrl = $this->crawlQueue->getUrlById($index);
60
61
        $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
62
63
        $robotsHeaders = RobotsHeaders::create($response->getHeaders());
64
65
        $robotsMeta = RobotsMeta::create($body);
66
67
        if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
68
            return;
69
        }
70
71
        $this->handleCrawled($response, $crawlUrl);
72
73
        if (! $this->crawlProfile instanceof CrawlSubdomains) {
74
            if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
75
                return;
76
            }
77
        }
78
79
        if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
80
            return;
81
        }
82
83
        $this->crawler->addAllLinksToCrawlQueue(
84
            $body,
85
            $crawlUrl->url
86
        );
87
    }
88
89
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
90
    {
91
        $bodyStream->rewind();
92
93
        $body = $bodyStream->read($readMaximumBytes);
94
95
        return $body;
96
    }
97
98
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
99
    {
100
        foreach ($this->crawlObservers as $crawlObserver) {
101
            $crawlObserver->crawled(
102
                $crawlUrl->url,
103
                $response,
104
                $crawlUrl->foundOnUrl
105
            );
106
        }
107
    }
108
109 View Code Duplication
    protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
110
    {
111
        if (! $this->respectRobots) {
112
            return true;
113
        }
114
115
        if (! $robotsHeaders->mayIndex()) {
116
            return false;
117
        }
118
119
        if (! $robotsMeta->mayIndex()) {
120
            return false;
121
        }
122
123
        return true;
124
    }
125
126 View Code Duplication
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
127
    {
128
        if (! $this->respectRobots) {
129
            return true;
130
        }
131
132
        if (! $robotsHeaders->mayFollow()) {
133
            return false;
134
        }
135
136
        if (! $robotsMeta->mayFollow()) {
137
            return false;
138
        }
139
140
        return true;
141
    }
142
}
143