Completed
Push — master ( 505018...16cf89 )
by D.
13s
created

Crawler::setCollectors()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 10
rs 9.4285
cc 2
eloc 3
nc 2
nop 1
1
<?php
2
/**
3
 * This file is part of sitemap-common.
4
 *
5
 * (c) 2016 Daniele Moraschi
6
 *
7
 * For the full copyright and license information, please view the LICENSE
8
 * file that was distributed with this source code.
9
 */
10
11
namespace SiteMap;
12
13
14
use GuzzleHttp\ClientInterface;
15
use SiteMap\Collect\Collector;
16
use SiteMap\Http\HttpResource;
17
use SiteMap\Http\WebResource;
18
use SiteMap\Http\Url;
19
use SiteMap\Parse\LinkParser;
20
use SiteMap\Policy\Policy;
21
22
class Crawler
23
{
24
25
    /**
26
     * @var Url
27
     */
28
    private $baseUrl;
29
30
    /**
31
     * @var LinkParser
32
     */
33
    private $parser;
34
35
    /**
36
     * @var ClientInterface
37
     */
38
    private $httpClient;
39
40
    /**
41
     * @var array
42
     */
43
    private $policies = [];
44
45
    /**
46
     * @var array
47
     */
48
    private $collectors = [];
49
50
    /**
51
     * Crawler constructor.
52
     *
53
     * @param Url $baseUrl
54
     * @param LinkParser $parser
55
     * @param ClientInterface $httpClient
56
     */
57
    public function __construct(Url $baseUrl, LinkParser $parser, ClientInterface $httpClient)
58
    {
59
        $this->baseUrl = $baseUrl;
60
        $this->parser = $parser;
61
        $this->httpClient = $httpClient;
62
    }
63
64
    /**
65
     * Add a new crawler policy.
66
     *
67
     * @param $key
68
     * @param Policy $policy
69
     */
70
    public function setPolicy($key, Policy $policy)
71
    {
72
        $this->policies[(string)$key] = $policy;
73
    }
74
75
    /**
76
     * Set crawler policies to follow the URLs
77
     * of a webpage.
78
     *
79
     * @param array $policies
80
     */
81
    public function setPolicies(array $policies)
82
    {
83
        /**
84
         * @var string $key
85
         * @var Policy $policy
86
         */
87
        foreach ($policies as $key => $policy) {
88
            $this->setPolicy($key, $policy);
89
        }
90
    }
91
92
    /**
93
     * Set a crawler collector.
94
     *
95
     * @param $key
96
     * @param Collector $collector
97
     */
98
    public function setCollector($key, Collector $collector)
99
    {
100
        $this->collectors[(string)$key] = $collector;
101
    }
102
103
    /**
104
     * Return a previously set crawler collector.
105
     *
106
     * @param $key
107
     * @return Collector|null
108
     */
109
    public function getCollector($key)
110
    {
111
        return isset($this->collectors[(string)$key])
112
            ? $this->collectors[(string)$key]
113
            : null;
114
    }
115
116
    /**
117
     * Set crawler collectors.
118
     *
119
     * @param array $collectors
120
     */
121
    public function setCollectors(array $collectors)
122
    {
123
        /**
124
         * @var string $key
125
         * @var Collector $collector
126
         */
127
        foreach ($collectors as $key => $collector) {
128
            $this->setCollector($key, $collector);
129
        }
130
    }
131
132
    /**
133
     * Will return true|false if the URL passed as argument should
134
     * be visited by the crawler based upon policies.
135
     *
136
     * @param Url $url
137
     * @return bool
138
     */
139
    public function shouldVisit(Url $url)
140
    {
141
        /** @var Policy $policy */
142
        foreach ($this->policies as $key => $policy) {
143
            if (! $policy->shouldVisit($url)) {
144
                return false;
145
            }
146
        }
147
        return true;
148
    }
149
150
    /**
151
     * Will return collect the data based on added collector rules.
152
     *
153
     * @param Url $url
154
     * @param $content
155
     */
156
    public function shouldCollect(Url $url, $content)
157
    {
158
        /** @var Collector $collector */
159
        foreach ($this->collectors as $key => $collector) {
160
            $collector->setContent($url, $content);
161
            $collector->collect();
162
        }
163
    }
164
165
    /**
166
     * Visit a webpage.
167
     *
168
     * @TODO handle the exception
169
     * @param HttpResource $httpResource
170
     * @return array
171
     */
172
    private function visitAndCollect(HttpResource $httpResource)
173
    {
174
        try {
175
            $webPage = $httpResource->getContent();
176
        } catch (\Exception $e) {
177
            return array();
178
        }
179
180
        $this->parser->setContent($httpResource->getURI(), $webPage);
181
        $links = $this->parser->findLinks();
182
183
        $this->shouldCollect($httpResource->getURI(), $webPage);
184
185
        return $links;
186
    }
187
188
    /**
189
     * This method will return the array of visited URLs by the crawler
190
     * based upon specified deep scan and policies.
191
     *
192
     * @param $maxDeep
193
     * @return array|mixed
194
     */
195
    public function crawl($maxDeep = 1)
196
    {
197
        $deepness = 0;
198
        $maxDeep = abs((int)$maxDeep);
199
        $linksCollection = array_fill(0, $maxDeep+1, []);
200
201
        $linksCollection[0] = array($this->baseUrl->getWebUrl());
202
203
        while ($deepness < $maxDeep) {
204
            $deepness++;
205
            foreach ($linksCollection[$deepness-1] as $webUrl) {
206
                $url = new Url($webUrl);
207
                if ($this->shouldVisit($url)) {
208
                    $linksCollection[$deepness] += $this->visitAndCollect(
209
                        new WebResource($url, $this->httpClient)
210
                    );
211
                }
212
            }
213
        }
214
215
        $linksCollection = call_user_func_array('array_merge', $linksCollection);
216
        return $this->getUrlArray($linksCollection);
217
    }
218
219
    /**
220
     * @param array $links
221
     * @return array
222
     */
223
    protected function getUrlArray(array $links = array())
224
    {
225
        return array_map(function($webUrl) {
226
            return new Url($webUrl);
227
        }, array_unique($links));
228
    }
229
}