Completed
Push — master ( 5b29fc...dd86b9 )
by Bruno
02:14
created

Crawl::getCrawl()   B

Complexity

Conditions 5
Paths 4

Size

Total Lines 19
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 30

Importance

Changes 0
Metric Value
dl 0
loc 19
ccs 0
cts 13
cp 0
rs 8.8571
c 0
b 0
f 0
cc 5
eloc 13
nc 4
nop 0
crap 30
1
<?php
2
3
namespace Swader\Diffbot\Api;
4
5
use \InvalidArgumentException;
6
use Swader\Diffbot\Entity\EntityIterator;
7
use Swader\Diffbot\Entity\JobCrawl;
8
use Swader\Diffbot\Exceptions\DiffbotException;
9
use Swader\Diffbot\Interfaces\Api;
10
use Swader\Diffbot\Traits\DiffbotAware;
11
12
/**
13
 * Class Crawl
14
 * @see https://www.diffbot.com/dev/docs/crawl/
15
 * @package Swader\Diffbot\Api
16
 */
17
class Crawl
18
{
19
    use DiffbotAware;
20
21
    /** @var string API URL to which to send the request */
22
    protected $apiUrl = 'https://api.diffbot.com/v3/crawl';
23
24
    /** @var string */
25
    protected $name;
26
27
    /** @var Api Api which should be used to process the pages */
28
    protected $api;
29
30
    /** @var array Options to set while initiating the API call */
31
    protected $otherOptions = [];
32
33
    /** @var array Array of seed URLs to crawl */
34
    protected $seeds = [];
35
36
    /**
37
     * @see getName
38
     * @param string|null $name
39
     * @param null|Api $api
40
     */
41 61
    public function __construct($name = null, Api $api = null)
42
    {
43 61
        if ($name !== null) {
44 60
            $this->name = $name;
45 60
            if ($api) {
46 1
                $this->setApi($api);
47 1
            }
48 60
        }
49 61
    }
50
51
    /**
52
     * Returns the unique name of the crawljob
53
     * This name is later used to download datasets, or to modify the job
54
     *
55
     * @return string
56
     */
57 45
    public function getName()
58
    {
59 45
        return $this->name;
60
    }
61
62
    /**
63
     * API which should be used to process the pages
64
     *
65
     * Accepts a fully formed instance of any other API. Will use it to build
66
     * and auto-encode the URL. To satisfy the required $url param of the API
67
     * classes, use the string 'crawl' which prepares the API for Crawlbot
68
     * consumption internally.
69
     *
70
     * @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs
71
     * @param Api $api
72
     * @return $this
73
     */
74 1
    public function setApi(Api $api)
75
    {
76 1
        $this->api = $api;
77
78 1
        return $this;
79
    }
80
81
    /**
82
     * An array of URLs (seeds) which to crawl for matching links
83
     *
84
     * By default Crawlbot will restrict spidering to the entire domain
85
     * ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com").
86
     *
87
     * @param array $seeds
88
     * @return $this
89
     */
90 52
    public function setSeeds(array $seeds)
91
    {
92 52
        $invalidSeeds = [];
93 52
        foreach ($seeds as $seed) {
94 52
            if (!filter_var($seed, FILTER_VALIDATE_URL)) {
95 1
                $invalidSeeds[] = $seed;
96 1
            }
97 52
        }
98 52
        if (!empty($invalidSeeds)) {
99 1
            throw new \InvalidArgumentException(
100 1
                'Some seeds were invalid: ' . implode(',', $invalidSeeds)
101 1
            );
102
        }
103
104 51
        $this->seeds = $seeds;
105
106 51
        return $this;
107
    }
108
109
    /**
110
     * Array of strings to limit pages crawled to those whose URLs
111
     * contain any of the content strings.
112
     *
113
     * You can use the exclamation point to specify a negative string, e.g.
114
     * !product to exclude URLs containing the string "product," and the ^ and
115
     * $ characters to limit matches to the beginning or end of the URL.
116
     *
117
     * The use of a urlCrawlPattern will allow Crawlbot to spider outside of
118
     * the seed domain; it will follow all matching URLs regardless of domain.
119
     *
120
     * @param array $pattern
121
     * @return $this
122
     */
123 1
    public function setUrlCrawlPatterns(array $pattern = null)
124
    {
125 1
        $this->otherOptions['urlCrawlPattern'] = ($pattern === null) ? null
126
            : implode("||", array_map(function ($item) {
127 1
                return urlencode($item);
128 1
            }, $pattern));
129
130 1
        return $this;
131
    }
132
133
    /**
134
     * Specify a regular expression to limit pages crawled to those URLs that
135
     * match your expression. This will override any urlCrawlPattern value.
136
     *
137
     * The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the
138
     * seed domain; it will follow all matching URLs regardless of domain.
139
     *
140
     * @param string $regex
141
     * @return $this
142
     */
143 1
    public function setUrlCrawlRegEx($regex)
144
    {
145 1
        $this->otherOptions['urlCrawlRegEx'] = $regex;
146
147 1
        return $this;
148
    }
149
150
    /**
151
     * Specify ||-separated strings to limit pages processed to those whose
152
     * URLs contain any of the content strings.
153
     *
154
     * You can use the exclamation point to specify a negative string, e.g.
155
     * !/category to exclude URLs containing the string "/category," and the ^
156
     * and $ characters to limit matches to the beginning or end of the URL.
157
     *
158
     * @param array $pattern
159
     * @return $this
160
     */
161 1
    public function setUrlProcessPatterns(array $pattern = null)
162
    {
163 1
        $this->otherOptions['urlProcessPattern'] = ($pattern === null) ? null
164
            : implode("||", array_map(function ($item) {
165 1
                return urlencode($item);
166 1
            }, $pattern));
167
168 1
        return $this;
169
    }
170
171
    /**
172
     * Specify a regular expression to limit pages processed to those URLs that
173
     * match your expression. This will override any urlProcessPattern value.
174
     *
175
     * @param string $regex
176
     * @return $this
177
     */
178 1
    public function setUrlProcessRegEx($regex)
179
    {
180 1
        $this->otherOptions['urlProcessRegEx'] = $regex;
181
182 1
        return $this;
183
184
    }
185
186
    /**
187
     * Specify ||-separated strings to limit pages processed to those whose
188
     * HTML contains any of the content strings.
189
     *
190
     * @param array $pattern
191
     * @return $this
192
     */
193 1
    public function setPageProcessPatterns(array $pattern)
194
    {
195 1
        $this->otherOptions['pageProcessPattern'] = implode("||",
196
            array_map(function ($item) {
197 1
                return urlencode($item);
198 1
            }, $pattern));
199
200 1
        return $this;
201
    }
202
203
    /**
204
     * Specify the depth of your crawl. A maxHops=0 will limit processing to
205
     * the seed URL(s) only -- no other links will be processed; maxHops=1 will
206
     * process all (otherwise matching) pages whose links appear on seed URL(s);
207
     * maxHops=2 will process pages whose links appear on those pages; and so on
208
     *
209
     * By default, Crawlbot will crawl and process links at any depth.
210
     *
211
     * @param int $input
212
     * @return $this
213
     */
214 6
    public function setMaxHops($input = -1)
215
    {
216 6
        if ((int)$input < -1) {
217 1
            $input = -1;
218 1
        }
219 6
        $this->otherOptions['maxHops'] = (int)$input;
220
221 6
        return $this;
222
    }
223
224
    /**
225
     * Specify max pages to spider. Default: 100,000.
226
     *
227
     * @param int $input
228
     * @return $this
229
     */
230 6
    public function setMaxToCrawl($input = 100000)
231
    {
232 6
        if ((int)$input < 1) {
233 2
            $input = 1;
234 2
        }
235 6
        $this->otherOptions['maxToCrawl'] = (int)$input;
236
237 6
        return $this;
238
    }
239
240
    /**
241
     * Specify max pages to process through Diffbot APIs. Default: 100,000.
242
     *
243
     * @param int $input
244
     * @return $this
245
     */
246 6
    public function setMaxToProcess($input = 100000)
247
    {
248 6
        if ((int)$input < 1) {
249 2
            $input = 1;
250 2
        }
251 6
        $this->otherOptions['maxToProcess'] = (int)$input;
252
253 6
        return $this;
254
    }
255
256
    /**
257
     * If input is email address, end a message to this email address when the
258
     * crawl hits the maxToCrawl or maxToProcess limit, or when the crawl
259
     * completes.
260
     *
261
     * If input is URL, you will receive a POST with X-Crawl-Name and
262
     * X-Crawl-Status in the headers, and the full JSON response in the
263
     * POST body.
264
     *
265
     * @param string $string
266
     * @return $this
267
     * @throws InvalidArgumentException
268
     */
269 6
    public function notify($string)
270
    {
271 6
        if (filter_var($string, FILTER_VALIDATE_EMAIL)) {
272 2
            $this->otherOptions['notifyEmail'] = $string;
273
274 2
            return $this;
275
        }
276 5
        if (filter_var($string, FILTER_VALIDATE_URL)) {
277 2
            $this->otherOptions['notifyWebhook'] = urlencode($string);
278
279 2
            return $this;
280
        }
281
282 3
        throw new InvalidArgumentException(
283
            'Only valid email or URL accepted! You provided: ' . $string
284 3
        );
285
    }
286
287
    /**
288
     * Wait this many seconds between each URL crawled from a single IP address.
289
     * Specify the number of seconds as an integer or floating-point number.
290
     *
291
     * @param float $input
292
     * @return $this
293
     * @throws InvalidArgumentException
294
     */
295 14
    public function setCrawlDelay($input = 0.25)
296
    {
297 14
        if (!is_numeric($input)) {
298 7
            throw new InvalidArgumentException('Input must be numeric.');
299
        }
300 7
        $input = ($input < 0) ? 0.25 : $input;
301 7
        $this->otherOptions['crawlDelay'] = (float)$input;
302
303 7
        return $this;
304
    }
305
306
    /**
307
     * Specify the number of days as a floating-point (e.g. repeat=7.0) to
308
     * repeat this crawl. By default crawls will not be repeated.
309
     *
310
     * @param int|float $input
311
     * @return $this
312
     * @throws \InvalidArgumentException
313
     */
314 7
    public function setRepeat($input)
315
    {
316 7
        if (!is_numeric($input) || !$input) {
317 4
            throw new \InvalidArgumentException('Only positive numbers allowed.');
318
        }
319 3
        $this->otherOptions['repeat'] = (float)$input;
320
321 3
        return $this;
322
    }
323
324
    /**
325
     * By default repeat crawls will only process new (previously unprocessed)
326
     * pages. Set to 0 to process all content on repeat crawls.
327
     *
328
     * @param int $int
329
     * @return $this
330
     */
331 1
    public function setOnlyProcessIfNew($int = 1)
332
    {
333 1
        $this->otherOptions['onlyProcessIfNew'] = (int)(bool)$int;
334
335 1
        return $this;
336
    }
337
338
    /**
339
     * Specify the maximum number of crawl repeats. By default (maxRounds=0)
340
     * repeating crawls will continue indefinitely.
341
     *
342
     * @param int $input
343
     * @return $this
344
     */
345 6
    public function setMaxRounds($input = 0)
346
    {
347 6
        if ((int)$input < -1) {
348 1
            $input = -1;
349 1
        }
350
351 6
        $this->otherOptions['maxRounds'] = (int)$input;
352
353 6
        return $this;
354
    }
355
356
    /**
357
     * Ignores robots.txt if set to 0/false
358
     *
359
     * @param bool $bool
360
     * @return $this
361
     */
362 1
    public function setObeyRobots($bool = true)
363
    {
364 1
        $this->otherOptions['obeyRobots'] = (int)(bool)$bool;
365
366 1
        return $this;
367
    }
368
369
    /**
370
     * Set value to 1 to force the use of proxy IPs for the crawl.
371
     *
372
     * @param bool $bool
373
     * @return $this
374
     */
375
    public function setUseProxies($bool = true)
376
    {
377
        $this->otherOptions['useProxies'] = (int)(bool)$bool;
378
379
        return $this;
380
    }
381
382
    /**
383
     * Force the start of a new crawl "round" (manually repeat the crawl).
384
     * If onlyProcessIfNew is set to 1 (default), only newly-created pages will
385
     * be processed.
386
     *
387
     * @param bool $commit
388
     * @return EntityIterator
389
     * @throws DiffbotException
390
     */
391 1
    public function roundStart($commit = true)
392
    {
393 1
        $this->otherOptions = ['roundStart' => 1];
394
395 1
        return ($commit) ? $this->call() : $this;
396
    }
397
398
    /**
399
     * Pause a crawl.
400
     *
401
     * @param bool $commit
402
     * @return EntityIterator
403
     * @throws DiffbotException
404
     */
405 1
    public function pause($commit = true)
406
    {
407 1
        $this->otherOptions = ['pause' => 1];
408
409 1
        return ($commit) ? $this->call() : $this;
410
    }
411
412
    /**
413
     * Pause a crawl.
414
     *
415
     * @param bool $commit
416
     * @return EntityIterator
417
     * @throws DiffbotException
418
     */
419 1
    public function unpause($commit = true)
420
    {
421 1
        $this->otherOptions = ['pause' => 0];
422
423 1
        return ($commit) ? $this->call() : $this;
424
    }
425
426
    /**
427
     * Restart removes all crawled data while maintaining crawl settings.
428
     *
429
     * @param bool $commit
430
     * @return EntityIterator
431
     * @throws DiffbotException
432
     */
433 1
    public function restart($commit = true)
434
    {
435 1
        $this->otherOptions = ['restart' => 1];
436
437 1
        return ($commit) ? $this->call() : $this;
438
    }
439
440
    /**
441
     * Delete a crawl, and all associated data, completely.
442
     *
443
     * @param bool $commit
444
     * @return EntityIterator
445
     * @throws DiffbotException
446
     */
447 1
    public function delete($commit = true)
448
    {
449 1
        $this->otherOptions = ['delete' => 1];
450
451 1
        return ($commit) ? $this->call() : $this;
452
    }
453
    public function getCrawl()
454
    {
455
        $theUrl = $this->apiUrl ."?token=" . $this->diffbot->getToken() . "&name=" . $this->name;
456
        $response = $this->diffbot->getHttpClient()->get($theUrl);
457
458
        $array = json_decode($response->getBody(), true);
459
460
        if (isset($array['jobs'])) {
461
            $jobs = [];
462
            foreach ($array['jobs'] as $job) {
463
                $jobs[] = new JobCrawl($job);
464
            }
465
            return new EntityIterator($jobs, $response);
466
        } elseif (!isset($array['jobs']) && isset($array['response'])) {
467
            return $array['response'];
468
        } else {
469
            throw new DiffbotException($array["error"]);
470
        }
471
    }
472
473 7
    public function call()
474
    {
475 7
        $theHeader=["content-type"=>"application/x-www-form-urlencoded; charset=UTF-8"];
476 7
        $response = $this->diffbot->getHttpClient()->post($this->apiUrl, $theHeader, $this->buildUrl());
477
478
479 6
        $array = json_decode($response->getBody(), true);
480
481 6
        if (isset($array['jobs'])) {
482 4
            $jobs = [];
483 4
            foreach ($array['jobs'] as $job) {
484 4
                $jobs[] = new JobCrawl($job);
485 4
            }
486
487 4
            return new EntityIterator($jobs, $response);
488 2
        } elseif (!isset($array['jobs']) && isset($array['response'])) {
489 1
            return $array['response'];
490
        } else {
491 1
            throw new DiffbotException('It appears something went wrong - no data was returned. Did you use the correct token / job name?');
492
        }
493
    }
494
495
    /**
496
     * Builds out the URL string that gets requested once `call()` is called
497
     *
498
     * @return string
499
     */
500 45
    public function buildUrl()
501
    {
502
503 45
        if (isset($this->otherOptions['urlProcessRegEx'])
504 45
            && !empty($this->otherOptions['urlProcessRegEx'])
505 45
        ) {
506 1
            unset($this->otherOptions['urlProcessPattern']);
507 1
        }
508
509 45
        if (isset($this->otherOptions['urlCrawlRegEx'])
510 45
            && !empty($this->otherOptions['urlCrawlRegEx'])
511 45
        ) {
512 1
            unset($this->otherOptions['urlCrawlPattern']);
513 1
        }
514
515
516
        // Add token
517 45
        $url = 'token=' . $this->diffbot->getToken();
518
519 45
        if ($this->getName()) {
520
            // Add name
521 44
            $url .= '&name=' . $this->getName();
522
523
            // Add seeds
524 44
            if (!empty($this->seeds)) {
525 37
                $url .= '&seeds=' . implode('%20', array_map(function ($item) {
526 37
                        return urlencode($item);
527 37
                    }, $this->seeds));
528 37
            }
529
530
            // Add other options
531 44
            if (!empty($this->otherOptions)) {
532 40
                foreach ($this->otherOptions as $option => $value) {
533 40
                    $url .= '&' . $option . '=' . $value;
534 40
                }
535 40
            }
536
537
            // Add API link
538 44
            $url .= '&apiUrl=' . $this->getApiString();
539 44
        }
540
541 45
        return $url;
542
    }
543
544
    /**
545
     * Sets the request type to "urls" to retrieve the URL Report
546
     * URL for understanding diagnostic data of URLs
547
     *
548
     * @return $this
549
     */
550
    public function getUrlReportUrl($num = null)
551
    {
552
        $this->otherOptions['type'] = 'urls';
553
554
        if (!empty($num) && is_numeric($num)) {
555
            $this->otherOptions['num'] = $num;
556
        }
557
558
        // Setup data endpoint
559
        $url = $this->apiUrl . '/data';
560
561
        // Add token
562
        $url .= '?token=' . $this->diffbot->getToken();
563
564
        if ($this->getName()) {
565
            // Add name
566
            $url .= '&name=' . $this->getName();
567
568
            // Add other options
569
            if (!empty($this->otherOptions)) {
570
                foreach ($this->otherOptions as $option => $value) {
571
                    $url .= '&' . $option . '=' . $value;
572
                }
573
            }
574
        }
575
576
        return $url;
577
578
    }
579
580
    /**
581
     * @return string
582
     */
583 44
    protected function getApiString()
584
    {
585 44
        if (!$this->api) {
586 43
            $this->api = $this->diffbot->createAnalyzeAPI('crawl');
587 43
            $this->api->setMode('auto');
588 43
        }
589
590 44
        return urlencode($this->api->buildUrl());
591
    }
592
}
593