Crawl::getCrawl() - Code Metrics - Inspection of "Merge pull request #60 from shinoshi/fix-crawl-pos..." - Swader/diffbot-php-client - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 5b29fc...dd86b9 )

by Bruno

created 2017-12-15 00:31 UTC

Crawl::getCrawl() B

↳ Parent: Crawl

Complexity

Conditions	5
Paths	4

Size

Total Lines	19
Code Lines	13

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	30

Importance

Changes

Metric	Value
dl	0
loc	19
ccs	0
cts	13
cp	0
rs	8.8571
c	0
b	0
f	0
cc	5
eloc	13
nc	4
nop	0
crap	30

<?php

namespace Swader\Diffbot\Api;

use \InvalidArgumentException;
use Swader\Diffbot\Entity\EntityIterator;
use Swader\Diffbot\Entity\JobCrawl;
use Swader\Diffbot\Exceptions\DiffbotException;
use Swader\Diffbot\Interfaces\Api;
use Swader\Diffbot\Traits\DiffbotAware;

/**
 * Class Crawl
 * @see https://www.diffbot.com/dev/docs/crawl/
 * @package Swader\Diffbot\Api
 */
class Crawl
{
    use DiffbotAware;

    /** @var string API URL to which to send the request */
    protected $apiUrl = 'https://api.diffbot.com/v3/crawl';

    /** @var string */
    protected $name;

    /** @var Api Api which should be used to process the pages */
    protected $api;

    /** @var array Options to set while initiating the API call */
    protected $otherOptions = [];

    /** @var array Array of seed URLs to crawl */
    protected $seeds = [];

    /**
     * @see getName
     * @param string|null $name
     * @param null|Api $api
     */
    public function __construct($name = null, Api $api = null)
    {
        if ($name !== null) {
            $this->name = $name;
            if ($api) {
                $this->setApi($api);
            }
        }
    }

    /**
     * Returns the unique name of the crawljob
     * This name is later used to download datasets, or to modify the job
     *
     * @return string
     */
    public function getName()
    {
        return $this->name;
    }

    /**
     * API which should be used to process the pages
     *
     * Accepts a fully formed instance of any other API. Will use it to build
     * and auto-encode the URL. To satisfy the required $url param of the API
     * classes, use the string 'crawl' which prepares the API for Crawlbot
     * consumption internally.
     *
     * @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs
     * @param Api $api
     * @return $this
     */
    public function setApi(Api $api)
    {
        $this->api = $api;

        return $this;
    }

    /**
     * An array of URLs (seeds) which to crawl for matching links
     *
     * By default Crawlbot will restrict spidering to the entire domain
     * ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com").
     *
     * @param array $seeds
     * @return $this
     */
    public function setSeeds(array $seeds)
    {
        $invalidSeeds = [];
        foreach ($seeds as $seed) {
            if (!filter_var($seed, FILTER_VALIDATE_URL)) {
                $invalidSeeds[] = $seed;
            }
        }
        if (!empty($invalidSeeds)) {
            throw new \InvalidArgumentException(
                'Some seeds were invalid: ' . implode(',', $invalidSeeds)
            );
        }

        $this->seeds = $seeds;

        return $this;
    }

    /**
     * Array of strings to limit pages crawled to those whose URLs
     * contain any of the content strings.
     *
     * You can use the exclamation point to specify a negative string, e.g.
     * !product to exclude URLs containing the string "product," and the ^ and
     * $ characters to limit matches to the beginning or end of the URL.
     *
     * The use of a urlCrawlPattern will allow Crawlbot to spider outside of
     * the seed domain; it will follow all matching URLs regardless of domain.
     *
     * @param array $pattern
     * @return $this
     */
    public function setUrlCrawlPatterns(array $pattern = null)
    {
        $this->otherOptions['urlCrawlPattern'] = ($pattern === null) ? null
            : implode("||", array_map(function ($item) {
                return urlencode($item);
            }, $pattern));

        return $this;
    }

    /**
     * Specify a regular expression to limit pages crawled to those URLs that
     * match your expression. This will override any urlCrawlPattern value.
     *
     * The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the
     * seed domain; it will follow all matching URLs regardless of domain.
     *
     * @param string $regex
     * @return $this
     */
    public function setUrlCrawlRegEx($regex)
    {
        $this->otherOptions['urlCrawlRegEx'] = $regex;

        return $this;
    }

    /**
     * Specify ||-separated strings to limit pages processed to those whose
     * URLs contain any of the content strings.
     *
     * You can use the exclamation point to specify a negative string, e.g.
     * !/category to exclude URLs containing the string "/category," and the ^
     * and $ characters to limit matches to the beginning or end of the URL.
     *
     * @param array $pattern
     * @return $this
     */
    public function setUrlProcessPatterns(array $pattern = null)
    {
        $this->otherOptions['urlProcessPattern'] = ($pattern === null) ? null
            : implode("||", array_map(function ($item) {
                return urlencode($item);
            }, $pattern));

        return $this;
    }

    /**
     * Specify a regular expression to limit pages processed to those URLs that
     * match your expression. This will override any urlProcessPattern value.
     *
     * @param string $regex
     * @return $this
     */
    public function setUrlProcessRegEx($regex)
    {
        $this->otherOptions['urlProcessRegEx'] = $regex;

        return $this;

    }

    /**
     * Specify ||-separated strings to limit pages processed to those whose
     * HTML contains any of the content strings.
     *
     * @param array $pattern
     * @return $this
     */
    public function setPageProcessPatterns(array $pattern)
    {
        $this->otherOptions['pageProcessPattern'] = implode("||",
            array_map(function ($item) {
                return urlencode($item);
            }, $pattern));

        return $this;
    }

    /**
     * Specify the depth of your crawl. A maxHops=0 will limit processing to
     * the seed URL(s) only -- no other links will be processed; maxHops=1 will
     * process all (otherwise matching) pages whose links appear on seed URL(s);
     * maxHops=2 will process pages whose links appear on those pages; and so on
     *
     * By default, Crawlbot will crawl and process links at any depth.
     *
     * @param int $input
     * @return $this
     */
    public function setMaxHops($input = -1)
    {
        if ((int)$input < -1) {
            $input = -1;
        }
        $this->otherOptions['maxHops'] = (int)$input;

        return $this;
    }

    /**
     * Specify max pages to spider. Default: 100,000.
     *
     * @param int $input
     * @return $this
     */
    public function setMaxToCrawl($input = 100000)
    {
        if ((int)$input < 1) {
            $input = 1;
        }
        $this->otherOptions['maxToCrawl'] = (int)$input;

        return $this;
    }

    /**
     * Specify max pages to process through Diffbot APIs. Default: 100,000.
     *
     * @param int $input
     * @return $this
     */
    public function setMaxToProcess($input = 100000)
    {
        if ((int)$input < 1) {
            $input = 1;
        }
        $this->otherOptions['maxToProcess'] = (int)$input;

        return $this;
    }

    /**
     * If input is email address, end a message to this email address when the
     * crawl hits the maxToCrawl or maxToProcess limit, or when the crawl
     * completes.
     *
     * If input is URL, you will receive a POST with X-Crawl-Name and
     * X-Crawl-Status in the headers, and the full JSON response in the
     * POST body.
     *
     * @param string $string
     * @return $this
     * @throws InvalidArgumentException
     */
    public function notify($string)
    {
        if (filter_var($string, FILTER_VALIDATE_EMAIL)) {
            $this->otherOptions['notifyEmail'] = $string;

            return $this;
        }
        if (filter_var($string, FILTER_VALIDATE_URL)) {
            $this->otherOptions['notifyWebhook'] = urlencode($string);

            return $this;
        }

        throw new InvalidArgumentException(
            'Only valid email or URL accepted! You provided: ' . $string
        );
    }

    /**
     * Wait this many seconds between each URL crawled from a single IP address.
     * Specify the number of seconds as an integer or floating-point number.
     *
     * @param float $input
     * @return $this
     * @throws InvalidArgumentException
     */
    public function setCrawlDelay($input = 0.25)
    {
        if (!is_numeric($input)) {
            throw new InvalidArgumentException('Input must be numeric.');
        }
        $input = ($input < 0) ? 0.25 : $input;
        $this->otherOptions['crawlDelay'] = (float)$input;

        return $this;
    }

    /**
     * Specify the number of days as a floating-point (e.g. repeat=7.0) to
     * repeat this crawl. By default crawls will not be repeated.
     *
     * @param int|float $input
     * @return $this
     * @throws \InvalidArgumentException
     */
    public function setRepeat($input)
    {
        if (!is_numeric($input) || !$input) {
            throw new \InvalidArgumentException('Only positive numbers allowed.');
        }
        $this->otherOptions['repeat'] = (float)$input;

        return $this;
    }

    /**
     * By default repeat crawls will only process new (previously unprocessed)
     * pages. Set to 0 to process all content on repeat crawls.
     *
     * @param int $int
     * @return $this
     */
    public function setOnlyProcessIfNew($int = 1)
    {
        $this->otherOptions['onlyProcessIfNew'] = (int)(bool)$int;

        return $this;
    }

    /**
     * Specify the maximum number of crawl repeats. By default (maxRounds=0)
     * repeating crawls will continue indefinitely.
     *
     * @param int $input
     * @return $this
     */
    public function setMaxRounds($input = 0)
    {
        if ((int)$input < -1) {
            $input = -1;
        }

        $this->otherOptions['maxRounds'] = (int)$input;

        return $this;
    }

    /**
     * Ignores robots.txt if set to 0/false
     *
     * @param bool $bool
     * @return $this
     */
    public function setObeyRobots($bool = true)
    {
        $this->otherOptions['obeyRobots'] = (int)(bool)$bool;

        return $this;
    }

    /**
     * Set value to 1 to force the use of proxy IPs for the crawl.
     *
     * @param bool $bool
     * @return $this
     */
    public function setUseProxies($bool = true)
    {
        $this->otherOptions['useProxies'] = (int)(bool)$bool;

        return $this;
    }

    /**
     * Force the start of a new crawl "round" (manually repeat the crawl).
     * If onlyProcessIfNew is set to 1 (default), only newly-created pages will
     * be processed.
     *
     * @param bool $commit
     * @return EntityIterator
     * @throws DiffbotException
     */
    public function roundStart($commit = true)
    {
        $this->otherOptions = ['roundStart' => 1];

        return ($commit) ? $this->call() : $this;
    }

    /**
     * Pause a crawl.
     *
     * @param bool $commit
     * @return EntityIterator
     * @throws DiffbotException
     */
    public function pause($commit = true)
    {
        $this->otherOptions = ['pause' => 1];

        return ($commit) ? $this->call() : $this;
    }

    /**
     * Pause a crawl.
     *
     * @param bool $commit
     * @return EntityIterator
     * @throws DiffbotException
     */
    public function unpause($commit = true)
    {
        $this->otherOptions = ['pause' => 0];

        return ($commit) ? $this->call() : $this;
    }

    /**
     * Restart removes all crawled data while maintaining crawl settings.
     *
     * @param bool $commit
     * @return EntityIterator
     * @throws DiffbotException
     */
    public function restart($commit = true)
    {
        $this->otherOptions = ['restart' => 1];

        return ($commit) ? $this->call() : $this;
    }

    /**
     * Delete a crawl, and all associated data, completely.
     *
     * @param bool $commit
     * @return EntityIterator
     * @throws DiffbotException
     */
    public function delete($commit = true)
    {
        $this->otherOptions = ['delete' => 1];

        return ($commit) ? $this->call() : $this;
    }
    public function getCrawl()
    {
        $theUrl = $this->apiUrl ."?token=" . $this->diffbot->getToken() . "&name=" . $this->name;
        $response = $this->diffbot->getHttpClient()->get($theUrl);

        $array = json_decode($response->getBody(), true);

        if (isset($array['jobs'])) {
            $jobs = [];
            foreach ($array['jobs'] as $job) {
                $jobs[] = new JobCrawl($job);
            }
            return new EntityIterator($jobs, $response);
        } elseif (!isset($array['jobs']) && isset($array['response'])) {
            return $array['response'];
        } else {
            throw new DiffbotException($array["error"]);
        }
    }

    public function call()
    {
        $theHeader=["content-type"=>"application/x-www-form-urlencoded; charset=UTF-8"];
        $response = $this->diffbot->getHttpClient()->post($this->apiUrl, $theHeader, $this->buildUrl());


        $array = json_decode($response->getBody(), true);

        if (isset($array['jobs'])) {
            $jobs = [];
            foreach ($array['jobs'] as $job) {
                $jobs[] = new JobCrawl($job);
            }

            return new EntityIterator($jobs, $response);
        } elseif (!isset($array['jobs']) && isset($array['response'])) {
            return $array['response'];
        } else {
            throw new DiffbotException('It appears something went wrong - no data was returned. Did you use the correct token / job name?');
        }
    }

    /**
     * Builds out the URL string that gets requested once `call()` is called
     *
     * @return string
     */
    public function buildUrl()
    {

        if (isset($this->otherOptions['urlProcessRegEx'])
            && !empty($this->otherOptions['urlProcessRegEx'])
        ) {
            unset($this->otherOptions['urlProcessPattern']);
        }

        if (isset($this->otherOptions['urlCrawlRegEx'])
            && !empty($this->otherOptions['urlCrawlRegEx'])
        ) {
            unset($this->otherOptions['urlCrawlPattern']);
        }


        // Add token
        $url = 'token=' . $this->diffbot->getToken();

        if ($this->getName()) {
            // Add name
            $url .= '&name=' . $this->getName();

            // Add seeds
            if (!empty($this->seeds)) {
                $url .= '&seeds=' . implode('%20', array_map(function ($item) {
                        return urlencode($item);
                    }, $this->seeds));
            }

            // Add other options
            if (!empty($this->otherOptions)) {
                foreach ($this->otherOptions as $option => $value) {
                    $url .= '&' . $option . '=' . $value;
                }
            }

            // Add API link
            $url .= '&apiUrl=' . $this->getApiString();
        }

        return $url;
    }

    /**
     * Sets the request type to "urls" to retrieve the URL Report
     * URL for understanding diagnostic data of URLs
     *
     * @return $this
     */
    public function getUrlReportUrl($num = null)
    {
        $this->otherOptions['type'] = 'urls';

        if (!empty($num) && is_numeric($num)) {
            $this->otherOptions['num'] = $num;
        }

        // Setup data endpoint
        $url = $this->apiUrl . '/data';

        // Add token
        $url .= '?token=' . $this->diffbot->getToken();

        if ($this->getName()) {
            // Add name
            $url .= '&name=' . $this->getName();

            // Add other options
            if (!empty($this->otherOptions)) {
                foreach ($this->otherOptions as $option => $value) {
                    $url .= '&' . $option . '=' . $value;
                }
            }
        }

        return $url;

    }

    /**
     * @return string
     */
    protected function getApiString()
    {
        if (!$this->api) {
            $this->api = $this->diffbot->createAnalyzeAPI('crawl');
            $this->api->setMode('auto');
        }

        return urlencode($this->api->buildUrl());
    }
}


1		<?php
2
3		namespace Swader\Diffbot\Api;
4
5		use \InvalidArgumentException;
6		use Swader\Diffbot\Entity\EntityIterator;
7		use Swader\Diffbot\Entity\JobCrawl;
8		use Swader\Diffbot\Exceptions\DiffbotException;
9		use Swader\Diffbot\Interfaces\Api;
10		use Swader\Diffbot\Traits\DiffbotAware;
11
12		/**
13		* Class Crawl
14		* @see https://www.diffbot.com/dev/docs/crawl/
15		* @package Swader\Diffbot\Api
16		*/
17		class Crawl
18		{
19		use DiffbotAware;
20
21		/** @var string API URL to which to send the request */
22		protected $apiUrl = 'https://api.diffbot.com/v3/crawl';
23
24		/** @var string */
25		protected $name;
26
27		/** @var Api Api which should be used to process the pages */
28		protected $api;
29
30		/** @var array Options to set while initiating the API call */
31		protected $otherOptions = [];
32
33		/** @var array Array of seed URLs to crawl */
34		protected $seeds = [];
35
36		/**
37		* @see getName
38		* @param string\|null $name
39		* @param null\|Api $api
40		*/
41	61	public function __construct($name = null, Api $api = null)
42		{
43	61	if ($name !== null) {
44	60	$this->name = $name;
45	60	if ($api) {
46	1	$this->setApi($api);
47	1	}
48	60	}
49	61	}
50
51		/**
52		* Returns the unique name of the crawljob
53		* This name is later used to download datasets, or to modify the job
54		*
55		* @return string
56		*/
57	45	public function getName()
58		{
59	45	return $this->name;
60		}
61
62		/**
63		* API which should be used to process the pages
64		*
65		* Accepts a fully formed instance of any other API. Will use it to build
66		* and auto-encode the URL. To satisfy the required $url param of the API
67		* classes, use the string 'crawl' which prepares the API for Crawlbot
68		* consumption internally.
69		*
70		* @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs
71		* @param Api $api
72		* @return $this
73		*/
74	1	public function setApi(Api $api)
75		{
76	1	$this->api = $api;
77
78	1	return $this;
79		}
80
81		/**
82		* An array of URLs (seeds) which to crawl for matching links
83		*
84		* By default Crawlbot will restrict spidering to the entire domain
85		* ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com").
86		*
87		* @param array $seeds
88		* @return $this
89		*/
90	52	public function setSeeds(array $seeds)
91		{
92	52	$invalidSeeds = [];
93	52	foreach ($seeds as $seed) {
94	52	if (!filter_var($seed, FILTER_VALIDATE_URL)) {
95	1	$invalidSeeds[] = $seed;
96	1	}
97	52	}
98	52	if (!empty($invalidSeeds)) {
99	1	throw new \InvalidArgumentException(
100	1	'Some seeds were invalid: ' . implode(',', $invalidSeeds)
101	1	);
102		}
103
104	51	$this->seeds = $seeds;
105
106	51	return $this;
107		}
108
109		/**
110		* Array of strings to limit pages crawled to those whose URLs
111		* contain any of the content strings.
112		*
113		* You can use the exclamation point to specify a negative string, e.g.
114		* !product to exclude URLs containing the string "product," and the ^ and
115		* $ characters to limit matches to the beginning or end of the URL.
116		*
117		* The use of a urlCrawlPattern will allow Crawlbot to spider outside of
118		* the seed domain; it will follow all matching URLs regardless of domain.
119		*
120		* @param array $pattern
121		* @return $this
122		*/
123	1	public function setUrlCrawlPatterns(array $pattern = null)
124		{
125	1	$this->otherOptions['urlCrawlPattern'] = ($pattern === null) ? null
126		: implode("\|\|", array_map(function ($item) {
127	1	return urlencode($item);
128	1	}, $pattern));
129
130	1	return $this;
131		}
132
133		/**
134		* Specify a regular expression to limit pages crawled to those URLs that
135		* match your expression. This will override any urlCrawlPattern value.
136		*
137		* The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the
138		* seed domain; it will follow all matching URLs regardless of domain.
139		*
140		* @param string $regex
141		* @return $this
142		*/
143	1	public function setUrlCrawlRegEx($regex)
144		{
145	1	$this->otherOptions['urlCrawlRegEx'] = $regex;
146
147	1	return $this;
148		}
149
150		/**
151		* Specify \|\|-separated strings to limit pages processed to those whose
152		* URLs contain any of the content strings.
153		*
154		* You can use the exclamation point to specify a negative string, e.g.
155		* !/category to exclude URLs containing the string "/category," and the ^
156		* and $ characters to limit matches to the beginning or end of the URL.
157		*
158		* @param array $pattern
159		* @return $this
160		*/
161	1	public function setUrlProcessPatterns(array $pattern = null)
162		{
163	1	$this->otherOptions['urlProcessPattern'] = ($pattern === null) ? null
164		: implode("\|\|", array_map(function ($item) {
165	1	return urlencode($item);
166	1	}, $pattern));
167
168	1	return $this;
169		}
170
171		/**
172		* Specify a regular expression to limit pages processed to those URLs that
173		* match your expression. This will override any urlProcessPattern value.
174		*
175		* @param string $regex
176		* @return $this
177		*/
178	1	public function setUrlProcessRegEx($regex)
179		{
180	1	$this->otherOptions['urlProcessRegEx'] = $regex;
181
182	1	return $this;
183
184		}
185
186		/**
187		* Specify \|\|-separated strings to limit pages processed to those whose
188		* HTML contains any of the content strings.
189		*
190		* @param array $pattern
191		* @return $this
192		*/
193	1	public function setPageProcessPatterns(array $pattern)
194		{
195	1	$this->otherOptions['pageProcessPattern'] = implode("\|\|",
196		array_map(function ($item) {
197	1	return urlencode($item);
198	1	}, $pattern));
199
200	1	return $this;
201		}
202
203		/**
204		* Specify the depth of your crawl. A maxHops=0 will limit processing to
205		* the seed URL(s) only -- no other links will be processed; maxHops=1 will
206		* process all (otherwise matching) pages whose links appear on seed URL(s);
207		* maxHops=2 will process pages whose links appear on those pages; and so on
208		*
209		* By default, Crawlbot will crawl and process links at any depth.
210		*
211		* @param int $input
212		* @return $this
213		*/
214	6	public function setMaxHops($input = -1)
215		{
216	6	if ((int)$input < -1) {
217	1	$input = -1;
218	1	}
219	6	$this->otherOptions['maxHops'] = (int)$input;
220
221	6	return $this;
222		}
223
224		/**
225		* Specify max pages to spider. Default: 100,000.
226		*
227		* @param int $input
228		* @return $this
229		*/
230	6	public function setMaxToCrawl($input = 100000)
231		{
232	6	if ((int)$input < 1) {
233	2	$input = 1;
234	2	}
235	6	$this->otherOptions['maxToCrawl'] = (int)$input;
236
237	6	return $this;
238		}
239
240		/**
241		* Specify max pages to process through Diffbot APIs. Default: 100,000.
242		*
243		* @param int $input
244		* @return $this
245		*/
246	6	public function setMaxToProcess($input = 100000)
247		{
248	6	if ((int)$input < 1) {
249	2	$input = 1;
250	2	}
251	6	$this->otherOptions['maxToProcess'] = (int)$input;
252
253	6	return $this;
254		}
255
256		/**
257		* If input is email address, end a message to this email address when the
258		* crawl hits the maxToCrawl or maxToProcess limit, or when the crawl
259		* completes.
260		*
261		* If input is URL, you will receive a POST with X-Crawl-Name and
262		* X-Crawl-Status in the headers, and the full JSON response in the
263		* POST body.
264		*
265		* @param string $string
266		* @return $this
267		* @throws InvalidArgumentException
268		*/
269	6	public function notify($string)
270		{
271	6	if (filter_var($string, FILTER_VALIDATE_EMAIL)) {
272	2	$this->otherOptions['notifyEmail'] = $string;
273
274	2	return $this;
275		}
276	5	if (filter_var($string, FILTER_VALIDATE_URL)) {
277	2	$this->otherOptions['notifyWebhook'] = urlencode($string);
278
279	2	return $this;
280		}
281
282	3	throw new InvalidArgumentException(
283		'Only valid email or URL accepted! You provided: ' . $string
284	3	);
285		}
286
287		/**
288		* Wait this many seconds between each URL crawled from a single IP address.
289		* Specify the number of seconds as an integer or floating-point number.
290		*
291		* @param float $input
292		* @return $this
293		* @throws InvalidArgumentException
294		*/
295	14	public function setCrawlDelay($input = 0.25)
296		{
297	14	if (!is_numeric($input)) {
298	7	throw new InvalidArgumentException('Input must be numeric.');
299		}
300	7	$input = ($input < 0) ? 0.25 : $input;
301	7	$this->otherOptions['crawlDelay'] = (float)$input;
302
303	7	return $this;
304		}
305
306		/**
307		* Specify the number of days as a floating-point (e.g. repeat=7.0) to
308		* repeat this crawl. By default crawls will not be repeated.
309		*
310		* @param int\|float $input
311		* @return $this
312		* @throws \InvalidArgumentException
313		*/
314	7	public function setRepeat($input)
315		{
316	7	if (!is_numeric($input) \|\| !$input) {
317	4	throw new \InvalidArgumentException('Only positive numbers allowed.');
318		}
319	3	$this->otherOptions['repeat'] = (float)$input;
320
321	3	return $this;
322		}
323
324		/**
325		* By default repeat crawls will only process new (previously unprocessed)
326		* pages. Set to 0 to process all content on repeat crawls.
327		*
328		* @param int $int
329		* @return $this
330		*/
331	1	public function setOnlyProcessIfNew($int = 1)
332		{
333	1	$this->otherOptions['onlyProcessIfNew'] = (int)(bool)$int;
334
335	1	return $this;
336		}
337
338		/**
339		* Specify the maximum number of crawl repeats. By default (maxRounds=0)
340		* repeating crawls will continue indefinitely.
341		*
342		* @param int $input
343		* @return $this
344		*/
345	6	public function setMaxRounds($input = 0)
346		{
347	6	if ((int)$input < -1) {
348	1	$input = -1;
349	1	}
350
351	6	$this->otherOptions['maxRounds'] = (int)$input;
352
353	6	return $this;
354		}
355
356		/**
357		* Ignores robots.txt if set to 0/false
358		*
359		* @param bool $bool
360		* @return $this
361		*/
362	1	public function setObeyRobots($bool = true)
363		{
364	1	$this->otherOptions['obeyRobots'] = (int)(bool)$bool;
365
366	1	return $this;
367		}
368
369		/**
370		* Set value to 1 to force the use of proxy IPs for the crawl.
371		*
372		* @param bool $bool
373		* @return $this
374		*/
375		public function setUseProxies($bool = true)
376		{
377		$this->otherOptions['useProxies'] = (int)(bool)$bool;
378
379		return $this;
380		}
381
382		/**
383		* Force the start of a new crawl "round" (manually repeat the crawl).
384		* If onlyProcessIfNew is set to 1 (default), only newly-created pages will
385		* be processed.
386		*
387		* @param bool $commit
388		* @return EntityIterator
389		* @throws DiffbotException
390		*/
391	1	public function roundStart($commit = true)
392		{
393	1	$this->otherOptions = ['roundStart' => 1];
394
395	1	return ($commit) ? $this->call() : $this;
396		}
397
398		/**
399		* Pause a crawl.
400		*
401		* @param bool $commit
402		* @return EntityIterator
403		* @throws DiffbotException
404		*/
405	1	public function pause($commit = true)
406		{
407	1	$this->otherOptions = ['pause' => 1];
408
409	1	return ($commit) ? $this->call() : $this;
410		}
411
412		/**
413		* Pause a crawl.
414		*
415		* @param bool $commit
416		* @return EntityIterator
417		* @throws DiffbotException
418		*/
419	1	public function unpause($commit = true)
420		{
421	1	$this->otherOptions = ['pause' => 0];
422
423	1	return ($commit) ? $this->call() : $this;
424		}
425
426		/**
427		* Restart removes all crawled data while maintaining crawl settings.
428		*
429		* @param bool $commit
430		* @return EntityIterator
431		* @throws DiffbotException
432		*/
433	1	public function restart($commit = true)
434		{
435	1	$this->otherOptions = ['restart' => 1];
436
437	1	return ($commit) ? $this->call() : $this;
438		}
439
440		/**
441		* Delete a crawl, and all associated data, completely.
442		*
443		* @param bool $commit
444		* @return EntityIterator
445		* @throws DiffbotException
446		*/
447	1	public function delete($commit = true)
448		{
449	1	$this->otherOptions = ['delete' => 1];
450
451	1	return ($commit) ? $this->call() : $this;
452		}
453		public function getCrawl()
454		{
455		$theUrl = $this->apiUrl ."?token=" . $this->diffbot->getToken() . "&name=" . $this->name;
456		$response = $this->diffbot->getHttpClient()->get($theUrl);
457
458		$array = json_decode($response->getBody(), true);
459
460		if (isset($array['jobs'])) {
461		$jobs = [];
462		foreach ($array['jobs'] as $job) {
463		$jobs[] = new JobCrawl($job);
464		}
465		return new EntityIterator($jobs, $response);
466		} elseif (!isset($array['jobs']) && isset($array['response'])) {
467		return $array['response'];
468		} else {
469		throw new DiffbotException($array["error"]);
470		}
471		}
472
473	7	public function call()
474		{
475	7	$theHeader=["content-type"=>"application/x-www-form-urlencoded; charset=UTF-8"];
476	7	$response = $this->diffbot->getHttpClient()->post($this->apiUrl, $theHeader, $this->buildUrl());
477
478
479	6	$array = json_decode($response->getBody(), true);
480
481	6	if (isset($array['jobs'])) {
482	4	$jobs = [];
483	4	foreach ($array['jobs'] as $job) {
484	4	$jobs[] = new JobCrawl($job);
485	4	}
486
487	4	return new EntityIterator($jobs, $response);
488	2	} elseif (!isset($array['jobs']) && isset($array['response'])) {
489	1	return $array['response'];
490		} else {
491	1	throw new DiffbotException('It appears something went wrong - no data was returned. Did you use the correct token / job name?');
492		}
493		}
494
495		/**
496		* Builds out the URL string that gets requested once `call()` is called
497		*
498		* @return string
499		*/
500	45	public function buildUrl()
501		{
502
503	45	if (isset($this->otherOptions['urlProcessRegEx'])
504	45	&& !empty($this->otherOptions['urlProcessRegEx'])
505	45	) {
506	1	unset($this->otherOptions['urlProcessPattern']);
507	1	}
508
509	45	if (isset($this->otherOptions['urlCrawlRegEx'])
510	45	&& !empty($this->otherOptions['urlCrawlRegEx'])
511	45	) {
512	1	unset($this->otherOptions['urlCrawlPattern']);
513	1	}
514
515
516		// Add token
517	45	$url = 'token=' . $this->diffbot->getToken();
518
519	45	if ($this->getName()) {
520		// Add name
521	44	$url .= '&name=' . $this->getName();
522
523		// Add seeds
524	44	if (!empty($this->seeds)) {
525	37	$url .= '&seeds=' . implode('%20', array_map(function ($item) {
526	37	return urlencode($item);
527	37	}, $this->seeds));
528	37	}
529
530		// Add other options
531	44	if (!empty($this->otherOptions)) {
532	40	foreach ($this->otherOptions as $option => $value) {
533	40	$url .= '&' . $option . '=' . $value;
534	40	}
535	40	}
536
537		// Add API link
538	44	$url .= '&apiUrl=' . $this->getApiString();
539	44	}
540
541	45	return $url;
542		}
543
544		/**
545		* Sets the request type to "urls" to retrieve the URL Report
546		* URL for understanding diagnostic data of URLs
547		*
548		* @return $this
549		*/
550		public function getUrlReportUrl($num = null)
551		{
552		$this->otherOptions['type'] = 'urls';
553
554		if (!empty($num) && is_numeric($num)) {
555		$this->otherOptions['num'] = $num;
556		}
557
558		// Setup data endpoint
559		$url = $this->apiUrl . '/data';
560
561		// Add token
562		$url .= '?token=' . $this->diffbot->getToken();
563
564		if ($this->getName()) {
565		// Add name
566		$url .= '&name=' . $this->getName();
567
568		// Add other options
569		if (!empty($this->otherOptions)) {
570		foreach ($this->otherOptions as $option => $value) {
571		$url .= '&' . $option . '=' . $value;
572		}
573		}
574		}
575
576		return $url;
577
578		}
579
580		/**
581		* @return string
582		*/
583	44	protected function getApiString()
584		{
585	44	if (!$this->api) {
586	43	$this->api = $this->diffbot->createAnalyzeAPI('crawl');
587	43	$this->api->setMode('auto');
588	43	}
589
590	44	return urlencode($this->api->buildUrl());
591		}
592		}
593

Swader / diffbot-php-client

Push — master ( 5b29fc...dd86b9 )

Crawl::getCrawl() B

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like