1 | <?php |
||
7 | abstract class Job extends Entity |
||
8 | { |
||
9 | |||
10 | /** |
||
11 | * Returns the name of the crawljob |
||
12 | * @return string |
||
13 | */ |
||
14 | 1 | public function getName() |
|
18 | |||
19 | /** |
||
20 | * Should always return either "crawl" or "bulk" |
||
21 | * @return string |
||
22 | */ |
||
23 | 2 | public function getType() |
|
27 | |||
28 | /** |
||
29 | * Timestamp of job creation |
||
30 | * |
||
31 | * @return int |
||
32 | */ |
||
33 | 1 | public function getJobCreationTimeUTC() |
|
38 | |||
39 | /** |
||
40 | * Timestamp of job completion |
||
41 | * |
||
42 | * @return int |
||
43 | */ |
||
44 | 1 | public function getJobCompletionTimeUTC() |
|
49 | |||
50 | /** |
||
51 | * Possible statuses |
||
52 | * |
||
53 | * 0 Job is initializing |
||
54 | * 1 Job has reached maxRounds limit |
||
55 | * 2 Job has reached maxToCrawl limit |
||
56 | * 3 Job has reached maxToProcess limit |
||
57 | * 4 Next round to start in _____ seconds |
||
58 | * 5 No URLs were added to the crawl |
||
59 | * 6 Job paused |
||
60 | * 7 Job in progress |
||
61 | * 8 All crawling temporarily paused by root administrator for maintenance. |
||
62 | * 9 Job has completed and no repeat is scheduled |
||
63 | * |
||
64 | * @return array |
||
65 | */ |
||
66 | 3 | public function getJobStatus() |
|
71 | |||
72 | /** |
||
73 | * True or false, depending on whether "job complete" notification was sent |
||
74 | * |
||
75 | * @return bool |
||
76 | */ |
||
77 | 1 | public function getNotificationSent() |
|
81 | |||
82 | /** |
||
83 | * Number of objects found |
||
84 | * |
||
85 | * @return int |
||
86 | */ |
||
87 | 2 | public function getObjectsFound() |
|
91 | |||
92 | /** |
||
93 | * Number of URLs harvested |
||
94 | * |
||
95 | * @return int |
||
96 | */ |
||
97 | 2 | public function getUrlsHarvested() |
|
101 | |||
102 | /** |
||
103 | * Returns an array with information about crawls - total attempts, |
||
104 | * successes, and successes this round |
||
105 | * |
||
106 | * @return array |
||
107 | */ |
||
108 | 3 | public function getPageCrawlInfo() |
|
116 | |||
117 | /** |
||
118 | * Returns an array with information about crawls - total attempts, |
||
119 | * successes, and successes this round |
||
120 | * |
||
121 | * @return array |
||
122 | */ |
||
123 | 2 | public function getPageProcessInfo() |
|
131 | |||
132 | /** |
||
133 | * The maximum number of crawl repeats. By default (maxRounds=0) repeating |
||
134 | * crawls will continue indefinitely. |
||
135 | * |
||
136 | * @return int |
||
137 | */ |
||
138 | 1 | public function getMaxRounds() |
|
142 | |||
143 | /** |
||
144 | * The number of days as a floating-point (e.g. repeat=7.0) to repeat this |
||
145 | * crawl. By default crawls will not be repeated. |
||
146 | * |
||
147 | * @return float |
||
148 | */ |
||
149 | 1 | public function getRepeat() |
|
153 | |||
154 | /** |
||
155 | * Wait this many seconds between each URL crawled from a single IP address. |
||
156 | * Number of seconds as an integer or floating-point number |
||
157 | * (e.g., crawlDelay=0.25). |
||
158 | * |
||
159 | * @return float |
||
160 | */ |
||
161 | 1 | public function getCrawlDelay() |
|
165 | |||
166 | /** |
||
167 | * Whether or not the job was set to respect robots.txt |
||
168 | * |
||
169 | * @return bool |
||
170 | */ |
||
171 | 1 | public function getObeyRobots() |
|
175 | |||
176 | /** |
||
177 | * How many rounds were completed with the job so far |
||
178 | * |
||
179 | * @return int |
||
180 | */ |
||
181 | 1 | public function getRoundsCompleted() |
|
185 | |||
186 | /** |
||
187 | * Returns timestamp of when next crawl round is about to start or 0 if none |
||
188 | * |
||
189 | * @return int |
||
190 | */ |
||
191 | 2 | public function getRoundStartTime() |
|
195 | |||
196 | /** |
||
197 | * Returns timestamp of current time |
||
198 | * |
||
199 | * @return int |
||
200 | */ |
||
201 | 2 | public function getCurrentTime() |
|
205 | |||
206 | /** |
||
207 | * Returns timestamp of current time, UTC. |
||
208 | * Should be the same as getCurrentTime |
||
209 | * |
||
210 | * @return int |
||
211 | */ |
||
212 | 1 | public function getCurrentTimeUTC() |
|
216 | |||
217 | /** |
||
218 | * The API URL is the URL of the API used to process pages found in the |
||
219 | * crawl. If the job was created with this Diffbot lib, then it was |
||
220 | * automatically built from a pre-configured API instance |
||
221 | * |
||
222 | * The API URL will be URL decoded, whereas it is submitted encoded. |
||
223 | * |
||
224 | * @return string |
||
225 | */ |
||
226 | 1 | public function getApiUrl() |
|
230 | |||
231 | /** |
||
232 | * @see \Swader\Diffbot\Api\Crawl::setUrlCrawlPattern |
||
233 | * @return string |
||
234 | */ |
||
235 | 1 | public function getUrlCrawlPattern() |
|
239 | |||
240 | /** |
||
241 | * @see \Swader\Diffbot\Api\Crawl::setUrlProcessPattern |
||
242 | * @return string |
||
243 | */ |
||
244 | 1 | public function getUrlProcessPattern() |
|
248 | |||
249 | /** |
||
250 | * @see \Swader\Diffbot\Api\Crawl::setPageProcessPattern |
||
251 | * @return string |
||
252 | */ |
||
253 | 1 | public function getPageProcessPattern() |
|
257 | |||
258 | /** |
||
259 | * @see \Swader\Diffbot\Api\Crawl::setUrlCrawlRegex |
||
260 | * |
||
261 | * @return string |
||
262 | */ |
||
263 | 1 | public function getUrlCrawlRegex() |
|
267 | |||
268 | /** |
||
269 | * @see \Swader\Diffbot\Api\Crawl::setUrlProcessRegex |
||
270 | * |
||
271 | * @return string |
||
272 | */ |
||
273 | 1 | public function getUrlProcessRegex() |
|
277 | |||
278 | /** |
||
279 | * @see \Swader\Diffbot\Api\Crawl::setMaxHops |
||
280 | * |
||
281 | * @return int |
||
282 | */ |
||
283 | 1 | public function getMaxHops() |
|
287 | |||
288 | /** |
||
289 | * Returns the link to the dataset the job produced. |
||
290 | * |
||
291 | * Accepted arguments are: "json", "csv" and "debug". |
||
292 | * It is important to be aware of the difference between the types. |
||
293 | * See "Retrieving Bulk Data" in link. |
||
294 | * |
||
295 | * @see https://www.diffbot.com/dev/docs/crawl/api.jsp |
||
296 | * |
||
297 | * @param string $type |
||
298 | * @return string |
||
299 | * @throws DiffbotException |
||
300 | */ |
||
301 | 2 | public function getDownloadUrl($type = "json") |
|
318 | |||
319 | /** |
||
320 | * Returns the email that was set to be notified after job's completion |
||
321 | * |
||
322 | * @return string |
||
323 | */ |
||
324 | 1 | public function getNotifyEmail() |
|
328 | |||
329 | /** |
||
330 | * Returns the webhook that was set to be pinged after job's completion |
||
331 | * |
||
332 | * @return string |
||
333 | */ |
||
334 | 1 | public function getNotifyWebhook() |
|
338 | } |