Passed
Push — master ( aff4dc...32e217 )
by David
01:39
created

WebClient::exec()   B

Complexity

Conditions 8
Paths 6

Size

Total Lines 34
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 12
nc 6
nop 1
dl 0
loc 34
rs 8.4444
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    http://wiki.apache.org/tika/TikaJAXRS
14
 */
15
class WebClient extends Client
16
{
17
    const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Number of retries on server error
42
     *
43
     * @var int
44
     */
45
    protected $retries = 3;
46
47
    /**
48
     * Default cURL options
49
     *
50
     * @var array
51
     */
52
    protected $options =
53
    [
54
        CURLINFO_HEADER_OUT    => true,
55
        CURLOPT_HTTPHEADER     => [],
56
        CURLOPT_PUT            => true,
57
        CURLOPT_RETURNTRANSFER => true,
58
        CURLOPT_TIMEOUT        => 5
59
    ];
60
61
    /**
62
     * Configure class and test if server is running
63
     *
64
     * @param   string  $host
65
     * @param   int     $port
66
     * @param   array   $options
67
     * @param   bool    $check
68
     * @throws  \Exception
69
     */
70
    public function __construct($host = null, $port = null, $options = [], $check = true)
71
    {
72
        parent::__construct();
73
74
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
75
        {
76
            $this->setUrl($host);
77
        }
78
        elseif($host)
79
        {
80
            $this->setHost($host);
81
        }
82
83
        if(is_numeric($port))
84
        {
85
            $this->setPort($port);
86
        }
87
88
        if(!empty($options))
89
        {
90
            $this->setOptions($options);
91
        }
92
93
        $this->setDownloadRemote(true);
94
95
        if($check === true)
96
        {
97
            $this->check();
98
        }
99
    }
100
101
    /**
102
     * Get the base URL
103
     *
104
     * @return string
105
     */
106
    public function getUrl()
107
    {
108
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
109
    }
110
111
    /**
112
     * Set the host and port using an URL
113
     *
114
     * @param   string  $url
115
     * @return $this
116
     */
117
    public function setUrl($url)
118
    {
119
        $url = parse_url($url);
120
121
        $this->setHost($url['host']);
122
123
        if(isset($url['port']))
124
        {
125
            $this->setPort($url['port']);
126
        }
127
128
        return $this;
129
    }
130
131
    /**
132
     * Get the host
133
     *
134
     * @return  null|string
135
     */
136
    public function getHost()
137
    {
138
        return $this->host;
139
    }
140
141
    /**
142
     * Set the host
143
     *
144
     * @param   string  $host
145
     * @return  $this
146
     */
147
    public function setHost($host)
148
    {
149
        $this->host = $host;
150
151
        return $this;
152
    }
153
154
    /**
155
     * Get the port
156
     *
157
     * @return  null|int
158
     */
159
    public function getPort()
160
    {
161
        return $this->port;
162
    }
163
164
    /**
165
     * Set the port
166
     *
167
     * @param   int     $port
168
     * @return  $this
169
     */
170
    public function setPort($port)
171
    {
172
        $this->port = $port;
173
174
        return $this;
175
    }
176
177
    /**
178
     * Get the number of retries
179
     *
180
     * @return  int
181
     */
182
    public function getRetries()
183
    {
184
        return $this->retries;
185
    }
186
187
    /**
188
     * Set the number of retries
189
     *
190
     * @param   int     $retries
191
     * @return  $this
192
     */
193
    public function setRetries($retries)
194
    {
195
        $this->retries = $retries;
196
197
        return $this;
198
    }
199
200
    /**
201
     * Get all the options
202
     *
203
     * @return  null|array
204
     */
205
    public function getOptions()
206
    {
207
        return $this->options;
208
    }
209
210
    /**
211
     * Get an specified option
212
     *
213
     * @param   string  $key
214
     * @return  mixed
215
     */
216
    public function getOption($key)
217
    {
218
        return isset($this->options[$key]) ? $this->options[$key] : null;
219
    }
220
221
    /**
222
     * Set a cURL option to be set with curl_setopt()
223
     *
224
     * @link    http://php.net/manual/en/curl.constants.php
225
     * @link    http://php.net/manual/en/function.curl-setopt.php
226
     * @param   string  $key
227
     * @param   mixed   $value
228
     * @return  $this
229
     * @throws  \Exception
230
     */
231
    public function setOption($key, $value)
232
    {
233
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
234
        {
235
            throw new Exception("Value for cURL option $key cannot be modified", 3);
236
        }
237
238
        $this->options[$key] = $value;
239
240
        return $this;
241
    }
242
243
    /**
244
     * Set the cURL options
245
     *
246
     * @param   array   $options
247
     * @return  $this
248
     * @throws  \Exception
249
     */
250
    public function setOptions($options)
251
    {
252
        foreach($options as $key => $value)
253
        {
254
            $this->setOption($key, $value);
255
        }
256
257
        return $this;
258
    }
259
260
    /**
261
     * Get the timeout value for cURL
262
     *
263
     * @return  int
264
     */
265
    public function getTimeout()
266
    {
267
        return $this->getOption(CURLOPT_TIMEOUT);
268
    }
269
270
    /**
271
     * Set the timeout value for cURL
272
     *
273
     * @param   int     $value
274
     * @return  $this
275
     * @throws  \Exception
276
     */
277
    public function setTimeout($value)
278
    {
279
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
280
281
        return $this;
282
    }
283
284
    /**
285
     * Check if server is running
286
     *
287
     * @throws \Exception
288
     */
289
    public function check()
290
    {
291
        if($this->isChecked() === false)
292
        {
293
            $this->setChecked(true);
294
295
            // throws an exception if server is unreachable or can't connect
296
            $this->request('version');
297
        }
298
    }
299
300
    /**
301
     * Configure, make a request and return its results
302
     *
303
     * @param   string  $type
304
     * @param   string  $file
305
     * @return  string
306
     * @throws  \Exception
307
     */
308
    public function request($type, $file = null)
309
    {
310
        static $retries = [];
311
312
        // check if not checked
313
        $this->check();
314
315
        // check if is cached
316
        if($this->isCached($type, $file))
317
        {
318
            return $this->getCachedResponse($type, $file);
319
        }
320
        elseif(!isset($retries[sha1($file)]))
321
        {
322
            $retries[sha1($file)] = $this->retries;
323
        }
324
325
        // parameters for cURL request
326
        list($resource, $headers) = $this->getParameters($type, $file);
327
328
        // check the request
329
        $file = parent::checkRequest($type, $file);
330
331
        // cURL options
332
        $options = $this->getCurlOptions($type, $file);
333
334
        // sets headers
335
        foreach($headers as $header)
336
        {
337
            $options[CURLOPT_HTTPHEADER][] = $header;
338
        }
339
340
        // cURL init and options
341
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
342
343
        // get the response and the HTTP status code
344
        list($response, $status) = $this->exec($options);
345
346
        // reduce memory usage closing cURL resource
347
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
348
        {
349
            fclose($options[CURLOPT_INFILE]);
350
        }
351
352
        // request completed successfully
353
        if($status == 200)
354
        {
355
            // cache certain responses
356
            if($this->isCacheable($type))
357
            {
358
                $this->cacheResponse($type, $response, $file);
359
            }
360
        }
361
        // request completed successfully but result is empty
362
        elseif($status == 204)
363
        {
364
            $response = null;
365
        }
366
        // retry on request failed with error 500
367
        elseif($status == 500 && $retries[sha1($file)]--)
368
        {
369
            $response = $this->request($type, $file);
370
        }
371
        // other status code is an error
372
        else
373
        {
374
            $this->error($status, $resource);
375
        }
376
377
        return $response;
378
    }
379
380
    /**
381
     * Make a request to Apache Tika Server
382
     *
383
     * @param   array   $options
384
     * @return  array
385
     * @throws  \Exception
386
     */
387
    protected function exec(array $options = [])
388
    {
389
        // cURL init and options
390
        $curl = curl_init();
391
392
        // add options only if cURL init doesn't fails
393
        if(is_resource($curl))
394
        {
395
            // we avoid curl_setopt_array($curl, $options) because extrange Windows behaviour (issue #8)
396
            foreach($options as $option => $value)
397
            {
398
                curl_setopt($curl, $option, $value);
399
            }
400
401
            // make the request
402
            if(is_null($this->callback))
403
            {
404
                $this->response = curl_exec($curl);
0 ignored issues
show
Documentation Bug introduced by
It seems like curl_exec($curl) can also be of type boolean. However, the property $response is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
405
            }
406
            else
407
            {
408
                $this->response = '';
409
                curl_exec($curl);
410
            }
411
        }
412
413
        // exception if cURL fails
414
        if($curl === false || curl_errno($curl))
415
        {
416
            throw new Exception($curl ? 'Unexpected error' : curl_error($curl), $curl ? curl_errno($curl) : 0);
0 ignored issues
show
introduced by
$curl is of type false|resource, thus it always evaluated to false.
Loading history...
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

416
            throw new Exception($curl ? 'Unexpected error' : curl_error(/** @scrutinizer ignore-type */ $curl), $curl ? curl_errno($curl) : 0);
Loading history...
417
        }
418
419
        // return the response and the status code
420
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
421
    }
422
423
    /**
424
     * Throws an exception for an error status code
425
     *
426
     * @codeCoverageIgnore
427
     *
428
     * @param   int       $status
429
     * @param   string    $resource
430
     * @throws  \Exception
431
     */
432
    protected function error($status, $resource)
433
    {
434
        switch($status)
435
        {
436
            //  method not allowed
437
            case 405:
438
                throw new Exception('Method not allowed', 405);
439
                break;
440
441
            //  unsupported media type
442
            case 415:
443
                throw new Exception('Unsupported media type', 415);
444
                break;
445
446
            //  unprocessable entity
447
            case 422:
448
                throw new Exception('Unprocessable document', 422);
449
                break;
450
451
            // server error
452
            case 500:
453
                throw new Exception('Error while processing document', 500);
454
                break;
455
456
            // unexpected
457
            default:
458
                throw new Exception("Unexpected response for /$resource ($status)", 501);
459
        }
460
    }
461
462
    /**
463
     * Get the parameters to make the request
464
     *
465
     * @link    https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
466
     * @param   string  $type
467
     * @param   string  $file
468
     * @return  array
469
     * @throws  \Exception
470
     */
471
    protected function getParameters($type, $file = null)
472
    {
473
        $headers = [];
474
475
        if(!empty($file) && preg_match('/^http/', $file))
476
        {
477
            $headers[] = "fileUrl:$file";
478
        }
479
480
        switch($type)
481
        {
482
            case 'html':
483
                $resource = 'tika';
484
                $headers[] = 'Accept: text/html';
485
                break;
486
487
            case 'lang':
488
                $resource = 'language/stream';
489
                break;
490
491
            case 'mime':
492
                $name = basename($file);
493
                $resource = 'detect/stream';
494
                $headers[] = "Content-Disposition: attachment, filename=$name";
495
                break;
496
497
            case 'meta':
498
            case 'rmeta/html':
499
            case 'rmeta/ignore':
500
            case 'rmeta/text':
501
                $resource = $type;
502
                $headers[] = 'Accept: application/json';
503
                break;
504
505
            case 'text':
506
                $resource = 'tika';
507
                $headers[] = 'Accept: text/plain';
508
                break;
509
510
            case 'text-main':
511
                $resource = 'tika/main';
512
                $headers[] = 'Accept: text/plain';
513
                break;
514
515
            case 'detectors':
516
            case 'parsers':
517
            case 'mime-types':
518
            case 'version':
519
                $resource = $type;
520
                break;
521
522
            default:
523
                throw new Exception("Unknown type $type");
524
        }
525
526
        return [$resource, $headers];
527
    }
528
529
    /**
530
     * Get the cURL options
531
     *
532
     * @param   string  $type
533
     * @param   string  $file
534
     * @return  array
535
     * @throws  \Exception
536
     */
537
    protected function getCurlOptions($type, $file = null)
538
    {
539
        // base options
540
        $options = $this->options;
541
542
        // callback
543
        if(!is_null($this->callback))
544
        {
545
            $callback = $this->callback;
546
547
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use($callback)
548
            {
549
                $this->response .= $data;
550
551
                $callback($data);
552
553
                // safe because cURL must receive the number of *bytes* written
554
                return strlen($data);
555
            };
556
        }
557
558
        // remote file options
559
        if($file && preg_match('/^http/', $file))
560
        {
561
            //
562
        }
563
        // local file options
564
        elseif($file && file_exists($file) && is_readable($file))
565
        {
566
            $options[CURLOPT_INFILE] = fopen($file, 'r');
567
            $options[CURLOPT_INFILESIZE] = filesize($file);
568
        }
569
        // other options for specific requests
570
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
571
        {
572
            $options[CURLOPT_PUT] = false;
573
        }
574
        // file not accesible
575
        else
576
        {
577
            throw new Exception("File $file can't be opened");
578
        }
579
580
        return $options;
581
    }
582
}
583