Passed
Push — master ( 5914ad...7768f9 )
by David
01:17
created

WebClient::isCacheable()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
use Vaites\ApacheTika\Metadata\Metadata;
9
10
/**
11
 * Apache Tika web client
12
 *
13
 * @author  David Martínez <[email protected]>
14
 * @link    http://wiki.apache.org/tika/TikaJAXRS
15
 * @link    https://tika.apache.org/1.12/formats.html
16
 */
17
class WebClient extends Client
18
{
19
    const MODE = 'web';
20
21
    /**
22
     * Cached responses to avoid multiple request for the same file
23
     *
24
     * @var array
25
     */
26
    protected $cache = [];
27
28
    /**
29
     * Apache Tika server host
30
     *
31
     * @var string
32
     */
33
    protected $host = null;
34
35
    /**
36
     * Apache Tika server port
37
     *
38
     * @var int
39
     */
40
    protected $port = null;
41
42
    /**
43
     * Number of retries on server error
44
     *
45
     * @var int
46
     */
47
    protected $retries = 3;
48
49
    /**
50
     * Default cURL options
51
     *
52
     * @var array
53
     */
54
    protected $options =
55
    [
56
        CURLINFO_HEADER_OUT    => true,
57
        CURLOPT_HTTPHEADER     => [],
58
        CURLOPT_PUT            => true,
59
        CURLOPT_RETURNTRANSFER => true,
60
        CURLOPT_TIMEOUT        => 5
61
    ];
62
63
    /**
64
     * Configure class and test if server is running
65
     *
66
     * @param   string  $host
67
     * @param   int     $port
68
     * @param   array   $options
69
     * @throws  \Exception
70
     */
71
    public function __construct($host = null, $port = null, $options = [])
72
    {
73
        parent::__construct();
74
75
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
76
        {
77
            $this->setUrl($host);
78
        }
79
        elseif($host)
80
        {
81
            $this->setHost($host);
82
        }
83
84
        if(is_numeric($port))
85
        {
86
            $this->setPort($port);
87
        }
88
89
        if(!empty($options))
90
        {
91
            $this->setOptions($options);
92
        }
93
94
        $this->setDownloadRemote(true);
95
96
        if(self::$check === true)
97
        {
98
            $this->check();
99
        }
100
    }
101
102
    /**
103
     * Get the base URL
104
     *
105
     * @return string
106
     */
107
    public function getUrl()
108
    {
109
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
110
    }
111
112
    /**
113
     * Set the host and port using an URL
114
     *
115
     * @param   string  $url
116
     * @return $this
117
     */
118
    public function setUrl($url)
119
    {
120
        $url = parse_url($url);
121
122
        $this->setHost($url['host']);
123
124
        if(isset($url['port']))
125
        {
126
            $this->setPort($url['port']);
127
        }
128
129
        return $this;
130
    }
131
132
    /**
133
     * Get the host
134
     *
135
     * @return  null|string
136
     */
137
    public function getHost()
138
    {
139
        return $this->host;
140
    }
141
142
    /**
143
     * Set the host
144
     *
145
     * @param   string  $host
146
     * @return  $this
147
     */
148
    public function setHost($host)
149
    {
150
        $this->host = $host;
151
152
        return $this;
153
    }
154
155
    /**
156
     * Get the port
157
     *
158
     * @return  null|int
159
     */
160
    public function getPort()
161
    {
162
        return $this->port;
163
    }
164
165
    /**
166
     * Set the port
167
     *
168
     * @param   int     $port
169
     * @return  $this
170
     */
171
    public function setPort($port)
172
    {
173
        $this->port = $port;
174
175
        return $this;
176
    }
177
178
    /**
179
     * Get the number of retries
180
     *
181
     * @return  int
182
     */
183
    public function getRetries()
184
    {
185
        return $this->retries;
186
    }
187
188
    /**
189
     * Set the number of retries
190
     *
191
     * @param   int     $retries
192
     * @return  $this
193
     */
194
    public function setRetries($retries)
195
    {
196
        $this->retries = $retries;
197
198
        return $this;
199
    }
200
201
    /**
202
     * Get all the options
203
     *
204
     * @return  null|array
205
     */
206
    public function getOptions()
207
    {
208
        return $this->options;
209
    }
210
211
    /**
212
     * Get an specified option
213
     *
214
     * @param   string  $key
215
     * @return  mixed
216
     */
217
    public function getOption($key)
218
    {
219
        return isset($this->options[$key]) ? $this->options[$key] : null;
220
    }
221
222
    /**
223
     * Set a cURL option to be set with curl_setopt()
224
     *
225
     * @link    http://php.net/manual/en/curl.constants.php
226
     * @link    http://php.net/manual/en/function.curl-setopt.php
227
     * @param   string  $key
228
     * @param   mixed   $value
229
     * @return  $this
230
     * @throws  \Exception
231
     */
232
    public function setOption($key, $value)
233
    {
234
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
235
        {
236
            throw new Exception("Value for cURL option $key cannot be modified", 3);
237
        }
238
239
        $this->options[$key] = $value;
240
241
        return $this;
242
    }
243
244
    /**
245
     * Set the cURL options
246
     *
247
     * @param   array   $options
248
     * @return  $this
249
     * @throws  \Exception
250
     */
251
    public function setOptions($options)
252
    {
253
        foreach($options as $key => $value)
254
        {
255
            $this->setOption($key, $value);
256
        }
257
258
        return $this;
259
    }
260
261
    /**
262
     * Get the timeout value for cURL
263
     *
264
     * @return  int
265
     */
266
    public function getTimeout()
267
    {
268
        return $this->getOption(CURLOPT_TIMEOUT);
269
    }
270
271
    /**
272
     * Set the timeout value for cURL
273
     *
274
     * @param   int     $value
275
     * @return  $this
276
     * @throws  \Exception
277
     */
278
    public function setTimeout($value)
279
    {
280
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
281
282
        return $this;
283
    }
284
285
    /**
286
     * Check if server is running
287
     *
288
     * @throws \Exception
289
     */
290
    public function check()
291
    {
292
        if(self::isChecked() === false)
293
        {
294
            self::setChecked(true);
295
296
            // throws an exception if server is unreachable or can't connect
297
            $this->request('version');
298
        }
299
    }
300
301
    /**
302
     * Configure, make a request and return its results
303
     *
304
     * @param   string  $type
305
     * @param   string  $file
306
     * @return  string
307
     * @throws  \Exception
308
     */
309
    public function request($type, $file = null)
310
    {
311
        static $retries = [];
312
313
        // check if not checked
314
        $this->check();
315
316
        // check if is cached
317
        if(isset($this->cache[sha1($file)][$type]))
318
        {
319
            return $this->cache[sha1($file)][$type];
320
        }
321
        elseif(!isset($retries[sha1($file)]))
322
        {
323
            $retries[sha1($file)] = $this->retries;
324
        }
325
326
        // parameters for cURL request
327
        list($resource, $headers) = $this->getParameters($type, $file);
328
329
        // check the request
330
        $file = parent::checkRequest($type, $file);
331
332
        // cURL options
333
        $options = $this->getCurlOptions($type, $file);
334
335
        // sets headers
336
        foreach($headers as $header)
337
        {
338
            $options[CURLOPT_HTTPHEADER][] = $header;
339
        }
340
341
        // cURL init and options
342
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
343
344
        // get the response and the HTTP status code
345
        list($response, $status) = $this->exec($options);
346
347
        // reduce memory usage closing cURL resource
348
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
349
        {
350
            fclose($options[CURLOPT_INFILE]);
351
        }
352
353
        // request completed successfully
354
        if($status == 200)
355
        {
356
            if($type == 'meta')
357
            {
358
                $response = Metadata::make($response, $file);
359
            }
360
361
            // cache certain responses
362
            if($this->isCacheable($type))
363
            {
364
                $this->cache[sha1($file)][$type] = $response;
365
            }
366
        }
367
        // request completed successfully but result is empty
368
        elseif($status == 204)
369
        {
370
            $response = null;
371
        }
372
        // retry on request failed with error 500
373
        elseif($status == 500 && $retries[sha1($file)]--)
374
        {
375
            $response = $this->request($type, $file);
376
        }
377
        // other status code is an error
378
        else
379
        {
380
            $this->error($status, $resource);
381
        }
382
383
        return $response;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $response also could return the type Vaites\ApacheTika\Metada...\Metadata\ImageMetadata which is incompatible with the documented return type string.
Loading history...
384
    }
385
386
    /**
387
     * Make a request to Apache Tika Server
388
     *
389
     * @param   array   $options
390
     * @return  array
391
     * @throws  \Exception
392
     */
393
    protected function exec(array $options = [])
394
    {
395
        // cURL init and options
396
        $curl = curl_init();
397
398
        // we avoid curl_setopt_array($curl, $options) because extrange Windows behaviour (issue #8)
399
        foreach($options as $option => $value)
400
        {
401
            curl_setopt($curl, $option, $value);
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_setopt() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

401
            curl_setopt(/** @scrutinizer ignore-type */ $curl, $option, $value);
Loading history...
402
        }
403
404
        // make the request
405
        if(is_null($this->callback))
406
        {
407
            $this->response = curl_exec($curl);
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

407
            $this->response = curl_exec(/** @scrutinizer ignore-type */ $curl);
Loading history...
Documentation Bug introduced by
It seems like curl_exec($curl) can also be of type boolean. However, the property $response is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
408
        }
409
        else
410
        {
411
            $this->response = '';
412
            curl_exec($curl);
413
        }
414
415
        // exception if cURL fails
416
        if(curl_errno($curl))
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_errno() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

416
        if(curl_errno(/** @scrutinizer ignore-type */ $curl))
Loading history...
417
        {
418
            throw new Exception(curl_error($curl), curl_errno($curl));
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

418
            throw new Exception(curl_error(/** @scrutinizer ignore-type */ $curl), curl_errno($curl));
Loading history...
419
        }
420
421
        // return the response and the status code
422
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

422
        return [trim($this->response), curl_getinfo(/** @scrutinizer ignore-type */ $curl, CURLINFO_HTTP_CODE)];
Loading history...
423
    }
424
425
    /**
426
     * Throws an exception for an error status code
427
     *
428
     * @codeCoverageIgnore
429
     *
430
     * @param   int       $status
431
     * @param   string    $resource
432
     * @throws  \Exception
433
     */
434
    protected function error($status, $resource)
435
    {
436
        switch($status)
437
        {
438
            //  method not allowed
439
            case 405:
440
                throw new Exception('Method not allowed', 405);
441
                break;
442
443
            //  unsupported media type
444
            case 415:
445
                throw new Exception('Unsupported media type', 415);
446
                break;
447
448
            //  unprocessable entity
449
            case 422:
450
                throw new Exception('Unprocessable document', 422);
451
                break;
452
453
            // server error
454
            case 500:
455
                throw new Exception('Error while processing document', 500);
456
                break;
457
458
            // unexpected
459
            default:
460
                throw new Exception("Unexpected response for /$resource ($status)", 501);
461
        }
462
    }
463
464
    /**
465
     * Get the parameters to make the request
466
     *
467
     * @link    https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
468
     * @param   string  $type
469
     * @param   string  $file
470
     * @return  array
471
     * @throws  \Exception
472
     */
473
    protected function getParameters($type, $file = null)
474
    {
475
        $headers = [];
476
477
        if(!empty($file) && preg_match('/^http/', $file))
478
        {
479
            $headers[] = "fileUrl:$file";
480
        }
481
482
        switch($type)
483
        {
484
            case 'html':
485
                $resource = 'tika';
486
                $headers[] = 'Accept: text/html';
487
                break;
488
489
            case 'lang':
490
                $resource = 'language/stream';
491
                break;
492
493
            case 'mime':
494
                $name = basename($file);
495
                $resource = 'detect/stream';
496
                $headers[] = "Content-Disposition: attachment, filename=$name";
497
                break;
498
499
            case 'meta':
500
                $resource = 'meta';
501
                $headers[] = 'Accept: application/json';
502
                break;
503
504
            case 'text':
505
                $resource = 'tika';
506
                $headers[] = 'Accept: text/plain';
507
                break;
508
509
            case 'text-main':
510
                $resource = 'tika/main';
511
                $headers[] = 'Accept: text/plain';
512
                break;
513
514
            case 'detectors':
515
            case 'parsers':
516
            case 'mime-types':
517
            case 'version':
518
                $resource = $type;
519
                break;
520
521
            default:
522
                throw new Exception("Unknown type $type");
523
        }
524
525
        return [$resource, $headers];
526
    }
527
528
    /**
529
     * Get the cURL options
530
     *
531
     * @param   string  $type
532
     * @param   string  $file
533
     * @return  array
534
     * @throws  \Exception
535
     */
536
    protected function getCurlOptions($type, $file = null)
537
    {
538
        // base options
539
        $options = $this->options;
540
541
        // callback
542
        if(!is_null($this->callback))
543
        {
544
            $callback = $this->callback;
545
546
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use($callback)
547
            {
548
                $this->response .= $data;
549
550
                $callback($data);
551
552
                // safe because cURL must receive the number of *bytes* written
553
                return strlen($data);
554
            };
555
        }
556
557
        // remote file options
558
        if($file && preg_match('/^http/', $file))
559
        {
560
            //
561
        }
562
        // local file options
563
        elseif($file && file_exists($file) && is_readable($file))
564
        {
565
            $options[CURLOPT_INFILE] = fopen($file, 'r');
566
            $options[CURLOPT_INFILESIZE] = filesize($file);
567
        }
568
        // other options for specific requests
569
        elseif(in_array($type,  ['detectors', 'mime-types', 'parsers', 'version']))
570
        {
571
            $options[CURLOPT_PUT] = false;
572
        }
573
        // file not accesible
574
        else
575
        {
576
            throw new Exception("File $file can't be opened");
577
        }
578
579
        return $options;
580
    }
581
582
    /**
583
     * Check if a request type must be cached
584
     *
585
     * @param   string  $type
586
     * @return  bool
587
     */
588
    protected function isCacheable($type)
589
    {
590
        return in_array($type, ['lang', 'meta']);
591
    }
592
}
593