Passed
Push — master ( d8574a...8cadc1 )
by David
02:55
created

WebClient::exec()   A

Complexity

Conditions 6
Paths 21

Size

Total Lines 37
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 9
Bugs 1 Features 0
Metric Value
eloc 14
c 9
b 1
f 0
dl 0
loc 37
rs 9.2222
cc 6
nc 21
nop 1
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Number of retries on server error
42
     *
43
     * @var int
44
     */
45
    protected $retries = 3;
46
47
    /**
48
     * Default cURL options
49
     *
50
     * @var array
51
     */
52
    protected $options =
53
    [
54
        CURLINFO_HEADER_OUT     => true,
55
        CURLOPT_HTTPHEADER      => [],
56
        CURLOPT_PUT             => true,
57
        CURLOPT_RETURNTRANSFER  => true,
58
        CURLOPT_TIMEOUT         => 5
59
    ];
60
61
    /**
62
     * Configure class and test if server is running
63
     *
64
     * @throws \Exception
65
     */
66
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
67
    {
68
        parent::__construct();
69
70
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
71
        {
72
            $this->setUrl($host);
73
        }
74
        elseif($host)
75
        {
76
            $this->setHost($host);
77
        }
78
79
        if(is_numeric($port))
80
        {
81
            $this->setPort($port);
82
        }
83
84
        if(!empty($options))
85
        {
86
            $this->setOptions($options);
87
        }
88
89
        $this->setDownloadRemote(true);
90
91
        if($check === true)
92
        {
93
            $this->check();
94
        }
95
    }
96
97
    /**
98
     * Get the base URL
99
     */
100
    public function getUrl(): string
101
    {
102
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
103
    }
104
105
    /**
106
     * Set the host and port using an URL
107
     */
108
    public function setUrl(string $url): self
109
    {
110
        $url = parse_url($url);
111
112
        $this->setHost($url['host']);
113
114
        if(isset($url['port']))
115
        {
116
            $this->setPort($url['port']);
117
        }
118
119
        return $this;
120
    }
121
122
    /**
123
     * Get the host
124
     */
125
    public function getHost(): ?string
126
    {
127
        return $this->host;
128
    }
129
130
    /**
131
     * Set the host
132
     */
133
    public function setHost(string $host): self
134
    {
135
        $this->host = $host;
136
137
        return $this;
138
    }
139
140
    /**
141
     * Get the port
142
     */
143
    public function getPort(): ?int
144
    {
145
        return $this->port;
146
    }
147
148
    /**
149
     * Set the port
150
     */
151
    public function setPort(int $port): self
152
    {
153
        $this->port = $port;
154
155
        return $this;
156
    }
157
158
    /**
159
     * Get the number of retries
160
     */
161
    public function getRetries(): int
162
    {
163
        return $this->retries;
164
    }
165
166
    /**
167
     * Set the number of retries
168
     */
169
    public function setRetries(int $retries): self
170
    {
171
        $this->retries = $retries;
172
173
        return $this;
174
    }
175
176
    /**
177
     * Get all the options
178
     */
179
    public function getOptions(): array
180
    {
181
        return $this->options;
182
    }
183
184
    /**
185
     * Get an specified option
186
     *
187
     * @return  mixed
188
     */
189
    public function getOption(int $key)
190
    {
191
        return $this->options[$key] ?? null;
192
    }
193
194
    /**
195
     * Set a cURL option to be set with curl_setopt()
196
     *
197
     * @link http://php.net/manual/en/curl.constants.php
198
     * @link http://php.net/manual/en/function.curl-setopt.php
199
     * @throws \Exception
200
     */
201
    public function setOption(int $key, $value): self
202
    {
203
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
204
        {
205
            throw new Exception("Value for cURL option $key cannot be modified", 3);
206
        }
207
208
        $this->options[$key] = $value;
209
210
        return $this;
211
    }
212
213
    /**
214
     * Set the cURL options
215
     *
216
     * @throws \Exception
217
     */
218
    public function setOptions(array $options): self
219
    {
220
        foreach($options as $key => $value)
221
        {
222
            $this->setOption($key, $value);
223
        }
224
225
        return $this;
226
    }
227
228
    /**
229
     * Get the timeout value for cURL
230
     */
231
    public function getTimeout(): int
232
    {
233
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
234
    }
235
236
    /**
237
     * Set the timeout value for cURL
238
     *
239
     * @throws \Exception
240
     */
241
    public function setTimeout(int $value): self
242
    {
243
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
244
245
        return $this;
246
    }
247
248
    /**
249
     * Returns the supported MIME types
250
     *
251
     * @throws \Exception
252
     */
253
    public function getSupportedMIMETypes(): array
254
    {
255
        $mimeTypes = json_decode($this->request('mime-types'), true);
256
257
        ksort($mimeTypes);
258
259
        return $mimeTypes;
260
    }
261
262
    /**
263
     * Returns the available detectors
264
     *
265
     * @throws \Exception
266
     */
267
    public function getAvailableDetectors(): array
268
    {
269
        $detectors = [json_decode($this->request('detectors'), true)];
270
271
        foreach($detectors as $index => $parent)
272
        {
273
            $detectors[$parent['name']] = $parent;
274
275
            if(isset($parent['children']))
276
            {
277
                foreach($parent['children'] as $subindex => $child)
278
                {
279
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
280
281
                    unset($detectors[$parent['name']]['children'][$subindex]);
282
                }
283
            }
284
285
            unset($detectors[$index]);
286
        }
287
288
        return $detectors;
289
    }
290
291
    /**
292
     * Returns the available parsers
293
     *
294
     * @throws \Exception
295
     */
296
    public function getAvailableParsers(): array
297
    {
298
        $parsers = [json_decode($this->request('parsers'), true)];
299
300
        foreach($parsers as $index => $parent)
301
        {
302
            $parsers[$parent['name']] = $parent;
303
304
            if(isset($parent['children']))
305
            {
306
                foreach($parent['children'] as $subindex => $child)
307
                {
308
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
309
310
                    unset($parsers[$parent['name']]['children'][$subindex]);
311
                }
312
            }
313
314
            unset($parsers[$index]);
315
        }
316
317
        return $parsers;
318
    }
319
320
    /**
321
     * Check if server is running
322
     *
323
     * @throws \Exception
324
     */
325
    public function check(): void
326
    {
327
        if($this->isChecked() === false)
328
        {
329
            $this->setChecked(true);
330
331
            // throws an exception if server is unreachable or can't connect
332
            $this->request('version');
333
        }
334
    }
335
336
    /**
337
     * Configure, make a request and return its results
338
     *
339
     * @throws \Exception
340
     */
341
    public function request(string $type, string $file = null): string
342
    {
343
        static $retries = [];
344
345
        // check if not checked
346
        $this->check();
347
348
        // check if is cached
349
        if($file !== null && $this->isCached($type, $file))
350
        {
351
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
352
        }
353
        elseif(!isset($retries[sha1($file)]))
354
        {
355
            $retries[sha1($file)] = $this->retries;
356
        }
357
358
        // parameters for cURL request
359
        [$resource, $headers] = $this->getParameters($type, $file);
360
361
        // check the request
362
        $file = $this->checkRequest($type, $file);
363
364
        // cURL options
365
        $options = $this->getCurlOptions($type, $file);
366
367
        // sets headers
368
        foreach($headers as $header)
369
        {
370
            $options[CURLOPT_HTTPHEADER][] = $header;
371
        }
372
373
        // cURL init and options
374
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
375
376
        // get the response and the HTTP status code
377
        [$response, $status] = $this->exec($options);
378
379
        // reduce memory usage closing cURL resource
380
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
381
        {
382
            fclose($options[CURLOPT_INFILE]);
383
        }
384
385
        // request completed successfully
386
        if($status == 200)
387
        {
388
            // cache certain responses
389
            if($this->isCacheable($type))
390
            {
391
                $this->cacheResponse($type, $response, $file);
0 ignored issues
show
Bug introduced by
It seems like $file can also be of type null; however, parameter $file of Vaites\ApacheTika\Client::cacheResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

391
                $this->cacheResponse($type, $response, /** @scrutinizer ignore-type */ $file);
Loading history...
392
            }
393
        } // request completed successfully but result is empty
394
        elseif($status == 204)
395
        {
396
            $response = null;
397
        } // retry on request failed with error 500
398
        elseif($status == 500 && $retries[sha1($file)]--)
399
        {
400
            $response = $this->request($type, $file);
401
        } // other status code is an error
402
        else
403
        {
404
            $this->error($status, $resource, $file);
405
        }
406
407
        return $response;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $response could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
408
    }
409
410
    /**
411
     * Make a request to Apache Tika Server
412
     *
413
     * @throws \Exception
414
     */
415
    protected function exec(array $options = []): array
416
    {
417
        try
418
        {
419
            // cURL init and options
420
            $curl = curl_init();
421
422
            // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
423
            foreach($options as $option => $value)
424
            {
425
                curl_setopt($curl, $option, $value);
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_setopt() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

425
                curl_setopt(/** @scrutinizer ignore-type */ $curl, $option, $value);
Loading history...
426
            }
427
428
            // make the request directly
429
            if(is_null($this->callback))
430
            {
431
                $this->response = curl_exec($curl) ?: '';
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

431
                $this->response = curl_exec(/** @scrutinizer ignore-type */ $curl) ?: '';
Loading history...
Documentation Bug introduced by
It seems like curl_exec($curl) ?: '' can also be of type true. However, the property $response is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
432
            } // with a callback, the response is appended on each block inside the callback
433
            else
434
            {
435
                $this->response = '';
436
                curl_exec($curl);
437
            }
438
439
            // exception if cURL fails
440
            if(curl_errno($curl))
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_errno() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

440
            if(curl_errno(/** @scrutinizer ignore-type */ $curl))
Loading history...
441
            {
442
                throw new Exception(curl_error($curl), curl_errno($curl));
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

442
                throw new Exception(curl_error(/** @scrutinizer ignore-type */ $curl), curl_errno($curl));
Loading history...
443
            }
444
        }
445
        catch(Exception $exception)
446
        {
447
            throw new Exception('Unexpected error', 0, $exception);
448
        }
449
450
        // return the response and the status code
451
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
0 ignored issues
show
Bug introduced by
It seems like $this->response can also be of type true; however, parameter $str of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
        return [trim(/** @scrutinizer ignore-type */ $this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
Loading history...
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
        return [trim($this->response), curl_getinfo(/** @scrutinizer ignore-type */ $curl, CURLINFO_HTTP_CODE)];
Loading history...
452
    }
453
454
    /**
455
     * Throws an exception for an error status code
456
     *
457
     * @throws \Exception
458
     */
459
    protected function error(int $status, string $resource, string $file = null): void
460
    {
461
        switch($status)
462
        {
463
            //  method not allowed
464
            case 405:
465
                $message = 'Method not allowed';
466
                break;
467
468
            //  unsupported media type
469
            case 415:
470
                $message = 'Unsupported media type';
471
                break;
472
473
            //  unprocessable entity
474
            case 422:
475
                $message = 'Unprocessable document';
476
477
                // using remote files require Tika server to be launched with specific options
478
                if($this->downloadRemote == false && preg_match('/^http/', $file))
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
479
                {
480
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
481
                }
482
483
                break;
484
485
            // server error
486
            case 500:
487
                $message = 'Error while processing document';
488
                break;
489
490
            // unexpected
491
            default:
492
                $message = "Unexpected response for /$resource ($status)";
493
                $status = 501;
494
        }
495
496
        throw new Exception($message, $status);
497
    }
498
499
    /**
500
     * Get the parameters to make the request
501
     *
502
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
503
     * @throws \Exception
504
     */
505
    protected function getParameters(string $type, string $file = null): array
506
    {
507
        $headers = [];
508
        $callback = null;
509
510
        if(!empty($file) && preg_match('/^http/', $file))
511
        {
512
            $headers[] = "fileUrl:$file";
513
        }
514
515
        switch($type)
516
        {
517
            case 'html':
518
                $resource = 'tika';
519
                $headers[] = 'Accept: text/html';
520
                break;
521
522
            case 'lang':
523
                $resource = 'language/stream';
524
                break;
525
526
            case 'mime':
527
                $name = basename($file);
528
                $resource = 'detect/stream';
529
                $headers[] = "Content-Disposition: attachment, filename=$name";
530
                break;
531
532
            case 'detectors':
533
            case 'parsers':
534
            case 'meta':
535
            case 'mime-types':
536
            case 'rmeta/html':
537
            case 'rmeta/ignore':
538
            case 'rmeta/text':
539
                $resource = $type;
540
                $headers[] = 'Accept: application/json';
541
                $callback = function($response)
542
                {
543
                    return json_decode($response, true);
544
                };
545
                break;
546
547
            case 'text':
548
                $resource = 'tika';
549
                $headers[] = 'Accept: text/plain';
550
                break;
551
552
            case 'text-main':
553
                $resource = 'tika/main';
554
                $headers[] = 'Accept: text/plain';
555
                break;
556
557
            case 'version':
558
                $resource = $type;
559
                break;
560
561
            default:
562
                throw new Exception("Unknown type $type");
563
        }
564
565
        return [$resource, $headers, $callback];
566
    }
567
568
    /**
569
     * Get the cURL options
570
     *
571
     * @throws \Exception
572
     */
573
    protected function getCurlOptions(string $type, string $file = null): array
574
    {
575
        // base options
576
        $options = $this->options;
577
578
        // callback
579
        if(!is_null($this->callback))
580
        {
581
            $callback = $this->callback;
582
583
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
584
            {
585
                if($this->callbackAppend === true)
586
                {
587
                    $this->response .= $data;
588
                }
589
590
                $callback($data);
591
592
                // safe because cURL must receive the number of *bytes* written
593
                return strlen($data);
594
            };
595
        }
596
597
        // remote file options
598
        if($file && preg_match('/^http/', $file))
599
        {
600
            //
601
        } // local file options
602
        elseif($file && file_exists($file) && is_readable($file))
603
        {
604
            $options[CURLOPT_INFILE] = fopen($file, 'r');
605
            $options[CURLOPT_INFILESIZE] = filesize($file);
606
        } // other options for specific requests
607
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
608
        {
609
            $options[CURLOPT_PUT] = false;
610
        } // file not accesible
611
        else
612
        {
613
            throw new Exception("File $file can't be opened");
614
        }
615
616
        return $options;
617
    }
618
}
619