Passed
Push — master ( dd145e...782d7e )
by David
03:49 queued 12s
created

WebClient::setScheme()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
c 0
b 0
f 0
dl 0
loc 5
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Apache Tika server connection scheme
42
     *
43
     * @var int
44
     */
45
    protected $scheme = 'http';
46
47
    /**
48
     * Number of retries on server error
49
     *
50
     * @var int
51
     */
52
    protected $retries = 3;
53
54
    /**
55
     * Default cURL options
56
     *
57
     * @var array
58
     */
59
    protected $options =
60
    [
61
        CURLINFO_HEADER_OUT     => true,
62
        CURLOPT_HTTPHEADER      => [],
63
        CURLOPT_PUT             => true,
64
        CURLOPT_RETURNTRANSFER  => true,
65
        CURLOPT_TIMEOUT         => 5
66
    ];
67
68
    /**
69
     * Configure class and test if server is running
70
     *
71
     * @throws \Exception
72
     */
73
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
74
    {
75
        parent::__construct();
76
77
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
78
        {
79
            $this->setUrl($host);
80
        }
81
        elseif($host)
82
        {
83
            $this->setHost($host);
84
        }
85
86
        if(is_numeric($port))
87
        {
88
            $this->setPort($port);
89
        }
90
91
        if(!empty($options))
92
        {
93
            $this->setOptions($options);
94
        }
95
96
        $this->setDownloadRemote(true);
97
98
        if($check === true)
99
        {
100
            $this->check();
101
        }
102
    }
103
104
    /**
105
     * Get the base URL
106
     */
107
    public function getUrl(): string
108
    {
109
        return sprintf('%s://%s:%d', $this->scheme ?: 'http', $this->host, $this->port ?: 9998);
110
    }
111
112
    /**
113
     * Set the host and port using an URL
114
     */
115
    public function setUrl(string $url): self
116
    {
117
        $scheme = parse_url($url, PHP_URL_SCHEME);
118
        $host = parse_url($url, PHP_URL_HOST);
119
        $port = parse_url($url, PHP_URL_PORT);
120
121
        if(!empty($scheme))
122
        {
123
            $this->setScheme((string) $scheme);
124
        }
125
126
        if(!empty($host))
127
        {
128
            $this->setHost((string) $host);
129
        }
130
131
        if(!empty($port))
132
        {
133
            $this->setPort((int) $port);
134
        }
135
136
        return $this;
137
    }
138
139
    /**
140
     * Get the host
141
     */
142
    public function getHost(): ?string
143
    {
144
        return $this->host;
145
    }
146
147
    /**
148
     * Set the host
149
     */
150
    public function setHost(string $host): self
151
    {
152
        $this->host = $host;
153
154
        return $this;
155
    }
156
157
    /**
158
     * Get the port
159
     */
160
    public function getPort(): ?int
161
    {
162
        return $this->port;
163
    }
164
165
    /**
166
     * Set the port
167
     */
168
    public function setPort(int $port): self
169
    {
170
        $this->port = $port;
171
172
        return $this;
173
    }
174
175
    /**
176
     * Get the scheme
177
     */
178
    public function getScheme(): string
179
    {
180
        return $this->scheme;
181
    }
182
183
    /**
184
     * Set the scheme
185
     */
186
    public function setScheme(string $scheme): self
187
    {
188
        $this->scheme = $scheme;
0 ignored issues
show
Documentation Bug introduced by
The property $scheme was declared of type integer, but $scheme is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
189
190
        return $this;
191
    }
192
193
    /**
194
     * Get the number of retries
195
     */
196
    public function getRetries(): int
197
    {
198
        return $this->retries;
199
    }
200
201
    /**
202
     * Set the number of retries
203
     */
204
    public function setRetries(int $retries): self
205
    {
206
        $this->retries = $retries;
207
208
        return $this;
209
    }
210
211
    /**
212
     * Get all the options
213
     */
214
    public function getOptions(): array
215
    {
216
        return $this->options;
217
    }
218
219
    /**
220
     * Get an specified option
221
     *
222
     * @return  mixed
223
     */
224
    public function getOption(int $key)
225
    {
226
        return $this->options[$key] ?? null;
227
    }
228
229
    /**
230
     * Set a cURL option to be set with curl_setopt()
231
     *
232
     * @link http://php.net/manual/en/curl.constants.php
233
     * @link http://php.net/manual/en/function.curl-setopt.php
234
     * @param mixed $value
235
     * @throws \Exception
236
     */
237
    public function setOption(int $key, $value): self
238
    {
239
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
240
        {
241
            throw new Exception("Value for cURL option $key cannot be modified", 3);
242
        }
243
244
        $this->options[$key] = $value;
245
246
        return $this;
247
    }
248
249
    /**
250
     * Set the cURL options
251
     *
252
     * @throws \Exception
253
     */
254
    public function setOptions(array $options): self
255
    {
256
        foreach($options as $key => $value)
257
        {
258
            $this->setOption($key, $value);
259
        }
260
261
        return $this;
262
    }
263
264
    /**
265
     * Get all the HTTP headers
266
     */
267
    public function getHeaders(): array
268
    {
269
        return $this->options[CURLOPT_HTTPHEADER];
270
    }
271
272
    /**
273
     * Get an specified HTTP header
274
     */
275
    public function getHeader(string $name): ?string
276
    {
277
        $value = [];
278
279
        foreach($this->options[CURLOPT_HTTPHEADER] as $header)
280
        {
281
            if(preg_match("/$name:\s+(.+)/i", $header, $match))
282
            {
283
                $value = $match[1];
284
                break;
285
            }
286
        }
287
288
        return $value;
289
    }
290
291
    /**
292
     * Set a cURL header to be set with curl_setopt()
293
     *
294
     * @param mixed $value
295
     * @throws \Exception
296
     */
297
    public function setHeader(string $name, $value): self
298
    {
299
        $this->options[CURLOPT_HTTPHEADER][] = "$name: $value";
300
301
        return $this;
302
    }
303
304
    /**
305
     * Set the HTTP headers
306
     *
307
     * @throws \Exception
308
     */
309
    public function setHeaders(array $headers): self
310
    {
311
        foreach($headers as $name => $value)
312
        {
313
            $this->setHeader($name, $value);
314
        }
315
316
        return $this;
317
    }
318
319
    /**
320
     * Get the accepted OCR languages
321
     */
322
    public function getOCRLanguages(): array
323
    {
324
        return explode('+', $this->getHeader('X-Tika-OCRLanguage') ?: '');
325
    }
326
327
    /**
328
     * Set the accepted OCR language
329
     *
330
     * @throws \Exception
331
     */
332
    public function setOCRLanguage(string $language): self
333
    {
334
        $this->setHeader('X-Tika-OCRLanguage', $language);
335
336
        return $this;
337
    }
338
339
    /**
340
     * Set the accepted OCR languages
341
     *
342
     * @throws \Exception
343
     */
344
    public function setOCRLanguages(array $languages): self
345
    {
346
        $this->setHeader('X-Tika-OCRLanguage', implode('+', $languages));
347
348
        return $this;
349
    }
350
351
    /**
352
     * Get the timeout value for cURL
353
     */
354
    public function getTimeout(): int
355
    {
356
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
357
    }
358
359
    /**
360
     * Set the timeout value for cURL
361
     *
362
     * @throws \Exception
363
     */
364
    public function setTimeout(int $value): self
365
    {
366
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
367
368
        return $this;
369
    }
370
371
    /**
372
     * Returns the supported MIME types
373
     *
374
     * @throws \Exception
375
     */
376
    public function getSupportedMIMETypes(): array
377
    {
378
        $mimeTypes = json_decode($this->request('mime-types'), true);
379
380
        ksort($mimeTypes);
381
382
        return $mimeTypes;
383
    }
384
385
    /**
386
     * Returns the available detectors
387
     *
388
     * @throws \Exception
389
     */
390
    public function getAvailableDetectors(): array
391
    {
392
        $detectors = [json_decode($this->request('detectors'), true)];
393
394
        foreach($detectors as $index => $parent)
395
        {
396
            $detectors[$parent['name']] = $parent;
397
398
            if(isset($parent['children']))
399
            {
400
                foreach($parent['children'] as $subindex => $child)
401
                {
402
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
403
404
                    unset($detectors[$parent['name']]['children'][$subindex]);
405
                }
406
            }
407
408
            unset($detectors[$index]);
409
        }
410
411
        return $detectors;
412
    }
413
414
    /**
415
     * Returns the available parsers
416
     *
417
     * @throws \Exception
418
     */
419
    public function getAvailableParsers(): array
420
    {
421
        $parsers = [json_decode($this->request('parsers'), true)];
422
423
        foreach($parsers as $index => $parent)
424
        {
425
            $parsers[$parent['name']] = $parent;
426
427
            if(isset($parent['children']))
428
            {
429
                foreach($parent['children'] as $subindex => $child)
430
                {
431
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
432
433
                    unset($parsers[$parent['name']]['children'][$subindex]);
434
                }
435
            }
436
437
            unset($parsers[$index]);
438
        }
439
440
        return $parsers;
441
    }
442
443
    /**
444
     * Check if server is running
445
     *
446
     * @throws \Exception
447
     */
448
    public function check(): void
449
    {
450
        if($this->isChecked() === false)
451
        {
452
            $this->setChecked(true);
453
454
            // throws an exception if server is unreachable or can't connect
455
            $this->request('version');
456
        }
457
    }
458
459
    /**
460
     * Configure, make a request and return its results
461
     *
462
     * @throws \Exception
463
     */
464
    public function request(string $type, string $file = null): string
465
    {
466
        static $retries = [];
467
468
        // check if not checked
469
        $this->check();
470
471
        // check if is cached
472
        if($file !== null && $this->isCached($type, $file))
473
        {
474
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
475
        }
476
        elseif($file !== null && !isset($retries[sha1($file)]))
477
        {
478
            $retries[sha1($file)] = $this->retries;
479
        }
480
481
        // parameters for cURL request
482
        [$resource, $headers] = $this->getParameters($type, $file);
483
484
        // check the request
485
        $file = $this->checkRequest($type, $file);
486
487
        // cURL options
488
        $options = $this->getCurlOptions($type, $file);
489
490
        // sets headers
491
        foreach($headers as $header)
492
        {
493
            $options[CURLOPT_HTTPHEADER][] = $header;
494
        }
495
496
        // cURL init and options
497
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
498
499
        // get the response and the HTTP status code
500
        [$response, $status] = $this->exec($options);
501
502
        // reduce memory usage closing cURL resource
503
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
504
        {
505
            fclose($options[CURLOPT_INFILE]);
506
        }
507
508
        // request completed successfully
509
        if($status == 200)
510
        {
511
            // cache certain responses
512
            if($file !== null && $this->isCacheable($type))
513
            {
514
                $this->cacheResponse($type, $response, $file);
515
            }
516
        } // request completed successfully but result is empty
517
        elseif($status == 204)
518
        {
519
            $response = null;
520
        } // retry on request failed with error 500
521
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
522
        {
523
            $response = $this->request($type, $file);
524
        } // other status code is an error
525
        else
526
        {
527
            $this->error($status, $resource, $file);
528
        }
529
530
        return $this->filterResponse($response);
0 ignored issues
show
Bug introduced by
It seems like $response can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::filterResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

530
        return $this->filterResponse(/** @scrutinizer ignore-type */ $response);
Loading history...
531
    }
532
533
    /**
534
     * Make a request to Apache Tika Server
535
     *
536
     * @throws \Exception
537
     */
538
    protected function exec(array $options = []): array
539
    {
540
        // cURL init and options
541
        $curl = curl_init();
542
543
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
544
        foreach($options as $option => $value)
545
        {
546
            curl_setopt($curl, $option, $value);
547
        }
548
549
        // make the request directly
550
        if(is_null($this->callback))
551
        {
552
            $this->response = (string) curl_exec($curl);
553
        }
554
        // with a callback, the response is appended on each block inside the callback
555
        else
556
        {
557
            $this->response = '';
558
            curl_exec($curl);
559
        }
560
561
        // exception if cURL fails
562
        if(curl_errno($curl))
563
        {
564
            throw new Exception(curl_error($curl), curl_errno($curl));
565
        }
566
567
        // return the response and the status code
568
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
569
    }
570
571
    /**
572
     * Throws an exception for an error status code
573
     *
574
     * @throws \Exception
575
     */
576
    protected function error(int $status, string $resource, string $file = null): void
577
    {
578
        switch($status)
579
        {
580
            //  method not allowed
581
            case 405:
582
                $message = 'Method not allowed';
583
                break;
584
585
            //  unsupported media type
586
            case 415:
587
                $message = 'Unsupported media type';
588
                break;
589
590
            //  unprocessable entity
591
            case 422:
592
                $message = 'Unprocessable document';
593
594
                // using remote files require Tika server to be launched with specific options
595
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
596
                {
597
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
598
                }
599
600
                break;
601
602
            // server error
603
            case 500:
604
                $message = 'Error while processing document';
605
                break;
606
607
            // unexpected
608
            default:
609
                $message = "Unexpected response for /$resource ($status)";
610
                $status = 501;
611
        }
612
613
        throw new Exception($message, $status);
614
    }
615
616
    /**
617
     * Get the parameters to make the request
618
     *
619
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
620
     * @throws \Exception
621
     */
622
    protected function getParameters(string $type, string $file = null): array
623
    {
624
        $headers = [];
625
        $callback = null;
626
627
        if(!empty($file) && preg_match('/^http/', $file))
628
        {
629
            $headers[] = "fileUrl:$file";
630
        }
631
632
        switch($type)
633
        {
634
            case 'html':
635
                $resource = 'tika';
636
                $headers[] = 'Accept: text/html';
637
                break;
638
639
            case 'lang':
640
                $resource = 'language/stream';
641
                break;
642
643
            case 'mime':
644
                $resource = 'detect/stream';
645
646
                if($file !== null)
647
                {
648
                    $name = basename($file);
649
                    $headers[] = "Content-Disposition: attachment, filename=$name";
650
                }
651
                break;
652
653
            case 'detectors':
654
            case 'parsers':
655
            case 'meta':
656
            case 'mime-types':
657
            case 'rmeta/html':
658
            case 'rmeta/ignore':
659
            case 'rmeta/text':
660
                $resource = $type;
661
                $headers[] = 'Accept: application/json';
662
                $callback = function($response)
663
                {
664
                    return json_decode($response, true);
665
                };
666
                break;
667
668
            case 'text':
669
                $resource = 'tika';
670
                $headers[] = 'Accept: text/plain';
671
                break;
672
673
            case 'text-main':
674
                $resource = 'tika/main';
675
                $headers[] = 'Accept: text/plain';
676
                break;
677
678
            case 'version':
679
                $resource = $type;
680
                break;
681
682
            case 'xhtml':
683
                throw new Exception("Tika Server does not support XHTML output");
684
685
            default:
686
                throw new Exception("Unknown type $type");
687
        }
688
689
        return [$resource, $headers, $callback];
690
    }
691
692
    /**
693
     * Get the cURL options
694
     *
695
     * @throws \Exception
696
     */
697
    protected function getCurlOptions(string $type, string $file = null): array
698
    {
699
        // base options
700
        $options = $this->options;
701
702
        // callback
703
        if(!is_null($this->callback))
704
        {
705
            $callback = $this->callback;
706
707
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
708
            {
709
                if($this->callbackAppend === true)
710
                {
711
                    $this->response .= $data;
712
                }
713
714
                $callback($data);
715
716
                // safe because cURL must receive the number of *bytes* written
717
                return strlen($data);
718
            };
719
        }
720
721
        // remote file options
722
        if($file && preg_match('/^http/', $file))
723
        {
724
            //
725
        }
726
        // local file options
727
        elseif($file && file_exists($file) && is_readable($file))
728
        {
729
            $options[CURLOPT_INFILE] = fopen($file, 'r');
730
            $options[CURLOPT_INFILESIZE] = filesize($file);
731
        } // other options for specific requests
732
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
733
        {
734
            $options[CURLOPT_PUT] = false;
735
        } // file not accesible
736
        else
737
        {
738
            throw new Exception("File $file can't be opened");
739
        }
740
741
        return $options;
742
    }
743
}
744