Passed
Push — master ( 18bd11...cbba9b )
by David
02:13 queued 14s
created

WebClient::getFetcherName()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
rs 10
cc 1
nc 1
nop 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Apache Tika server connection scheme
42
     *
43
     * @var string
44
     */
45
    protected $scheme = 'http';
46
47
    /**
48
     * Number of retries on server error
49
     *
50
     * @var int
51
     */
52
    protected $retries = 3;
53
54
    /**
55
     * Name of the fetcher to be used (for Tika >= 2.0.0 only)
56
     *
57
     * @var string|null
58
     */
59
    protected $fetcherName = null;
60
61
    /**
62
     * Default cURL options
63
     *
64
     * @var array
65
     */
66
    protected $options =
67
    [
68
        CURLINFO_HEADER_OUT     => true,
69
        CURLOPT_HTTPHEADER      => [],
70
        CURLOPT_PUT             => true,
71
        CURLOPT_RETURNTRANSFER  => true,
72
        CURLOPT_TIMEOUT         => 5
73
    ];
74
75
    /**
76
     * Configure class and test if server is running
77
     *
78
     * @throws \Exception
79
     */
80
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
81
    {
82
        parent::__construct();
83
84
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
85
        {
86
            $this->setUrl($host);
87
        }
88
        elseif($host)
89
        {
90
            $this->setHost($host);
91
        }
92
93
        if(is_numeric($port))
94
        {
95
            $this->setPort($port);
96
        }
97
98
        if(!empty($options))
99
        {
100
            $this->setOptions($options);
101
        }
102
103
        $this->setDownloadRemote(true);
104
105
        if($check === true)
106
        {
107
            $this->check();
108
        }
109
    }
110
111
    /**
112
     * Get the base URL
113
     */
114
    public function getUrl(): string
115
    {
116
        return sprintf('%s://%s:%d', $this->scheme ?: 'http', $this->host, $this->port ?: 9998);
117
    }
118
119
    /**
120
     * Set the host and port using an URL
121
     */
122
    public function setUrl(string $url): self
123
    {
124
        $scheme = parse_url($url, PHP_URL_SCHEME);
125
        $host = parse_url($url, PHP_URL_HOST);
126
        $port = parse_url($url, PHP_URL_PORT);
127
128
        if(!empty($scheme))
129
        {
130
            $this->setScheme((string) $scheme);
131
        }
132
133
        if(!empty($host))
134
        {
135
            $this->setHost((string) $host);
136
        }
137
138
        if(!empty($port))
139
        {
140
            $this->setPort((int) $port);
141
        }
142
143
        return $this;
144
    }
145
146
    /**
147
     * Get the host
148
     */
149
    public function getHost(): ?string
150
    {
151
        return $this->host;
152
    }
153
154
    /**
155
     * Set the host
156
     */
157
    public function setHost(string $host): self
158
    {
159
        $this->host = $host;
160
161
        return $this;
162
    }
163
164
    /**
165
     * Get the port
166
     */
167
    public function getPort(): ?int
168
    {
169
        return $this->port;
170
    }
171
172
    /**
173
     * Set the port
174
     */
175
    public function setPort(int $port): self
176
    {
177
        $this->port = $port;
178
179
        return $this;
180
    }
181
182
    /**
183
     * Get the scheme
184
     */
185
    public function getScheme(): string
186
    {
187
        return $this->scheme;
188
    }
189
190
    /**
191
     * Set the scheme
192
     */
193
    public function setScheme(string $scheme): self
194
    {
195
        $this->scheme = $scheme;
196
197
        return $this;
198
    }
199
200
    /**
201
     * Get the number of retries
202
     */
203
    public function getRetries(): int
204
    {
205
        return $this->retries;
206
    }
207
208
    /**
209
     * Set the number of retries
210
     */
211
    public function setRetries(int $retries): self
212
    {
213
        $this->retries = $retries;
214
215
        return $this;
216
    }
217
218
    /**
219
     * Get the name of the fetcher to be used (for Tika >= 2.0.0 only)
220
     *
221
     * @return string|null
222
     */
223
    public function getFetcherName(): ?string
224
    {
225
        return $this->fetcherName;
226
    }
227
228
    /**
229
     * Set the name of the fetcher to be used (for Tika >= 2.0.0 only)
230
     *
231
     * @link https://cwiki.apache.org/confluence/display/TIKA/tika-pipes
232
     */
233
    public function setFetcherName(string $fetcherName): self
234
    {
235
        if(!in_array($fetcherName, ['FileSystemFetcher', 'HttpFetcher', 'S3Fetcher', 'GCSFetcher', 'SolrFetcher']))
236
        {
237
            throw new Exception("Fetcher name $fetcherName is invalid, see https://cwiki.apache.org/confluence/display/TIKA/tika-pipes");
238
        }
239
240
        $this->fetcherName = $fetcherName;
241
242
        return $this;
243
    }
244
245
    /**
246
     * Get all the options
247
     */
248
    public function getOptions(): array
249
    {
250
        return $this->options;
251
    }
252
253
    /**
254
     * Get an specified option
255
     *
256
     * @return  mixed
257
     */
258
    public function getOption(int $key)
259
    {
260
        return $this->options[$key] ?? null;
261
    }
262
263
    /**
264
     * Set a cURL option to be set with curl_setopt()
265
     *
266
     * @link http://php.net/manual/en/curl.constants.php
267
     * @link http://php.net/manual/en/function.curl-setopt.php
268
     * @param mixed $value
269
     * @throws \Exception
270
     */
271
    public function setOption(int $key, $value): self
272
    {
273
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
274
        {
275
            throw new Exception("Value for cURL option $key cannot be modified", 3);
276
        }
277
278
        $this->options[$key] = $value;
279
280
        return $this;
281
    }
282
283
    /**
284
     * Set the cURL options
285
     *
286
     * @throws \Exception
287
     */
288
    public function setOptions(array $options): self
289
    {
290
        foreach($options as $key => $value)
291
        {
292
            $this->setOption($key, $value);
293
        }
294
295
        return $this;
296
    }
297
298
    /**
299
     * Get all the HTTP headers
300
     */
301
    public function getHeaders(): array
302
    {
303
        return $this->options[CURLOPT_HTTPHEADER];
304
    }
305
306
    /**
307
     * Get an specified HTTP header
308
     */
309
    public function getHeader(string $name): ?string
310
    {
311
        $value = [];
312
313
        foreach($this->options[CURLOPT_HTTPHEADER] as $header)
314
        {
315
            if(preg_match("/$name:\s+(.+)/i", $header, $match))
316
            {
317
                $value = $match[1];
318
                break;
319
            }
320
        }
321
322
        return $value;
323
    }
324
325
    /**
326
     * Set a cURL header to be set with curl_setopt()
327
     *
328
     * @param mixed $value
329
     * @throws \Exception
330
     */
331
    public function setHeader(string $name, $value): self
332
    {
333
        $this->options[CURLOPT_HTTPHEADER][] = "$name: $value";
334
335
        return $this;
336
    }
337
338
    /**
339
     * Set the HTTP headers
340
     *
341
     * @throws \Exception
342
     */
343
    public function setHeaders(array $headers): self
344
    {
345
        foreach($headers as $name => $value)
346
        {
347
            $this->setHeader($name, $value);
348
        }
349
350
        return $this;
351
    }
352
353
    /**
354
     * Get the accepted OCR languages
355
     */
356
    public function getOCRLanguages(): array
357
    {
358
        return explode('+', $this->getHeader('X-Tika-OCRLanguage') ?: '');
359
    }
360
361
    /**
362
     * Set the accepted OCR language
363
     *
364
     * @throws \Exception
365
     */
366
    public function setOCRLanguage(string $language): self
367
    {
368
        $this->setHeader('X-Tika-OCRLanguage', $language);
369
370
        return $this;
371
    }
372
373
    /**
374
     * Set the accepted OCR languages
375
     *
376
     * @throws \Exception
377
     */
378
    public function setOCRLanguages(array $languages): self
379
    {
380
        $this->setHeader('X-Tika-OCRLanguage', implode('+', $languages));
381
382
        return $this;
383
    }
384
385
    /**
386
     * Get the timeout value for cURL
387
     */
388
    public function getTimeout(): int
389
    {
390
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
391
    }
392
393
    /**
394
     * Set the timeout value for cURL
395
     *
396
     * @throws \Exception
397
     */
398
    public function setTimeout(int $value): self
399
    {
400
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
401
402
        return $this;
403
    }
404
405
    /**
406
     * Returns the supported MIME types
407
     *
408
     * @throws \Exception
409
     */
410
    public function getSupportedMIMETypes(): array
411
    {
412
        $mimeTypes = json_decode($this->request('mime-types'), true);
413
414
        ksort($mimeTypes);
415
416
        return $mimeTypes;
417
    }
418
419
    /**
420
     * Returns the available detectors
421
     *
422
     * @throws \Exception
423
     */
424
    public function getAvailableDetectors(): array
425
    {
426
        $detectors = [json_decode($this->request('detectors'), true)];
427
428
        foreach($detectors as $index => $parent)
429
        {
430
            $detectors[$parent['name']] = $parent;
431
432
            if(isset($parent['children']))
433
            {
434
                foreach($parent['children'] as $subindex => $child)
435
                {
436
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
437
438
                    unset($detectors[$parent['name']]['children'][$subindex]);
439
                }
440
            }
441
442
            unset($detectors[$index]);
443
        }
444
445
        return $detectors;
446
    }
447
448
    /**
449
     * Returns the available parsers
450
     *
451
     * @throws \Exception
452
     */
453
    public function getAvailableParsers(): array
454
    {
455
        $parsers = [json_decode($this->request('parsers'), true)];
456
457
        foreach($parsers as $index => $parent)
458
        {
459
            $parsers[$parent['name']] = $parent;
460
461
            if(isset($parent['children']))
462
            {
463
                foreach($parent['children'] as $subindex => $child)
464
                {
465
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
466
467
                    unset($parsers[$parent['name']]['children'][$subindex]);
468
                }
469
            }
470
471
            unset($parsers[$index]);
472
        }
473
474
        return $parsers;
475
    }
476
477
    /**
478
     * Check if server is running
479
     *
480
     * @throws \Exception
481
     */
482
    public function check(): void
483
    {
484
        if($this->isChecked() === false)
485
        {
486
            $this->setChecked(true);
487
488
            // throws an exception if server is unreachable or can't connect
489
            $this->request('version');
490
        }
491
    }
492
493
    /**
494
     * Configure, make a request and return its results
495
     *
496
     * @throws \Exception
497
     */
498
    public function request(string $type, string $file = null): string
499
    {
500
        static $retries = [];
501
502
        // check if not checked
503
        $this->check();
504
505
        // check if is cached
506
        if($file !== null && $this->isCached($type, $file))
507
        {
508
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
509
        }
510
        elseif($file !== null && !isset($retries[sha1($file)]))
511
        {
512
            $retries[sha1($file)] = $this->retries;
513
        }
514
515
        // parameters for cURL request
516
        [$resource, $headers] = $this->getParameters($type, $file);
517
518
        // check the request
519
        $file = $this->checkRequest($type, $file);
520
521
        // cURL options
522
        $options = $this->getCurlOptions($type, $file);
523
524
        // sets headers
525
        foreach($headers as $header)
526
        {
527
            $options[CURLOPT_HTTPHEADER][] = $header;
528
        }
529
530
        // cURL init and options
531
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
532
533
        // get the response and the HTTP status code
534
        [$response, $status] = $this->exec($options);
535
536
        // reduce memory usage closing cURL resource
537
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
538
        {
539
            fclose($options[CURLOPT_INFILE]);
540
        }
541
542
        // request completed successfully
543
        if($status == 200)
544
        {
545
            // cache certain responses
546
            if($file !== null && $this->isCacheable($type))
547
            {
548
                $this->cacheResponse($type, $response, $file);
549
            }
550
        } // request completed successfully but result is empty
551
        elseif($status == 204)
552
        {
553
            $response = null;
554
        } // retry on request failed with error 500
555
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
556
        {
557
            $response = $this->request($type, $file);
558
        } // other status code is an error
559
        else
560
        {
561
            $this->error($status, $resource, $file);
562
        }
563
564
        return $this->filterResponse($response);
0 ignored issues
show
Bug introduced by
It seems like $response can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::filterResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

564
        return $this->filterResponse(/** @scrutinizer ignore-type */ $response);
Loading history...
565
    }
566
567
    /**
568
     * Make a request to Apache Tika Server
569
     *
570
     * @throws \Exception
571
     */
572
    protected function exec(array $options = []): array
573
    {
574
        // cURL init and options
575
        $curl = curl_init();
576
577
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
578
        foreach($options as $option => $value)
579
        {
580
            curl_setopt($curl, $option, $value);
581
        }
582
583
        // make the request directly
584
        if(is_null($this->callback))
585
        {
586
            $this->response = (string) curl_exec($curl);
587
        }
588
        // with a callback, the response is appended on each block inside the callback
589
        else
590
        {
591
            $this->response = '';
592
            curl_exec($curl);
593
        }
594
595
        // exception if cURL fails
596
        if(curl_errno($curl))
597
        {
598
            throw new Exception(curl_error($curl), curl_errno($curl));
599
        }
600
601
        // return the response and the status code
602
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
603
    }
604
605
    /**
606
     * Throws an exception for an error status code
607
     *
608
     * @throws \Exception
609
     */
610
    protected function error(int $status, string $resource, string $file = null): void
611
    {
612
        switch($status)
613
        {
614
            //  method not allowed
615
            case 405:
616
                $message = 'Method not allowed';
617
                break;
618
619
            //  unsupported media type
620
            case 415:
621
                $message = 'Unsupported media type';
622
                break;
623
624
            //  unprocessable entity
625
            case 422:
626
                $message = 'Unprocessable document';
627
628
                // using remote files require Tika server to be launched with specific options
629
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
630
                {
631
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
632
                }
633
634
                break;
635
636
            // server error
637
            case 500:
638
                $message = 'Error while processing document';
639
                break;
640
641
            // unexpected
642
            default:
643
                $message = "Unexpected response for /$resource ($status)";
644
                $status = 501;
645
        }
646
647
        throw new Exception($message, $status);
648
    }
649
650
    /**
651
     * Get the parameters to make the request
652
     *
653
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
654
     * @throws \Exception
655
     */
656
    protected function getParameters(string $type, string $file = null): array
657
    {
658
        $headers = [];
659
        $callback = null;
660
661
        if(!empty($file) && preg_match('/^http/', $file))
662
        {
663
            if($this->fetcherName)
664
            {
665
                $headers[] = "fetcherName:$this->fetcherName";
666
                $headers[] = "fetchKey:$file";
667
            }
668
            else
669
            {
670
                $headers[] = "fileUrl:$file";
671
            }
672
        }
673
674
        switch($type)
675
        {
676
            case 'html':
677
                $resource = 'tika';
678
                $headers[] = 'Accept: text/html';
679
                break;
680
681
            case 'lang':
682
                $resource = 'language/stream';
683
                break;
684
685
            case 'mime':
686
                $resource = 'detect/stream';
687
688
                if($file !== null)
689
                {
690
                    $name = basename($file);
691
                    $headers[] = "Content-Disposition: attachment, filename=$name";
692
                }
693
                break;
694
695
            case 'detectors':
696
            case 'parsers':
697
            case 'meta':
698
            case 'mime-types':
699
            case 'rmeta/html':
700
            case 'rmeta/ignore':
701
            case 'rmeta/text':
702
                $resource = $type;
703
                $headers[] = 'Accept: application/json';
704
                $callback = function($response)
705
                {
706
                    return json_decode($response, true);
707
                };
708
                break;
709
710
            case 'text':
711
                $resource = 'tika';
712
                $headers[] = 'Accept: text/plain';
713
                break;
714
715
            case 'text-main':
716
                $resource = 'tika/main';
717
                $headers[] = 'Accept: text/plain';
718
                break;
719
720
            case 'version':
721
                $resource = $type;
722
                break;
723
724
            case 'xhtml':
725
                throw new Exception("Tika Server does not support XHTML output");
726
727
            default:
728
                throw new Exception("Unknown type $type");
729
        }
730
731
        return [$resource, $headers, $callback];
732
    }
733
734
    /**
735
     * Get the cURL options
736
     *
737
     * @throws \Exception
738
     */
739
    protected function getCurlOptions(string $type, string $file = null): array
740
    {
741
        // base options
742
        $options = $this->options;
743
744
        // callback
745
        if(!is_null($this->callback))
746
        {
747
            $callback = $this->callback;
748
749
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
750
            {
751
                if($this->callbackAppend === true)
752
                {
753
                    $this->response .= $data;
754
                }
755
756
                $callback($data);
757
758
                // safe because cURL must receive the number of *bytes* written
759
                return strlen($data);
760
            };
761
        }
762
763
        // remote file options
764
        if($file && preg_match('/^http/', $file))
765
        {
766
            //
767
        }
768
        // local file options
769
        elseif($file && file_exists($file) && is_readable($file))
770
        {
771
            $options[CURLOPT_INFILE] = fopen($file, 'r');
772
            $options[CURLOPT_INFILESIZE] = filesize($file);
773
        } // other options for specific requests
774
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
775
        {
776
            $options[CURLOPT_PUT] = false;
777
        } // file not accesible
778
        else
779
        {
780
            throw new Exception("File $file can't be opened");
781
        }
782
783
        return $options;
784
    }
785
}
786