WebClient::getAvailableParsers()   A
last analyzed

Complexity

Conditions 4
Paths 3

Size

Total Lines 22
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
c 0
b 0
f 0
dl 0
loc 22
rs 9.9666
cc 4
nc 3
nop 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Apache Tika server connection scheme
42
     *
43
     * @var string
44
     */
45
    protected $scheme = 'http';
46
47
    /**
48
     * Number of retries on server error
49
     *
50
     * @var int
51
     */
52
    protected $retries = 3;
53
54
    /**
55
     * Name of the fetcher to be used (for Tika >= 2.0.0 only)
56
     *
57
     * @var string|null
58
     */
59
    protected $fetcherName = null;
60
61
    /**
62
     * Default cURL options
63
     *
64
     * @var array
65
     */
66
    protected $options =
67
    [
68
        CURLINFO_HEADER_OUT     => true,
69
        CURLOPT_HTTPHEADER      => [],
70
        CURLOPT_PUT             => true,
71
        CURLOPT_RETURNTRANSFER  => true,
72
        CURLOPT_TIMEOUT         => 5
73
    ];
74
75
    /**
76
     * Configure class and test if server is running
77
     *
78
     * @throws \Exception
79
     */
80
    public function __construct(?string $host = null, ?int $port = null, array $options = [], bool $check = true)
81
    {
82
        parent::__construct();
83
84
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
85
        {
86
            $this->setUrl($host);
87
        }
88
        elseif($host)
89
        {
90
            $this->setHost($host);
91
        }
92
93
        if(is_numeric($port))
94
        {
95
            $this->setPort($port);
96
        }
97
98
        if(!empty($options))
99
        {
100
            $this->setOptions($options);
101
        }
102
103
        $this->setDownloadRemote(true);
104
105
        if($check === true)
106
        {
107
            $this->check();
108
        }
109
    }
110
111
    /**
112
     * Get the base URL
113
     */
114
    public function getUrl(): string
115
    {
116
        return sprintf('%s://%s:%d', $this->scheme ?: 'http', $this->host, $this->port ?: 9998);
117
    }
118
119
    /**
120
     * Set the host and port using an URL
121
     */
122
    public function setUrl(string $url): self
123
    {
124
        $scheme = parse_url($url, PHP_URL_SCHEME);
125
        $host = parse_url($url, PHP_URL_HOST);
126
        $port = parse_url($url, PHP_URL_PORT);
127
128
        if(!empty($scheme))
129
        {
130
            $this->setScheme((string) $scheme);
131
        }
132
133
        if(!empty($host))
134
        {
135
            $this->setHost((string) $host);
136
        }
137
138
        if(!empty($port))
139
        {
140
            $this->setPort((int) $port);
141
        }
142
143
        return $this;
144
    }
145
146
    /**
147
     * Get the host
148
     */
149
    public function getHost(): ?string
150
    {
151
        return $this->host;
152
    }
153
154
    /**
155
     * Set the host
156
     */
157
    public function setHost(string $host): self
158
    {
159
        $this->host = $host;
160
161
        return $this;
162
    }
163
164
    /**
165
     * Get the port
166
     */
167
    public function getPort(): ?int
168
    {
169
        return $this->port;
170
    }
171
172
    /**
173
     * Set the port
174
     */
175
    public function setPort(int $port): self
176
    {
177
        $this->port = $port;
178
179
        return $this;
180
    }
181
182
    /**
183
     * Get the scheme
184
     */
185
    public function getScheme(): string
186
    {
187
        return $this->scheme;
188
    }
189
190
    /**
191
     * Set the scheme
192
     */
193
    public function setScheme(string $scheme): self
194
    {
195
        $this->scheme = $scheme;
196
197
        return $this;
198
    }
199
200
    /**
201
     * Get the number of retries
202
     */
203
    public function getRetries(): int
204
    {
205
        return $this->retries;
206
    }
207
208
    /**
209
     * Set the number of retries
210
     */
211
    public function setRetries(int $retries): self
212
    {
213
        $this->retries = $retries;
214
215
        return $this;
216
    }
217
218
    /**
219
     * Get the name of the fetcher to be used (for Tika >= 2.0.0 only)
220
     *
221
     * @return string|null
222
     */
223
    public function getFetcherName(): ?string
224
    {
225
        return $this->fetcherName;
226
    }
227
228
    /**
229
     * Set the name of the fetcher to be used (for Tika >= 2.0.0 only)
230
     *
231
     * @link https://cwiki.apache.org/confluence/display/TIKA/tika-pipes
232
     */
233
    public function setFetcherName(string $fetcherName): self
234
    {
235
        $this->fetcherName = $fetcherName;
236
237
        return $this;
238
    }
239
240
    /**
241
     * Get all the options
242
     */
243
    public function getOptions(): array
244
    {
245
        return $this->options;
246
    }
247
248
    /**
249
     * Get an specified option
250
     *
251
     * @return  mixed
252
     */
253
    public function getOption(int $key)
254
    {
255
        return $this->options[$key] ?? null;
256
    }
257
258
    /**
259
     * Set a cURL option to be set with curl_setopt()
260
     *
261
     * @link http://php.net/manual/en/curl.constants.php
262
     * @link http://php.net/manual/en/function.curl-setopt.php
263
     * @param mixed $value
264
     * @throws \Exception
265
     */
266
    public function setOption(int $key, $value): self
267
    {
268
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
269
        {
270
            throw new Exception("Value for cURL option $key cannot be modified", 3);
271
        }
272
273
        $this->options[$key] = $value;
274
275
        return $this;
276
    }
277
278
    /**
279
     * Set the cURL options
280
     *
281
     * @throws \Exception
282
     */
283
    public function setOptions(array $options): self
284
    {
285
        foreach($options as $key => $value)
286
        {
287
            $this->setOption($key, $value);
288
        }
289
290
        return $this;
291
    }
292
293
    /**
294
     * Get all the HTTP headers
295
     */
296
    public function getHeaders(): array
297
    {
298
        return $this->options[CURLOPT_HTTPHEADER];
299
    }
300
301
    /**
302
     * Get an specified HTTP header
303
     */
304
    public function getHeader(string $name): ?string
305
    {
306
        $value = null;
307
308
        foreach($this->options[CURLOPT_HTTPHEADER] as $header)
309
        {
310
            if(preg_match("/$name:\s+(.+)/i", $header, $match))
311
            {
312
                $value = $match[1];
313
                break;
314
            }
315
        }
316
317
        return $value;
318
    }
319
320
    /**
321
     * Set a cURL header to be set with curl_setopt()
322
     *
323
     * @param mixed $value
324
     * @throws \Exception
325
     */
326
    public function setHeader(string $name, $value): self
327
    {
328
        $this->options[CURLOPT_HTTPHEADER][] = "$name: $value";
329
330
        return $this;
331
    }
332
333
    /**
334
     * Set the HTTP headers
335
     *
336
     * @throws \Exception
337
     */
338
    public function setHeaders(array $headers): self
339
    {
340
        foreach($headers as $name => $value)
341
        {
342
            $this->setHeader($name, $value);
343
        }
344
345
        return $this;
346
    }
347
348
    /**
349
     * Get the accepted OCR languages
350
     */
351
    public function getOCRLanguages(): array
352
    {
353
        return explode('+', $this->getHeader('X-Tika-OCRLanguage') ?: '');
354
    }
355
356
    /**
357
     * Set the accepted OCR language
358
     *
359
     * @throws \Exception
360
     */
361
    public function setOCRLanguage(string $language): self
362
    {
363
        $this->setHeader('X-Tika-OCRLanguage', $language);
364
365
        return $this;
366
    }
367
368
    /**
369
     * Set the accepted OCR languages
370
     *
371
     * @throws \Exception
372
     */
373
    public function setOCRLanguages(array $languages): self
374
    {
375
        $this->setHeader('X-Tika-OCRLanguage', implode('+', $languages));
376
377
        return $this;
378
    }
379
380
    /**
381
     * Get the timeout value for cURL
382
     */
383
    public function getTimeout(): int
384
    {
385
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
386
    }
387
388
    /**
389
     * Set the timeout value for cURL
390
     *
391
     * @throws \Exception
392
     */
393
    public function setTimeout(int $value): self
394
    {
395
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
396
397
        return $this;
398
    }
399
400
    /**
401
     * Returns the supported MIME types
402
     *
403
     * @throws \Exception
404
     */
405
    public function getSupportedMIMETypes(): array
406
    {
407
        $mimeTypes = json_decode($this->request('mime-types'), true);
408
409
        ksort($mimeTypes);
410
411
        return $mimeTypes;
412
    }
413
414
    /**
415
     * Returns the available detectors
416
     *
417
     * @throws \Exception
418
     */
419
    public function getAvailableDetectors(): array
420
    {
421
        $detectors = [json_decode($this->request('detectors'), true)];
422
423
        foreach($detectors as $index => $parent)
424
        {
425
            $detectors[$parent['name']] = $parent;
426
427
            if(isset($parent['children']))
428
            {
429
                foreach($parent['children'] as $subindex => $child)
430
                {
431
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
432
433
                    unset($detectors[$parent['name']]['children'][$subindex]);
434
                }
435
            }
436
437
            unset($detectors[$index]);
438
        }
439
440
        return $detectors;
441
    }
442
443
    /**
444
     * Returns the available parsers
445
     *
446
     * @throws \Exception
447
     */
448
    public function getAvailableParsers(): array
449
    {
450
        $parsers = [json_decode($this->request('parsers'), true)];
451
452
        foreach($parsers as $index => $parent)
453
        {
454
            $parsers[$parent['name']] = $parent;
455
456
            if(isset($parent['children']))
457
            {
458
                foreach($parent['children'] as $subindex => $child)
459
                {
460
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
461
462
                    unset($parsers[$parent['name']]['children'][$subindex]);
463
                }
464
            }
465
466
            unset($parsers[$index]);
467
        }
468
469
        return $parsers;
470
    }
471
472
    /**
473
     * Check if server is running
474
     *
475
     * @throws \Exception
476
     */
477
    public function check(): void
478
    {
479
        if($this->isChecked() === false)
480
        {
481
            $this->setChecked(true);
482
483
            // throws an exception if server is unreachable or can't connect
484
            $this->request('version');
485
        }
486
    }
487
488
    /**
489
     * Configure, make a request and return its results
490
     *
491
     * @throws \Exception
492
     */
493
    public function request(string $type, ?string $file = null): string
494
    {
495
        static $retries = [];
496
497
        // check if not checked
498
        $this->check();
499
500
        // check if is cached
501
        if($file !== null && $this->isCached($type, $file))
502
        {
503
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
504
        }
505
        elseif($file !== null && !isset($retries[sha1($file)]))
506
        {
507
            $retries[sha1($file)] = $this->retries;
508
        }
509
510
        // parameters for cURL request
511
        [$resource, $headers] = $this->getParameters($type, $file);
512
513
        // check the request
514
        $file = $this->checkRequest($type, $file);
515
516
        // cURL options
517
        $options = $this->getCurlOptions($type, $file);
518
519
        // sets headers
520
        foreach($headers as $header)
521
        {
522
            $options[CURLOPT_HTTPHEADER][] = $header;
523
        }
524
525
        // cURL init and options
526
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
527
528
        // get the response and the HTTP status code
529
        [$response, $status] = $this->exec($options);
530
531
        // reduce memory usage closing cURL resource
532
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
533
        {
534
            fclose($options[CURLOPT_INFILE]);
535
        }
536
537
        // request completed successfully
538
        if($status == 200)
539
        {
540
            // cache certain responses
541
            if($file !== null && $this->isCacheable($type))
542
            {
543
                $this->cacheResponse($type, $response, $file);
544
            }
545
        } // request completed successfully but result is empty
546
        elseif($status == 204)
547
        {
548
            $response = null;
549
        } // retry on request failed with error 500
550
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
551
        {
552
            $response = $this->request($type, $file);
553
        } // other status code is an error
554
        else
555
        {
556
            $this->error($status, $resource, $file);
557
        }
558
559
        return $this->filterResponse($response);
0 ignored issues
show
Bug introduced by
It seems like $response can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::filterResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

559
        return $this->filterResponse(/** @scrutinizer ignore-type */ $response);
Loading history...
560
    }
561
562
    /**
563
     * Make a request to Apache Tika Server
564
     *
565
     * @throws \Exception
566
     */
567
    protected function exec(array $options = []): array
568
    {
569
        // cURL init and options
570
        $curl = curl_init();
571
572
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
573
        foreach($options as $option => $value)
574
        {
575
            curl_setopt($curl, $option, $value);
576
        }
577
578
        // make the request directly
579
        if(is_null($this->callback))
580
        {
581
            $this->response = (string) curl_exec($curl);
582
        }
583
        // with a callback, the response is appended on each block inside the callback
584
        else
585
        {
586
            $this->response = '';
587
            curl_exec($curl);
588
        }
589
590
        // exception if cURL fails
591
        if(curl_errno($curl))
592
        {
593
            throw new Exception(curl_error($curl), curl_errno($curl));
594
        }
595
596
        // return the response and the status code
597
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
598
    }
599
600
    /**
601
     * Throws an exception for an error status code
602
     *
603
     * @throws \Exception
604
     */
605
    protected function error(int $status, string $resource, ?string $file = null): void
606
    {
607
        switch($status)
608
        {
609
            //  method not allowed
610
            case 405:
611
                $message = 'Method not allowed';
612
                break;
613
614
            //  unsupported media type
615
            case 415:
616
                $message = 'Unsupported media type';
617
                break;
618
619
            //  unprocessable entity
620
            case 422:
621
                $message = 'Unprocessable document';
622
623
                // using remote files require Tika server to be launched with specific options
624
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
625
                {
626
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
627
                }
628
629
                break;
630
631
            // server error
632
            case 500:
633
                $message = 'Error while processing document';
634
                break;
635
636
            // unexpected
637
            default:
638
                $message = "Unexpected response for /$resource ($status)";
639
                $status = 501;
640
        }
641
642
        throw new Exception($message, $status);
643
    }
644
645
    /**
646
     * Get the parameters to make the request
647
     *
648
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
649
     * @throws \Exception
650
     */
651
    protected function getParameters(string $type, ?string $file = null): array
652
    {
653
        $headers = [];
654
        $callback = null;
655
656
        if(!empty($file) && preg_match('/^http/', $file))
657
        {
658
            if($this->fetcherName)
659
            {
660
                $headers[] = "fetcherName: $this->fetcherName";
661
                $headers[] = "fetchKey: $file";
662
            }
663
            $headers[] = "fileUrl: $file";
664
        }
665
666
        switch($type)
667
        {
668
            case 'html':
669
                $resource = 'tika';
670
                $headers[] = 'Accept: text/html';
671
                break;
672
673
            case 'lang':
674
                $resource = 'language/stream';
675
                break;
676
677
            case 'mime':
678
                $resource = 'detect/stream';
679
680
                if($file !== null)
681
                {
682
                    $name = basename($file);
683
                    $headers[] = "Content-Disposition: attachment, filename=$name";
684
                }
685
                break;
686
687
            case 'detectors':
688
            case 'parsers':
689
            case 'meta':
690
            case 'mime-types':
691
            case 'rmeta/html':
692
            case 'rmeta/ignore':
693
            case 'rmeta/text':
694
                $resource = $type;
695
                $headers[] = 'Accept: application/json';
696
                $callback = function($response)
697
                {
698
                    return json_decode($response, true);
699
                };
700
                break;
701
702
            case 'text':
703
                $resource = 'tika';
704
                $headers[] = 'Accept: text/plain';
705
                break;
706
707
            case 'text-main':
708
                $resource = 'tika/main';
709
                $headers[] = 'Accept: text/plain';
710
                break;
711
712
            case 'version':
713
                $resource = $type;
714
                break;
715
716
            case 'xhtml':
717
                throw new Exception("Tika Server does not support XHTML output");
718
719
            default:
720
                throw new Exception("Unknown type $type");
721
        }
722
723
        return [$resource, $headers, $callback];
724
    }
725
726
    /**
727
     * Get the cURL options
728
     *
729
     * @throws \Exception
730
     */
731
    protected function getCurlOptions(string $type, ?string $file = null): array
732
    {
733
        // base options
734
        $options = $this->options;
735
736
        // callback
737
        if(!is_null($this->callback))
738
        {
739
            $callback = $this->callback;
740
741
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
742
            {
743
                if($this->callbackAppend === true)
744
                {
745
                    $this->response .= $data;
746
                }
747
748
                $callback($data);
749
750
                // safe because cURL must receive the number of *bytes* written
751
                return strlen($data);
752
            };
753
        }
754
755
        // remote file options
756
        if($file && preg_match('/^http/', $file))
757
        {
758
            //
759
        }
760
        // local file options
761
        elseif($file && file_exists($file) && is_readable($file))
762
        {
763
            $options[CURLOPT_INFILE] = fopen($file, 'r');
764
            $options[CURLOPT_INFILESIZE] = filesize($file);
765
        } // other options for specific requests
766
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
767
        {
768
            $options[CURLOPT_PUT] = false;
769
        } // file not accesible
770
        else
771
        {
772
            throw new Exception("File $file can't be opened");
773
        }
774
775
        return $options;
776
    }
777
}
778