Passed
Push — master ( d0db71...18bd11 )
by David
03:30 queued 13s
created

WebClient::setPort()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 5
rs 10
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Apache Tika server connection scheme
42
     *
43
     * @var string
44
     */
45
    protected $scheme = 'http';
46
47
    /**
48
     * Number of retries on server error
49
     *
50
     * @var int
51
     */
52
    protected $retries = 3;
53
54
    /**
55
     * Name of the fetcher to be used (for Tika >= 2.0.0 only)
56
     *
57
     * @var string|null
58
     */
59
    protected $fetcherName = null;
60
61
    /**
62
     * Default cURL options
63
     *
64
     * @var array
65
     */
66
    protected $options =
67
    [
68
        CURLINFO_HEADER_OUT     => true,
69
        CURLOPT_HTTPHEADER      => [],
70
        CURLOPT_PUT             => true,
71
        CURLOPT_RETURNTRANSFER  => true,
72
        CURLOPT_TIMEOUT         => 5
73
    ];
74
75
    /**
76
     * Configure class and test if server is running
77
     *
78
     * @throws \Exception
79
     */
80
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
81
    {
82
        parent::__construct();
83
84
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
85
        {
86
            $this->setUrl($host);
87
        }
88
        elseif($host)
89
        {
90
            $this->setHost($host);
91
        }
92
93
        if(is_numeric($port))
94
        {
95
            $this->setPort($port);
96
        }
97
98
        if(!empty($options))
99
        {
100
            $this->setOptions($options);
101
        }
102
103
        $this->setDownloadRemote(true);
104
105
        if($check === true)
106
        {
107
            $this->check();
108
        }
109
    }
110
111
    /**
112
     * Get the base URL
113
     */
114
    public function getUrl(): string
115
    {
116
        return sprintf('%s://%s:%d', $this->scheme ?: 'http', $this->host, $this->port ?: 9998);
117
    }
118
119
    /**
120
     * Set the host and port using an URL
121
     */
122
    public function setUrl(string $url): self
123
    {
124
        $scheme = parse_url($url, PHP_URL_SCHEME);
125
        $host = parse_url($url, PHP_URL_HOST);
126
        $port = parse_url($url, PHP_URL_PORT);
127
128
        if(!empty($scheme))
129
        {
130
            $this->setScheme((string) $scheme);
131
        }
132
133
        if(!empty($host))
134
        {
135
            $this->setHost((string) $host);
136
        }
137
138
        if(!empty($port))
139
        {
140
            $this->setPort((int) $port);
141
        }
142
143
        return $this;
144
    }
145
146
    /**
147
     * Get the host
148
     */
149
    public function getHost(): ?string
150
    {
151
        return $this->host;
152
    }
153
154
    /**
155
     * Set the host
156
     */
157
    public function setHost(string $host): self
158
    {
159
        $this->host = $host;
160
161
        return $this;
162
    }
163
164
    /**
165
     * Get the port
166
     */
167
    public function getPort(): ?int
168
    {
169
        return $this->port;
170
    }
171
172
    /**
173
     * Set the port
174
     */
175
    public function setPort(int $port): self
176
    {
177
        $this->port = $port;
178
179
        return $this;
180
    }
181
182
    /**
183
     * Get the scheme
184
     */
185
    public function getScheme(): string
186
    {
187
        return $this->scheme;
188
    }
189
190
    /**
191
     * Set the scheme
192
     */
193
    public function setScheme(string $scheme): self
194
    {
195
        $this->scheme = $scheme;
196
197
        return $this;
198
    }
199
200
    /**
201
     * Get the number of retries
202
     */
203
    public function getRetries(): int
204
    {
205
        return $this->retries;
206
    }
207
208
    /**
209
     * Set the number of retries
210
     */
211
    public function setRetries(int $retries): self
212
    {
213
        $this->retries = $retries;
214
215
        return $this;
216
    }
217
218
    /**
219
     * Set the name of the fetcher to be used (for Tika >= 2.0.0 only)
220
     */
221
    public function setFetcherName(string $fetcherName): self
222
    {
223
        $this->fetcherName = $fetcherName;
224
225
        return $this;
226
    }
227
228
    /**
229
     * Get all the options
230
     */
231
    public function getOptions(): array
232
    {
233
        return $this->options;
234
    }
235
236
    /**
237
     * Get an specified option
238
     *
239
     * @return  mixed
240
     */
241
    public function getOption(int $key)
242
    {
243
        return $this->options[$key] ?? null;
244
    }
245
246
    /**
247
     * Set a cURL option to be set with curl_setopt()
248
     *
249
     * @link http://php.net/manual/en/curl.constants.php
250
     * @link http://php.net/manual/en/function.curl-setopt.php
251
     * @param mixed $value
252
     * @throws \Exception
253
     */
254
    public function setOption(int $key, $value): self
255
    {
256
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
257
        {
258
            throw new Exception("Value for cURL option $key cannot be modified", 3);
259
        }
260
261
        $this->options[$key] = $value;
262
263
        return $this;
264
    }
265
266
    /**
267
     * Set the cURL options
268
     *
269
     * @throws \Exception
270
     */
271
    public function setOptions(array $options): self
272
    {
273
        foreach($options as $key => $value)
274
        {
275
            $this->setOption($key, $value);
276
        }
277
278
        return $this;
279
    }
280
281
    /**
282
     * Get all the HTTP headers
283
     */
284
    public function getHeaders(): array
285
    {
286
        return $this->options[CURLOPT_HTTPHEADER];
287
    }
288
289
    /**
290
     * Get an specified HTTP header
291
     */
292
    public function getHeader(string $name): ?string
293
    {
294
        $value = [];
295
296
        foreach($this->options[CURLOPT_HTTPHEADER] as $header)
297
        {
298
            if(preg_match("/$name:\s+(.+)/i", $header, $match))
299
            {
300
                $value = $match[1];
301
                break;
302
            }
303
        }
304
305
        return $value;
306
    }
307
308
    /**
309
     * Set a cURL header to be set with curl_setopt()
310
     *
311
     * @param mixed $value
312
     * @throws \Exception
313
     */
314
    public function setHeader(string $name, $value): self
315
    {
316
        $this->options[CURLOPT_HTTPHEADER][] = "$name: $value";
317
318
        return $this;
319
    }
320
321
    /**
322
     * Set the HTTP headers
323
     *
324
     * @throws \Exception
325
     */
326
    public function setHeaders(array $headers): self
327
    {
328
        foreach($headers as $name => $value)
329
        {
330
            $this->setHeader($name, $value);
331
        }
332
333
        return $this;
334
    }
335
336
    /**
337
     * Get the accepted OCR languages
338
     */
339
    public function getOCRLanguages(): array
340
    {
341
        return explode('+', $this->getHeader('X-Tika-OCRLanguage') ?: '');
342
    }
343
344
    /**
345
     * Set the accepted OCR language
346
     *
347
     * @throws \Exception
348
     */
349
    public function setOCRLanguage(string $language): self
350
    {
351
        $this->setHeader('X-Tika-OCRLanguage', $language);
352
353
        return $this;
354
    }
355
356
    /**
357
     * Set the accepted OCR languages
358
     *
359
     * @throws \Exception
360
     */
361
    public function setOCRLanguages(array $languages): self
362
    {
363
        $this->setHeader('X-Tika-OCRLanguage', implode('+', $languages));
364
365
        return $this;
366
    }
367
368
    /**
369
     * Get the timeout value for cURL
370
     */
371
    public function getTimeout(): int
372
    {
373
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
374
    }
375
376
    /**
377
     * Set the timeout value for cURL
378
     *
379
     * @throws \Exception
380
     */
381
    public function setTimeout(int $value): self
382
    {
383
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
384
385
        return $this;
386
    }
387
388
    /**
389
     * Returns the supported MIME types
390
     *
391
     * @throws \Exception
392
     */
393
    public function getSupportedMIMETypes(): array
394
    {
395
        $mimeTypes = json_decode($this->request('mime-types'), true);
396
397
        ksort($mimeTypes);
398
399
        return $mimeTypes;
400
    }
401
402
    /**
403
     * Returns the available detectors
404
     *
405
     * @throws \Exception
406
     */
407
    public function getAvailableDetectors(): array
408
    {
409
        $detectors = [json_decode($this->request('detectors'), true)];
410
411
        foreach($detectors as $index => $parent)
412
        {
413
            $detectors[$parent['name']] = $parent;
414
415
            if(isset($parent['children']))
416
            {
417
                foreach($parent['children'] as $subindex => $child)
418
                {
419
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
420
421
                    unset($detectors[$parent['name']]['children'][$subindex]);
422
                }
423
            }
424
425
            unset($detectors[$index]);
426
        }
427
428
        return $detectors;
429
    }
430
431
    /**
432
     * Returns the available parsers
433
     *
434
     * @throws \Exception
435
     */
436
    public function getAvailableParsers(): array
437
    {
438
        $parsers = [json_decode($this->request('parsers'), true)];
439
440
        foreach($parsers as $index => $parent)
441
        {
442
            $parsers[$parent['name']] = $parent;
443
444
            if(isset($parent['children']))
445
            {
446
                foreach($parent['children'] as $subindex => $child)
447
                {
448
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
449
450
                    unset($parsers[$parent['name']]['children'][$subindex]);
451
                }
452
            }
453
454
            unset($parsers[$index]);
455
        }
456
457
        return $parsers;
458
    }
459
460
    /**
461
     * Check if server is running
462
     *
463
     * @throws \Exception
464
     */
465
    public function check(): void
466
    {
467
        if($this->isChecked() === false)
468
        {
469
            $this->setChecked(true);
470
471
            // throws an exception if server is unreachable or can't connect
472
            $this->request('version');
473
        }
474
    }
475
476
    /**
477
     * Configure, make a request and return its results
478
     *
479
     * @throws \Exception
480
     */
481
    public function request(string $type, string $file = null): string
482
    {
483
        static $retries = [];
484
485
        // check if not checked
486
        $this->check();
487
488
        // check if is cached
489
        if($file !== null && $this->isCached($type, $file))
490
        {
491
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
492
        }
493
        elseif($file !== null && !isset($retries[sha1($file)]))
494
        {
495
            $retries[sha1($file)] = $this->retries;
496
        }
497
498
        // parameters for cURL request
499
        [$resource, $headers] = $this->getParameters($type, $file);
500
501
        // check the request
502
        $file = $this->checkRequest($type, $file);
503
504
        // cURL options
505
        $options = $this->getCurlOptions($type, $file);
506
507
        // sets headers
508
        foreach($headers as $header)
509
        {
510
            $options[CURLOPT_HTTPHEADER][] = $header;
511
        }
512
513
        // cURL init and options
514
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
515
516
        // get the response and the HTTP status code
517
        [$response, $status] = $this->exec($options);
518
519
        // reduce memory usage closing cURL resource
520
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
521
        {
522
            fclose($options[CURLOPT_INFILE]);
523
        }
524
525
        // request completed successfully
526
        if($status == 200)
527
        {
528
            // cache certain responses
529
            if($file !== null && $this->isCacheable($type))
530
            {
531
                $this->cacheResponse($type, $response, $file);
532
            }
533
        } // request completed successfully but result is empty
534
        elseif($status == 204)
535
        {
536
            $response = null;
537
        } // retry on request failed with error 500
538
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
539
        {
540
            $response = $this->request($type, $file);
541
        } // other status code is an error
542
        else
543
        {
544
            $this->error($status, $resource, $file);
545
        }
546
547
        return $this->filterResponse($response);
0 ignored issues
show
Bug introduced by
It seems like $response can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::filterResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

547
        return $this->filterResponse(/** @scrutinizer ignore-type */ $response);
Loading history...
548
    }
549
550
    /**
551
     * Make a request to Apache Tika Server
552
     *
553
     * @throws \Exception
554
     */
555
    protected function exec(array $options = []): array
556
    {
557
        // cURL init and options
558
        $curl = curl_init();
559
560
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
561
        foreach($options as $option => $value)
562
        {
563
            curl_setopt($curl, $option, $value);
564
        }
565
566
        // make the request directly
567
        if(is_null($this->callback))
568
        {
569
            $this->response = (string) curl_exec($curl);
570
        }
571
        // with a callback, the response is appended on each block inside the callback
572
        else
573
        {
574
            $this->response = '';
575
            curl_exec($curl);
576
        }
577
578
        // exception if cURL fails
579
        if(curl_errno($curl))
580
        {
581
            throw new Exception(curl_error($curl), curl_errno($curl));
582
        }
583
584
        // return the response and the status code
585
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
586
    }
587
588
    /**
589
     * Throws an exception for an error status code
590
     *
591
     * @throws \Exception
592
     */
593
    protected function error(int $status, string $resource, string $file = null): void
594
    {
595
        switch($status)
596
        {
597
            //  method not allowed
598
            case 405:
599
                $message = 'Method not allowed';
600
                break;
601
602
            //  unsupported media type
603
            case 415:
604
                $message = 'Unsupported media type';
605
                break;
606
607
            //  unprocessable entity
608
            case 422:
609
                $message = 'Unprocessable document';
610
611
                // using remote files require Tika server to be launched with specific options
612
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
613
                {
614
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
615
                }
616
617
                break;
618
619
            // server error
620
            case 500:
621
                $message = 'Error while processing document';
622
                break;
623
624
            // unexpected
625
            default:
626
                $message = "Unexpected response for /$resource ($status)";
627
                $status = 501;
628
        }
629
630
        throw new Exception($message, $status);
631
    }
632
633
    /**
634
     * Get the parameters to make the request
635
     *
636
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
637
     * @throws \Exception
638
     */
639
    protected function getParameters(string $type, string $file = null): array
640
    {
641
        $headers = [];
642
        $callback = null;
643
644
        if(!empty($file) && preg_match('/^http/', $file))
645
        {
646
            if($this->fetcherName) {
647
                $headers[] = "fetcherName:$this->fetcherName";
648
                $headers[] = "fetchKey:$file";
649
            } else {
650
                $headers[] = "fileUrl:$file";
651
            }
652
        }
653
654
        switch($type)
655
        {
656
            case 'html':
657
                $resource = 'tika';
658
                $headers[] = 'Accept: text/html';
659
                break;
660
661
            case 'lang':
662
                $resource = 'language/stream';
663
                break;
664
665
            case 'mime':
666
                $resource = 'detect/stream';
667
668
                if($file !== null)
669
                {
670
                    $name = basename($file);
671
                    $headers[] = "Content-Disposition: attachment, filename=$name";
672
                }
673
                break;
674
675
            case 'detectors':
676
            case 'parsers':
677
            case 'meta':
678
            case 'mime-types':
679
            case 'rmeta/html':
680
            case 'rmeta/ignore':
681
            case 'rmeta/text':
682
                $resource = $type;
683
                $headers[] = 'Accept: application/json';
684
                $callback = function($response)
685
                {
686
                    return json_decode($response, true);
687
                };
688
                break;
689
690
            case 'text':
691
                $resource = 'tika';
692
                $headers[] = 'Accept: text/plain';
693
                break;
694
695
            case 'text-main':
696
                $resource = 'tika/main';
697
                $headers[] = 'Accept: text/plain';
698
                break;
699
700
            case 'version':
701
                $resource = $type;
702
                break;
703
704
            case 'xhtml':
705
                throw new Exception("Tika Server does not support XHTML output");
706
707
            default:
708
                throw new Exception("Unknown type $type");
709
        }
710
711
        return [$resource, $headers, $callback];
712
    }
713
714
    /**
715
     * Get the cURL options
716
     *
717
     * @throws \Exception
718
     */
719
    protected function getCurlOptions(string $type, string $file = null): array
720
    {
721
        // base options
722
        $options = $this->options;
723
724
        // callback
725
        if(!is_null($this->callback))
726
        {
727
            $callback = $this->callback;
728
729
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
730
            {
731
                if($this->callbackAppend === true)
732
                {
733
                    $this->response .= $data;
734
                }
735
736
                $callback($data);
737
738
                // safe because cURL must receive the number of *bytes* written
739
                return strlen($data);
740
            };
741
        }
742
743
        // remote file options
744
        if($file && preg_match('/^http/', $file))
745
        {
746
            //
747
        }
748
        // local file options
749
        elseif($file && file_exists($file) && is_readable($file))
750
        {
751
            $options[CURLOPT_INFILE] = fopen($file, 'r');
752
            $options[CURLOPT_INFILESIZE] = filesize($file);
753
        } // other options for specific requests
754
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
755
        {
756
            $options[CURLOPT_PUT] = false;
757
        } // file not accesible
758
        else
759
        {
760
            throw new Exception("File $file can't be opened");
761
        }
762
763
        return $options;
764
    }
765
}
766