Passed
Push — master ( d3a7db...c8c09a )
by David
04:00 queued 10s
created

WebClient::check()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
c 2
b 0
f 0
nc 2
nop 0
dl 0
loc 8
rs 10
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Number of retries on server error
42
     *
43
     * @var int
44
     */
45
    protected $retries = 3;
46
47
    /**
48
     * Default cURL options
49
     *
50
     * @var array
51
     */
52
    protected $options =
53
    [
54
        CURLINFO_HEADER_OUT     => true,
55
        CURLOPT_HTTPHEADER      => [],
56
        CURLOPT_PUT             => true,
57
        CURLOPT_RETURNTRANSFER  => true,
58
        CURLOPT_TIMEOUT         => 5
59
    ];
60
61
    /**
62
     * Configure class and test if server is running
63
     *
64
     * @throws \Exception
65
     */
66
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
67
    {
68
        parent::__construct();
69
70
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
71
        {
72
            $this->setUrl($host);
73
        }
74
        elseif($host)
75
        {
76
            $this->setHost($host);
77
        }
78
79
        if(is_numeric($port))
80
        {
81
            $this->setPort($port);
82
        }
83
84
        if(!empty($options))
85
        {
86
            $this->setOptions($options);
87
        }
88
89
        $this->setDownloadRemote(true);
90
91
        if($check === true)
92
        {
93
            $this->check();
94
        }
95
    }
96
97
    /**
98
     * Get the base URL
99
     */
100
    public function getUrl(): string
101
    {
102
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
103
    }
104
105
    /**
106
     * Set the host and port using an URL
107
     */
108
    public function setUrl(string $url): self
109
    {
110
        $host = parse_url($url, PHP_URL_HOST);
111
        $port = parse_url($url, PHP_URL_PORT);
112
113
        if(!empty($host))
114
        {
115
            $this->setHost((string) $host);
116
        }
117
118
        if(!empty($port))
119
        {
120
            $this->setPort((int) $port);
121
        }
122
123
        return $this;
124
    }
125
126
    /**
127
     * Get the host
128
     */
129
    public function getHost(): ?string
130
    {
131
        return $this->host;
132
    }
133
134
    /**
135
     * Set the host
136
     */
137
    public function setHost(string $host): self
138
    {
139
        $this->host = $host;
140
141
        return $this;
142
    }
143
144
    /**
145
     * Get the port
146
     */
147
    public function getPort(): ?int
148
    {
149
        return $this->port;
150
    }
151
152
    /**
153
     * Set the port
154
     */
155
    public function setPort(int $port): self
156
    {
157
        $this->port = $port;
158
159
        return $this;
160
    }
161
162
    /**
163
     * Get the number of retries
164
     */
165
    public function getRetries(): int
166
    {
167
        return $this->retries;
168
    }
169
170
    /**
171
     * Set the number of retries
172
     */
173
    public function setRetries(int $retries): self
174
    {
175
        $this->retries = $retries;
176
177
        return $this;
178
    }
179
180
    /**
181
     * Get all the options
182
     */
183
    public function getOptions(): array
184
    {
185
        return $this->options;
186
    }
187
188
    /**
189
     * Get an specified option
190
     *
191
     * @return  mixed
192
     */
193
    public function getOption(int $key)
194
    {
195
        return $this->options[$key] ?? null;
196
    }
197
198
    /**
199
     * Set a cURL option to be set with curl_setopt()
200
     *
201
     * @link http://php.net/manual/en/curl.constants.php
202
     * @link http://php.net/manual/en/function.curl-setopt.php
203
     * @param mixed $value
204
     * @throws \Exception
205
     */
206
    public function setOption(int $key, $value): self
207
    {
208
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
209
        {
210
            throw new Exception("Value for cURL option $key cannot be modified", 3);
211
        }
212
213
        $this->options[$key] = $value;
214
215
        return $this;
216
    }
217
218
    /**
219
     * Set the cURL options
220
     *
221
     * @throws \Exception
222
     */
223
    public function setOptions(array $options): self
224
    {
225
        foreach($options as $key => $value)
226
        {
227
            $this->setOption($key, $value);
228
        }
229
230
        return $this;
231
    }
232
233
    /**
234
     * Get all the HTTP headers
235
     */
236
    public function getHeaders(): array
237
    {
238
        return $this->options[CURLOPT_HTTPHEADER];
239
    }
240
241
    /**
242
     * Get an specified HTTP header
243
     *
244
     * @return  mixed
245
     */
246
    public function getHeader(string $name): ?string
247
    {
248
        $value = [];
249
250
        foreach($this->options[CURLOPT_HTTPHEADER] as $header)
251
        {
252
            if(preg_match("/$name:\s+(.+)/i", $header, $match))
253
            {
254
                $value = $match[1];
255
                break;
256
            }
257
        }
258
259
        return $value;
260
    }
261
262
    /**
263
     * Set a cURL header to be set with curl_setopt()
264
     *
265
     * @param mixed $value
266
     * @throws \Exception
267
     */
268
    public function setHeader(string $name, $value): self
269
    {
270
        $this->options[CURLOPT_HTTPHEADER][] = "$name: $value";
271
272
        return $this;
273
    }
274
275
    /**
276
     * Set the HTTP headers
277
     *
278
     * @throws \Exception
279
     */
280
    public function setHeaders(array $headers): self
281
    {
282
        foreach($headers as $name => $value)
283
        {
284
            $this->setHeader($name, $value);
285
        }
286
287
        return $this;
288
    }
289
290
    /**
291
     * Get the accepted OCR languages
292
     */
293
    public function getOCRLanguages(): array
294
    {
295
        return explode('+', $this->getHeader('X-Tika-OCRLanguage') ?: '');
296
    }
297
298
    /**
299
     * Set the accepted OCR language
300
     *
301
     * @throws \Exception
302
     */
303
    public function setOCRLanguage(string $language): self
304
    {
305
        $this->setHeader('X-Tika-OCRLanguage', $language);
306
307
        return $this;
308
    }
309
310
    /**
311
     * Set the accepted OCR languages
312
     *
313
     * @throws \Exception
314
     */
315
    public function setOCRLanguages(array $languages): self
316
    {
317
        $this->setHeader('X-Tika-OCRLanguage', implode('+', $languages));
318
319
        return $this;
320
    }
321
322
    /**
323
     * Get the timeout value for cURL
324
     */
325
    public function getTimeout(): int
326
    {
327
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
328
    }
329
330
    /**
331
     * Set the timeout value for cURL
332
     *
333
     * @throws \Exception
334
     */
335
    public function setTimeout(int $value): self
336
    {
337
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
338
339
        return $this;
340
    }
341
342
    /**
343
     * Returns the supported MIME types
344
     *
345
     * @throws \Exception
346
     */
347
    public function getSupportedMIMETypes(): array
348
    {
349
        $mimeTypes = json_decode($this->request('mime-types'), true);
350
351
        ksort($mimeTypes);
352
353
        return $mimeTypes;
354
    }
355
356
    /**
357
     * Returns the available detectors
358
     *
359
     * @throws \Exception
360
     */
361
    public function getAvailableDetectors(): array
362
    {
363
        $detectors = [json_decode($this->request('detectors'), true)];
364
365
        foreach($detectors as $index => $parent)
366
        {
367
            $detectors[$parent['name']] = $parent;
368
369
            if(isset($parent['children']))
370
            {
371
                foreach($parent['children'] as $subindex => $child)
372
                {
373
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
374
375
                    unset($detectors[$parent['name']]['children'][$subindex]);
376
                }
377
            }
378
379
            unset($detectors[$index]);
380
        }
381
382
        return $detectors;
383
    }
384
385
    /**
386
     * Returns the available parsers
387
     *
388
     * @throws \Exception
389
     */
390
    public function getAvailableParsers(): array
391
    {
392
        $parsers = [json_decode($this->request('parsers'), true)];
393
394
        foreach($parsers as $index => $parent)
395
        {
396
            $parsers[$parent['name']] = $parent;
397
398
            if(isset($parent['children']))
399
            {
400
                foreach($parent['children'] as $subindex => $child)
401
                {
402
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
403
404
                    unset($parsers[$parent['name']]['children'][$subindex]);
405
                }
406
            }
407
408
            unset($parsers[$index]);
409
        }
410
411
        return $parsers;
412
    }
413
414
    /**
415
     * Check if server is running
416
     *
417
     * @throws \Exception
418
     */
419
    public function check(): void
420
    {
421
        if($this->isChecked() === false)
422
        {
423
            $this->setChecked(true);
424
425
            // throws an exception if server is unreachable or can't connect
426
            $this->request('version');
427
        }
428
    }
429
430
    /**
431
     * Configure, make a request and return its results
432
     *
433
     * @throws \Exception
434
     */
435
    public function request(string $type, string $file = null): string
436
    {
437
        static $retries = [];
438
439
        // check if not checked
440
        $this->check();
441
442
        // check if is cached
443
        if($file !== null && $this->isCached($type, $file))
444
        {
445
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
446
        }
447
        elseif($file !== null && !isset($retries[sha1($file)]))
448
        {
449
            $retries[sha1($file)] = $this->retries;
450
        }
451
452
        // parameters for cURL request
453
        [$resource, $headers] = $this->getParameters($type, $file);
454
455
        // check the request
456
        $file = $this->checkRequest($type, $file);
457
458
        // cURL options
459
        $options = $this->getCurlOptions($type, $file);
460
461
        // sets headers
462
        foreach($headers as $header)
463
        {
464
            $options[CURLOPT_HTTPHEADER][] = $header;
465
        }
466
467
        // cURL init and options
468
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
469
470
        // get the response and the HTTP status code
471
        [$response, $status] = $this->exec($options);
472
473
        // reduce memory usage closing cURL resource
474
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
475
        {
476
            fclose($options[CURLOPT_INFILE]);
477
        }
478
479
        // request completed successfully
480
        if($status == 200)
481
        {
482
            // cache certain responses
483
            if($file !== null && $this->isCacheable($type))
484
            {
485
                $this->cacheResponse($type, $response, $file);
486
            }
487
        } // request completed successfully but result is empty
488
        elseif($status == 204)
489
        {
490
            $response = null;
491
        } // retry on request failed with error 500
492
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
493
        {
494
            $response = $this->request($type, $file);
495
        } // other status code is an error
496
        else
497
        {
498
            $this->error($status, $resource, $file);
499
        }
500
501
        return $response;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $response could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
502
    }
503
504
    /**
505
     * Make a request to Apache Tika Server
506
     *
507
     * @throws \Exception
508
     */
509
    protected function exec(array $options = []): array
510
    {
511
        // cURL init and options
512
        $curl = curl_init();
513
514
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
515
        foreach($options as $option => $value)
516
        {
517
            curl_setopt($curl, $option, $value);
518
        }
519
520
        // make the request directly
521
        if(is_null($this->callback))
522
        {
523
            $this->response = (string) curl_exec($curl);
524
        }
525
        // with a callback, the response is appended on each block inside the callback
526
        else
527
        {
528
            $this->response = '';
529
            curl_exec($curl);
530
        }
531
532
        // exception if cURL fails
533
        if(curl_errno($curl))
534
        {
535
            throw new Exception(curl_error($curl), curl_errno($curl));
536
        }
537
538
        // return the response and the status code
539
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
540
    }
541
542
    /**
543
     * Throws an exception for an error status code
544
     *
545
     * @throws \Exception
546
     */
547
    protected function error(int $status, string $resource, string $file = null): void
548
    {
549
        switch($status)
550
        {
551
            //  method not allowed
552
            case 405:
553
                $message = 'Method not allowed';
554
                break;
555
556
            //  unsupported media type
557
            case 415:
558
                $message = 'Unsupported media type';
559
                break;
560
561
            //  unprocessable entity
562
            case 422:
563
                $message = 'Unprocessable document';
564
565
                // using remote files require Tika server to be launched with specific options
566
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
567
                {
568
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
569
                }
570
571
                break;
572
573
            // server error
574
            case 500:
575
                $message = 'Error while processing document';
576
                break;
577
578
            // unexpected
579
            default:
580
                $message = "Unexpected response for /$resource ($status)";
581
                $status = 501;
582
        }
583
584
        throw new Exception($message, $status);
585
    }
586
587
    /**
588
     * Get the parameters to make the request
589
     *
590
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
591
     * @throws \Exception
592
     */
593
    protected function getParameters(string $type, string $file = null): array
594
    {
595
        $headers = [];
596
        $callback = null;
597
598
        if(!empty($file) && preg_match('/^http/', $file))
599
        {
600
            $headers[] = "fileUrl:$file";
601
        }
602
603
        switch($type)
604
        {
605
            case 'html':
606
                $resource = 'tika';
607
                $headers[] = 'Accept: text/html';
608
                break;
609
610
            case 'lang':
611
                $resource = 'language/stream';
612
                break;
613
614
            case 'mime':
615
                $resource = 'detect/stream';
616
617
                if($file !== null)
618
                {
619
                    $name = basename($file);
620
                    $headers[] = "Content-Disposition: attachment, filename=$name";
621
                }
622
                break;
623
624
            case 'detectors':
625
            case 'parsers':
626
            case 'meta':
627
            case 'mime-types':
628
            case 'rmeta/html':
629
            case 'rmeta/ignore':
630
            case 'rmeta/text':
631
                $resource = $type;
632
                $headers[] = 'Accept: application/json';
633
                $callback = function($response)
634
                {
635
                    return json_decode($response, true);
636
                };
637
                break;
638
639
            case 'text':
640
                $resource = 'tika';
641
                $headers[] = 'Accept: text/plain';
642
                break;
643
644
            case 'text-main':
645
                $resource = 'tika/main';
646
                $headers[] = 'Accept: text/plain';
647
                break;
648
649
            case 'version':
650
                $resource = $type;
651
                break;
652
653
            case 'xhtml':
654
                throw new Exception("Tika Server does not support XHTML output");
655
656
            default:
657
                throw new Exception("Unknown type $type");
658
        }
659
660
        return [$resource, $headers, $callback];
661
    }
662
663
    /**
664
     * Get the cURL options
665
     *
666
     * @throws \Exception
667
     */
668
    protected function getCurlOptions(string $type, string $file = null): array
669
    {
670
        // base options
671
        $options = $this->options;
672
673
        // callback
674
        if(!is_null($this->callback))
675
        {
676
            $callback = $this->callback;
677
678
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
679
            {
680
                if($this->callbackAppend === true)
681
                {
682
                    $this->response .= $data;
683
                }
684
685
                $callback($data);
686
687
                // safe because cURL must receive the number of *bytes* written
688
                return strlen($data);
689
            };
690
        }
691
692
        // remote file options
693
        if($file && preg_match('/^http/', $file))
694
        {
695
            //
696
        } // local file options
697
        elseif($file && file_exists($file) && is_readable($file))
698
        {
699
            $options[CURLOPT_INFILE] = fopen($file, 'r');
700
            $options[CURLOPT_INFILESIZE] = filesize($file);
701
        } // other options for specific requests
702
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
703
        {
704
            $options[CURLOPT_PUT] = false;
705
        } // file not accesible
706
        else
707
        {
708
            throw new Exception("File $file can't be opened");
709
        }
710
711
        return $options;
712
    }
713
}
714