Passed
Push — master ( dcc276...a6d10d )
by David
03:59
created

WebClient::getParameters()   D

Complexity

Conditions 18
Paths 32

Size

Total Lines 68
Code Lines 46

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
eloc 46
c 1
b 1
f 0
dl 0
loc 68
rs 4.8666
cc 18
nc 32
nop 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika web client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://cwiki.apache.org/confluence/display/TIKA/TikaServer
14
 */
15
class WebClient extends Client
16
{
17
    protected const MODE = 'web';
18
19
    /**
20
     * Cached responses to avoid multiple request for the same file
21
     *
22
     * @var array
23
     */
24
    protected $cache = [];
25
26
    /**
27
     * Apache Tika server host
28
     *
29
     * @var string
30
     */
31
    protected $host = null;
32
33
    /**
34
     * Apache Tika server port
35
     *
36
     * @var int
37
     */
38
    protected $port = null;
39
40
    /**
41
     * Number of retries on server error
42
     *
43
     * @var int
44
     */
45
    protected $retries = 3;
46
47
    /**
48
     * Default cURL options
49
     *
50
     * @var array
51
     */
52
    protected $options =
53
    [
54
        CURLINFO_HEADER_OUT     => true,
55
        CURLOPT_HTTPHEADER      => [],
56
        CURLOPT_PUT             => true,
57
        CURLOPT_RETURNTRANSFER  => true,
58
        CURLOPT_TIMEOUT         => 5
59
    ];
60
61
    /**
62
     * Configure class and test if server is running
63
     *
64
     * @throws \Exception
65
     */
66
    public function __construct(string $host = null, int $port = null, array $options = [], bool $check = true)
67
    {
68
        parent::__construct();
69
70
        if(is_string($host) && filter_var($host, FILTER_VALIDATE_URL))
71
        {
72
            $this->setUrl($host);
73
        }
74
        elseif($host)
75
        {
76
            $this->setHost($host);
77
        }
78
79
        if(is_numeric($port))
80
        {
81
            $this->setPort($port);
82
        }
83
84
        if(!empty($options))
85
        {
86
            $this->setOptions($options);
87
        }
88
89
        $this->setDownloadRemote(true);
90
91
        if($check === true)
92
        {
93
            $this->check();
94
        }
95
    }
96
97
    /**
98
     * Get the base URL
99
     */
100
    public function getUrl(): string
101
    {
102
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
103
    }
104
105
    /**
106
     * Set the host and port using an URL
107
     */
108
    public function setUrl(string $url): self
109
    {
110
        $host = parse_url($url, PHP_URL_HOST);
111
        $port = parse_url($url, PHP_URL_PORT);
112
113
        if(!empty($host))
114
        {
115
            $this->setHost((string) $host);
116
        }
117
118
        if(!empty($port))
119
        {
120
            $this->setPort((int) $port);
121
        }
122
123
        return $this;
124
    }
125
126
    /**
127
     * Get the host
128
     */
129
    public function getHost(): ?string
130
    {
131
        return $this->host;
132
    }
133
134
    /**
135
     * Set the host
136
     */
137
    public function setHost(string $host): self
138
    {
139
        $this->host = $host;
140
141
        return $this;
142
    }
143
144
    /**
145
     * Get the port
146
     */
147
    public function getPort(): ?int
148
    {
149
        return $this->port;
150
    }
151
152
    /**
153
     * Set the port
154
     */
155
    public function setPort(int $port): self
156
    {
157
        $this->port = $port;
158
159
        return $this;
160
    }
161
162
    /**
163
     * Get the number of retries
164
     */
165
    public function getRetries(): int
166
    {
167
        return $this->retries;
168
    }
169
170
    /**
171
     * Set the number of retries
172
     */
173
    public function setRetries(int $retries): self
174
    {
175
        $this->retries = $retries;
176
177
        return $this;
178
    }
179
180
    /**
181
     * Get all the options
182
     */
183
    public function getOptions(): array
184
    {
185
        return $this->options;
186
    }
187
188
    /**
189
     * Get an specified option
190
     *
191
     * @return  mixed
192
     */
193
    public function getOption(int $key)
194
    {
195
        return $this->options[$key] ?? null;
196
    }
197
198
    /**
199
     * Set a cURL option to be set with curl_setopt()
200
     *
201
     * @link http://php.net/manual/en/curl.constants.php
202
     * @link http://php.net/manual/en/function.curl-setopt.php
203
     * @param mixed $value
204
     * @throws \Exception
205
     */
206
    public function setOption(int $key, $value): self
207
    {
208
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
209
        {
210
            throw new Exception("Value for cURL option $key cannot be modified", 3);
211
        }
212
213
        $this->options[$key] = $value;
214
215
        return $this;
216
    }
217
218
    /**
219
     * Set the cURL options
220
     *
221
     * @throws \Exception
222
     */
223
    public function setOptions(array $options): self
224
    {
225
        foreach($options as $key => $value)
226
        {
227
            $this->setOption($key, $value);
228
        }
229
230
        return $this;
231
    }
232
233
    /**
234
     * Get the timeout value for cURL
235
     */
236
    public function getTimeout(): int
237
    {
238
        return $this->getOption(CURLOPT_TIMEOUT);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getOption(...lients\CURLOPT_TIMEOUT) could return the type null which is incompatible with the type-hinted return integer. Consider adding an additional type-check to rule them out.
Loading history...
239
    }
240
241
    /**
242
     * Set the timeout value for cURL
243
     *
244
     * @throws \Exception
245
     */
246
    public function setTimeout(int $value): self
247
    {
248
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
249
250
        return $this;
251
    }
252
253
    /**
254
     * Returns the supported MIME types
255
     *
256
     * @throws \Exception
257
     */
258
    public function getSupportedMIMETypes(): array
259
    {
260
        $mimeTypes = json_decode($this->request('mime-types'), true);
261
262
        ksort($mimeTypes);
263
264
        return $mimeTypes;
265
    }
266
267
    /**
268
     * Returns the available detectors
269
     *
270
     * @throws \Exception
271
     */
272
    public function getAvailableDetectors(): array
273
    {
274
        $detectors = [json_decode($this->request('detectors'), true)];
275
276
        foreach($detectors as $index => $parent)
277
        {
278
            $detectors[$parent['name']] = $parent;
279
280
            if(isset($parent['children']))
281
            {
282
                foreach($parent['children'] as $subindex => $child)
283
                {
284
                    $detectors[$parent['name']]['children'][$child['name']] = $child;
285
286
                    unset($detectors[$parent['name']]['children'][$subindex]);
287
                }
288
            }
289
290
            unset($detectors[$index]);
291
        }
292
293
        return $detectors;
294
    }
295
296
    /**
297
     * Returns the available parsers
298
     *
299
     * @throws \Exception
300
     */
301
    public function getAvailableParsers(): array
302
    {
303
        $parsers = [json_decode($this->request('parsers'), true)];
304
305
        foreach($parsers as $index => $parent)
306
        {
307
            $parsers[$parent['name']] = $parent;
308
309
            if(isset($parent['children']))
310
            {
311
                foreach($parent['children'] as $subindex => $child)
312
                {
313
                    $parsers[$parent['name']]['children'][$child['name']] = $child;
314
315
                    unset($parsers[$parent['name']]['children'][$subindex]);
316
                }
317
            }
318
319
            unset($parsers[$index]);
320
        }
321
322
        return $parsers;
323
    }
324
325
    /**
326
     * Check if server is running
327
     *
328
     * @throws \Exception
329
     */
330
    public function check(): void
331
    {
332
        if($this->isChecked() === false)
333
        {
334
            $this->setChecked(true);
335
336
            // throws an exception if server is unreachable or can't connect
337
            $this->request('version');
338
        }
339
    }
340
341
    /**
342
     * Configure, make a request and return its results
343
     *
344
     * @throws \Exception
345
     */
346
    public function request(string $type, string $file = null): string
347
    {
348
        static $retries = [];
349
350
        // check if not checked
351
        $this->check();
352
353
        // check if is cached
354
        if($file !== null && $this->isCached($type, $file))
355
        {
356
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
357
        }
358
        elseif($file !== null && !isset($retries[sha1($file)]))
359
        {
360
            $retries[sha1($file)] = $this->retries;
361
        }
362
363
        // parameters for cURL request
364
        [$resource, $headers] = $this->getParameters($type, $file);
365
366
        // check the request
367
        $file = $this->checkRequest($type, $file);
368
369
        // cURL options
370
        $options = $this->getCurlOptions($type, $file);
371
372
        // sets headers
373
        foreach($headers as $header)
374
        {
375
            $options[CURLOPT_HTTPHEADER][] = $header;
376
        }
377
378
        // cURL init and options
379
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
380
381
        // get the response and the HTTP status code
382
        [$response, $status] = $this->exec($options);
383
384
        // reduce memory usage closing cURL resource
385
        if(isset($options[CURLOPT_INFILE]) && is_resource($options[CURLOPT_INFILE]))
386
        {
387
            fclose($options[CURLOPT_INFILE]);
388
        }
389
390
        // request completed successfully
391
        if($status == 200)
392
        {
393
            // cache certain responses
394
            if($file !== null && $this->isCacheable($type))
395
            {
396
                $this->cacheResponse($type, $response, $file);
397
            }
398
        } // request completed successfully but result is empty
399
        elseif($status == 204)
400
        {
401
            $response = null;
402
        } // retry on request failed with error 500
403
        elseif($status == 500 && $file !== null && $retries[sha1($file)]--)
404
        {
405
            $response = $this->request($type, $file);
406
        } // other status code is an error
407
        else
408
        {
409
            $this->error($status, $resource, $file);
410
        }
411
412
        return $response;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $response could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
413
    }
414
415
    /**
416
     * Make a request to Apache Tika Server
417
     *
418
     * @throws \Exception
419
     */
420
    protected function exec(array $options = []): array
421
    {
422
        // cURL init and options
423
        $curl = curl_init();
424
425
        // we avoid curl_setopt_array($curl, $options) because strange Windows behaviour (issue #8)
426
        foreach($options as $option => $value)
427
        {
428
            curl_setopt($curl, $option, $value);
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_setopt() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

428
            curl_setopt(/** @scrutinizer ignore-type */ $curl, $option, $value);
Loading history...
429
        }
430
431
        // make the request directly
432
        if(is_null($this->callback))
433
        {
434
            $this->response = (string) curl_exec($curl);
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

434
            $this->response = (string) curl_exec(/** @scrutinizer ignore-type */ $curl);
Loading history...
435
        }
436
        // with a callback, the response is appended on each block inside the callback
437
        else
438
        {
439
            $this->response = '';
440
            curl_exec($curl);
441
        }
442
443
        // exception if cURL fails
444
        if(curl_errno($curl))
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_errno() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
        if(curl_errno(/** @scrutinizer ignore-type */ $curl))
Loading history...
445
        {
446
            throw new Exception(curl_error($curl), curl_errno($curl));
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

446
            throw new Exception(curl_error(/** @scrutinizer ignore-type */ $curl), curl_errno($curl));
Loading history...
447
        }
448
449
        // return the response and the status code
450
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
0 ignored issues
show
Bug introduced by
It seems like $curl can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

450
        return [trim($this->response), curl_getinfo(/** @scrutinizer ignore-type */ $curl, CURLINFO_HTTP_CODE)];
Loading history...
451
    }
452
453
    /**
454
     * Throws an exception for an error status code
455
     *
456
     * @throws \Exception
457
     */
458
    protected function error(int $status, string $resource, string $file = null): void
459
    {
460
        switch($status)
461
        {
462
            //  method not allowed
463
            case 405:
464
                $message = 'Method not allowed';
465
                break;
466
467
            //  unsupported media type
468
            case 415:
469
                $message = 'Unsupported media type';
470
                break;
471
472
            //  unprocessable entity
473
            case 422:
474
                $message = 'Unprocessable document';
475
476
                // using remote files require Tika server to be launched with specific options
477
                if($this->downloadRemote === false && $file !== null && preg_match('/^http/', $file))
478
                {
479
                    $message .= ' (is server launched using "-enableUnsecureFeatures -enableFileUrl" arguments?)';
480
                }
481
482
                break;
483
484
            // server error
485
            case 500:
486
                $message = 'Error while processing document';
487
                break;
488
489
            // unexpected
490
            default:
491
                $message = "Unexpected response for /$resource ($status)";
492
                $status = 501;
493
        }
494
495
        throw new Exception($message, $status);
496
    }
497
498
    /**
499
     * Get the parameters to make the request
500
     *
501
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
502
     * @throws \Exception
503
     */
504
    protected function getParameters(string $type, string $file = null): array
505
    {
506
        $headers = [];
507
        $callback = null;
508
509
        if(!empty($file) && preg_match('/^http/', $file))
510
        {
511
            $headers[] = "fileUrl:$file";
512
        }
513
514
        switch($type)
515
        {
516
            case 'html':
517
                $resource = 'tika';
518
                $headers[] = 'Accept: text/html';
519
                break;
520
521
            case 'lang':
522
                $resource = 'language/stream';
523
                break;
524
525
            case 'mime':
526
                $resource = 'detect/stream';
527
528
                if($file !== null)
529
                {
530
                    $name = basename($file);
531
                    $headers[] = "Content-Disposition: attachment, filename=$name";
532
                }
533
                break;
534
535
            case 'detectors':
536
            case 'parsers':
537
            case 'meta':
538
            case 'mime-types':
539
            case 'rmeta/html':
540
            case 'rmeta/ignore':
541
            case 'rmeta/text':
542
                $resource = $type;
543
                $headers[] = 'Accept: application/json';
544
                $callback = function($response)
545
                {
546
                    return json_decode($response, true);
547
                };
548
                break;
549
550
            case 'text':
551
                $resource = 'tika';
552
                $headers[] = 'Accept: text/plain';
553
                break;
554
555
            case 'text-main':
556
                $resource = 'tika/main';
557
                $headers[] = 'Accept: text/plain';
558
                break;
559
560
            case 'version':
561
                $resource = $type;
562
                break;
563
564
            case 'xhtml':
565
                throw new Exception("Tika Server does not support XHTML output");
566
567
            default:
568
                throw new Exception("Unknown type $type");
569
        }
570
571
        return [$resource, $headers, $callback];
572
    }
573
574
    /**
575
     * Get the cURL options
576
     *
577
     * @throws \Exception
578
     */
579
    protected function getCurlOptions(string $type, string $file = null): array
580
    {
581
        // base options
582
        $options = $this->options;
583
584
        // callback
585
        if(!is_null($this->callback))
586
        {
587
            $callback = $this->callback;
588
589
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use ($callback)
590
            {
591
                if($this->callbackAppend === true)
592
                {
593
                    $this->response .= $data;
594
                }
595
596
                $callback($data);
597
598
                // safe because cURL must receive the number of *bytes* written
599
                return strlen($data);
600
            };
601
        }
602
603
        // remote file options
604
        if($file && preg_match('/^http/', $file))
605
        {
606
            //
607
        } // local file options
608
        elseif($file && file_exists($file) && is_readable($file))
609
        {
610
            $options[CURLOPT_INFILE] = fopen($file, 'r');
611
            $options[CURLOPT_INFILESIZE] = filesize($file);
612
        } // other options for specific requests
613
        elseif(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
614
        {
615
            $options[CURLOPT_PUT] = false;
616
        } // file not accesible
617
        else
618
        {
619
            throw new Exception("File $file can't be opened");
620
        }
621
622
        return $options;
623
    }
624
}
625