Passed
Push — master ( c2e92a...6d03a1 )
by David
01:20
created

WebClient::setUrl()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 12
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 1
dl 0
loc 12
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
use Vaites\ApacheTika\Metadata\Metadata;
9
10
/**
11
 * Apache Tika web client
12
 *
13
 * @author  David Martínez <[email protected]>
14
 * @link    http://wiki.apache.org/tika/TikaJAXRS
15
 * @link    https://tika.apache.org/1.12/formats.html
16
 */
17
class WebClient extends Client
18
{
19
    const MODE = 'web';
20
21
    /**
22
     * Cached responses to avoid multiple request for the same file
23
     *
24
     * @var array
25
     */
26
    protected $cache = [];
27
28
    /**
29
     * Apache Tika server host
30
     *
31
     * @var string
32
     */
33
    protected $host = null;
34
35
    /**
36
     * Apache Tika server port
37
     *
38
     * @var int
39
     */
40
    protected $port = null;
41
42
    /**
43
     * Number of retries on server error
44
     *
45
     * @var int
46
     */
47
    protected $retries = 3;
48
49
    /**
50
     * Default cURL options
51
     *
52
     * @var array
53
     */
54
    protected $options =
55
    [
56
        CURLINFO_HEADER_OUT    => true,
57
        CURLOPT_HTTPHEADER     => [],
58
        CURLOPT_PUT            => true,
59
        CURLOPT_RETURNTRANSFER => true,
60
        CURLOPT_TIMEOUT        => 5
61
    ];
62
63
    /**
64
     * Configure class and test if server is running
65
     *
66
     * @param   string  $host
67
     * @param   int     $port
68
     * @param   array   $options
69
     * @throws  \Exception
70
     */
71
    public function __construct($host = null, $port = null, $options = [])
72
    {
73
        if($host && filter_var($host, FILTER_VALIDATE_URL))
74
        {
75
            $this->setUrl($host);
76
        }
77
        elseif($host)
78
        {
79
            $this->setHost($host);
80
        }
81
82
        if($port)
0 ignored issues
show
Bug Best Practice introduced by
The expression $port of type integer|null is loosely compared to true; this is ambiguous if the integer can be 0. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
83
        {
84
            $this->setPort($port);
85
        }
86
87
        if(!empty($options))
88
        {
89
            $this->setOptions($options);
90
        }
91
92
        $this->setDownloadRemote(true);
93
94
        $this->getVersion(); // exception if not running
95
    }
96
97
    /**
98
     * Get the base URL
99
     *
100
     * @return string
101
     */
102
    public function getUrl()
103
    {
104
        return sprintf('http://%s:%d', $this->host, $this->port ?: 9998);
105
    }
106
107
    /**
108
     * Set the host and port using an URL
109
     *
110
     * @param   string  $url
111
     * @return $this
112
     */
113
    public function setUrl($url)
114
    {
115
        $url = parse_url($url);
116
117
        $this->setHost($url['host']);
118
119
        if(!empty($url['port']))
120
        {
121
            $this->setPort($url['port']);
122
        }
123
124
        return $this;
125
    }
126
127
    /**
128
     * Get the host
129
     *
130
     * @return  null|string
131
     */
132
    public function getHost()
133
    {
134
        return $this->host;
135
    }
136
137
    /**
138
     * Set the host
139
     *
140
     * @param   string  $host
141
     * @return  $this
142
     */
143
    public function setHost($host)
144
    {
145
        $this->host = $host;
146
147
        return $this;
148
    }
149
150
    /**
151
     * Get the port
152
     *
153
     * @return  null|int
154
     */
155
    public function getPort()
156
    {
157
        return $this->port;
158
    }
159
160
    /**
161
     * Set the port
162
     *
163
     * @param   int     $port
164
     * @return  $this
165
     */
166
    public function setPort($port)
167
    {
168
        $this->port = $port;
169
170
        return $this;
171
    }
172
173
    /**
174
     * Get the number of retries
175
     *
176
     * @return  int
177
     */
178
    public function getRetries()
179
    {
180
        return $this->retries;
181
    }
182
183
    /**
184
     * Set the number of retries
185
     *
186
     * @param   int     $retries
187
     * @return  $this
188
     */
189
    public function setRetries($retries)
190
    {
191
        $this->retries = $retries;
192
193
        return $this;
194
    }
195
196
    /**
197
     * Get all the options
198
     *
199
     * @return  null|array
200
     */
201
    public function getOptions()
202
    {
203
        return $this->options;
204
    }
205
206
    /**
207
     * Get an specified option
208
     *
209
     * @param   string  $key
210
     * @return  mixed
211
     */
212
    public function getOption($key)
213
    {
214
        return isset($this->options[$key]) ? $this->options[$key] : null;
215
    }
216
217
    /**
218
     * Set a cURL option to be set with curl_setopt()
219
     *
220
     * @link    http://php.net/manual/en/curl.constants.php
221
     * @link    http://php.net/manual/en/function.curl-setopt.php
222
     * @param   string  $key
223
     * @param   mixed   $value
224
     * @return  $this
225
     * @throws  \Exception
226
     */
227
    public function setOption($key, $value)
228
    {
229
        if(in_array($key, [CURLINFO_HEADER_OUT, CURLOPT_PUT, CURLOPT_RETURNTRANSFER]))
230
        {
231
            throw new Exception("Value for cURL option $key cannot be modified", 3);
232
        }
233
234
        $this->options[$key] = $value;
235
236
        return $this;
237
    }
238
239
    /**
240
     * Set the cURL options
241
     *
242
     * @param   array   $options
243
     * @return  $this
244
     * @throws  \Exception
245
     */
246
    public function setOptions($options)
247
    {
248
        foreach($options as $key => $value)
249
        {
250
            $this->setOption($key, $value);
251
        }
252
253
        return $this;
254
    }
255
256
    /**
257
     * Get the timeout value for cURL
258
     *
259
     * @return  int
260
     */
261
    public function getTimeout()
262
    {
263
        return $this->getOption(CURLOPT_TIMEOUT);
264
    }
265
266
    /**
267
     * Set the timeout value for cURL
268
     *
269
     * @param   int     $value
270
     * @return  $this
271
     * @throws  \Exception
272
     */
273
    public function setTimeout($value)
274
    {
275
        $this->setOption(CURLOPT_TIMEOUT, (int) $value);
276
277
        return $this;
278
    }
279
280
    /**
281
     * Configure, make a request and return its results
282
     *
283
     * @param   string  $type
284
     * @param   string  $file
285
     * @return  string
286
     * @throws  \Exception
287
     */
288
    public function request($type, $file = null)
289
    {
290
        static $retries = [];
291
292
        // check if is cached
293
        if(isset($this->cache[sha1($file)][$type]))
294
        {
295
            return $this->cache[sha1($file)][$type];
296
        }
297
        elseif(!isset($retries[sha1($file)]))
298
        {
299
            $retries[sha1($file)] = $this->retries;
300
        }
301
302
        // parameters for cURL request
303
        list($resource, $headers) = $this->getParameters($type, $file);
304
305
        // check the request
306
        $file = parent::checkRequest($type, $file);
307
308
        // cURL options
309
        $options = $this->getCurlOptions($type, $file);
310
311
        // sets headers
312
        foreach($headers as $header)
313
        {
314
            $options[CURLOPT_HTTPHEADER][] = $header;
315
        }
316
317
        // cURL init and options
318
        $options[CURLOPT_URL] = $this->getUrl() . "/$resource";
319
320
        // get the response and the HTTP status code
321
        list($response, $status) = $this->exec($options);
322
323
        // request completed successfully
324
        if($status == 200)
325
        {
326
            if($type == 'meta')
327
            {
328
                $response = Metadata::make($response, $file);
329
            }
330
331
            // cache certain responses
332
            if(in_array($type, ['lang', 'meta']))
333
            {
334
                $this->cache[sha1($file)][$type] = $response;
335
            }
336
        }
337
        // request completed successfully but result is empty
338
        elseif($status == 204)
339
        {
340
            $response = null;
341
        }
342
        // retry on request failed with error 500
343
        elseif($status == 500 && $retries[sha1($file)]--)
344
        {
345
            $response = $this->request($type, $file);
346
        }
347
        // other status code is an error
348
        else
349
        {
350
            $this->error($status, $resource);
351
        }
352
353
        return $response;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $response also could return the type Vaites\ApacheTika\Metada...\Metadata\ImageMetadata which is incompatible with the documented return type string.
Loading history...
354
    }
355
356
    /**
357
     * Make a request to Apache Tika Server
358
     *
359
     * @param   array   $options
360
     * @return  array
361
     * @throws  \Exception
362
     */
363
    protected function exec(array $options = [])
364
    {
365
        // cURL init and options
366
        $curl = curl_init();
367
368
        // we avoid curl_setopt_array($curl, $options) because extrange Windows behaviour (issue #8)
369
        foreach($options as $option => $value)
370
        {
371
            curl_setopt($curl, $option, $value);
372
        }
373
374
        // make the request
375
        if(is_null($this->callback))
376
        {
377
            $this->response = curl_exec($curl);
378
        }
379
        else
380
        {
381
            $this->response = '';
382
            curl_exec($curl);
383
        }
384
385
        // exception if cURL fails
386
        if(curl_errno($curl))
387
        {
388
            throw new Exception(curl_error($curl), curl_errno($curl));
389
        }
390
391
        // return the response and the status code
392
        return [trim($this->response), curl_getinfo($curl, CURLINFO_HTTP_CODE)];
393
    }
394
395
    /**
396
     * Throws an exception for an error status code
397
     *
398
     * @codeCoverageIgnore
399
     *
400
     * @param   int       $status
401
     * @param   string    $resource
402
     * @throws  \Exception
403
     */
404
    protected function error($status, $resource)
405
    {
406
        switch($status)
407
        {
408
            //  method not allowed
409
            case 405:
410
                throw new Exception('Method not allowed', 405);
411
                break;
412
413
            //  unsupported media type
414
            case 415:
415
                throw new Exception('Unsupported media type', 415);
416
                break;
417
418
            //  unprocessable entity
419
            case 422:
420
                throw new Exception('Unprocessable document', 422);
421
                break;
422
423
            // server error
424
            case 500:
425
                throw new Exception('Error while processing document', 500);
426
                break;
427
428
            // unexpected
429
            default:
430
                throw new Exception("Unexpected response for /$resource ($status)", 501);
431
        }
432
    }
433
434
    /**
435
     * Get the parameters to make the request
436
     *
437
     * @link    https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
438
     * @param   string  $type
439
     * @param   string  file
0 ignored issues
show
Bug introduced by
The type Vaites\ApacheTika\Clients\file was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
440
     * @return  array
441
     * @throws  \Exception
442
     */
443
    protected function getParameters($type, $file = null)
444
    {
445
        $headers = [];
446
447
        if(!empty($file) && preg_match('/^http/', $file))
448
        {
449
            $headers[] = "fileUrl:$file";
450
        }
451
452
        switch($type)
453
        {
454
            case 'html':
455
                $resource = 'tika';
456
                $headers[] = 'Accept: text/html';
457
                break;
458
459
            case 'lang':
460
                $resource = 'language/stream';
461
                break;
462
463
            case 'mime':
464
                $name = basename($file);
465
                $resource = 'detect/stream';
466
                $headers[] = "Content-Disposition: attachment, filename=$name";
467
                break;
468
469
            case 'meta':
470
                $resource = 'meta';
471
                $headers[] = 'Accept: application/json';
472
                break;
473
474
            case 'text':
475
                $resource = 'tika';
476
                $headers[] = 'Accept: text/plain';
477
                break;
478
479
            case 'text-main':
480
                $resource = 'tika/main';
481
                $headers[] = 'Accept: text/plain';
482
                break;
483
484
            case 'detectors':
485
            case 'parsers':
486
            case 'mime-types':
487
            case 'version':
488
                $resource = $type;
489
                break;
490
491
            default:
492
                throw new Exception("Unknown type $type");
493
        }
494
495
        return [$resource, $headers];
496
    }
497
498
    /**
499
     * Get the cURL options
500
     *
501
     * @param   string  $type
502
     * @param   string  file
503
     * @return  array
504
     * @throws  \Exception
505
     */
506
    protected function getCurlOptions($type, $file = null)
507
    {
508
        // base options
509
        $options = $this->options;
510
511
        // callback
512
        if(!is_null($this->callback))
513
        {
514
            $callback = $this->callback;
515
516
            $options[CURLOPT_WRITEFUNCTION] = function($handler, $data) use($callback)
517
            {
518
                $this->response .= $data;
519
520
                $callback($data);
521
522
                // safe because cURL must receive the number of *bytes* written
523
                return strlen($data);
524
            };
525
        }
526
527
        // remote file options
528
        if($file && preg_match('/^http/', $file))
529
        {
530
            //
531
        }
532
        // local file options
533
        elseif($file && file_exists($file) && is_readable($file))
534
        {
535
            $options[CURLOPT_INFILE] = fopen($file, 'r');
536
            $options[CURLOPT_INFILESIZE] = filesize($file);
537
        }
538
        // other options for specific requests
539
        elseif(in_array($type,  ['detectors', 'mime-types', 'parsers', 'version']))
540
        {
541
            $options[CURLOPT_PUT] = false;
542
        }
543
        // file not accesible
544
        else
545
        {
546
            throw new Exception("File $file can't be opened");
547
        }
548
549
        return $options;
550
    }
551
}
552