Passed
Push — master ( d8574a...8cadc1 )
by David
02:55
created

Client::setChecked()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
c 0
b 0
f 0
dl 0
loc 5
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
namespace Vaites\ApacheTika;
4
5
use Closure;
6
use Exception;
7
use stdClass;
8
9
use Vaites\ApacheTika\Clients\CLIClient;
10
use Vaites\ApacheTika\Clients\WebClient;
11
use Vaites\ApacheTika\Metadata\Metadata;
12
use Vaites\ApacheTika\Metadata\MetadataInterface;
13
14
/**
15
 * Apache Tika client interface
16
 *
17
 * @author  David Martínez <[email protected]>
18
 * @link    https://tika.apache.org/1.24/formats.html
19
 */
20
abstract class Client
21
{
22
    protected const MODE = null;
23
24
    /**
25
     * Checked flag
26
     *
27
     * @var bool
28
     */
29
    protected $checked = false;
30
31
    /**
32
     * Response using callbacks
33
     *
34
     * @var string
35
     */
36
    protected $response = null;
37
38
    /**
39
     * Platform (unix or win)
40
     *
41
     * @var string
42
     */
43
    protected $platform = null;
44
45
    /**
46
     * Cached responses to avoid multiple request for the same file.
47
     *
48
     * @var array
49
     */
50
    protected $cache = [];
51
52
    /**
53
     * Text encoding
54
     *
55
     * @var string|null
56
     */
57
    protected $encoding = null;
58
59
    /**
60
     * Callback called on secuential read
61
     *
62
     * @var callable|null
63
     */
64
    protected $callback = null;
65
66
    /**
67
     * Enable or disable appending when using callback
68
     *
69
     * @var bool
70
     */
71
    protected $callbackAppend = true;
72
73
    /**
74
     * Size of chunks for callback
75
     *
76
     * @var int
77
     */
78
    protected $chunkSize = 1048576;
79
80
    /**
81
     * Remote download flag
82
     *
83
     * @var bool
84
     */
85
    protected $downloadRemote = false;
86
87
    /**
88
     * Configure client
89
     */
90
    public function __construct()
91
    {
92
        $this->platform = defined('PHP_WINDOWS_VERSION_MAJOR') ? 'win' : 'unix';
93
    }
94
95
    /**
96
     * Get a class instance throwing an exception if check fails
97
     *
98
     * @param string     $param1 path or host
99
     * @param string|int $param2 Java binary path or port for web client
100
     * @param array      $options options for cURL request
101
     * @param bool       $check check JAR file or server connection
102
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
103
     * @throws \Exception
104
     */
105
    public static function make(string $param1 = null, $param2 = null, array $options = [], bool $check = true): Client
106
    {
107
        if(preg_match('/\.jar$/', func_get_arg(0)))
108
        {
109
            return new CLIClient($param1, $param2, $check);
110
        }
111
        else
112
        {
113
            return new WebClient($param1, $param2, $options, $check);
114
        }
115
    }
116
117
    /**
118
     * Get a class instance delaying the check
119
     *
120
     * @param string $param1 path or host
121
     * @param int    $param2 Java binary path or port for web client
122
     * @param array  $options options for cURL request
123
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
124
     * @throws \Exception
125
     */
126
    public static function prepare($param1 = null, $param2 = null, $options = []): Client
127
    {
128
        return self::make($param1, $param2, $options, false);
129
    }
130
131
    /**
132
     * Get the encoding
133
     */
134
    public function getEncoding(): ?string
135
    {
136
        return $this->encoding;
137
    }
138
139
    /**
140
     * Set the encoding
141
     *
142
     * @throws \Exception
143
     */
144
    public function setEncoding(string $encoding): self
145
    {
146
        if(!empty($encoding))
147
        {
148
            $this->encoding = $encoding;
149
        }
150
        else
151
        {
152
            throw new Exception('Invalid encoding');
153
        }
154
155
        return $this;
156
    }
157
158
    /**
159
     * Get the callback
160
     */
161
    public function getCallback(): ?Closure
162
    {
163
        return $this->callback;
164
    }
165
166
    /**
167
     * Set the callback (callable or closure) for call on secuential read
168
     *
169
     * @throws \Exception
170
     */
171
    public function setCallback(callable $callback, bool $append = true): self
172
    {
173
        if($callback instanceof Closure || is_array($callback))
174
        {
175
            $this->callbackAppend = (bool) $append;
176
            $this->callback = $callback;
177
        }
178
        elseif(is_string($callback))
179
        {
180
            $this->callbackAppend = (bool) $append;
181
            $this->callback = function($chunk) use ($callback)
182
            {
183
                return call_user_func_array($callback, [$chunk]);
184
            };
185
        }
186
        else
187
        {
188
            throw new Exception('Invalid callback');
189
        }
190
191
        return $this;
192
    }
193
194
    /**
195
     * Get the chunk size
196
     */
197
    public function getChunkSize(): int
198
    {
199
        return $this->chunkSize;
200
    }
201
202
    /**
203
     * Set the chunk size for secuential read
204
     *
205
     * @throws \Exception
206
     */
207
    public function setChunkSize(int $size): self
208
    {
209
        if(static::MODE == 'cli')
0 ignored issues
show
introduced by
The condition static::MODE == 'cli' is always false.
Loading history...
210
        {
211
            $this->chunkSize = $size;
212
        }
213
        else
214
        {
215
            throw new Exception('Chunk size is not supported on web mode');
216
        }
217
218
        return $this;
219
    }
220
221
    /**
222
     * Get the remote download flag
223
     */
224
    public function getDownloadRemote(): bool
225
    {
226
        return $this->downloadRemote;
227
    }
228
229
    /**
230
     * Set the remote download flag
231
     */
232
    public function setDownloadRemote(bool $download): self
233
    {
234
        $this->downloadRemote = (bool) $download;
235
236
        return $this;
237
    }
238
239
    /**
240
     * Gets file metadata
241
     *
242
     * @throws \Exception
243
     */
244
    public function getMetadata(string $file): MetadataInterface
245
    {
246
        $response = $this->parseJsonResponse($this->request('meta', $file));
247
248
        if($response instanceof stdClass == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
249
        {
250
            throw new Exception("Unexpected metadata response for $file");
251
        }
252
253
        return Metadata::make($response, $file);
254
    }
255
256
    /**
257
     * Gets recursive file metadata where the returned array indexes are the file name.
258
     *
259
     * Example: for a sample.zip with an example.doc file, the return array looks like if be defined as:
260
     *
261
     *  [
262
     *      'sample.zip' => new Metadata()
263
     *      'sample.zip/example.doc' => new DocumentMetadata()
264
     *  ]
265
     *
266
     * @link https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-RecursiveMetadataandContent
267
     * @throws \Exception
268
     */
269
    public function getRecursiveMetadata(string $file, ?string $format = 'ignore'): array
270
    {
271
        if(in_array($format, ['text', 'html', 'ignore']) == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
272
        {
273
            throw new Exception("Unknown recursive type (must be text, html, ignore or null)");
274
        }
275
276
        $response = $this->parseJsonResponse($this->request("rmeta/$format", $file));
277
278
        if(is_array($response) == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
279
        {
280
            throw new Exception("Unexpected metadata response for $file");
281
        }
282
283
        $metadata = [];
284
285
        foreach($response as $item)
286
        {
287
            $name = basename($file);
288
            if(isset($item->{'X-TIKA:embedded_resource_path'}))
289
            {
290
                $name .= $item->{'X-TIKA:embedded_resource_path'};
291
            }
292
293
            $metadata[$name] = Metadata::make($item, $file);
294
        }
295
296
        return $metadata;
297
    }
298
299
    /**
300
     * Detect language
301
     *
302
     * @throws \Exception
303
     */
304
    public function getLanguage(string $file): string
305
    {
306
        return $this->request('lang', $file);
307
    }
308
309
    /**
310
     * Detect MIME type
311
     *
312
     * @throws \Exception
313
     */
314
    public function getMIME(string $file): string
315
    {
316
        return $this->request('mime', $file);
317
    }
318
319
    /**
320
     * Extracts HTML
321
     *
322
     * @throws \Exception
323
     */
324
    public function getHTML(string $file, callable $callback = null, bool $append = true): string
325
    {
326
        if(!is_null($callback))
327
        {
328
            $this->setCallback($callback, $append);
329
        }
330
331
        return $this->request('html', $file);
332
    }
333
334
    /**
335
     * Extracts text
336
     *
337
     * @throws \Exception
338
     */
339
    public function getText(string $file, callable $callback = null, bool $append = true): string
340
    {
341
        if(!is_null($callback))
342
        {
343
            $this->setCallback($callback, $append);
344
        }
345
346
        return $this->request('text', $file);
347
    }
348
349
    /**
350
     * Extracts main text
351
     *
352
     * @throws \Exception
353
     */
354
    public function getMainText(string $file, callable $callback = null, bool $append = true): string
355
    {
356
        if(!is_null($callback))
357
        {
358
            $this->setCallback($callback, $append);
359
        }
360
361
        return $this->request('text-main', $file);
362
    }
363
364
    /**
365
     * Returns current Tika version
366
     *
367
     * @throws \Exception
368
     */
369
    public function getVersion(): string
370
    {
371
        return $this->request('version');
372
    }
373
374
    /**
375
     * Return the list of Apache Tika supported versions
376
     *
377
     * @throws \Exception
378
     */
379
    public function getSupportedVersions(): array
380
    {
381
        static $versions = null;
382
383
        if(is_null($versions))
384
        {
385
            $composer = json_decode(file_get_contents(dirname(__DIR__) . '/composer.json'), true);
386
            $versions = $composer['extra']['supported-versions'] ?? null;
387
388
            if(empty($versions))
389
            {
390
                throw new Exception("An error ocurred trying to read package's composer.json file");
391
            }
392
        }
393
394
        return $versions;
395
    }
396
397
    /**
398
     * Sets the checked flag
399
     */
400
    public function setChecked(bool $checked): self
401
    {
402
        $this->checked = (bool) $checked;
403
404
        return $this;
405
    }
406
407
    /**
408
     * Checks if instance is checked
409
     */
410
    public function isChecked(): bool
411
    {
412
        return $this->checked;
413
    }
414
415
    /**
416
     * Check if a response is cached
417
     */
418
    protected function isCached(string $type, string $file): bool
419
    {
420
        return isset($this->cache[sha1($file)][$type]);
421
    }
422
423
    /**
424
     * Get a cached response
425
     */
426
    protected function getCachedResponse(string $type, string $file)
427
    {
428
        return $this->cache[sha1($file)][$type] ?? null;
429
    }
430
431
    /**
432
     * Check if a request type must be cached
433
     */
434
    protected function isCacheable(string $type): bool
435
    {
436
        return in_array($type, ['lang', 'meta']);
437
    }
438
439
    /**
440
     * Caches a response
441
     */
442
    protected function cacheResponse(string $type, $response, string $file): bool
443
    {
444
        $this->cache[sha1($file)][$type] = $response;
445
446
        return true;
447
    }
448
449
    /**
450
     * Checks if a specific version is supported
451
     */
452
    public function isVersionSupported(string $version): bool
453
    {
454
        return in_array($version, $this->getSupportedVersions());
455
    }
456
457
    /**
458
     * Check if a mime type is supported
459
     *
460
     * @param string $mime
461
     * @return bool
462
     * @throws \Exception
463
     */
464
    public function isMIMETypeSupported(string $mime): bool
465
    {
466
        return array_key_exists($mime, $this->getSupportedMIMETypes());
467
    }
468
469
    /**
470
     * Check the request before executing
471
     *
472
     * @throws \Exception
473
     */
474
    public function checkRequest(string $type, string $file = null): ?string
475
    {
476
        // no checks for getters
477
        if(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
478
        {
479
            //
480
        } // invalid local file
481
        elseif(!preg_match('/^http/', $file) && !file_exists($file))
482
        {
483
            throw new Exception("File $file can't be opened");
484
        } // invalid remote file
485
        elseif(preg_match('/^http/', $file) && !preg_match('/200/', get_headers($file)[0]))
486
        {
487
            throw new Exception("File $file can't be opened", 2);
488
        } // download remote file if required only for integrated downloader
489
        elseif(preg_match('/^http/', $file) && $this->downloadRemote)
490
        {
491
            $file = $this->downloadFile($file);
0 ignored issues
show
Bug introduced by
It seems like $file can also be of type null; however, parameter $file of Vaites\ApacheTika\Client::downloadFile() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

491
            $file = $this->downloadFile(/** @scrutinizer ignore-type */ $file);
Loading history...
492
        }
493
494
        return $file;
495
    }
496
497
    /**
498
     * Parse the response returned by Apache Tika
499
     *
500
     * @return mixed
501
     * @throws \Exception
502
     */
503
    protected function parseJsonResponse(string $response)
504
    {
505
        // an empty response throws an error
506
        if(empty($response) || trim($response) == '')
507
        {
508
            throw new Exception('Empty response');
509
        }
510
511
        // decode the JSON response
512
        $json = json_decode($response);
513
514
        // exceptions if metadata is not valid
515
        if(json_last_error())
516
        {
517
            $message = function_exists('json_last_error_msg') ? json_last_error_msg() : 'Error parsing JSON response';
518
519
            throw new Exception($message, json_last_error());
520
        }
521
522
        return $json;
523
    }
524
525
    /**
526
     * Download file to a temporary folder
527
     *
528
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
529
     * @throws \Exception
530
     */
531
    protected function downloadFile(string $file): string
532
    {
533
        $dest = tempnam(sys_get_temp_dir(), 'TIKA');
534
535
        $fp = fopen($dest, 'w+');
536
537
        if($fp === false)
538
        {
539
            throw new Exception("$dest can't be opened");
540
        }
541
542
        $ch = curl_init($file);
543
        curl_setopt($ch, CURLOPT_FILE, $fp);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_setopt() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

543
        curl_setopt(/** @scrutinizer ignore-type */ $ch, CURLOPT_FILE, $fp);
Loading history...
544
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
545
        curl_exec($ch);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

545
        curl_exec(/** @scrutinizer ignore-type */ $ch);
Loading history...
546
547
        if(curl_errno($ch))
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_errno() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

547
        if(curl_errno(/** @scrutinizer ignore-type */ $ch))
Loading history...
548
        {
549
            throw new Exception(curl_error($ch));
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

549
            throw new Exception(curl_error(/** @scrutinizer ignore-type */ $ch));
Loading history...
550
        }
551
552
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
        $code = curl_getinfo(/** @scrutinizer ignore-type */ $ch, CURLINFO_HTTP_CODE);
Loading history...
553
554
        curl_close($ch);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_close() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
        curl_close(/** @scrutinizer ignore-type */ $ch);
Loading history...
555
556
        if($code != 200)
557
        {
558
            throw new Exception("$file can't be downloaded", $code);
559
        }
560
561
        return $dest;
562
    }
563
564
    /**
565
     * Must return the supported MIME types
566
     *
567
     * @throws \Exception
568
     */
569
    abstract public function getSupportedMIMETypes(): array;
570
571
    /**
572
     * Must return the available detectors
573
     *
574
     * @throws \Exception
575
     */
576
    abstract public function getAvailableDetectors(): array;
577
578
    /**
579
     * Must return the available parsers
580
     *
581
     * @throws \Exception
582
     */
583
    abstract public function getAvailableParsers(): array;
584
585
    /**
586
     * Check Java binary, JAR path or server connection
587
     */
588
    abstract public function check(): void;
589
590
    /**
591
     * Configure and make a request and return its results.
592
     *
593
     * @throws \Exception
594
     */
595
    abstract public function request(string $type, string $file = null): string;
596
}
597