Passed
Push — master ( c8c09a...264914 )
by David
03:27 queued 10s
created

Client::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 1
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 3
rs 10
1
<?php
2
3
namespace Vaites\ApacheTika;
4
5
use Closure;
6
use Exception;
7
use stdClass;
8
9
use Vaites\ApacheTika\Clients\CLIClient;
10
use Vaites\ApacheTika\Clients\WebClient;
11
use Vaites\ApacheTika\Metadata\Metadata;
12
use Vaites\ApacheTika\Metadata\MetadataInterface;
13
14
/**
15
 * Apache Tika client interface
16
 *
17
 * @author  David Martínez <[email protected]>
18
 * @link    https://tika.apache.org/1.24/formats.html
19
 */
20
abstract class Client
21
{
22
    protected const MODE = null;
23
24
    /**
25
     * Checked flag
26
     *
27
     * @var bool
28
     */
29
    protected $checked = false;
30
31
    /**
32
     * Response using callbacks
33
     *
34
     * @var string
35
     */
36
    protected $response = null;
37
38
    /**
39
     * Platform (unix or win)
40
     *
41
     * @var string
42
     */
43
    protected $platform = null;
44
45
    /**
46
     * Cached responses to avoid multiple request for the same file.
47
     *
48
     * @var array
49
     */
50
    protected $cache = [];
51
52
    /**
53
     * Text encoding
54
     *
55
     * @var string|null
56
     */
57
    protected $encoding = null;
58
59
    /**
60
     * Callback called on secuential read
61
     *
62
     * @var callable|null
63
     */
64
    protected $callback = null;
65
66
    /**
67
     * Enable or disable appending when using callback
68
     *
69
     * @var bool
70
     */
71
    protected $callbackAppend = true;
72
73
    /**
74
     * Size of chunks for callback
75
     *
76
     * @var int
77
     */
78
    protected $chunkSize = 1048576;
79
80
    /**
81
     * Remote download flag
82
     *
83
     * @var bool
84
     */
85
    protected $downloadRemote = false;
86
87
    /**
88
     * Configure client
89
     */
90
    public function __construct()
91
    {
92
        $this->platform = defined('PHP_WINDOWS_VERSION_MAJOR') ? 'win' : 'unix';
93
    }
94
95
    /**
96
     * Get a class instance throwing an exception if check fails
97
     *
98
     * @param string|null     $param1   path or host
99
     * @param string|int|null $param2   Java binary path or port for web client
100
     * @param array           $options  options for cURL request
101
     * @param bool            $check    check JAR file or server connection
102
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
103
     * @throws \Exception
104
     */
105
    public static function make(string $param1 = null, $param2 = null, array $options = [], bool $check = true): Client
106
    {
107
        if(preg_match('/\.jar$/', func_get_arg(0)))
108
        {
109
            $path = $param1 ? (string) $param1 : null;
110
            $java = $param2 ? (string) $param2 : null;
111
112
            return new CLIClient($path, $java, $check);
113
        }
114
        else
115
        {
116
            $host = $param1 ? (string) $param1 : null;
117
            $port = $param2 ? (int) $param2 : null;
118
119
            return new WebClient($host, $port, $options, $check);
120
        }
121
    }
122
123
    /**
124
     * Get a class instance delaying the check
125
     *
126
     * @param string|null $param1 path or host
127
     * @param int|null    $param2 Java binary path or port for web client
128
     * @param array       $options options for cURL request
129
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
130
     * @throws \Exception
131
     */
132
    public static function prepare($param1 = null, $param2 = null, $options = []): Client
133
    {
134
        return self::make($param1, $param2, $options, false);
135
    }
136
137
    /**
138
     * Get the encoding
139
     */
140
    public function getEncoding(): ?string
141
    {
142
        return $this->encoding;
143
    }
144
145
    /**
146
     * Set the encoding
147
     *
148
     * @throws \Exception
149
     */
150
    public function setEncoding(string $encoding): self
151
    {
152
        if(!empty($encoding))
153
        {
154
            $this->encoding = $encoding;
155
        }
156
        else
157
        {
158
            throw new Exception('Invalid encoding');
159
        }
160
161
        return $this;
162
    }
163
164
    /**
165
     * Get the callback
166
     */
167
    public function getCallback(): ?callable
168
    {
169
        return $this->callback;
170
    }
171
172
    /**
173
     * Set the callback (callable or closure) for call on secuential read
174
     *
175
     * @throws \Exception
176
     */
177
    public function setCallback(callable $callback, bool $append = true): self
178
    {
179
        if($callback instanceof Closure || is_array($callback))
180
        {
181
            $this->callbackAppend = (bool) $append;
182
            $this->callback = $callback;
183
        }
184
        elseif(is_string($callback))
185
        {
186
            $this->callbackAppend = (bool) $append;
187
            $this->callback = function($chunk) use ($callback)
188
            {
189
                return call_user_func_array($callback, [$chunk]);
190
            };
191
        }
192
        else
193
        {
194
            throw new Exception('Invalid callback');
195
        }
196
197
        return $this;
198
    }
199
200
    /**
201
     * Get the chunk size
202
     */
203
    public function getChunkSize(): int
204
    {
205
        return $this->chunkSize;
206
    }
207
208
    /**
209
     * Set the chunk size for secuential read
210
     *
211
     * @throws \Exception
212
     */
213
    public function setChunkSize(int $size): self
214
    {
215
        if(static::MODE == 'cli')
0 ignored issues
show
introduced by
The condition static::MODE == 'cli' is always false.
Loading history...
216
        {
217
            $this->chunkSize = $size;
218
        }
219
        else
220
        {
221
            throw new Exception('Chunk size is not supported on web mode');
222
        }
223
224
        return $this;
225
    }
226
227
    /**
228
     * Get the remote download flag
229
     */
230
    public function getDownloadRemote(): bool
231
    {
232
        return $this->downloadRemote;
233
    }
234
235
    /**
236
     * Set the remote download flag
237
     */
238
    public function setDownloadRemote(bool $download): self
239
    {
240
        $this->downloadRemote = (bool) $download;
241
242
        return $this;
243
    }
244
245
    /**
246
     * Gets file metadata
247
     *
248
     * @throws \Exception
249
     */
250
    public function getMetadata(string $file): MetadataInterface
251
    {
252
        $response = $this->parseJsonResponse($this->request('meta', $file));
0 ignored issues
show
Bug introduced by
It seems like $this->request('meta', $file) can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::parseJsonResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

252
        $response = $this->parseJsonResponse(/** @scrutinizer ignore-type */ $this->request('meta', $file));
Loading history...
253
254
        if($response instanceof stdClass === false)
255
        {
256
            throw new Exception("Unexpected metadata response for $file");
257
        }
258
259
        return Metadata::make($response, $file);
260
    }
261
262
    /**
263
     * Gets recursive file metadata where the returned array indexes are the file name.
264
     *
265
     * Example: for a sample.zip with an example.doc file, the return array looks like if be defined as:
266
     *
267
     *  [
268
     *      'sample.zip' => new Metadata()
269
     *      'sample.zip/example.doc' => new DocumentMetadata()
270
     *  ]
271
     *
272
     * @link https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-RecursiveMetadataandContent
273
     * @throws \Exception
274
     */
275
    public function getRecursiveMetadata(string $file, ?string $format = 'ignore'): array
276
    {
277
        if(in_array($format, ['text', 'html', 'ignore']) === false)
278
        {
279
            throw new Exception("Unknown recursive type (must be text, html, ignore or null)");
280
        }
281
282
        $response = $this->parseJsonResponse($this->request("rmeta/$format", $file));
0 ignored issues
show
Bug introduced by
It seems like $this->request('rmeta/'.$format, $file) can also be of type null; however, parameter $response of Vaites\ApacheTika\Client::parseJsonResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

282
        $response = $this->parseJsonResponse(/** @scrutinizer ignore-type */ $this->request("rmeta/$format", $file));
Loading history...
283
284
        if(is_array($response) === false)
285
        {
286
            throw new Exception("Unexpected metadata response for $file");
287
        }
288
289
        $metadata = [];
290
291
        foreach($response as $item)
292
        {
293
            $name = basename($file);
294
            if(isset($item->{'X-TIKA:embedded_resource_path'}))
295
            {
296
                $name .= $item->{'X-TIKA:embedded_resource_path'};
297
            }
298
299
            $metadata[$name] = Metadata::make($item, $file);
300
        }
301
302
        return $metadata;
303
    }
304
305
    /**
306
     * Detect language
307
     *
308
     * @throws \Exception
309
     */
310
    public function getLanguage(string $file): string
311
    {
312
        return $this->request('lang', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('lang', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
313
    }
314
315
    /**
316
     * Detect MIME type
317
     *
318
     * @throws \Exception
319
     */
320
    public function getMIME(string $file): string
321
    {
322
        return $this->request('mime', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('mime', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
323
    }
324
325
    /**
326
     * Extracts HTML
327
     *
328
     * @throws \Exception
329
     */
330
    public function getHTML(string $file, callable $callback = null, bool $append = true): string
331
    {
332
        if(!is_null($callback))
333
        {
334
            $this->setCallback($callback, $append);
335
        }
336
337
        return $this->request('html', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('html', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
338
    }
339
340
    /**
341
     * Extracts XHTML
342
     *
343
     * @throws \Exception
344
     */
345
    public function getXHTML(string $file, callable $callback = null, bool $append = true): string
346
    {
347
        if(!is_null($callback))
348
        {
349
            $this->setCallback($callback, $append);
350
        }
351
352
        return $this->request('xhtml', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('xhtml', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
353
    }
354
355
    /**
356
     * Extracts text
357
     *
358
     * @throws \Exception
359
     */
360
    public function getText(string $file, callable $callback = null, bool $append = true): string
361
    {
362
        if(!is_null($callback))
363
        {
364
            $this->setCallback($callback, $append);
365
        }
366
367
        return $this->request('text', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('text', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
368
    }
369
370
    /**
371
     * Extracts main text
372
     *
373
     * @throws \Exception
374
     */
375
    public function getMainText(string $file, callable $callback = null, bool $append = true): string
376
    {
377
        if(!is_null($callback))
378
        {
379
            $this->setCallback($callback, $append);
380
        }
381
382
        return $this->request('text-main', $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('text-main', $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
383
    }
384
385
    /**
386
     * Returns current Tika version
387
     *
388
     * @throws \Exception
389
     */
390
    public function getVersion(): string
391
    {
392
        return $this->request('version');
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->request('version') could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
393
    }
394
395
    /**
396
     * Return the list of Apache Tika supported versions
397
     *
398
     * @throws \Exception
399
     */
400
    public function getSupportedVersions(): array
401
    {
402
        static $versions = null;
403
404
        if(is_null($versions))
405
        {
406
            $composer = file_get_contents(dirname(__DIR__) . '/composer.json');
407
408
            if($composer === false)
409
            {
410
                throw new Exception("An error ocurred trying to read package's composer.json file");
411
            }
412
413
            $versions = json_decode($composer, true)['extra']['supported-versions'] ?? null;
414
415
            if(empty($versions))
416
            {
417
                throw new Exception("An error ocurred trying to read package's composer.json file");
418
            }
419
        }
420
421
        return $versions;
422
    }
423
424
    /**
425
     * Sets the checked flag
426
     */
427
    public function setChecked(bool $checked): self
428
    {
429
        $this->checked = (bool) $checked;
430
431
        return $this;
432
    }
433
434
    /**
435
     * Checks if instance is checked
436
     */
437
    public function isChecked(): bool
438
    {
439
        return $this->checked;
440
    }
441
442
    /**
443
     * Check if a response is cached
444
     */
445
    protected function isCached(string $type, string $file): bool
446
    {
447
        return isset($this->cache[sha1($file)][$type]);
448
    }
449
450
    /**
451
     * Get a cached response
452
     *
453
     * @return mixed
454
     */
455
    protected function getCachedResponse(string $type, string $file)
456
    {
457
        return $this->cache[sha1($file)][$type] ?? null;
458
    }
459
460
    /**
461
     * Check if a request type must be cached
462
     */
463
    protected function isCacheable(string $type): bool
464
    {
465
        return in_array($type, ['lang', 'meta']);
466
    }
467
468
    /**
469
     * Caches a response
470
     *
471
     * @param mixed $response
472
     */
473
    protected function cacheResponse(string $type, $response, string $file): bool
474
    {
475
        $this->cache[sha1($file)][$type] = $response;
476
477
        return true;
478
    }
479
480
    /**
481
     * Checks if a specific version is supported
482
     */
483
    public function isVersionSupported(string $version): bool
484
    {
485
        return in_array($version, $this->getSupportedVersions());
486
    }
487
488
    /**
489
     * Check if a mime type is supported
490
     *
491
     * @param string $mime
492
     * @return bool
493
     * @throws \Exception
494
     */
495
    public function isMIMETypeSupported(string $mime): bool
496
    {
497
        return array_key_exists($mime, $this->getSupportedMIMETypes());
498
    }
499
500
    /**
501
     * Check the request before executing
502
     *
503
     * @throws \Exception
504
     */
505
    public function checkRequest(string $type, string $file = null): ?string
506
    {
507
        // no checks for getters
508
        if(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
509
        {
510
            //
511
        } // invalid local file
512
        elseif($file !== null && !preg_match('/^http/', $file) && !file_exists($file))
513
        {
514
            throw new Exception("File $file can't be opened");
515
        } // invalid remote file
516
        elseif($file !== null && preg_match('/^http/', $file))
517
        {
518
            $headers = get_headers($file);
519
520
            if(empty($headers) || !preg_match('/200/', $headers[0]))
521
            {
522
                throw new Exception("File $file can't be opened", 2);
523
            }
524
        } // download remote file if required only for integrated downloader
525
        elseif($file !== null && preg_match('/^http/', $file) && $this->downloadRemote)
526
        {
527
            $file = $this->downloadFile($file);
528
        }
529
530
        return $file;
531
    }
532
533
    /**
534
     * Filter response to fix common issues
535
     *
536
     * @param string $response
537
     * @return string
538
     */
539
    protected function filterResponse(string $response): string
540
    {
541
        // fix Log4j2 warning
542
        $response = trim(str_replace
543
        (
544
            'WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.',
545
            '',
546
            $response
547
        ));
548
549
        return trim($response);
550
    }
551
552
    /**
553
     * Parse the response returned by Apache Tika
554
     *
555
     * @return mixed
556
     * @throws \Exception
557
     */
558
    protected function parseJsonResponse(string $response)
559
    {
560
        // an empty response throws an error
561
        if(empty($response) || trim($response) == '')
562
        {
563
            throw new Exception('Empty response');
564
        }
565
566
        // decode the JSON response
567
        $json = json_decode($response);
568
569
        // exceptions if metadata is not valid
570
        if(json_last_error())
571
        {
572
            dd($response);
573
574
            $message = function_exists('json_last_error_msg') ? json_last_error_msg() : 'Error parsing JSON response';
575
576
            throw new Exception($message, json_last_error());
577
        }
578
579
        return $json;
580
    }
581
582
    /**
583
     * Download file to a temporary folder
584
     *
585
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
586
     * @throws \Exception
587
     */
588
    protected function downloadFile(string $file): string
589
    {
590
        $dest = tempnam(sys_get_temp_dir(), 'TIKA');
591
592
        if($dest === false)
593
        {
594
            throw new Exception("Can't create a temporary file at " . sys_get_temp_dir());
595
        }
596
597
        $fp = fopen($dest, 'w+');
598
599
        if($fp === false)
600
        {
601
            throw new Exception("$dest can't be opened");
602
        }
603
604
        $ch = curl_init($file);
605
606
        if($ch === false)
607
        {
608
            throw new Exception("$file can't be downloaded");
609
        }
610
611
        curl_setopt($ch, CURLOPT_FILE, $fp);
612
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
613
        curl_exec($ch);
614
615
        if(curl_errno($ch))
616
        {
617
            throw new Exception(curl_error($ch));
618
        }
619
620
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
621
622
        curl_close($ch);
623
624
        if($code != 200)
625
        {
626
            throw new Exception("$file can't be downloaded", $code);
627
        }
628
629
        return $dest;
630
    }
631
632
    /**
633
     * Must return the supported MIME types
634
     *
635
     * @throws \Exception
636
     */
637
    abstract public function getSupportedMIMETypes(): array;
638
639
    /**
640
     * Must return the available detectors
641
     *
642
     * @throws \Exception
643
     */
644
    abstract public function getAvailableDetectors(): array;
645
646
    /**
647
     * Must return the available parsers
648
     *
649
     * @throws \Exception
650
     */
651
    abstract public function getAvailableParsers(): array;
652
653
    /**
654
     * Check Java binary, JAR path or server connection
655
     */
656
    abstract public function check(): void;
657
658
    /**
659
     * Configure and make a request and return its results.
660
     *
661
     * @throws \Exception
662
     */
663
    abstract public function request(string $type, string $file = null): ?string;
664
}
665