Client::cacheResponse()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 3
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika;
4
5
use Closure;
6
use Exception;
7
use stdClass;
8
9
use Vaites\ApacheTika\Clients\CLIClient;
10
use Vaites\ApacheTika\Clients\WebClient;
11
use Vaites\ApacheTika\Metadata\Metadata;
12
use Vaites\ApacheTika\Metadata\MetadataInterface;
13
14
/**
15
 * Apache Tika client interface
16
 *
17
 * @author  David Martínez <[email protected]>
18
 * @link    https://tika.apache.org/1.24/formats.html
19
 */
20
abstract class Client
21
{
22
    protected const MODE = null;
23
24
    /**
25
     * Checked flag
26
     *
27
     * @var bool
28
     */
29
    protected $checked = false;
30
31
    /**
32
     * Response using callbacks
33
     *
34
     * @var string
35
     */
36
    protected $response = null;
37
38
    /**
39
     * Platform (unix or win)
40
     *
41
     * @var string
42
     */
43
    protected $platform = null;
44
45
    /**
46
     * Apache Tika version
47
     * 
48
     * @var string
49
     */
50
    protected $version = null;
51
52
    /**
53
     * Cached responses to avoid multiple request for the same file.
54
     *
55
     * @var array
56
     */
57
    protected $cache = [];
58
59
    /**
60
     * Text encoding
61
     *
62
     * @var string|null
63
     */
64
    protected $encoding = null;
65
66
    /**
67
     * Callback called on secuential read
68
     *
69
     * @var callable|null
70
     */
71
    protected $callback = null;
72
73
    /**
74
     * Enable or disable appending when using callback
75
     *
76
     * @var bool
77
     */
78
    protected $callbackAppend = true;
79
80
    /**
81
     * Size of chunks for callback
82
     *
83
     * @var int
84
     */
85
    protected $chunkSize = 1048576;
86
87
    /**
88
     * Remote download flag
89
     *
90
     * @var bool
91
     */
92
    protected $downloadRemote = false;
93
94
    /**
95
     * Configure client
96
     */
97
    public function __construct()
98
    {
99
        $this->platform = defined('PHP_WINDOWS_VERSION_MAJOR') ? 'win' : 'unix';
100
    }
101
102
    /**
103
     * Get a class instance throwing an exception if check fails
104
     *
105
     * @param string|null     $param1   path or host
106
     * @param string|int|null $param2   Java binary path or port for web client
107
     * @param array           $options  options for cURL request
108
     * @param bool            $check    check JAR file or server connection
109
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
110
     * @throws \Exception
111
     */
112
    public static function make(?string $param1 = null, $param2 = null, array $options = [], bool $check = true): Client
113
    {
114
        if(preg_match('/\.jar$/', func_get_arg(0)))
115
        {
116
            $path = $param1 ? (string) $param1 : null;
117
            $java = $param2 ? (string) $param2 : null;
118
119
            return new CLIClient($path, $java, $check);
120
        }
121
        else
122
        {
123
            $host = $param1 ? (string) $param1 : null;
124
            $port = $param2 ? (int) $param2 : null;
125
126
            return new WebClient($host, $port, $options, $check);
127
        }
128
    }
129
130
    /**
131
     * Get a class instance delaying the check
132
     *
133
     * @param string|null $param1 path or host
134
     * @param int|null    $param2 Java binary path or port for web client
135
     * @param array       $options options for cURL request
136
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
137
     * @throws \Exception
138
     */
139
    public static function prepare($param1 = null, $param2 = null, $options = []): Client
140
    {
141
        return self::make($param1, $param2, $options, false);
142
    }
143
144
    /**
145
     * Get the encoding
146
     */
147
    public function getEncoding(): ?string
148
    {
149
        return $this->encoding;
150
    }
151
152
    /**
153
     * Set the encoding
154
     *
155
     * @throws \Exception
156
     */
157
    public function setEncoding(string $encoding): self
158
    {
159
        if(!empty($encoding))
160
        {
161
            $this->encoding = $encoding;
162
        }
163
        else
164
        {
165
            throw new Exception('Invalid encoding');
166
        }
167
168
        return $this;
169
    }
170
171
    /**
172
     * Get the callback
173
     */
174
    public function getCallback(): ?callable
175
    {
176
        return $this->callback;
177
    }
178
179
    /**
180
     * Set the callback (callable or closure) for call on secuential read
181
     *
182
     * @throws \Exception
183
     */
184
    public function setCallback(callable $callback, bool $append = true): self
185
    {
186
        if($callback instanceof Closure || is_array($callback))
187
        {
188
            $this->callbackAppend = (bool) $append;
189
            $this->callback = $callback;
190
        }
191
        elseif(is_string($callback))
192
        {
193
            $this->callbackAppend = (bool) $append;
194
            $this->callback = function($chunk) use ($callback)
195
            {
196
                return call_user_func_array($callback, [$chunk]);
197
            };
198
        }
199
        else
200
        {
201
            throw new Exception('Invalid callback');
202
        }
203
204
        return $this;
205
    }
206
207
    /**
208
     * Get the chunk size
209
     */
210
    public function getChunkSize(): int
211
    {
212
        return $this->chunkSize;
213
    }
214
215
    /**
216
     * Set the chunk size for secuential read
217
     *
218
     * @throws \Exception
219
     */
220
    public function setChunkSize(int $size): self
221
    {
222
        if(static::MODE == 'cli')
0 ignored issues
show
introduced by
The condition static::MODE == 'cli' is always false.
Loading history...
223
        {
224
            $this->chunkSize = $size;
225
        }
226
        else
227
        {
228
            throw new Exception('Chunk size is not supported on web mode');
229
        }
230
231
        return $this;
232
    }
233
234
    /**
235
     * Get the remote download flag
236
     */
237
    public function getDownloadRemote(): bool
238
    {
239
        return $this->downloadRemote;
240
    }
241
242
    /**
243
     * Set the remote download flag
244
     */
245
    public function setDownloadRemote(bool $download): self
246
    {
247
        $this->downloadRemote = (bool) $download;
248
249
        return $this;
250
    }
251
252
    /**
253
     * Gets file metadata
254
     *
255
     * @throws \Exception
256
     */
257
    public function getMetadata(string $file): MetadataInterface
258
    {
259
        $response = $this->parseJsonResponse($this->request('meta', $file) ?: 'ERROR');
260
261
        if($response instanceof stdClass === false)
262
        {
263
            throw new Exception("Unexpected metadata response for $file");
264
        }
265
266
        return Metadata::make($response, $file);
267
    }
268
269
    /**
270
     * Gets recursive file metadata where the returned array indexes are the file name.
271
     *
272
     * Example: for a sample.zip with an example.doc file, the return array looks like if be defined as:
273
     *
274
     *  [
275
     *      'sample.zip' => new Metadata()
276
     *      'sample.zip/example.doc' => new DocumentMetadata()
277
     *  ]
278
     *
279
     * @link https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-RecursiveMetadataandContent
280
     * @throws \Exception
281
     */
282
    public function getRecursiveMetadata(string $file, ?string $format = 'ignore'): array
283
    {
284
        if(in_array($format, ['text', 'html', 'ignore']) === false)
285
        {
286
            throw new Exception("Unknown recursive type (must be text, html, ignore or null)");
287
        }
288
289
        $response = $this->parseJsonResponse($this->request("rmeta/$format", $file) ?: 'ERROR');
290
291
        if(is_array($response) === false)
292
        {
293
            throw new Exception("Unexpected metadata response for $file");
294
        }
295
296
        $metadata = [];
297
298
        foreach($response as $item)
299
        {
300
            $name = basename($file);
301
            if(isset($item->{'X-TIKA:embedded_resource_path'}))
302
            {
303
                $name .= $item->{'X-TIKA:embedded_resource_path'};
304
            }
305
306
            $metadata[$name] = Metadata::make($item, $file);
307
        }
308
309
        return $metadata;
310
    }
311
312
    /**
313
     * Detect language
314
     *
315
     * @throws \Exception
316
     */
317
    public function getLanguage(string $file): ?string
318
    {
319
        return $this->request('lang', $file);
320
    }
321
322
    /**
323
     * Detect MIME type
324
     *
325
     * @throws \Exception
326
     */
327
    public function getMIME(string $file): ?string
328
    {
329
        return $this->request('mime', $file);
330
    }
331
332
    /**
333
     * Extracts HTML
334
     *
335
     * @throws \Exception
336
     */
337
    public function getHTML(string $file, ?callable $callback = null, bool $append = true): ?string
338
    {
339
        if(!is_null($callback))
340
        {
341
            $this->setCallback($callback, $append);
342
        }
343
344
        return $this->request('html', $file);
345
    }
346
347
    /**
348
     * Extracts XHTML
349
     *
350
     * @throws \Exception
351
     */
352
    public function getXHTML(string $file, ?callable $callback = null, bool $append = true): ?string
353
    {
354
        if(!is_null($callback))
355
        {
356
            $this->setCallback($callback, $append);
357
        }
358
359
        return $this->request('xhtml', $file);
360
    }
361
362
    /**
363
     * Extracts text
364
     *
365
     * @throws \Exception
366
     */
367
    public function getText(string $file, ?callable $callback = null, bool $append = true): ?string
368
    {
369
        if(!is_null($callback))
370
        {
371
            $this->setCallback($callback, $append);
372
        }
373
374
        return $this->request('text', $file);
375
    }
376
377
    /**
378
     * Extracts main text
379
     *
380
     * @throws \Exception
381
     */
382
    public function getMainText(string $file, ?callable $callback = null, bool $append = true): ?string
383
    {
384
        if(!is_null($callback))
385
        {
386
            $this->setCallback($callback, $append);
387
        }
388
389
        return $this->request('text-main', $file);
390
    }
391
392
    /**
393
     * Returns current Tika version
394
     *
395
     * @throws \Exception
396
     */
397
    public function getVersion(): ?string
398
    {
399
        if(is_null($this->version))
0 ignored issues
show
introduced by
The condition is_null($this->version) is always false.
Loading history...
400
        {
401
            $this->setVersion($this->request('version'));
402
        }
403
404
        return $this->version;
405
    }
406
407
    /**
408
     * Set the Tika version
409
     */
410
    public function setVersion(string $version): self
411
    {
412
        $this->checked = true;
413
        $this->version = $version;
414
415
        return $this;
416
    }
417
418
    /**
419
     * Return the list of Apache Tika supported versions
420
     *
421
     * @throws \Exception
422
     */
423
    public function getSupportedVersions(): array
424
    {
425
        static $versions = null;
426
427
        if(is_null($versions))
428
        {
429
            $composer = file_get_contents(dirname(__DIR__) . '/composer.json');
430
431
            if($composer === false)
432
            {
433
                throw new Exception("An error ocurred trying to read package's composer.json file");
434
            }
435
436
            $versions = json_decode($composer, true)['extra']['supported-versions'] ?? null;
437
438
            if(empty($versions))
439
            {
440
                throw new Exception("An error ocurred trying to read package's composer.json file");
441
            }
442
        }
443
444
        return $versions;
445
    }
446
447
    /**
448
     * Sets the checked flag
449
     */
450
    public function setChecked(bool $checked): self
451
    {
452
        $this->checked = (bool) $checked;
453
454
        return $this;
455
    }
456
457
    /**
458
     * Checks if instance is checked
459
     */
460
    public function isChecked(): bool
461
    {
462
        return $this->checked;
463
    }
464
465
    /**
466
     * Check if a response is cached
467
     */
468
    protected function isCached(string $type, string $file): bool
469
    {
470
        return isset($this->cache[sha1($file)][$type]);
471
    }
472
473
    /**
474
     * Get a cached response
475
     *
476
     * @return mixed
477
     */
478
    protected function getCachedResponse(string $type, string $file)
479
    {
480
        return $this->cache[sha1($file)][$type] ?? null;
481
    }
482
483
    /**
484
     * Check if a request type must be cached
485
     */
486
    protected function isCacheable(string $type): bool
487
    {
488
        return in_array($type, ['lang', 'meta']);
489
    }
490
491
    /**
492
     * Caches a response
493
     *
494
     * @param mixed $response
495
     */
496
    protected function cacheResponse(string $type, $response, string $file): bool
497
    {
498
        $this->cache[sha1($file)][$type] = $response;
499
500
        return true;
501
    }
502
503
    /**
504
     * Checks if a specific version is supported
505
     */
506
    public function isVersionSupported(string $version): bool
507
    {
508
        return in_array($version, $this->getSupportedVersions());
509
    }
510
511
    /**
512
     * Check if a mime type is supported
513
     *
514
     * @param string $mime
515
     * @return bool
516
     * @throws \Exception
517
     */
518
    public function isMIMETypeSupported(string $mime): bool
519
    {
520
        return array_key_exists($mime, $this->getSupportedMIMETypes());
521
    }
522
523
    /**
524
     * Check the request before executing
525
     *
526
     * @throws \Exception
527
     */
528
    public function checkRequest(string $type, ?string $file = null): ?string
529
    {
530
        // no checks for getters
531
        if(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
532
        {
533
            //
534
        }
535
        // invalid local file
536
        elseif($file !== null && !preg_match('/^http/', $file) && !file_exists($file))
537
        {
538
            throw new Exception("File $file can't be opened");
539
        }
540
        // download remote file if required only for integrated downloader
541
        elseif($file !== null && preg_match('/^http/', $file) && $this->downloadRemote)
542
        {
543
            $file = $this->downloadFile($file);
544
        }
545
        // invalid remote file
546
        elseif($file !== null && preg_match('/^http/', $file))
547
        {
548
            $headers = get_headers($file);
549
550
            if(empty($headers) || !preg_match('/200/', $headers[0]))
551
            {
552
                throw new Exception("File $file can't be opened", 2);
553
            }
554
        }
555
556
        return $file;
557
    }
558
559
    /**
560
     * Filter response to fix common issues
561
     *
562
     * @param string $response
563
     * @return string
564
     */
565
    protected function filterResponse(string $response): string
566
    {
567
        // fix Log4j2 warning
568
        $response = trim(str_replace
569
        (
570
            'WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.',
571
            '',
572
            $response
573
        ));
574
575
        return trim($response);
576
    }
577
578
    /**
579
     * Parse the response returned by Apache Tika
580
     *
581
     * @return mixed
582
     * @throws \Exception
583
     */
584
    protected function parseJsonResponse(string $response)
585
    {
586
        // an empty response throws an error
587
        if(empty($response) || trim($response) == '')
588
        {
589
            throw new Exception('Empty response');
590
        }
591
592
        // decode the JSON response
593
        $json = json_decode($response);
594
595
        // exceptions if metadata is not valid
596
        if(json_last_error())
597
        {
598
            $message = function_exists('json_last_error_msg') ? json_last_error_msg() : 'Error parsing JSON response';
599
600
            throw new Exception($message, json_last_error());
601
        }
602
603
        return $json;
604
    }
605
606
    /**
607
     * Download file to a temporary folder and return its path
608
     *
609
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
610
     * @throws \Exception
611
     */
612
    protected function downloadFile(string $file): string
613
    {
614
        $dest = tempnam(sys_get_temp_dir(), 'TIKA');
615
616
        if($dest === false)
617
        {
618
            throw new Exception("Can't create a temporary file at " . sys_get_temp_dir());
619
        }
620
621
        $fp = fopen($dest, 'w+');
622
623
        if($fp === false)
624
        {
625
            throw new Exception("$dest can't be opened");
626
        }
627
628
        $ch = curl_init($file);
629
630
        if($ch === false)
631
        {
632
            throw new Exception("$file can't be downloaded");
633
        }
634
635
        curl_setopt($ch, CURLOPT_FILE, $fp);
636
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
637
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
638
        curl_exec($ch);
639
640
        if(curl_errno($ch))
641
        {
642
            throw new Exception(curl_error($ch));
643
        }
644
645
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
646
647
        curl_close($ch);
648
649
        if($code != 200)
650
        {
651
            throw new Exception("$file can't be downloaded", $code);
652
        }
653
654
        return $dest;
655
    }
656
657
    /**
658
     * Must return the supported MIME types
659
     *
660
     * @throws \Exception
661
     */
662
    abstract public function getSupportedMIMETypes(): array;
663
664
    /**
665
     * Must return the available detectors
666
     *
667
     * @throws \Exception
668
     */
669
    abstract public function getAvailableDetectors(): array;
670
671
    /**
672
     * Must return the available parsers
673
     *
674
     * @throws \Exception
675
     */
676
    abstract public function getAvailableParsers(): array;
677
678
    /**
679
     * Check Java binary, JAR path or server connection
680
     */
681
    abstract public function check(): void;
682
683
    /**
684
     * Configure and make a request and return its results.
685
     *
686
     * @throws \Exception
687
     */
688
    abstract public function request(string $type, ?string $file = null): ?string;
689
}
690