Passed
Push — master ( f99a92...dcc276 )
by David
02:43
created

Client::getXHTML()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
c 0
b 0
f 0
dl 0
loc 8
rs 10
cc 2
nc 2
nop 3
1
<?php
2
3
namespace Vaites\ApacheTika;
4
5
use Closure;
6
use Exception;
7
use stdClass;
8
9
use Vaites\ApacheTika\Clients\CLIClient;
10
use Vaites\ApacheTika\Clients\WebClient;
11
use Vaites\ApacheTika\Metadata\Metadata;
12
use Vaites\ApacheTika\Metadata\MetadataInterface;
13
14
/**
15
 * Apache Tika client interface
16
 *
17
 * @author  David Martínez <[email protected]>
18
 * @link    https://tika.apache.org/1.24/formats.html
19
 */
20
abstract class Client
21
{
22
    protected const MODE = null;
23
24
    /**
25
     * Checked flag
26
     *
27
     * @var bool
28
     */
29
    protected $checked = false;
30
31
    /**
32
     * Response using callbacks
33
     *
34
     * @var string
35
     */
36
    protected $response = null;
37
38
    /**
39
     * Platform (unix or win)
40
     *
41
     * @var string
42
     */
43
    protected $platform = null;
44
45
    /**
46
     * Cached responses to avoid multiple request for the same file.
47
     *
48
     * @var array
49
     */
50
    protected $cache = [];
51
52
    /**
53
     * Text encoding
54
     *
55
     * @var string|null
56
     */
57
    protected $encoding = null;
58
59
    /**
60
     * Callback called on secuential read
61
     *
62
     * @var callable|null
63
     */
64
    protected $callback = null;
65
66
    /**
67
     * Enable or disable appending when using callback
68
     *
69
     * @var bool
70
     */
71
    protected $callbackAppend = true;
72
73
    /**
74
     * Size of chunks for callback
75
     *
76
     * @var int
77
     */
78
    protected $chunkSize = 1048576;
79
80
    /**
81
     * Remote download flag
82
     *
83
     * @var bool
84
     */
85
    protected $downloadRemote = false;
86
87
    /**
88
     * Configure client
89
     */
90
    public function __construct()
91
    {
92
        $this->platform = defined('PHP_WINDOWS_VERSION_MAJOR') ? 'win' : 'unix';
93
    }
94
95
    /**
96
     * Get a class instance throwing an exception if check fails
97
     *
98
     * @param string     $param1 path or host
99
     * @param string|int $param2 Java binary path or port for web client
100
     * @param array      $options options for cURL request
101
     * @param bool       $check check JAR file or server connection
102
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
103
     * @throws \Exception
104
     */
105
    public static function make(string $param1 = null, $param2 = null, array $options = [], bool $check = true): Client
106
    {
107
        if(preg_match('/\.jar$/', func_get_arg(0)))
108
        {
109
            return new CLIClient($param1, $param2, $check);
110
        }
111
        else
112
        {
113
            return new WebClient($param1, $param2, $options, $check);
114
        }
115
    }
116
117
    /**
118
     * Get a class instance delaying the check
119
     *
120
     * @param string $param1 path or host
121
     * @param int    $param2 Java binary path or port for web client
122
     * @param array  $options options for cURL request
123
     * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient
124
     * @throws \Exception
125
     */
126
    public static function prepare($param1 = null, $param2 = null, $options = []): Client
127
    {
128
        return self::make($param1, $param2, $options, false);
129
    }
130
131
    /**
132
     * Get the encoding
133
     */
134
    public function getEncoding(): ?string
135
    {
136
        return $this->encoding;
137
    }
138
139
    /**
140
     * Set the encoding
141
     *
142
     * @throws \Exception
143
     */
144
    public function setEncoding(string $encoding): self
145
    {
146
        if(!empty($encoding))
147
        {
148
            $this->encoding = $encoding;
149
        }
150
        else
151
        {
152
            throw new Exception('Invalid encoding');
153
        }
154
155
        return $this;
156
    }
157
158
    /**
159
     * Get the callback
160
     */
161
    public function getCallback(): ?Closure
162
    {
163
        return $this->callback;
164
    }
165
166
    /**
167
     * Set the callback (callable or closure) for call on secuential read
168
     *
169
     * @throws \Exception
170
     */
171
    public function setCallback(callable $callback, bool $append = true): self
172
    {
173
        if($callback instanceof Closure || is_array($callback))
174
        {
175
            $this->callbackAppend = (bool) $append;
176
            $this->callback = $callback;
177
        }
178
        elseif(is_string($callback))
179
        {
180
            $this->callbackAppend = (bool) $append;
181
            $this->callback = function($chunk) use ($callback)
182
            {
183
                return call_user_func_array($callback, [$chunk]);
184
            };
185
        }
186
        else
187
        {
188
            throw new Exception('Invalid callback');
189
        }
190
191
        return $this;
192
    }
193
194
    /**
195
     * Get the chunk size
196
     */
197
    public function getChunkSize(): int
198
    {
199
        return $this->chunkSize;
200
    }
201
202
    /**
203
     * Set the chunk size for secuential read
204
     *
205
     * @throws \Exception
206
     */
207
    public function setChunkSize(int $size): self
208
    {
209
        if(static::MODE == 'cli')
0 ignored issues
show
introduced by
The condition static::MODE == 'cli' is always false.
Loading history...
210
        {
211
            $this->chunkSize = $size;
212
        }
213
        else
214
        {
215
            throw new Exception('Chunk size is not supported on web mode');
216
        }
217
218
        return $this;
219
    }
220
221
    /**
222
     * Get the remote download flag
223
     */
224
    public function getDownloadRemote(): bool
225
    {
226
        return $this->downloadRemote;
227
    }
228
229
    /**
230
     * Set the remote download flag
231
     */
232
    public function setDownloadRemote(bool $download): self
233
    {
234
        $this->downloadRemote = (bool) $download;
235
236
        return $this;
237
    }
238
239
    /**
240
     * Gets file metadata
241
     *
242
     * @throws \Exception
243
     */
244
    public function getMetadata(string $file): MetadataInterface
245
    {
246
        $response = $this->parseJsonResponse($this->request('meta', $file));
247
248
        if($response instanceof stdClass == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
249
        {
250
            throw new Exception("Unexpected metadata response for $file");
251
        }
252
253
        return Metadata::make($response, $file);
254
    }
255
256
    /**
257
     * Gets recursive file metadata where the returned array indexes are the file name.
258
     *
259
     * Example: for a sample.zip with an example.doc file, the return array looks like if be defined as:
260
     *
261
     *  [
262
     *      'sample.zip' => new Metadata()
263
     *      'sample.zip/example.doc' => new DocumentMetadata()
264
     *  ]
265
     *
266
     * @link https://cwiki.apache.org/confluence/display/TIKA/TikaServer#TikaServer-RecursiveMetadataandContent
267
     * @throws \Exception
268
     */
269
    public function getRecursiveMetadata(string $file, ?string $format = 'ignore'): array
270
    {
271
        if(in_array($format, ['text', 'html', 'ignore']) == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
272
        {
273
            throw new Exception("Unknown recursive type (must be text, html, ignore or null)");
274
        }
275
276
        $response = $this->parseJsonResponse($this->request("rmeta/$format", $file));
277
278
        if(is_array($response) == false)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
279
        {
280
            throw new Exception("Unexpected metadata response for $file");
281
        }
282
283
        $metadata = [];
284
285
        foreach($response as $item)
286
        {
287
            $name = basename($file);
288
            if(isset($item->{'X-TIKA:embedded_resource_path'}))
289
            {
290
                $name .= $item->{'X-TIKA:embedded_resource_path'};
291
            }
292
293
            $metadata[$name] = Metadata::make($item, $file);
294
        }
295
296
        return $metadata;
297
    }
298
299
    /**
300
     * Detect language
301
     *
302
     * @throws \Exception
303
     */
304
    public function getLanguage(string $file): string
305
    {
306
        return $this->request('lang', $file);
307
    }
308
309
    /**
310
     * Detect MIME type
311
     *
312
     * @throws \Exception
313
     */
314
    public function getMIME(string $file): string
315
    {
316
        return $this->request('mime', $file);
317
    }
318
319
    /**
320
     * Extracts HTML
321
     *
322
     * @throws \Exception
323
     */
324
    public function getHTML(string $file, callable $callback = null, bool $append = true): string
325
    {
326
        if(!is_null($callback))
327
        {
328
            $this->setCallback($callback, $append);
329
        }
330
331
        return $this->request('html', $file);
332
    }
333
334
    /**
335
     * Extracts XHTML
336
     *
337
     * @throws \Exception
338
     */
339
    public function getXHTML(string $file, callable $callback = null, bool $append = true): string
340
    {
341
        if(!is_null($callback))
342
        {
343
            $this->setCallback($callback, $append);
344
        }
345
346
        return $this->request('xhtml', $file);
347
    }
348
349
    /**
350
     * Extracts text
351
     *
352
     * @throws \Exception
353
     */
354
    public function getText(string $file, callable $callback = null, bool $append = true): string
355
    {
356
        if(!is_null($callback))
357
        {
358
            $this->setCallback($callback, $append);
359
        }
360
361
        return $this->request('text', $file);
362
    }
363
364
    /**
365
     * Extracts main text
366
     *
367
     * @throws \Exception
368
     */
369
    public function getMainText(string $file, callable $callback = null, bool $append = true): string
370
    {
371
        if(!is_null($callback))
372
        {
373
            $this->setCallback($callback, $append);
374
        }
375
376
        return $this->request('text-main', $file);
377
    }
378
379
    /**
380
     * Returns current Tika version
381
     *
382
     * @throws \Exception
383
     */
384
    public function getVersion(): string
385
    {
386
        return $this->request('version');
387
    }
388
389
    /**
390
     * Return the list of Apache Tika supported versions
391
     *
392
     * @throws \Exception
393
     */
394
    public function getSupportedVersions(): array
395
    {
396
        static $versions = null;
397
398
        if(is_null($versions))
399
        {
400
            $composer = json_decode(file_get_contents(dirname(__DIR__) . '/composer.json'), true);
401
            $versions = $composer['extra']['supported-versions'] ?? null;
402
403
            if(empty($versions))
404
            {
405
                throw new Exception("An error ocurred trying to read package's composer.json file");
406
            }
407
        }
408
409
        return $versions;
410
    }
411
412
    /**
413
     * Sets the checked flag
414
     */
415
    public function setChecked(bool $checked): self
416
    {
417
        $this->checked = (bool) $checked;
418
419
        return $this;
420
    }
421
422
    /**
423
     * Checks if instance is checked
424
     */
425
    public function isChecked(): bool
426
    {
427
        return $this->checked;
428
    }
429
430
    /**
431
     * Check if a response is cached
432
     */
433
    protected function isCached(string $type, string $file): bool
434
    {
435
        return isset($this->cache[sha1($file)][$type]);
436
    }
437
438
    /**
439
     * Get a cached response
440
     */
441
    protected function getCachedResponse(string $type, string $file)
442
    {
443
        return $this->cache[sha1($file)][$type] ?? null;
444
    }
445
446
    /**
447
     * Check if a request type must be cached
448
     */
449
    protected function isCacheable(string $type): bool
450
    {
451
        return in_array($type, ['lang', 'meta']);
452
    }
453
454
    /**
455
     * Caches a response
456
     */
457
    protected function cacheResponse(string $type, $response, string $file): bool
458
    {
459
        $this->cache[sha1($file)][$type] = $response;
460
461
        return true;
462
    }
463
464
    /**
465
     * Checks if a specific version is supported
466
     */
467
    public function isVersionSupported(string $version): bool
468
    {
469
        return in_array($version, $this->getSupportedVersions());
470
    }
471
472
    /**
473
     * Check if a mime type is supported
474
     *
475
     * @param string $mime
476
     * @return bool
477
     * @throws \Exception
478
     */
479
    public function isMIMETypeSupported(string $mime): bool
480
    {
481
        return array_key_exists($mime, $this->getSupportedMIMETypes());
482
    }
483
484
    /**
485
     * Check the request before executing
486
     *
487
     * @throws \Exception
488
     */
489
    public function checkRequest(string $type, string $file = null): ?string
490
    {
491
        // no checks for getters
492
        if(in_array($type, ['detectors', 'mime-types', 'parsers', 'version']))
493
        {
494
            //
495
        } // invalid local file
496
        elseif(!preg_match('/^http/', $file) && !file_exists($file))
497
        {
498
            throw new Exception("File $file can't be opened");
499
        } // invalid remote file
500
        elseif(preg_match('/^http/', $file) && !preg_match('/200/', get_headers($file)[0]))
501
        {
502
            throw new Exception("File $file can't be opened", 2);
503
        } // download remote file if required only for integrated downloader
504
        elseif(preg_match('/^http/', $file) && $this->downloadRemote)
505
        {
506
            $file = $this->downloadFile($file);
0 ignored issues
show
Bug introduced by
It seems like $file can also be of type null; however, parameter $file of Vaites\ApacheTika\Client::downloadFile() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

506
            $file = $this->downloadFile(/** @scrutinizer ignore-type */ $file);
Loading history...
507
        }
508
509
        return $file;
510
    }
511
512
    /**
513
     * Parse the response returned by Apache Tika
514
     *
515
     * @return mixed
516
     * @throws \Exception
517
     */
518
    protected function parseJsonResponse(string $response)
519
    {
520
        // an empty response throws an error
521
        if(empty($response) || trim($response) == '')
522
        {
523
            throw new Exception('Empty response');
524
        }
525
526
        // decode the JSON response
527
        $json = json_decode($response);
528
529
        // exceptions if metadata is not valid
530
        if(json_last_error())
531
        {
532
            $message = function_exists('json_last_error_msg') ? json_last_error_msg() : 'Error parsing JSON response';
533
534
            throw new Exception($message, json_last_error());
535
        }
536
537
        return $json;
538
    }
539
540
    /**
541
     * Download file to a temporary folder
542
     *
543
     * @link https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes
544
     * @throws \Exception
545
     */
546
    protected function downloadFile(string $file): string
547
    {
548
        $dest = tempnam(sys_get_temp_dir(), 'TIKA');
549
550
        $fp = fopen($dest, 'w+');
551
552
        if($fp === false)
553
        {
554
            throw new Exception("$dest can't be opened");
555
        }
556
557
        $ch = curl_init($file);
558
        curl_setopt($ch, CURLOPT_FILE, $fp);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_setopt() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

558
        curl_setopt(/** @scrutinizer ignore-type */ $ch, CURLOPT_FILE, $fp);
Loading history...
559
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
560
        curl_exec($ch);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

560
        curl_exec(/** @scrutinizer ignore-type */ $ch);
Loading history...
561
562
        if(curl_errno($ch))
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_errno() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

562
        if(curl_errno(/** @scrutinizer ignore-type */ $ch))
Loading history...
563
        {
564
            throw new Exception(curl_error($ch));
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_error() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

564
            throw new Exception(curl_error(/** @scrutinizer ignore-type */ $ch));
Loading history...
565
        }
566
567
        $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

567
        $code = curl_getinfo(/** @scrutinizer ignore-type */ $ch, CURLINFO_HTTP_CODE);
Loading history...
568
569
        curl_close($ch);
0 ignored issues
show
Bug introduced by
It seems like $ch can also be of type false; however, parameter $ch of curl_close() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

569
        curl_close(/** @scrutinizer ignore-type */ $ch);
Loading history...
570
571
        if($code != 200)
572
        {
573
            throw new Exception("$file can't be downloaded", $code);
574
        }
575
576
        return $dest;
577
    }
578
579
    /**
580
     * Must return the supported MIME types
581
     *
582
     * @throws \Exception
583
     */
584
    abstract public function getSupportedMIMETypes(): array;
585
586
    /**
587
     * Must return the available detectors
588
     *
589
     * @throws \Exception
590
     */
591
    abstract public function getAvailableDetectors(): array;
592
593
    /**
594
     * Must return the available parsers
595
     *
596
     * @throws \Exception
597
     */
598
    abstract public function getAvailableParsers(): array;
599
600
    /**
601
     * Check Java binary, JAR path or server connection
602
     */
603
    abstract public function check(): void;
604
605
    /**
606
     * Configure and make a request and return its results.
607
     *
608
     * @throws \Exception
609
     */
610
    abstract public function request(string $type, string $file = null): string;
611
}
612