CLIClient::getVersion()   B
last analyzed

Complexity

Conditions 7
Paths 4

Size

Total Lines 22
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
c 0
b 0
f 0
dl 0
loc 22
rs 8.8333
cc 7
nc 4
nop 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
use ZipArchive;
7
8
use Vaites\ApacheTika\Client;
9
10
/**
11
 * Apache Tika command line interface client
12
 *
13
 * @author  David Martínez <[email protected]>
14
 * @link    https://tika.apache.org/1.23/gettingstarted.html#Using_Tika_as_a_command_line_utility
15
 */
16
class CLIClient extends Client
17
{
18
    protected const MODE = 'cli';
19
20
    /**
21
     * Apache Tika app path
22
     *
23
     * @var string
24
     */
25
    protected $path = null;
26
27
    /**
28
     * Java binary path
29
     *
30
     * @var string
31
     */
32
    protected $java = null;
33
34
    /**
35
     * Java arguments
36
     *
37
     * @var string
38
     */
39
    protected $javaArgs = null;
40
41
    /**
42
     * Environment variables
43
     *
44
     * @var array
45
     */
46
    protected $envVars = [];
47
48
    /**
49
     * Configure client
50
     *
51
     * @throws \Exception
52
     */
53
    public function __construct(?string $path = null, ?string $java = null, bool $check = true)
54
    {
55
        parent::__construct();
56
57
        if($path)
58
        {
59
            $this->setPath($path);
60
        }
61
62
        if($java)
63
        {
64
            $this->setJava($java);
65
        }
66
67
        if($check === true)
68
        {
69
            $this->check();
70
        }
71
    }
72
73
    /**
74
     * Get the path
75
     */
76
    public function getPath(): ?string
77
    {
78
        return $this->path;
79
    }
80
81
    /**
82
     * Set the path
83
     */
84
    public function setPath(string $path): self
85
    {
86
        $this->path = $path;
87
88
        return $this;
89
    }
90
91
    /**
92
     * Get the Java path
93
     */
94
    public function getJava(): ?string
95
    {
96
        return $this->java;
97
    }
98
99
    /**
100
     * Set the Java path
101
     */
102
    public function setJava(string $java): self
103
    {
104
        $this->java = $java;
105
106
        return $this;
107
    }
108
109
    /**
110
     * Get the Java arguments
111
     */
112
    public function getJavaArgs(): ?string
113
    {
114
        return $this->javaArgs;
115
    }
116
117
    /**
118
     * Set the Java arguments
119
     *
120
     * NOTE: to modify child process jvm args, prepend "J" to each argument (-JXmx4g)
121
     */
122
    public function setJavaArgs(string $args): self
123
    {
124
        $this->javaArgs = $args;
125
126
        return $this;
127
    }
128
129
    /**
130
     * Get the environment variables
131
     */
132
    public function getEnvVars(): array
133
    {
134
        return $this->envVars;
135
    }
136
137
    /**
138
     * Set the environment variables
139
     */
140
    public function setEnvVars(array $variables): self
141
    {
142
        $this->envVars = $variables;
143
144
        return $this;
145
    }
146
147
    /**
148
     * Returns current Tika version
149
     *
150
     * @throws \Exception
151
     */
152
    public function getVersion(): string
153
    {
154
        $manifest = [];
155
156
        if(class_exists(ZipArchive::class) && file_exists($this->path))
157
        {
158
            $zip = new ZipArchive();
159
160
            if($zip->open($this->path))
161
            {
162
                $content = $zip->getFromName('META-INF/MANIFEST.MF') ?: 'ERROR';
163
                if(preg_match_all('/(.+):\s+(.+)\r?\n/U', $content, $match))
164
                {
165
                    foreach($match[1] as $index => $key)
166
                    {
167
                        $manifest[$key] = $match[2][$index];
168
                    }
169
                }
170
            }
171
        }
172
173
        return $manifest['Implementation-Version'] ?? $this->request('version');
174
    }
175
176
    /**
177
     * Returns the supported MIME types
178
     *
179
     * NOTE: the data provided by the CLI must be parsed: mime type has no spaces, aliases go next prefixed with spaces
180
     *
181
     * @throws \Exception
182
     */
183
    public function getSupportedMIMETypes(): array
184
    {
185
        $mime = null;
186
        $mimeTypes = [];
187
188
        $response = preg_split("/\n/", $this->request('mime-types')) ?: [];
189
190
        foreach($response as $line)
191
        {
192
            if(preg_match('/^\w+/', $line))
193
            {
194
                $mime = trim($line);
195
                $mimeTypes[$mime] = ['alias' => []];
196
            }
197
            else
198
            {
199
                [$key, $value] = preg_split('/:\s+/', trim($line));
200
201
                if($key == 'alias')
202
                {
203
                    $mimeTypes[$mime]['alias'][] = $value;
204
                }
205
                else
206
                {
207
                    $mimeTypes[$mime][$key] = $value;
208
                }
209
            }
210
        }
211
212
213
        return $mimeTypes;
214
    }
215
216
    /**
217
     * Returns the available detectors
218
     *
219
     * @throws \Exception
220
     */
221
    public function getAvailableDetectors(): array
222
    {
223
        $detectors = [];
224
225
        $split = preg_split("/\n/", $this->request('detectors')) ?: [];
226
227
        $parent = null;
228
        foreach($split as $line)
229
        {
230
            if(preg_match('/composite/i', $line))
231
            {
232
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
233
                $detectors[$parent] = ['children' => [], 'composite' => true, 'name' => $parent];
234
            }
235
            else
236
            {
237
                $child = trim($line);
238
                $detectors[$parent]['children'][$child] = ['composite' => false, 'name' => $child];
239
            }
240
        }
241
242
        return $detectors;
243
    }
244
245
    /**
246
     * Returns the available parsers
247
     *
248
     * @throws \Exception
249
     */
250
    public function getAvailableParsers(): array
251
    {
252
        $parsers = [];
253
254
        $split = preg_split("/\n/", $this->request('parsers')) ?: [];
255
        array_shift($split);
256
257
        $parent = null;
258
        foreach($split as $line)
259
        {
260
            if(preg_match('/composite/i', $line))
261
            {
262
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
263
264
                $parsers[$parent] = ['children' => [], 'composite' => true, 'name' => $parent, 'decorated' => false];
265
            }
266
            else
267
            {
268
                $child = trim($line);
269
270
                $parsers[$parent]['children'][$child] = ['composite' => false, 'name' => $child, 'decorated' => false];
271
            }
272
        }
273
274
        return $parsers;
275
    }
276
277
    /**
278
     * Check Java binary, JAR path or server connection
279
     *
280
     * @throws \Exception
281
     */
282
    public function check(): void
283
    {
284
        if($this->isChecked() === false)
285
        {
286
            // Java command must not return an error
287
            try
288
            {
289
                $this->exec(($this->java ?: 'java') . ' -version');
290
            }
291
            catch(Exception $exception)
292
            {
293
                throw new Exception('Java command not found');
294
            }
295
296
            // JAR path must exists
297
            if(file_exists($this->path) === false)
298
            {
299
                throw new Exception('Apache Tika app JAR not found');
300
            }
301
302
            $this->setChecked(true);
303
        }
304
    }
305
306
    /**
307
     * Configure and make a request and return its results
308
     *
309
     * @throws \Exception
310
     */
311
    public function request(string $type, ?string $file = null): string
312
    {
313
        // check if not checked
314
        $this->check();
315
316
        // check if is cached
317
        if($file !== null && $this->isCached($type, $file))
318
        {
319
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
320
        }
321
322
        // command arguments
323
        $arguments = $this->getArguments($type, $file);
324
325
        // check the request
326
        $file = $this->checkRequest($type, $file);
327
328
        // add last argument
329
        if($file)
330
        {
331
            $arguments[] = escapeshellarg($file);
332
        }
333
334
        // build command
335
        $jar = escapeshellarg($this->path);
336
        $command = trim(($this->java ?: 'java') . " -jar $jar " . implode(' ', $arguments) . " {$this->javaArgs}");
337
338
        // run command
339
        $response = $this->exec($command);
340
341
        // error if command fails
342
        if($response === null)
343
        {
344
            throw new Exception('An error occurred running Java command');
345
        }
346
347
        // metadata response
348
        if($file !== null && in_array(preg_replace('/\/.+/', '', $type), ['meta', 'rmeta']))
349
        {
350
            // fix for invalid? json returned only with images
351
            $response = str_replace(basename($file) . '"}{', '", ', $response);
352
353
            // on Windows, response must be encoded to UTF8
354
            if(version_compare($this->getVersion(), '2.1.0', '<'))
355
            {
356
                $response = $this->platform == 'win' ? utf8_encode($response) : $response;
357
            }
358
        }
359
360
        // cache certain responses
361
        if($file !== null && $this->isCacheable($type))
362
        {
363
            $this->cacheResponse($type, $response, $file);
364
        }
365
366
        return $this->filterResponse($response);
367
    }
368
369
    /**
370
     * Run the command and return its results
371
     *
372
     * @throws \Exception
373
     */
374
    public function exec(string $command): ?string
375
    {
376
        // get env variables for proc_open()
377
        $env = empty($this->envVars) ? null : array_merge(getenv(), $this->envVars);
378
379
        // run command
380
        $exit = -1;
381
        $logfile = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tika-error.log';
382
        $descriptors = [['pipe', 'r'], ['pipe', 'w'], ['file', $logfile, 'a']];
383
        $process = proc_open($command, $descriptors, $pipes, null, $env);
384
        $callback = $this->callback;
385
386
        // get output if command runs ok
387
        if(is_resource($process))
388
        {
389
            fclose($pipes[0]);
390
            $this->response = '';
391
            while($chunk = stream_get_line($pipes[1], $this->chunkSize))
392
            {
393
                if(!is_null($callback))
394
                {
395
                    $callback($chunk);
396
                }
397
398
                if($this->callbackAppend === true)
399
                {
400
                    $this->response .= $chunk;
401
                }
402
            }
403
            fclose($pipes[1]);
404
            $exit = proc_close($process);
405
        }
406
407
        // exception if exit value is not zero
408
        if($exit > 0)
409
        {
410
            throw new Exception("Unexpected exit value ($exit) for command $command");
411
        }
412
413
        return $this->filterResponse($this->response);
414
    }
415
416
    /**
417
     * Get the arguments to run the command
418
     *
419
     * @throws  Exception
420
     */
421
    protected function getArguments(string $type, ?string $file = null): array
422
    {
423
        $arguments = $this->encoding ? ["--encoding={$this->encoding}"] : [];
424
425
        switch($type)
426
        {
427
            case 'html':
428
                $arguments[] = '--html';
429
                break;
430
431
            case 'lang':
432
                $arguments[] = '--language';
433
                break;
434
435
            case 'mime':
436
                $arguments[] = '--detect';
437
                break;
438
439
            case 'meta':
440
                $arguments[] = '--metadata --json';
441
                break;
442
443
            case 'text':
444
                $arguments[] = '--text';
445
                break;
446
447
            case 'text-main':
448
                $arguments[] = '--text-main';
449
                break;
450
451
            case 'mime-types':
452
                $arguments[] = '--list-supported-types';
453
                break;
454
455
            case 'detectors':
456
                $arguments[] = '--list-detectors';
457
                break;
458
459
            case 'parsers':
460
                $arguments[] = '--list-parsers';
461
                break;
462
463
            case 'version':
464
                $arguments[] = '--version';
465
                break;
466
467
            case 'rmeta/ignore':
468
                $arguments[] = '--metadata --jsonRecursive';
469
                break;
470
471
            case 'rmeta/html':
472
                $arguments[] = '--html --jsonRecursive';
473
                break;
474
475
            case 'rmeta/text':
476
                $arguments[] = '--text --jsonRecursive';
477
                break;
478
479
            case 'xhtml':
480
                $arguments[] = '--xml';
481
                break;
482
483
            default:
484
                throw new Exception($file ? "Unknown type $type for $file" : "Unknown type $type");
485
        }
486
487
        return $arguments;
488
    }
489
}
490