Passed
Push — master ( d3a7db...c8c09a )
by David
04:00 queued 10s
created

CLIClient::check()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 21
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 8
nc 4
nop 0
dl 0
loc 21
rs 9.6111
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
use ZipArchive;
7
8
use Vaites\ApacheTika\Client;
9
10
/**
11
 * Apache Tika command line interface client
12
 *
13
 * @author  David Martínez <[email protected]>
14
 * @link    https://tika.apache.org/1.23/gettingstarted.html#Using_Tika_as_a_command_line_utility
15
 */
16
class CLIClient extends Client
17
{
18
    protected const MODE = 'cli';
19
20
    /**
21
     * Apache Tika app path
22
     *
23
     * @var string
24
     */
25
    protected $path = null;
26
27
    /**
28
     * Java binary path
29
     *
30
     * @var string
31
     */
32
    protected $java = null;
33
34
    /**
35
     * Java arguments
36
     *
37
     * @var string
38
     */
39
    protected $javaArgs = null;
40
41
    /**
42
     * Environment variables
43
     *
44
     * @var array
45
     */
46
    protected $envVars = [];
47
48
    /**
49
     * Configure client
50
     *
51
     * @throws \Exception
52
     */
53
    public function __construct(string $path = null, string $java = null, bool $check = true)
54
    {
55
        parent::__construct();
56
57
        if($path)
58
        {
59
            $this->setPath($path);
60
        }
61
62
        if($java)
63
        {
64
            $this->setJava($java);
65
        }
66
67
        if($check === true)
68
        {
69
            $this->check();
70
        }
71
    }
72
73
    /**
74
     * Get the path
75
     */
76
    public function getPath(): ?string
77
    {
78
        return $this->path;
79
    }
80
81
    /**
82
     * Set the path
83
     */
84
    public function setPath(string $path): self
85
    {
86
        $this->path = $path;
87
88
        return $this;
89
    }
90
91
    /**
92
     * Get the Java path
93
     */
94
    public function getJava(): ?string
95
    {
96
        return $this->java;
97
    }
98
99
    /**
100
     * Set the Java path
101
     */
102
    public function setJava(string $java): self
103
    {
104
        $this->java = $java;
105
106
        return $this;
107
    }
108
109
    /**
110
     * Get the Java arguments
111
     */
112
    public function getJavaArgs(): ?string
113
    {
114
        return $this->javaArgs;
115
    }
116
117
    /**
118
     * Set the Java arguments
119
     *
120
     * NOTE: to modify child process jvm args, prepend "J" to each argument (-JXmx4g)
121
     */
122
    public function setJavaArgs(string $args): self
123
    {
124
        $this->javaArgs = $args;
125
126
        return $this;
127
    }
128
129
    /**
130
     * Get the environment variables
131
     */
132
    public function getEnvVars(): array
133
    {
134
        return $this->envVars;
135
    }
136
137
    /**
138
     * Set the environment variables
139
     */
140
    public function setEnvVars(array $variables): self
141
    {
142
        $this->envVars = $variables;
143
144
        return $this;
145
    }
146
147
    /**
148
     * Returns current Tika version
149
     *
150
     * @throws \Exception
151
     */
152
    public function getVersion(): string
153
    {
154
        $manifest = [];
155
156
        if(class_exists(ZipArchive::class) && file_exists($this->path))
157
        {
158
            $zip = new ZipArchive();
159
160
            if($zip->open($this->path))
161
            {
162
                if(preg_match_all('/(.+):\s+(.+)\r?\n/U', $zip->getFromName('META-INF/MANIFEST.MF'), $match))
163
                {
164
                    foreach($match[1] as $index => $key)
165
                    {
166
                        $manifest[$key] = $match[2][$index];
167
                    }
168
                }
169
            }
170
        }
171
172
        return $manifest['Implementation-Version'] ?? $this->request('version');
173
    }
174
175
    /**
176
     * Returns the supported MIME types
177
     *
178
     * NOTE: the data provided by the CLI must be parsed: mime type has no spaces, aliases go next prefixed with spaces
179
     *
180
     * @throws \Exception
181
     */
182
    public function getSupportedMIMETypes(): array
183
    {
184
        $mime = null;
185
        $mimeTypes = [];
186
187
        $response = preg_split("/\n/", $this->request('mime-types')) ?: [];
188
189
        foreach($response as $line)
190
        {
191
            if(preg_match('/^\w+/', $line))
192
            {
193
                $mime = trim($line);
194
                $mimeTypes[$mime] = ['alias' => []];
195
            }
196
            else
197
            {
198
                [$key, $value] = preg_split('/:\s+/', trim($line));
199
200
                if($key == 'alias')
201
                {
202
                    $mimeTypes[$mime]['alias'][] = $value;
203
                }
204
                else
205
                {
206
                    $mimeTypes[$mime][$key] = $value;
207
                }
208
            }
209
        }
210
211
212
        return $mimeTypes;
213
    }
214
215
    /**
216
     * Returns the available detectors
217
     *
218
     * @throws \Exception
219
     */
220
    public function getAvailableDetectors(): array
221
    {
222
        $detectors = [];
223
224
        $split = preg_split("/\n/", $this->request('detectors')) ?: [];
225
226
        $parent = null;
227
        foreach($split as $line)
228
        {
229
            if(preg_match('/composite/i', $line))
230
            {
231
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
232
                $detectors[$parent] = ['children' => [], 'composite' => true, 'name' => $parent];
233
            }
234
            else
235
            {
236
                $child = trim($line);
237
                $detectors[$parent]['children'][$child] = ['composite' => false, 'name' => $child];
238
            }
239
        }
240
241
        return $detectors;
242
    }
243
244
    /**
245
     * Returns the available parsers
246
     *
247
     * @throws \Exception
248
     */
249
    public function getAvailableParsers(): array
250
    {
251
        $parsers = [];
252
253
        $split = preg_split("/\n/", $this->request('parsers')) ?: [];
254
        array_shift($split);
255
256
        $parent = null;
257
        foreach($split as $line)
258
        {
259
            if(preg_match('/composite/i', $line))
260
            {
261
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
262
263
                $parsers[$parent] = ['children' => [], 'composite' => true, 'name' => $parent, 'decorated' => false];
264
            }
265
            else
266
            {
267
                $child = trim($line);
268
269
                $parsers[$parent]['children'][$child] = ['composite' => false, 'name' => $child, 'decorated' => false];
270
            }
271
        }
272
273
        return $parsers;
274
    }
275
276
    /**
277
     * Check Java binary, JAR path or server connection
278
     *
279
     * @throws \Exception
280
     */
281
    public function check(): void
282
    {
283
        if($this->isChecked() === false)
284
        {
285
            // Java command must not return an error
286
            try
287
            {
288
                $this->exec(($this->java ?: 'java') . ' -version');
289
            }
290
            catch(Exception $exception)
291
            {
292
                throw new Exception('Java command not found');
293
            }
294
295
            // JAR path must exists
296
            if(file_exists($this->path) === false)
297
            {
298
                throw new Exception('Apache Tika app JAR not found');
299
            }
300
301
            $this->setChecked(true);
302
        }
303
    }
304
305
    /**
306
     * Configure and make a request and return its results
307
     *
308
     * @throws \Exception
309
     */
310
    public function request(string $type, string $file = null): string
311
    {
312
        // check if not checked
313
        $this->check();
314
315
        // check if is cached
316
        if($file !== null && $this->isCached($type, $file))
317
        {
318
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
319
        }
320
321
        // command arguments
322
        $arguments = $this->getArguments($type, $file);
323
324
        // check the request
325
        $file = $this->checkRequest($type, $file);
326
327
        // add last argument
328
        if($file)
329
        {
330
            $arguments[] = escapeshellarg($file);
331
        }
332
333
        // build command
334
        $jar = escapeshellarg($this->path);
335
        $command = trim(($this->java ?: 'java') . " -jar $jar " . implode(' ', $arguments) . " {$this->javaArgs}");
336
337
        // run command
338
        $response = $this->exec($command);
339
340
        // error if command fails
341
        if($response === null)
342
        {
343
            throw new Exception('An error occurred running Java command');
344
        }
345
346
        // metadata response
347
        if($file !== null && in_array(preg_replace('/\/.+/', '', $type), ['meta', 'rmeta']))
348
        {
349
            // fix for invalid? json returned only with images
350
            $response = str_replace(basename($file) . '"}{', '", ', $response);
351
352
            // on Windows, response must be encoded to UTF8
353
            if(version_compare($this->getVersion(), '2.1.0', '<'))
354
            {
355
                $response = $this->platform == 'win' ? utf8_encode($response) : $response;
356
            }
357
        }
358
359
        // cache certain responses
360
        if($file !== null && $this->isCacheable($type))
361
        {
362
            $this->cacheResponse($type, $response, $file);
363
        }
364
365
        return $response;
366
    }
367
368
    /**
369
     * Run the command and return its results
370
     *
371
     * @throws \Exception
372
     */
373
    public function exec(string $command): ?string
374
    {
375
        // get env variables for proc_open()
376
        $env = empty($this->envVars) ? null : array_merge(getenv(), $this->envVars);
377
378
        // run command
379
        $exit = -1;
380
        $logfile = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tika-error.log';
381
        $descriptors = [['pipe', 'r'], ['pipe', 'w'], ['file', $logfile, 'a']];
382
        $process = proc_open($command, $descriptors, $pipes, null, $env);
383
        $callback = $this->callback;
384
385
        // get output if command runs ok
386
        if(is_resource($process))
387
        {
388
            fclose($pipes[0]);
389
            $this->response = '';
390
            while($chunk = stream_get_line($pipes[1], $this->chunkSize))
391
            {
392
                if(!is_null($callback))
393
                {
394
                    $callback($chunk);
395
                }
396
397
                if($this->callbackAppend === true)
398
                {
399
                    $this->response .= $chunk;
400
                }
401
            }
402
            fclose($pipes[1]);
403
            $exit = proc_close($process);
404
        }
405
406
        // exception if exit value is not zero
407
        if($exit > 0)
408
        {
409
            throw new Exception("Unexpected exit value ($exit) for command $command");
410
        }
411
412
        return trim($this->response);
413
    }
414
415
    /**
416
     * Get the arguments to run the command
417
     *
418
     * @throws  Exception
419
     */
420
    protected function getArguments(string $type, string $file = null): array
421
    {
422
        $arguments = $this->encoding ? ["--encoding={$this->encoding}"] : [];
423
424
        switch($type)
425
        {
426
            case 'html':
427
                $arguments[] = '--html';
428
                break;
429
430
            case 'lang':
431
                $arguments[] = '--language';
432
                break;
433
434
            case 'mime':
435
                $arguments[] = '--detect';
436
                break;
437
438
            case 'meta':
439
                $arguments[] = '--metadata --json';
440
                break;
441
442
            case 'text':
443
                $arguments[] = '--text';
444
                break;
445
446
            case 'text-main':
447
                $arguments[] = '--text-main';
448
                break;
449
450
            case 'mime-types':
451
                $arguments[] = '--list-supported-types';
452
                break;
453
454
            case 'detectors':
455
                $arguments[] = '--list-detectors';
456
                break;
457
458
            case 'parsers':
459
                $arguments[] = '--list-parsers';
460
                break;
461
462
            case 'version':
463
                $arguments[] = '--version';
464
                break;
465
466
            case 'rmeta/ignore':
467
                $arguments[] = '--metadata --jsonRecursive';
468
                break;
469
470
            case 'rmeta/html':
471
                $arguments[] = '--html --jsonRecursive';
472
                break;
473
474
            case 'rmeta/text':
475
                $arguments[] = '--text --jsonRecursive';
476
                break;
477
478
            case 'xhtml':
479
                $arguments[] = '--xml';
480
                break;
481
482
            default:
483
                throw new Exception($file ? "Unknown type $type for $file" : "Unknown type $type");
484
        }
485
486
        return $arguments;
487
    }
488
}
489