Passed
Push — master ( dcc276...a6d10d )
by David
03:59
created

CLIClient::getEnvVars()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 1
c 0
b 0
f 0
dl 0
loc 3
rs 10
cc 1
nc 1
nop 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika command line interface client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://tika.apache.org/1.23/gettingstarted.html#Using_Tika_as_a_command_line_utility
14
 */
15
class CLIClient extends Client
16
{
17
    protected const MODE = 'cli';
18
19
    /**
20
     * Apache Tika app path
21
     *
22
     * @var string
23
     */
24
    protected $path = null;
25
26
    /**
27
     * Java binary path
28
     *
29
     * @var string
30
     */
31
    protected $java = null;
32
33
    /**
34
     * Java arguments
35
     *
36
     * @var string
37
     */
38
    protected $javaArgs = null;
39
40
    /**
41
     * Environment variables
42
     *
43
     * @var array
44
     */
45
    protected $envVars = [];
46
47
    /**
48
     * Configure client
49
     *
50
     * @throws \Exception
51
     */
52
    public function __construct(string $path = null, string $java = null, bool $check = true)
53
    {
54
        parent::__construct();
55
56
        if($path)
57
        {
58
            $this->setPath($path);
59
        }
60
61
        if($java)
62
        {
63
            $this->setJava($java);
64
        }
65
66
        if($check === true)
67
        {
68
            $this->check();
69
        }
70
    }
71
72
    /**
73
     * Get the path
74
     */
75
    public function getPath(): ?string
76
    {
77
        return $this->path;
78
    }
79
80
    /**
81
     * Set the path
82
     */
83
    public function setPath(string $path): self
84
    {
85
        $this->path = $path;
86
87
        return $this;
88
    }
89
90
    /**
91
     * Get the Java path
92
     */
93
    public function getJava(): ?string
94
    {
95
        return $this->java;
96
    }
97
98
    /**
99
     * Set the Java path
100
     */
101
    public function setJava(string $java): self
102
    {
103
        $this->java = $java;
104
105
        return $this;
106
    }
107
108
    /**
109
     * Get the Java arguments
110
     */
111
    public function getJavaArgs(): ?string
112
    {
113
        return $this->javaArgs;
114
    }
115
116
    /**
117
     * Set the Java arguments
118
     *
119
     * NOTE: to modify child process jvm args, prepend "J" to each argument (-JXmx4g)
120
     */
121
    public function setJavaArgs(string $args): self
122
    {
123
        $this->javaArgs = $args;
124
125
        return $this;
126
    }
127
128
    /**
129
     * Get the environment variables
130
     */
131
    public function getEnvVars(): array
132
    {
133
        return $this->envVars;
134
    }
135
136
    /**
137
     * Set the environment variables
138
     */
139
    public function setEnvVars(array $variables): self
140
    {
141
        $this->envVars = $variables;
142
143
        return $this;
144
    }
145
146
    /**
147
     * Returns the supported MIME types
148
     *
149
     * NOTE: the data provided by the CLI must be parsed: mime type has no spaces, aliases go next prefixed with spaces
150
     *
151
     * @throws \Exception
152
     */
153
    public function getSupportedMIMETypes(): array
154
    {
155
        $mime = null;
156
        $mimeTypes = [];
157
158
        $response = preg_split("/\n/", $this->request('mime-types')) ?: [];
159
160
        foreach($response as $line)
161
        {
162
            if(preg_match('/^\w+/', $line))
163
            {
164
                $mime = trim($line);
165
                $mimeTypes[$mime] = ['alias' => []];
166
            }
167
            else
168
            {
169
                [$key, $value] = preg_split('/:\s+/', trim($line));
170
171
                if($key == 'alias')
172
                {
173
                    $mimeTypes[$mime]['alias'][] = $value;
174
                }
175
                else
176
                {
177
                    $mimeTypes[$mime][$key] = $value;
178
                }
179
            }
180
        }
181
182
183
        return $mimeTypes;
184
    }
185
186
    /**
187
     * Returns the available detectors
188
     *
189
     * @throws \Exception
190
     */
191
    public function getAvailableDetectors(): array
192
    {
193
        $detectors = [];
194
195
        $split = preg_split("/\n/", $this->request('detectors')) ?: [];
196
197
        $parent = null;
198
        foreach($split as $line)
199
        {
200
            if(preg_match('/composite/i', $line))
201
            {
202
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
203
                $detectors[$parent] = ['children' => [], 'composite' => true, 'name' => $parent];
204
            }
205
            else
206
            {
207
                $child = trim($line);
208
                $detectors[$parent]['children'][$child] = ['composite' => false, 'name' => $child];
209
            }
210
        }
211
212
        return $detectors;
213
    }
214
215
    /**
216
     * Returns the available parsers
217
     *
218
     * @throws \Exception
219
     */
220
    public function getAvailableParsers(): array
221
    {
222
        $parsers = [];
223
224
        $split = preg_split("/\n/", $this->request('parsers')) ?: [];
225
        array_shift($split);
226
227
        $parent = null;
228
        foreach($split as $line)
229
        {
230
            if(preg_match('/composite/i', $line))
231
            {
232
                $parent = trim(preg_replace('/\(.+\):/', '', $line) ?: '');
233
234
                $parsers[$parent] = ['children' => [], 'composite' => true, 'name' => $parent, 'decorated' => false];
235
            }
236
            else
237
            {
238
                $child = trim($line);
239
240
                $parsers[$parent]['children'][$child] = ['composite' => false, 'name' => $child, 'decorated' => false];
241
            }
242
        }
243
244
        return $parsers;
245
    }
246
247
    /**
248
     * Check Java binary, JAR path or server connection
249
     *
250
     * @throws \Exception
251
     */
252
    public function check(): void
253
    {
254
        if($this->isChecked() === false)
255
        {
256
            // Java command must not return an error
257
            try
258
            {
259
                $this->exec(($this->java ?: 'java') . ' -version');
260
            }
261
            catch(Exception $exception)
262
            {
263
                throw new Exception('Java command not found');
264
            }
265
266
            // JAR path must exists
267
            if(file_exists($this->path) === false)
268
            {
269
                throw new Exception('Apache Tika app JAR not found');
270
            }
271
272
            $this->setChecked(true);
273
        }
274
    }
275
276
    /**
277
     * Configure and make a request and return its results
278
     *
279
     * @throws \Exception
280
     */
281
    public function request(string $type, string $file = null): string
282
    {
283
        // check if not checked
284
        $this->check();
285
286
        // check if is cached
287
        if($file !== null && $this->isCached($type, $file))
288
        {
289
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
290
        }
291
292
        // command arguments
293
        $arguments = $this->getArguments($type, $file);
294
295
        // check the request
296
        $file = $this->checkRequest($type, $file);
297
298
        // add last argument
299
        if($file)
300
        {
301
            $arguments[] = escapeshellarg($file);
302
        }
303
304
        // build command
305
        $jar = escapeshellarg($this->path);
306
        $command = trim(($this->java ?: 'java') . " -jar $jar " . implode(' ', $arguments) . " {$this->javaArgs}");
307
308
        // run command
309
        $response = $this->exec($command);
310
311
        // error if command fails
312
        if($response === null)
313
        {
314
            throw new Exception('An error occurred running Java command');
315
        }
316
317
        // metadata response
318
        if($file !== null && in_array(preg_replace('/\/.+/', '', $type), ['meta', 'rmeta']))
319
        {
320
            // fix for invalid? json returned only with images
321
            $response = str_replace(basename($file) . '"}{', '", ', $response);
322
323
            // on Windows, response must be encoded to UTF8
324
            $response = $this->platform == 'win' ? utf8_encode($response) : $response;
325
        }
326
327
        // cache certain responses
328
        if($file !== null && $this->isCacheable($type))
329
        {
330
            $this->cacheResponse($type, $response, $file);
331
        }
332
333
        return $response;
334
    }
335
336
    /**
337
     * Run the command and return its results
338
     *
339
     * @throws \Exception
340
     */
341
    public function exec(string $command): ?string
342
    {
343
        // get env variables for proc_open()
344
        $env = empty($this->envVars) ? null : array_merge(getenv(), $this->envVars);
0 ignored issues
show
Bug introduced by
getenv() of type string is incompatible with the type array expected by parameter $array1 of array_merge(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
        $env = empty($this->envVars) ? null : array_merge(/** @scrutinizer ignore-type */ getenv(), $this->envVars);
Loading history...
345
346
        // run command
347
        $exit = -1;
348
        $logfile = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tika-error.log';
349
        $descriptors = [['pipe', 'r'], ['pipe', 'w'], ['file', $logfile, 'a']];
350
        $process = proc_open($command, $descriptors, $pipes, null, $env);
351
        $callback = $this->callback;
352
353
        // get output if command runs ok
354
        if(is_resource($process))
355
        {
356
            fclose($pipes[0]);
357
            $this->response = '';
358
            while($chunk = stream_get_line($pipes[1], $this->chunkSize))
359
            {
360
                if(!is_null($callback))
361
                {
362
                    $callback($chunk);
363
                }
364
365
                if($this->callbackAppend === true)
366
                {
367
                    $this->response .= $chunk;
368
                }
369
            }
370
            fclose($pipes[1]);
371
            $exit = proc_close($process);
372
        }
373
374
        // exception if exit value is not zero
375
        if($exit > 0)
376
        {
377
            throw new Exception("Unexpected exit value ($exit) for command $command");
378
        }
379
380
        return trim($this->response);
381
    }
382
383
    /**
384
     * Get the arguments to run the command
385
     *
386
     * @throws  Exception
387
     */
388
    protected function getArguments(string $type, string $file = null): array
389
    {
390
        $arguments = $this->encoding ? ["--encoding={$this->encoding}"] : [];
391
392
        switch($type)
393
        {
394
            case 'html':
395
                $arguments[] = '--html';
396
                break;
397
398
            case 'lang':
399
                $arguments[] = '--language';
400
                break;
401
402
            case 'mime':
403
                $arguments[] = '--detect';
404
                break;
405
406
            case 'meta':
407
                $arguments[] = '--metadata --json';
408
                break;
409
410
            case 'text':
411
                $arguments[] = '--text';
412
                break;
413
414
            case 'text-main':
415
                $arguments[] = '--text-main';
416
                break;
417
418
            case 'mime-types':
419
                $arguments[] = '--list-supported-types';
420
                break;
421
422
            case 'detectors':
423
                $arguments[] = '--list-detectors';
424
                break;
425
426
            case 'parsers':
427
                $arguments[] = '--list-parsers';
428
                break;
429
430
            case 'version':
431
                $arguments[] = '--version';
432
                break;
433
434
            case 'rmeta/ignore':
435
                $arguments[] = '--metadata --jsonRecursive';
436
                break;
437
438
            case 'rmeta/html':
439
                $arguments[] = '--html --jsonRecursive';
440
                break;
441
442
            case 'rmeta/text':
443
                $arguments[] = '--text --jsonRecursive';
444
                break;
445
446
            case 'xhtml':
447
                $arguments[] = '--xml';
448
                break;
449
450
            default:
451
                throw new Exception($file ? "Unknown type $type for $file" : "Unknown type $type");
452
        }
453
454
        return $arguments;
455
    }
456
}
457