Passed
Push — master ( f99a92...dcc276 )
by David
02:43
created

CLIClient::setJavaArgs()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
c 0
b 0
f 0
dl 0
loc 5
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika command line interface client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://tika.apache.org/1.23/gettingstarted.html#Using_Tika_as_a_command_line_utility
14
 */
15
class CLIClient extends Client
16
{
17
    protected const MODE = 'cli';
18
19
    /**
20
     * Apache Tika app path
21
     *
22
     * @var string
23
     */
24
    protected $path = null;
25
26
    /**
27
     * Java binary path
28
     *
29
     * @var string
30
     */
31
    protected $java = null;
32
33
    /**
34
     * Java arguments
35
     *
36
     * @var string
37
     */
38
    protected $javaArgs = null;
39
40
    /**
41
     * Configure client
42
     *
43
     * @throws \Exception
44
     */
45
    public function __construct(string $path = null, string $java = null, bool $check = true)
46
    {
47
        parent::__construct();
48
49
        if($path)
50
        {
51
            $this->setPath($path);
52
        }
53
54
        if($java)
55
        {
56
            $this->setJava($java);
57
        }
58
59
        if($check === true)
60
        {
61
            $this->check();
62
        }
63
    }
64
65
    /**
66
     * Get the path
67
     */
68
    public function getPath(): ?string
69
    {
70
        return $this->path;
71
    }
72
73
    /**
74
     * Set the path
75
     */
76
    public function setPath($path): self
77
    {
78
        $this->path = $path;
79
80
        return $this;
81
    }
82
83
    /**
84
     * Get the Java path
85
     */
86
    public function getJava(): ?string
87
    {
88
        return $this->java;
89
    }
90
91
    /**
92
     * Set the Java path
93
     */
94
    public function setJava(string $java): self
95
    {
96
        $this->java = $java;
97
98
        return $this;
99
    }
100
101
    /**
102
     * Get the Java arguments
103
     */
104
    public function getJavaArgs(): ?string
105
    {
106
        return $this->javaArgs;
107
    }
108
109
    /**
110
     * Set the Java arguments
111
     *
112
     * NOTE: to modify child process jvm args, prepend "J" to each argument (-JXmx4g)
113
     */
114
    public function setJavaArgs(string $args): self
115
    {
116
        $this->javaArgs = $args;
117
118
        return $this;
119
    }
120
121
    /**
122
     * Returns the supported MIME types
123
     *
124
     * NOTE: the data provided by the CLI must be parsed: mime type has no spaces, aliases go next prefixed with spaces
125
     *
126
     * @throws \Exception
127
     */
128
    public function getSupportedMIMETypes(): array
129
    {
130
        $mime = null;
131
        $mimeTypes = [];
132
133
        $response = preg_split("/\n/", $this->request('mime-types'));
134
135
        foreach($response as $line)
136
        {
137
            if(preg_match('/^\w+/', $line))
138
            {
139
                $mime = trim($line);
140
                $mimeTypes[$mime] = ['alias' => []];
141
            }
142
            else
143
            {
144
                [$key, $value] = preg_split('/:\s+/', trim($line));
145
146
                if($key == 'alias')
147
                {
148
                    $mimeTypes[$mime]['alias'][] = $value;
149
                }
150
                else
151
                {
152
                    $mimeTypes[$mime][$key] = $value;
153
                }
154
            }
155
        }
156
157
158
        return $mimeTypes;
159
    }
160
161
    /**
162
     * Returns the available detectors
163
     *
164
     * @throws \Exception
165
     */
166
    public function getAvailableDetectors(): array
167
    {
168
        $detectors = [];
169
170
        $split = preg_split("/\n/", $this->request('detectors'));
171
172
        $parent = null;
173
        foreach($split as $line)
174
        {
175
            if(preg_match('/composite/i', $line))
176
            {
177
                $parent = trim(preg_replace('/\(.+\):/', '', $line));
178
                $detectors[$parent] = ['children' => [], 'composite' => true, 'name' => $parent];
179
            }
180
            else
181
            {
182
                $child = trim($line);
183
                $detectors[$parent]['children'][$child] = ['composite' => false, 'name' => $child];
184
            }
185
        }
186
187
        return $detectors;
188
    }
189
190
    /**
191
     * Returns the available parsers
192
     *
193
     * @throws \Exception
194
     */
195
    public function getAvailableParsers(): array
196
    {
197
        $parsers = [];
198
199
        $split = preg_split("/\n/", $this->request('parsers'));
200
        array_shift($split);
0 ignored issues
show
Bug introduced by
It seems like $split can also be of type false; however, parameter $array of array_shift() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

200
        array_shift(/** @scrutinizer ignore-type */ $split);
Loading history...
201
202
        $parent = null;
203
        foreach($split as $line)
204
        {
205
            if(preg_match('/composite/i', $line))
206
            {
207
                $parent = trim(preg_replace('/\(.+\):/', '', $line));
208
209
                $parsers[$parent] = ['children' => [], 'composite' => true, 'name' => $parent, 'decorated' => false];
210
            }
211
            else
212
            {
213
                $child = trim($line);
214
215
                $parsers[$parent]['children'][$child] = ['composite' => false, 'name' => $child, 'decorated' => false];
216
            }
217
        }
218
219
        return $parsers;
220
    }
221
222
    /**
223
     * Check Java binary, JAR path or server connection
224
     *
225
     * @throws \Exception
226
     */
227
    public function check(): void
228
    {
229
        if($this->isChecked() === false)
230
        {
231
            // Java command must not return an error
232
            try
233
            {
234
                $this->exec(($this->java ?: 'java') . ' -version');
235
            }
236
            catch(Exception $exception)
237
            {
238
                throw new Exception('Java command not found');
239
            }
240
241
            // JAR path must exists
242
            if(file_exists($this->path) === false)
243
            {
244
                throw new Exception('Apache Tika app JAR not found');
245
            }
246
247
            $this->setChecked(true);
248
        }
249
    }
250
251
    /**
252
     * Configure and make a request and return its results
253
     *
254
     * @throws \Exception
255
     */
256
    public function request(string $type, string $file = null): string
257
    {
258
        // check if not checked
259
        $this->check();
260
261
        // check if is cached
262
        if($file !== null && $this->isCached($type, $file))
263
        {
264
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
265
        }
266
267
        // command arguments
268
        $arguments = $this->getArguments($type, $file);
269
270
        // check the request
271
        $file = $this->checkRequest($type, $file);
272
273
        // add last argument
274
        if($file)
275
        {
276
            $arguments[] = escapeshellarg($file);
277
        }
278
279
        // build command
280
        $jar = escapeshellarg($this->path);
281
        $command = trim(($this->java ?: 'java') . " -jar $jar " . implode(' ', $arguments) . " {$this->javaArgs}");
282
283
        // run command
284
        $response = $this->exec($command);
285
286
        // metadata response
287
        if(in_array(preg_replace('/\/.+/', '', $type), ['meta', 'rmeta']))
288
        {
289
            // fix for invalid? json returned only with images
290
            $response = str_replace(basename($file) . '"}{', '", ', $response);
291
292
            // on Windows, response must be encoded to UTF8
293
            $response = $this->platform == 'win' ? utf8_encode($response) : $response;
294
        }
295
296
        // cache certain responses
297
        if($this->isCacheable($type))
298
        {
299
            $this->cacheResponse($type, $response, $file);
0 ignored issues
show
Bug introduced by
It seems like $file can also be of type null; however, parameter $file of Vaites\ApacheTika\Client::cacheResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
            $this->cacheResponse($type, $response, /** @scrutinizer ignore-type */ $file);
Loading history...
300
        }
301
302
        return $response;
303
    }
304
305
    /**
306
     * Run the command and return its results
307
     *
308
     * @throws \Exception
309
     */
310
    public function exec(string $command): ?string
311
    {
312
        // run command
313
        $exit = -1;
314
        $logfile = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tika-error.log';
315
        $descriptors = [['pipe', 'r'], ['pipe', 'w'], ['file', $logfile, 'a']];
316
        $process = proc_open($command, $descriptors, $pipes);
317
        $callback = $this->callback;
318
319
        // get output if command runs ok
320
        if(is_resource($process))
321
        {
322
            fclose($pipes[0]);
323
            $this->response = '';
324
            while($chunk = stream_get_line($pipes[1], $this->chunkSize))
325
            {
326
                if(!is_null($callback))
327
                {
328
                    $callback($chunk);
329
                }
330
331
                if($this->callbackAppend === true)
332
                {
333
                    $this->response .= $chunk;
334
                }
335
            }
336
            fclose($pipes[1]);
337
            $exit = proc_close($process);
338
        }
339
340
        // exception if exit value is not zero
341
        if($exit > 0)
342
        {
343
            throw new Exception("Unexpected exit value ($exit) for command $command");
344
        }
345
346
        return trim($this->response);
347
    }
348
349
    /**
350
     * Get the arguments to run the command
351
     *
352
     * @throws  Exception
353
     */
354
    protected function getArguments(string $type, string $file = null): array
355
    {
356
        $arguments = $this->encoding ? ["--encoding={$this->encoding}"] : [];
357
358
        switch($type)
359
        {
360
            case 'html':
361
                $arguments[] = '--html';
362
                break;
363
364
            case 'lang':
365
                $arguments[] = '--language';
366
                break;
367
368
            case 'mime':
369
                $arguments[] = '--detect';
370
                break;
371
372
            case 'meta':
373
                $arguments[] = '--metadata --json';
374
                break;
375
376
            case 'text':
377
                $arguments[] = '--text';
378
                break;
379
380
            case 'text-main':
381
                $arguments[] = '--text-main';
382
                break;
383
384
            case 'mime-types':
385
                $arguments[] = '--list-supported-types';
386
                break;
387
388
            case 'detectors':
389
                $arguments[] = '--list-detectors';
390
                break;
391
392
            case 'parsers':
393
                $arguments[] = '--list-parsers';
394
                break;
395
396
            case 'version':
397
                $arguments[] = '--version';
398
                break;
399
400
            case 'rmeta/ignore':
401
                $arguments[] = '--metadata --jsonRecursive';
402
                break;
403
404
            case 'rmeta/html':
405
                $arguments[] = '--html --jsonRecursive';
406
                break;
407
408
            case 'rmeta/text':
409
                $arguments[] = '--text --jsonRecursive';
410
                break;
411
412
            case 'xhtml':
413
                $arguments[] = '--xml';
414
                break;
415
416
            default:
417
                throw new Exception($file ? "Unknown type $type for $file" : "Unknown type $type");
418
        }
419
420
        return $arguments;
421
    }
422
}
423