Passed
Push — master ( d8574a...8cadc1 )
by David
02:55
created

CLIClient::check()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 21
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 8
nc 4
nop 0
dl 0
loc 21
rs 9.6111
c 0
b 0
f 0
1
<?php
2
3
namespace Vaites\ApacheTika\Clients;
4
5
use Exception;
6
7
use Vaites\ApacheTika\Client;
8
9
/**
10
 * Apache Tika command line interface client
11
 *
12
 * @author  David Martínez <[email protected]>
13
 * @link    https://tika.apache.org/1.23/gettingstarted.html#Using_Tika_as_a_command_line_utility
14
 */
15
class CLIClient extends Client
16
{
17
    protected const MODE = 'cli';
18
19
    /**
20
     * Apache Tika app path
21
     *
22
     * @var string
23
     */
24
    protected $path = null;
25
26
    /**
27
     * Java binary path
28
     *
29
     * @var string
30
     */
31
    protected $java = null;
32
33
    /**
34
     * Configure client
35
     *
36
     * @throws \Exception
37
     */
38
    public function __construct(string $path = null, string $java = null, bool $check = true)
39
    {
40
        parent::__construct();
41
42
        if($path)
43
        {
44
            $this->setPath($path);
45
        }
46
47
        if($java)
48
        {
49
            $this->setJava($java);
50
        }
51
52
        if($check === true)
53
        {
54
            $this->check();
55
        }
56
    }
57
58
    /**
59
     * Get the path
60
     */
61
    public function getPath(): ?string
62
    {
63
        return $this->path;
64
    }
65
66
    /**
67
     * Set the path
68
     */
69
    public function setPath($path): self
70
    {
71
        $this->path = $path;
72
73
        return $this;
74
    }
75
76
    /**
77
     * Get the Java path
78
     */
79
    public function getJava(): ?string
80
    {
81
        return $this->java;
82
    }
83
84
    /**
85
     * Set the Java path
86
     */
87
    public function setJava($java): self
88
    {
89
        $this->java = $java;
90
91
        return $this;
92
    }
93
94
    /**
95
     * Returns the supported MIME types
96
     *
97
     * NOTE: the data provided by the CLI must be parsed: mime type has no spaces, aliases go next prefixed with spaces
98
     *
99
     * @throws \Exception
100
     */
101
    public function getSupportedMIMETypes(): array
102
    {
103
        $mime = null;
104
        $mimeTypes = [];
105
106
        $response = preg_split("/\n/", $this->request('mime-types'));
107
108
        foreach($response as $line)
109
        {
110
            if(preg_match('/^\w+/', $line))
111
            {
112
                $mime = trim($line);
113
                $mimeTypes[$mime] = ['alias' => []];
114
            }
115
            else
116
            {
117
                [$key, $value] = preg_split('/:\s+/', trim($line));
118
119
                if($key == 'alias')
120
                {
121
                    $mimeTypes[$mime]['alias'][] = $value;
122
                }
123
                else
124
                {
125
                    $mimeTypes[$mime][$key] = $value;
126
                }
127
            }
128
        }
129
130
131
        return $mimeTypes;
132
    }
133
134
    /**
135
     * Returns the available detectors
136
     *
137
     * @throws \Exception
138
     */
139
    public function getAvailableDetectors(): array
140
    {
141
        $detectors = [];
142
143
        $split = preg_split("/\n/", $this->request('detectors'));
144
145
        $parent = null;
146
        foreach($split as $line)
147
        {
148
            if(preg_match('/composite/i', $line))
149
            {
150
                $parent = trim(preg_replace('/\(.+\):/', '', $line));
151
                $detectors[$parent] = ['children' => [], 'composite' => true, 'name' => $parent];
152
            }
153
            else
154
            {
155
                $child = trim($line);
156
                $detectors[$parent]['children'][$child] = ['composite' => false, 'name' => $child];
157
            }
158
        }
159
160
        return $detectors;
161
    }
162
163
    /**
164
     * Returns the available parsers
165
     *
166
     * @throws \Exception
167
     */
168
    public function getAvailableParsers(): array
169
    {
170
        $parsers = [];
171
172
        $split = preg_split("/\n/", $this->request('parsers'));
173
        array_shift($split);
0 ignored issues
show
Bug introduced by
It seems like $split can also be of type false; however, parameter $array of array_shift() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

173
        array_shift(/** @scrutinizer ignore-type */ $split);
Loading history...
174
175
        $parent = null;
176
        foreach($split as $line)
177
        {
178
            if(preg_match('/composite/i', $line))
179
            {
180
                $parent = trim(preg_replace('/\(.+\):/', '', $line));
181
182
                $parsers[$parent] = ['children' => [], 'composite' => true, 'name' => $parent, 'decorated' => false];
183
            }
184
            else
185
            {
186
                $child = trim($line);
187
188
                $parsers[$parent]['children'][$child] = ['composite' => false, 'name' => $child, 'decorated' => false];
189
            }
190
        }
191
192
        return $parsers;
193
    }
194
195
    /**
196
     * Check Java binary, JAR path or server connection
197
     *
198
     * @throws \Exception
199
     */
200
    public function check(): void
201
    {
202
        if($this->isChecked() === false)
203
        {
204
            // Java command must not return an error
205
            try
206
            {
207
                $this->exec(($this->java ?: 'java') . ' -version');
208
            }
209
            catch(Exception $exception)
210
            {
211
                throw new Exception('Java command not found');
212
            }
213
214
            // JAR path must exists
215
            if(file_exists($this->path) === false)
216
            {
217
                throw new Exception('Apache Tika app JAR not found');
218
            }
219
220
            $this->setChecked(true);
221
        }
222
    }
223
224
    /**
225
     * Configure and make a request and return its results
226
     *
227
     * @throws \Exception
228
     */
229
    public function request(string $type, string $file = null): string
230
    {
231
        // check if not checked
232
        $this->check();
233
234
        // check if is cached
235
        if($file !== null && $this->isCached($type, $file))
236
        {
237
            return $this->getCachedResponse($type, $file);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->getCachedResponse($type, $file) could return the type null which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
238
        }
239
240
        // command arguments
241
        $arguments = $this->getArguments($type, $file);
242
243
        // check the request
244
        $file = $this->checkRequest($type, $file);
245
246
        // add last argument
247
        if($file)
248
        {
249
            $arguments[] = escapeshellarg($file);
250
        }
251
252
        // build command
253
        $jar = escapeshellarg($this->path);
254
        $command = ($this->java ?: 'java') . " -jar $jar " . implode(' ', $arguments);
255
256
        // run command
257
        $response = $this->exec($command);
258
259
        // metadata response
260
        if(in_array(preg_replace('/\/.+/', '', $type), ['meta', 'rmeta']))
261
        {
262
            // fix for invalid? json returned only with images
263
            $response = str_replace(basename($file) . '"}{', '", ', $response);
264
265
            // on Windows, response must be encoded to UTF8
266
            $response = $this->platform == 'win' ? utf8_encode($response) : $response;
267
        }
268
269
        // cache certain responses
270
        if($this->isCacheable($type))
271
        {
272
            $this->cacheResponse($type, $response, $file);
0 ignored issues
show
Bug introduced by
It seems like $file can also be of type null; however, parameter $file of Vaites\ApacheTika\Client::cacheResponse() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

272
            $this->cacheResponse($type, $response, /** @scrutinizer ignore-type */ $file);
Loading history...
273
        }
274
275
        return $response;
276
    }
277
278
    /**
279
     * Run the command and return its results
280
     *
281
     * @throws \Exception
282
     */
283
    public function exec(string $command): ?string
284
    {
285
        // run command
286
        $exit = -1;
287
        $logfile = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tika-error.log';
288
        $descriptors = [['pipe', 'r'], ['pipe', 'w'], ['file', $logfile, 'a']];
289
        $process = proc_open($command, $descriptors, $pipes);
290
        $callback = $this->callback;
291
292
        // get output if command runs ok
293
        if(is_resource($process))
294
        {
295
            fclose($pipes[0]);
296
            $this->response = '';
297
            while($chunk = stream_get_line($pipes[1], $this->chunkSize))
298
            {
299
                if(!is_null($callback))
300
                {
301
                    $callback($chunk);
302
                }
303
304
                if($this->callbackAppend === true)
305
                {
306
                    $this->response .= $chunk;
307
                }
308
            }
309
            fclose($pipes[1]);
310
            $exit = proc_close($process);
311
        }
312
313
        // exception if exit value is not zero
314
        if($exit > 0)
315
        {
316
            throw new Exception("Unexpected exit value ($exit) for command $command");
317
        }
318
319
        return trim($this->response);
320
    }
321
322
    /**
323
     * Get the arguments to run the command
324
     *
325
     * @throws  Exception
326
     */
327
    protected function getArguments(string $type, string $file = null): array
328
    {
329
        $arguments = $this->encoding ? ["--encoding={$this->encoding}"] : [];
330
331
        switch($type)
332
        {
333
            case 'html':
334
                $arguments[] = '--html';
335
                break;
336
337
            case 'lang':
338
                $arguments[] = '--language';
339
                break;
340
341
            case 'mime':
342
                $arguments[] = '--detect';
343
                break;
344
345
            case 'meta':
346
                $arguments[] = '--metadata --json';
347
                break;
348
349
            case 'text':
350
                $arguments[] = '--text';
351
                break;
352
353
            case 'text-main':
354
                $arguments[] = '--text-main';
355
                break;
356
357
            case 'mime-types':
358
                $arguments[] = '--list-supported-types';
359
                break;
360
361
            case 'detectors':
362
                $arguments[] = '--list-detectors';
363
                break;
364
365
            case 'parsers':
366
                $arguments[] = '--list-parsers';
367
                break;
368
369
            case 'version':
370
                $arguments[] = '--version';
371
                break;
372
373
            case 'rmeta/ignore':
374
                $arguments[] = '--metadata --jsonRecursive';
375
                break;
376
377
            case 'rmeta/html':
378
                $arguments[] = '--html --jsonRecursive';
379
                break;
380
381
            case 'rmeta/text':
382
                $arguments[] = '--text --jsonRecursive';
383
                break;
384
385
            default:
386
                throw new Exception($file ? "Unknown type $type for $file" : "Unknown type $type");
387
        }
388
389
        return $arguments;
390
    }
391
}
392