AppService::extractMetaData()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 26
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 18
nc 1
nop 1
dl 0
loc 26
ccs 22
cts 22
cp 1
crap 1
rs 9.6666
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Tika;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Tika\Utility\FileUtility;
21
use ApacheSolrForTypo3\Tika\Utility\ShellUtility;
22
use RuntimeException;
23
use TYPO3\CMS\Core\Resource\FileInterface;
24
use TYPO3\CMS\Core\Utility\CommandUtility;
25
use TYPO3\CMS\Core\Utility\GeneralUtility;
26
27
/**
28
 * A Tika service implementation using the tika-app.jar
29
 *
30
 * @copyright (c) 2015 Ingo Renner <[email protected]>
31
 */
32
class AppService extends AbstractService
33
{
34
    /**
35
    * @var array
36
    */
37
    protected static array $supportedMimeTypes = [];
38
39
    /**
40
     * Service initialization
41
     */
42 9
    protected function initializeService(): void
43
    {
44 9
        if (!is_file(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
45
        ) {
46
            throw new RuntimeException(
47
                'Invalid path or filename for Tika application jar: ' . $this->configuration['tikaPath'],
48
                1266864929
49
            );
50
        }
51
52 9
        if (!CommandUtility::checkCommand('java')) {
53
            throw new RuntimeException('Could not find Java', 1421208775);
54
        }
55
    }
56
57
    /**
58
     * Gets the Tika server version
59
     *
60
     * @return string Tika app version string
61
     */
62 2
    public function getTikaVersion(): string
63
    {
64 2
        $tikaCommand = /** @scrutinizer ignore-type */ CommandUtility::getCommand('java')
65 2
            . ' -Dfile.encoding=UTF8' // forces UTF8 output
66 2
            . $this->getAdditionalCommandOptions()
67 2
            . ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
68 2
            . ' -V';
69
70 2
        return shell_exec($tikaCommand) ?: '';
71
    }
72
73
    /**
74
     * Takes a file reference and extracts the text from it.
75
     *
76
     * @param FileInterface $file
77
     * @return string
78
     */
79 1
    public function extractText(FileInterface $file): string
80
    {
81 1
        $localTempFilePath = $file->getForLocalProcessing(false);
82 1
        $tikaCommand = ShellUtility::getLanguagePrefix()
83 1
            . /** @scrutinizer ignore-type */  CommandUtility::getCommand('java')
84 1
            . ' -Dfile.encoding=UTF8' // forces UTF8 output
85 1
            . $this->getAdditionalCommandOptions()
86 1
            . ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
87 1
            . ' -t'
88 1
            . ' ' . ShellUtility::escapeShellArgument($localTempFilePath);
89
90 1
        $extractedText = shell_exec($tikaCommand);
91
92 1
        $this->log(
93 1
            'Text Extraction using local Tika',
94 1
            [
95 1
                'file' => $file,
96 1
                'tika command' => $tikaCommand,
97 1
                'shell output' => $extractedText,
98 1
            ]
99 1
        );
100
101 1
        return (string)$extractedText;
102
    }
103
104
    /**
105
     * Takes a file reference and extracts its meta-data.
106
     *
107
     * @param FileInterface $file
108
     * @return array
109
     */
110 1
    public function extractMetaData(FileInterface $file): array
111
    {
112 1
        $localTempFilePath = $file->getForLocalProcessing(false);
113 1
        $tikaCommand = ShellUtility::getLanguagePrefix()
114 1
            . /** @scrutinizer ignore-type */ CommandUtility::getCommand('java')
115 1
            . ' -Dfile.encoding=UTF8'
116 1
            . $this->getAdditionalCommandOptions()
117 1
            . ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
118 1
            . ' -m'
119 1
            . ' ' . ShellUtility::escapeShellArgument($localTempFilePath);
120
121 1
        $shellOutput = [];
122 1
        exec($tikaCommand, $shellOutput);
123 1
        $metaData = $this->shellOutputToArray($shellOutput);
124
125 1
        $this->log(
126 1
            'Meta Data Extraction using local Tika',
127 1
            [
128 1
                'file' => $file,
129 1
                'tika command' => $tikaCommand,
130 1
                'shell output' => $shellOutput,
131 1
                'meta data' => $metaData,
132 1
            ]
133 1
        );
134
135 1
        return $metaData;
136
    }
137
138
    /**
139
     * Takes a file reference and detects its content's language.
140
     *
141
     * @param FileInterface $file
142
     * @return string Language ISO code
143
     */
144 1
    public function detectLanguageFromFile(FileInterface $file): string
145
    {
146 1
        $localTempFilePath = $file->getForLocalProcessing(false);
147
148 1
        return $this->detectLanguageFromLocalFile($localTempFilePath);
149
    }
150
151
    /**
152
     * Takes a string as input and detects its language.
153
     *
154
     * @param string $input
155
     * @return string Language ISO code
156
     */
157 1
    public function detectLanguageFromString(string $input): string
158
    {
159 1
        $tempFilePath = GeneralUtility::tempnam('Tx_Tika_AppService_DetectLanguage');
160 1
        file_put_contents($tempFilePath, $input);
161
162
        // detect language
163 1
        $language = $this->detectLanguageFromLocalFile($tempFilePath);
164
165
        // cleanup
166 1
        unlink($tempFilePath);
167
168 1
        return $language;
169
    }
170
171
    /**
172
     * The actual language detection
173
     *
174
     * @param string $localFilePath Path to a local file
175
     * @return string The file content's language
176
     */
177 2
    protected function detectLanguageFromLocalFile(string $localFilePath): string
178
    {
179 2
        $tikaCommand = ShellUtility::getLanguagePrefix()
180 2
            . /** @scrutinizer ignore-type */ CommandUtility::getCommand('java')
181 2
            . ' -Dfile.encoding=UTF8'
182 2
            . $this->getAdditionalCommandOptions()
183 2
            . ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
184 2
            . ' -l'
185 2
            . ' ' . ShellUtility::escapeShellArgument($localFilePath);
186
187 2
        $language = trim(shell_exec($tikaCommand) ?: '');
188
189 2
        $this->log(
190 2
            'Language Detection using local Tika',
191 2
            [
192 2
                'file' => $localFilePath,
193 2
                'tika command' => $tikaCommand,
194 2
                'shell output' => $language,
195 2
            ]
196 2
        );
197
198 2
        return $language;
199
    }
200
201
    /**
202
     * @return array
203
     */
204 2
    public function getSupportedMimeTypes(): array
205
    {
206 2
        if (is_array(self::$supportedMimeTypes) && count(self::$supportedMimeTypes) > 0) {
207
            return self::$supportedMimeTypes;
208
        }
209
210 2
        self::$supportedMimeTypes = $this->buildSupportedMimeTypes();
211
212 2
        return self::$supportedMimeTypes;
213
    }
214
215
    /**
216
     * @return array
217
     */
218 2
    public function buildSupportedMimeTypes(): array
219
    {
220 2
        $mimeTypeOutput = $this->getMimeTypeOutputFromTikaJar();
221 2
        $coreTypes = [];
222 2
        preg_match_all('/^[^\s]*/im', $mimeTypeOutput, $coreTypes);
223
224 2
        $aliasTypes = [];
225 2
        preg_match_all('/^[\s]*alias:[\s]*.*/im', $mimeTypeOutput, $aliasTypes);
226
227 2
        $supportedTypes = $coreTypes[0];
228 2
        foreach ($aliasTypes[0] as $aliasType) {
229 1
            $supportedTypes[] = trim(str_replace('alias:', '', $aliasType));
230
        }
231
232 2
        $supportedTypes = array_filter($supportedTypes);
233 2
        asort($supportedTypes);
234 2
        return $supportedTypes;
235
    }
236
237
    /**
238
     * Takes the shell output from exec() and turns it into an array of key => value
239
     * pairs.
240
     *
241
     * @param array $shellOutput An array containing the shell output from exec() with one line per entry
242
     * @return array Key => value pairs
243
     */
244 1
    protected function shellOutputToArray(array $shellOutput): array
245
    {
246 1
        $metaData = [];
247
248 1
        foreach ($shellOutput as $line) {
249 1
            [$key, $value] = explode(':', $line, 2);
250 1
            $value = trim($value ?? '');
251
252 1
            if (in_array($key, [
253 1
                'dc',
254 1
                'dcterms',
255 1
                'meta',
256 1
                'tiff',
257 1
                'xmp',
258 1
                'xmpTPg',
259 1
                'xmpDM',
260 1
            ])) {
261
                // Dublin Core metadata and co
262
                $keyPrefix = $key;
263
                [$key, $value] = explode(':', $value, 2);
264
265
                $key = $keyPrefix . ':' . $key;
266
                $value = trim($value ?? '');
267
            }
268
269 1
            if (array_key_exists($key, $metaData)) {
270
                if ($metaData[$key] == $value) {
271
                    // first duplicate key hit, but also duplicate value
272
                    continue;
273
                }
274
275
                // allow a meta data key to appear multiple times
276
                if (!is_array($metaData[$key])) {
277
                    $metaData[$key] = [$metaData[$key]];
278
                }
279
280
                // but do not allow duplicate values
281
                if (!in_array($value, $metaData[$key])) {
282
                    $metaData[$key][] = $value;
283
                }
284
            } else {
285 1
                $metaData[$key] = $value;
286
            }
287
        }
288
289 1
        return $metaData;
290
    }
291
292
    /**
293
     * The app is available when the jar can be opened
294
     *
295
     * @return bool
296
     */
297
    public function isAvailable(): bool
298
    {
299
        $tikaFileExists = is_file(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']));
300
        if (!$tikaFileExists) {
301
            return false;
302
        }
303
304
        $canCallJava = CommandUtility::checkCommand('java');
305
        if (!$canCallJava) {
306
            return false;
307
        }
308
309
        return true;
310
    }
311
312
    /**
313
     * @return string
314
     */
315 1
    protected function getMimeTypeOutputFromTikaJar(): string
316
    {
317 1
        $tikaCommand = ShellUtility::getLanguagePrefix()
318 1
            . /** @scrutinizer ignore-type */ CommandUtility::getCommand('java')
319 1
            . ' -Dfile.encoding=UTF8'
320 1
            . $this->getAdditionalCommandOptions()
321 1
            . ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath']))
322 1
            . ' --list-supported-types';
323
324 1
        return trim(shell_exec($tikaCommand) ?: '');
325
    }
326
}
327