1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace ApacheSolrForTypo3\Tika\Service\Tika; |
6
|
|
|
|
7
|
|
|
/* |
8
|
|
|
* This file is part of the TYPO3 CMS project. |
9
|
|
|
* |
10
|
|
|
* It is free software; you can redistribute it and/or modify it under |
11
|
|
|
* the terms of the GNU General Public License, either version 2 |
12
|
|
|
* of the License, or any later version. |
13
|
|
|
* |
14
|
|
|
* For the full copyright and license information, please read the |
15
|
|
|
* LICENSE.txt file that was distributed with this source code. |
16
|
|
|
* |
17
|
|
|
* The TYPO3 project - inspiring people to share! |
18
|
|
|
*/ |
19
|
|
|
|
20
|
|
|
use ApacheSolrForTypo3\Tika\Utility\FileUtility; |
21
|
|
|
use ApacheSolrForTypo3\Tika\Utility\ShellUtility; |
22
|
|
|
use RuntimeException; |
23
|
|
|
use TYPO3\CMS\Core\Resource\FileInterface; |
24
|
|
|
use TYPO3\CMS\Core\Utility\CommandUtility; |
25
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* A Tika service implementation using the tika-app.jar |
29
|
|
|
* |
30
|
|
|
* @copyright (c) 2015 Ingo Renner <[email protected]> |
31
|
|
|
*/ |
32
|
|
|
class AppService extends AbstractService |
33
|
|
|
{ |
34
|
|
|
/** |
35
|
|
|
* @var array |
36
|
|
|
*/ |
37
|
|
|
protected static array $supportedMimeTypes = []; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Service initialization |
41
|
|
|
*/ |
42
|
9 |
|
protected function initializeService(): void |
43
|
|
|
{ |
44
|
9 |
|
if (!is_file(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
45
|
|
|
) { |
46
|
|
|
throw new RuntimeException( |
47
|
|
|
'Invalid path or filename for Tika application jar: ' . $this->configuration['tikaPath'], |
48
|
|
|
1266864929 |
49
|
|
|
); |
50
|
|
|
} |
51
|
|
|
|
52
|
9 |
|
if (!CommandUtility::checkCommand('java')) { |
53
|
|
|
throw new RuntimeException('Could not find Java', 1421208775); |
54
|
|
|
} |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* Gets the Tika server version |
59
|
|
|
* |
60
|
|
|
* @return string Tika app version string |
61
|
|
|
*/ |
62
|
2 |
|
public function getTikaVersion(): string |
63
|
|
|
{ |
64
|
2 |
|
$tikaCommand = /** @scrutinizer ignore-type */ CommandUtility::getCommand('java') |
65
|
2 |
|
. ' -Dfile.encoding=UTF8' // forces UTF8 output |
66
|
2 |
|
. $this->getAdditionalCommandOptions() |
67
|
2 |
|
. ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
68
|
2 |
|
. ' -V'; |
69
|
|
|
|
70
|
2 |
|
return shell_exec($tikaCommand) ?: ''; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* Takes a file reference and extracts the text from it. |
75
|
|
|
* |
76
|
|
|
* @param FileInterface $file |
77
|
|
|
* @return string |
78
|
|
|
*/ |
79
|
1 |
|
public function extractText(FileInterface $file): string |
80
|
|
|
{ |
81
|
1 |
|
$localTempFilePath = $file->getForLocalProcessing(false); |
82
|
1 |
|
$tikaCommand = ShellUtility::getLanguagePrefix() |
83
|
1 |
|
. /** @scrutinizer ignore-type */ CommandUtility::getCommand('java') |
84
|
1 |
|
. ' -Dfile.encoding=UTF8' // forces UTF8 output |
85
|
1 |
|
. $this->getAdditionalCommandOptions() |
86
|
1 |
|
. ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
87
|
1 |
|
. ' -t' |
88
|
1 |
|
. ' ' . ShellUtility::escapeShellArgument($localTempFilePath); |
89
|
|
|
|
90
|
1 |
|
$extractedText = shell_exec($tikaCommand); |
91
|
|
|
|
92
|
1 |
|
$this->log( |
93
|
1 |
|
'Text Extraction using local Tika', |
94
|
1 |
|
[ |
95
|
1 |
|
'file' => $file, |
96
|
1 |
|
'tika command' => $tikaCommand, |
97
|
1 |
|
'shell output' => $extractedText, |
98
|
1 |
|
] |
99
|
1 |
|
); |
100
|
|
|
|
101
|
1 |
|
return (string)$extractedText; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
/** |
105
|
|
|
* Takes a file reference and extracts its meta-data. |
106
|
|
|
* |
107
|
|
|
* @param FileInterface $file |
108
|
|
|
* @return array |
109
|
|
|
*/ |
110
|
1 |
|
public function extractMetaData(FileInterface $file): array |
111
|
|
|
{ |
112
|
1 |
|
$localTempFilePath = $file->getForLocalProcessing(false); |
113
|
1 |
|
$tikaCommand = ShellUtility::getLanguagePrefix() |
114
|
1 |
|
. /** @scrutinizer ignore-type */ CommandUtility::getCommand('java') |
115
|
1 |
|
. ' -Dfile.encoding=UTF8' |
116
|
1 |
|
. $this->getAdditionalCommandOptions() |
117
|
1 |
|
. ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
118
|
1 |
|
. ' -m' |
119
|
1 |
|
. ' ' . ShellUtility::escapeShellArgument($localTempFilePath); |
120
|
|
|
|
121
|
1 |
|
$shellOutput = []; |
122
|
1 |
|
exec($tikaCommand, $shellOutput); |
123
|
1 |
|
$metaData = $this->shellOutputToArray($shellOutput); |
124
|
|
|
|
125
|
1 |
|
$this->log( |
126
|
1 |
|
'Meta Data Extraction using local Tika', |
127
|
1 |
|
[ |
128
|
1 |
|
'file' => $file, |
129
|
1 |
|
'tika command' => $tikaCommand, |
130
|
1 |
|
'shell output' => $shellOutput, |
131
|
1 |
|
'meta data' => $metaData, |
132
|
1 |
|
] |
133
|
1 |
|
); |
134
|
|
|
|
135
|
1 |
|
return $metaData; |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
/** |
139
|
|
|
* Takes a file reference and detects its content's language. |
140
|
|
|
* |
141
|
|
|
* @param FileInterface $file |
142
|
|
|
* @return string Language ISO code |
143
|
|
|
*/ |
144
|
1 |
|
public function detectLanguageFromFile(FileInterface $file): string |
145
|
|
|
{ |
146
|
1 |
|
$localTempFilePath = $file->getForLocalProcessing(false); |
147
|
|
|
|
148
|
1 |
|
return $this->detectLanguageFromLocalFile($localTempFilePath); |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
/** |
152
|
|
|
* Takes a string as input and detects its language. |
153
|
|
|
* |
154
|
|
|
* @param string $input |
155
|
|
|
* @return string Language ISO code |
156
|
|
|
*/ |
157
|
1 |
|
public function detectLanguageFromString(string $input): string |
158
|
|
|
{ |
159
|
1 |
|
$tempFilePath = GeneralUtility::tempnam('Tx_Tika_AppService_DetectLanguage'); |
160
|
1 |
|
file_put_contents($tempFilePath, $input); |
161
|
|
|
|
162
|
|
|
// detect language |
163
|
1 |
|
$language = $this->detectLanguageFromLocalFile($tempFilePath); |
164
|
|
|
|
165
|
|
|
// cleanup |
166
|
1 |
|
unlink($tempFilePath); |
167
|
|
|
|
168
|
1 |
|
return $language; |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
/** |
172
|
|
|
* The actual language detection |
173
|
|
|
* |
174
|
|
|
* @param string $localFilePath Path to a local file |
175
|
|
|
* @return string The file content's language |
176
|
|
|
*/ |
177
|
2 |
|
protected function detectLanguageFromLocalFile(string $localFilePath): string |
178
|
|
|
{ |
179
|
2 |
|
$tikaCommand = ShellUtility::getLanguagePrefix() |
180
|
2 |
|
. /** @scrutinizer ignore-type */ CommandUtility::getCommand('java') |
181
|
2 |
|
. ' -Dfile.encoding=UTF8' |
182
|
2 |
|
. $this->getAdditionalCommandOptions() |
183
|
2 |
|
. ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
184
|
2 |
|
. ' -l' |
185
|
2 |
|
. ' ' . ShellUtility::escapeShellArgument($localFilePath); |
186
|
|
|
|
187
|
2 |
|
$language = trim(shell_exec($tikaCommand) ?: ''); |
188
|
|
|
|
189
|
2 |
|
$this->log( |
190
|
2 |
|
'Language Detection using local Tika', |
191
|
2 |
|
[ |
192
|
2 |
|
'file' => $localFilePath, |
193
|
2 |
|
'tika command' => $tikaCommand, |
194
|
2 |
|
'shell output' => $language, |
195
|
2 |
|
] |
196
|
2 |
|
); |
197
|
|
|
|
198
|
2 |
|
return $language; |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* @return array |
203
|
|
|
*/ |
204
|
2 |
|
public function getSupportedMimeTypes(): array |
205
|
|
|
{ |
206
|
2 |
|
if (is_array(self::$supportedMimeTypes) && count(self::$supportedMimeTypes) > 0) { |
207
|
|
|
return self::$supportedMimeTypes; |
208
|
|
|
} |
209
|
|
|
|
210
|
2 |
|
self::$supportedMimeTypes = $this->buildSupportedMimeTypes(); |
211
|
|
|
|
212
|
2 |
|
return self::$supportedMimeTypes; |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
/** |
216
|
|
|
* @return array |
217
|
|
|
*/ |
218
|
2 |
|
public function buildSupportedMimeTypes(): array |
219
|
|
|
{ |
220
|
2 |
|
$mimeTypeOutput = $this->getMimeTypeOutputFromTikaJar(); |
221
|
2 |
|
$coreTypes = []; |
222
|
2 |
|
preg_match_all('/^[^\s]*/im', $mimeTypeOutput, $coreTypes); |
223
|
|
|
|
224
|
2 |
|
$aliasTypes = []; |
225
|
2 |
|
preg_match_all('/^[\s]*alias:[\s]*.*/im', $mimeTypeOutput, $aliasTypes); |
226
|
|
|
|
227
|
2 |
|
$supportedTypes = $coreTypes[0]; |
228
|
2 |
|
foreach ($aliasTypes[0] as $aliasType) { |
229
|
1 |
|
$supportedTypes[] = trim(str_replace('alias:', '', $aliasType)); |
230
|
|
|
} |
231
|
|
|
|
232
|
2 |
|
$supportedTypes = array_filter($supportedTypes); |
233
|
2 |
|
asort($supportedTypes); |
234
|
2 |
|
return $supportedTypes; |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
/** |
238
|
|
|
* Takes the shell output from exec() and turns it into an array of key => value |
239
|
|
|
* pairs. |
240
|
|
|
* |
241
|
|
|
* @param array $shellOutput An array containing the shell output from exec() with one line per entry |
242
|
|
|
* @return array Key => value pairs |
243
|
|
|
*/ |
244
|
1 |
|
protected function shellOutputToArray(array $shellOutput): array |
245
|
|
|
{ |
246
|
1 |
|
$metaData = []; |
247
|
|
|
|
248
|
1 |
|
foreach ($shellOutput as $line) { |
249
|
1 |
|
[$key, $value] = explode(':', $line, 2); |
250
|
1 |
|
$value = trim($value ?? ''); |
251
|
|
|
|
252
|
1 |
|
if (in_array($key, [ |
253
|
1 |
|
'dc', |
254
|
1 |
|
'dcterms', |
255
|
1 |
|
'meta', |
256
|
1 |
|
'tiff', |
257
|
1 |
|
'xmp', |
258
|
1 |
|
'xmpTPg', |
259
|
1 |
|
'xmpDM', |
260
|
1 |
|
])) { |
261
|
|
|
// Dublin Core metadata and co |
262
|
|
|
$keyPrefix = $key; |
263
|
|
|
[$key, $value] = explode(':', $value, 2); |
264
|
|
|
|
265
|
|
|
$key = $keyPrefix . ':' . $key; |
266
|
|
|
$value = trim($value ?? ''); |
267
|
|
|
} |
268
|
|
|
|
269
|
1 |
|
if (array_key_exists($key, $metaData)) { |
270
|
|
|
if ($metaData[$key] == $value) { |
271
|
|
|
// first duplicate key hit, but also duplicate value |
272
|
|
|
continue; |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
// allow a meta data key to appear multiple times |
276
|
|
|
if (!is_array($metaData[$key])) { |
277
|
|
|
$metaData[$key] = [$metaData[$key]]; |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
// but do not allow duplicate values |
281
|
|
|
if (!in_array($value, $metaData[$key])) { |
282
|
|
|
$metaData[$key][] = $value; |
283
|
|
|
} |
284
|
|
|
} else { |
285
|
1 |
|
$metaData[$key] = $value; |
286
|
|
|
} |
287
|
|
|
} |
288
|
|
|
|
289
|
1 |
|
return $metaData; |
290
|
|
|
} |
291
|
|
|
|
292
|
|
|
/** |
293
|
|
|
* The app is available when the jar can be opened |
294
|
|
|
* |
295
|
|
|
* @return bool |
296
|
|
|
*/ |
297
|
|
|
public function isAvailable(): bool |
298
|
|
|
{ |
299
|
|
|
$tikaFileExists = is_file(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])); |
300
|
|
|
if (!$tikaFileExists) { |
301
|
|
|
return false; |
302
|
|
|
} |
303
|
|
|
|
304
|
|
|
$canCallJava = CommandUtility::checkCommand('java'); |
305
|
|
|
if (!$canCallJava) { |
306
|
|
|
return false; |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
return true; |
310
|
|
|
} |
311
|
|
|
|
312
|
|
|
/** |
313
|
|
|
* @return string |
314
|
|
|
*/ |
315
|
1 |
|
protected function getMimeTypeOutputFromTikaJar(): string |
316
|
|
|
{ |
317
|
1 |
|
$tikaCommand = ShellUtility::getLanguagePrefix() |
318
|
1 |
|
. /** @scrutinizer ignore-type */ CommandUtility::getCommand('java') |
319
|
1 |
|
. ' -Dfile.encoding=UTF8' |
320
|
1 |
|
. $this->getAdditionalCommandOptions() |
321
|
1 |
|
. ' -jar ' . escapeshellarg(FileUtility::getAbsoluteFilePath($this->configuration['tikaPath'])) |
322
|
1 |
|
. ' --list-supported-types'; |
323
|
|
|
|
324
|
1 |
|
return trim(shell_exec($tikaCommand) ?: ''); |
325
|
|
|
} |
326
|
|
|
} |
327
|
|
|
|