1
|
|
|
<?php |
2
|
|
|
namespace ApacheSolrForTypo3\Tika\Service\Tika; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2015 Ingo Renner <[email protected]> |
8
|
|
|
* All rights reserved |
9
|
|
|
* |
10
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
11
|
|
|
* free software; you can redistribute it and/or modify |
12
|
|
|
* it under the terms of the GNU General Public License as published by |
13
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
14
|
|
|
* (at your option) any later version. |
15
|
|
|
* |
16
|
|
|
* The GNU General Public License can be found at |
17
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
18
|
|
|
* |
19
|
|
|
* This script is distributed in the hope that it will be useful, |
20
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
21
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22
|
|
|
* GNU General Public License for more details. |
23
|
|
|
* |
24
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
25
|
|
|
***************************************************************/ |
26
|
|
|
|
27
|
|
|
use ApacheSolrForTypo3\Tika\Process; |
28
|
|
|
use ApacheSolrForTypo3\Tika\Utility\FileUtility; |
29
|
|
|
use TYPO3\CMS\Core\Registry; |
30
|
|
|
use TYPO3\CMS\Core\Resource\FileInterface; |
31
|
|
|
use TYPO3\CMS\Core\Utility\CommandUtility; |
32
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* A Tika service implementation using the tika-server.jar |
36
|
|
|
* |
37
|
|
|
*/ |
38
|
|
|
class ServerService extends AbstractService |
39
|
|
|
{ |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* Tika server URL |
43
|
|
|
* |
44
|
|
|
* @var string |
45
|
|
|
*/ |
46
|
|
|
protected $tikaUrl; |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* @var array |
50
|
|
|
*/ |
51
|
|
|
protected static $supportedMimeTypes = []; |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* Service initialization |
55
|
|
|
* |
56
|
|
|
* @return void |
57
|
|
|
*/ |
58
|
44 |
|
protected function initializeService() |
59
|
|
|
{ |
60
|
|
|
// Fallback default configuration is with http protocol |
61
|
44 |
|
$this->tikaUrl = 'http://' . $this->configuration['tikaServerHost']; |
62
|
|
|
|
63
|
|
|
// Overwrite configuration of tikaServerScheme is configured |
64
|
44 |
|
if (!empty($this->configuration['tikaServerScheme'])) { |
65
|
44 |
|
$this->tikaUrl = $this->configuration['tikaServerScheme'] . '://' . $this->configuration['tikaServerHost']; |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
// Only append tikaServerPort if configured |
69
|
44 |
|
if (!empty($this->configuration['tikaServerPort'])) { |
70
|
44 |
|
$this->tikaUrl .= ':' . $this->configuration['tikaServerPort']; |
71
|
|
|
} |
72
|
44 |
|
} |
73
|
|
|
|
74
|
|
|
/** |
75
|
|
|
* Initializes a Tika server process. |
76
|
|
|
* |
77
|
|
|
* @param string $arguments |
78
|
|
|
* @return \ApacheSolrForTypo3\Tika\Process |
79
|
|
|
*/ |
80
|
5 |
|
protected function getProcess($arguments = '') |
81
|
|
|
{ |
82
|
5 |
|
$process = GeneralUtility::makeInstance(Process::class, CommandUtility::getCommand('java'), $arguments); |
83
|
|
|
|
84
|
5 |
|
return $process; |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Creates the command to start the Tika server. |
89
|
|
|
* |
90
|
|
|
* @return string |
91
|
|
|
*/ |
92
|
4 |
|
protected function getStartCommand() |
93
|
|
|
{ |
94
|
4 |
|
$tikaJar = FileUtility::getAbsoluteFilePath($this->configuration['tikaServerPath']); |
95
|
4 |
|
$command = '-jar ' . escapeshellarg($tikaJar); |
96
|
4 |
|
$command .= ' -p ' . escapeshellarg($this->configuration['tikaServerPort']); |
97
|
|
|
|
98
|
4 |
|
$command = escapeshellcmd($command); |
99
|
|
|
|
100
|
4 |
|
return $command; |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
/** |
104
|
|
|
* Starts the Tika server |
105
|
|
|
* |
106
|
|
|
* @return void |
107
|
|
|
*/ |
108
|
1 |
|
public function startServer() |
109
|
|
|
{ |
110
|
1 |
|
$process = $this->getProcess($this->getStartCommand()); |
111
|
1 |
|
$process->start(); |
112
|
1 |
|
$pid = $process->getPid(); |
113
|
|
|
|
114
|
1 |
|
$registry = GeneralUtility::makeInstance(Registry::class); |
115
|
1 |
|
$registry->set('tx_tika', 'server.pid', $pid); |
116
|
1 |
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* Stops the Tika server |
120
|
|
|
* |
121
|
|
|
* @return void |
122
|
|
|
*/ |
123
|
1 |
|
public function stopServer() |
124
|
|
|
{ |
125
|
1 |
|
$pid = $this->getServerPid(); |
126
|
|
|
|
127
|
1 |
|
$process = $this->getProcess(); |
128
|
1 |
|
$process->setPid($pid); |
129
|
1 |
|
$process->stop(); |
130
|
|
|
|
131
|
|
|
// unset pid in registry |
132
|
1 |
|
$registry = GeneralUtility::makeInstance(Registry::class); |
133
|
1 |
|
$registry->remove('tx_tika', 'server.pid'); |
134
|
1 |
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* Gets the Tika server pid. |
138
|
|
|
* |
139
|
|
|
* Tries to retrieve the pid from the TYPO3 registry first, then using ps. |
140
|
|
|
* |
141
|
|
|
* @return int|null Null if the pid can't be found, otherwise the pid |
142
|
|
|
*/ |
143
|
6 |
|
public function getServerPid() |
144
|
|
|
{ |
145
|
6 |
|
$registry = GeneralUtility::makeInstance(Registry::class); |
146
|
6 |
|
$pid = $registry->get('tx_tika', 'server.pid'); |
147
|
|
|
|
148
|
6 |
|
if (empty($pid)) { |
149
|
3 |
|
$process = $this->getProcess($this->getStartCommand()); |
150
|
3 |
|
$pid = $process->findPid(); |
151
|
|
|
} |
152
|
|
|
|
153
|
6 |
|
return $pid; |
154
|
|
|
} |
155
|
|
|
|
156
|
|
|
/** |
157
|
|
|
* Check if the Tika server is running |
158
|
|
|
* |
159
|
|
|
* @return bool |
160
|
|
|
*/ |
161
|
3 |
|
public function isServerRunning() |
162
|
|
|
{ |
163
|
3 |
|
$pid = $this->getServerPid(); |
164
|
|
|
|
165
|
3 |
|
return !empty($pid); |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
/** |
169
|
|
|
* Ping the Tika server |
170
|
|
|
* |
171
|
|
|
* @return bool true if the Tika server can be reached, false if not |
172
|
|
|
* @throws \Exception |
173
|
|
|
*/ |
174
|
1 |
|
public function ping() |
175
|
|
|
{ |
176
|
1 |
|
$tikaPing = $this->queryTika('/tika'); |
177
|
1 |
|
$tikaReachable = GeneralUtility::isFirstPartOfStr($tikaPing, 'This is Tika Server'); |
178
|
|
|
|
179
|
1 |
|
return $tikaReachable; |
180
|
|
|
} |
181
|
|
|
|
182
|
|
|
/** |
183
|
|
|
* The tika server is available when the server is pingable. |
184
|
|
|
* |
185
|
|
|
* @return bool |
186
|
|
|
*/ |
187
|
|
|
public function isAvailable() |
188
|
|
|
{ |
189
|
|
|
return $this->ping(); |
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
/** |
193
|
|
|
* Constructs the Tika server URL. |
194
|
|
|
* |
195
|
|
|
* @return string Tika server URL |
196
|
|
|
*/ |
197
|
32 |
|
public function getTikaServerUrl() |
198
|
|
|
{ |
199
|
32 |
|
return $this->tikaUrl; |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
/** |
203
|
|
|
* Gets the Tika server version |
204
|
|
|
* |
205
|
|
|
* @return string Tika server version string |
206
|
|
|
* @throws \Exception |
207
|
|
|
*/ |
208
|
|
|
public function getTikaVersion() |
209
|
|
|
{ |
210
|
|
|
$version = 'unknown'; |
211
|
|
|
|
212
|
|
|
if ($this->isServerRunning()) { |
213
|
|
|
$version = $this->queryTika('/version'); |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
return $version; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
/** |
220
|
|
|
* Query a Tika server endpoint |
221
|
|
|
* |
222
|
|
|
* @param string $endpoint |
223
|
|
|
* @param resource $context optional stream context |
224
|
|
|
* @return string Tika output |
225
|
|
|
* @throws \Exception |
226
|
|
|
*/ |
227
|
31 |
|
protected function queryTika($endpoint, $context = null) |
228
|
|
|
{ |
229
|
31 |
|
$url = $this->getTikaServerUrl(); |
230
|
31 |
|
$url .= $endpoint; |
231
|
|
|
|
232
|
31 |
|
$tikaOutput = ''; |
|
|
|
|
233
|
|
|
try { |
234
|
31 |
|
$tikaOutput = file_get_contents($url, false, $context); |
235
|
|
|
} catch (\Exception $e) { |
236
|
|
|
$message = $e->getMessage(); |
237
|
|
|
if (strpos($message, 'Connection refused') === false && strpos($message, 'HTTP request failed') === false) { |
238
|
|
|
// If the server is simply not available it would say Connection refused |
239
|
|
|
// since that is not the case something else went wrong |
240
|
|
|
throw $e; |
241
|
|
|
} |
242
|
|
|
} |
243
|
|
|
|
244
|
31 |
|
return $tikaOutput; |
245
|
|
|
} |
246
|
|
|
|
247
|
|
|
/** |
248
|
|
|
* Takes a file reference and extracts the text from it. |
249
|
|
|
* |
250
|
|
|
* @param \TYPO3\CMS\Core\Resource\FileInterface $file |
251
|
|
|
* @return string |
252
|
|
|
*/ |
253
|
3 |
|
public function extractText(FileInterface $file) |
254
|
|
|
{ |
255
|
3 |
|
$headers = [$this->getUserAgent(), 'Accept: text/plain', 'Content-Type: application/octet-stream', 'Connection: close']; |
256
|
|
|
|
257
|
3 |
|
$context = stream_context_create(['http' => ['protocol_version' => 1.1, 'method' => 'PUT', 'header' => implode(CRLF, $headers), 'content' => $file->getContents()]]); |
258
|
|
|
|
259
|
3 |
|
$response = $this->queryTika('/tika', $context); |
260
|
|
|
|
261
|
3 |
|
if ($response === FALSE) { |
262
|
|
|
$this->log('Text Extraction using Tika Server failed', $this->getLogData($file, $response), 2); |
263
|
|
|
} else { |
264
|
3 |
|
$this->log('Text Extraction using Tika Server', $this->getLogData($file, $response)); |
265
|
|
|
} |
266
|
|
|
|
267
|
3 |
|
return $response; |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
/** |
271
|
|
|
* Takes a file reference and extracts its meta data. |
272
|
|
|
* |
273
|
|
|
* @param \TYPO3\CMS\Core\Resource\FileInterface $file |
274
|
|
|
* @return array |
275
|
|
|
*/ |
276
|
2 |
|
public function extractMetaData(FileInterface $file) |
277
|
|
|
{ |
278
|
2 |
|
$headers = [$this->getUserAgent(), 'Accept: application/json', 'Content-Type: application/octet-stream', 'Connection: close']; |
279
|
|
|
|
280
|
2 |
|
$context = stream_context_create(['http' => ['protocol_version' => 1.1, 'method' => 'PUT', 'header' => implode(CRLF, $headers), 'content' => $file->getContents()]]); |
281
|
|
|
|
282
|
2 |
|
$rawResponse = $this->queryTika('/meta', $context); |
283
|
2 |
|
$response = (array)json_decode($rawResponse); |
284
|
|
|
|
285
|
2 |
|
if ($response === FALSE) { |
|
|
|
|
286
|
|
|
$this->log('Meta Data Extraction using Tika Server failed', $this->getLogData($file, $response), 2); |
287
|
|
|
} else { |
288
|
2 |
|
$this->log('Meta Data Extraction using Tika Server', $this->getLogData($file, $response)); |
|
|
|
|
289
|
|
|
} |
290
|
|
|
|
291
|
|
|
|
292
|
2 |
|
return $response; |
293
|
|
|
} |
294
|
|
|
|
295
|
|
|
/** |
296
|
|
|
* Takes a file reference and detects its content's language. |
297
|
|
|
* |
298
|
|
|
* @param \TYPO3\CMS\Core\Resource\FileInterface $file |
299
|
|
|
* @return string Language ISO code |
300
|
|
|
*/ |
301
|
14 |
|
public function detectLanguageFromFile(FileInterface $file) |
302
|
|
|
{ |
303
|
14 |
|
$headers = [$this->getUserAgent(), 'Content-Type: application/octet-stream', 'Connection: close']; |
304
|
|
|
|
305
|
14 |
|
$context = stream_context_create(['http' => ['protocol_version' => 1.1, 'method' => 'PUT', 'header' => implode(CRLF, $headers), 'content' => $file->getContents()]]); |
306
|
|
|
|
307
|
14 |
|
$response = $this->queryTika('/language/stream', $context); |
308
|
|
|
|
309
|
14 |
|
if ($response === FALSE) { |
310
|
|
|
$this->log('Language Detection using Tika Server failed', $this->getLogData($file, $response), 2); |
311
|
|
|
} else { |
312
|
14 |
|
$this->log('Language Detection using Tika Server', $this->getLogData($file, $response)); |
313
|
|
|
} |
314
|
|
|
|
315
|
14 |
|
return $response; |
316
|
|
|
} |
317
|
|
|
|
318
|
|
|
/** |
319
|
|
|
* Takes a string as input and detects its language. |
320
|
|
|
* |
321
|
|
|
* @param string $input |
322
|
|
|
* @return string Language ISO code |
323
|
|
|
*/ |
324
|
14 |
|
public function detectLanguageFromString($input) |
325
|
|
|
{ |
326
|
14 |
|
$headers = [$this->getUserAgent(), 'Content-Type: application/octet-stream', 'Connection: close']; |
327
|
|
|
|
328
|
14 |
|
$context = stream_context_create(['http' => ['protocol_version' => 1.1, 'method' => 'PUT', 'header' => implode(CRLF, $headers), 'content' => $input]]); |
329
|
|
|
|
330
|
14 |
|
$response = $this->queryTika('/language/string', $context); |
331
|
|
|
|
332
|
14 |
|
return $response; |
333
|
|
|
} |
334
|
|
|
|
335
|
|
|
/** |
336
|
|
|
* @return array |
337
|
|
|
*/ |
338
|
1 |
|
public function getSupportedMimeTypes() |
339
|
|
|
{ |
340
|
1 |
|
if (is_array(self::$supportedMimeTypes) && count(self::$supportedMimeTypes) > 0) { |
341
|
|
|
return self::$supportedMimeTypes; |
342
|
|
|
} |
343
|
|
|
|
344
|
1 |
|
self::$supportedMimeTypes = $this->buildSupportedMimeTypes(); |
345
|
|
|
|
346
|
1 |
|
return self::$supportedMimeTypes; |
347
|
|
|
} |
348
|
|
|
|
349
|
|
|
/** |
350
|
|
|
* @return string |
351
|
|
|
*/ |
352
|
1 |
|
protected function getMimeTypeJsonFromTikaServer() |
353
|
|
|
{ |
354
|
1 |
|
$headers = [$this->getUserAgent(), 'Content-Type: application/octet-stream', 'Accept: application/json', 'Connection: close']; |
355
|
|
|
|
356
|
1 |
|
$context = stream_context_create(['http' => ['protocol_version' => 1.1, 'method' => 'GET', 'header' => implode(CRLF, $headers),]]); |
357
|
|
|
|
358
|
1 |
|
$response = $this->queryTika('/mime-types', $context); |
359
|
1 |
|
return $response; |
360
|
|
|
} |
361
|
|
|
|
362
|
|
|
/** |
363
|
|
|
* @return array |
364
|
|
|
*/ |
365
|
1 |
|
protected function buildSupportedMimeTypes() |
366
|
|
|
{ |
367
|
1 |
|
$response = $this->getMimeTypeJsonFromTikaServer(); |
368
|
|
|
|
369
|
1 |
|
$result = (json_decode($response)); |
370
|
1 |
|
$definitions = get_object_vars($result); |
371
|
1 |
|
$coreTypes = []; |
372
|
1 |
|
$aliasTypes = []; |
373
|
1 |
|
foreach ($definitions as $coreMimeType => $configuration) { |
374
|
1 |
|
if (isset($configuration->alias) && is_array($configuration->alias)) { |
375
|
1 |
|
$aliasTypes += $configuration->alias; |
376
|
|
|
} |
377
|
1 |
|
$coreTypes[] = $coreMimeType; |
378
|
|
|
} |
379
|
|
|
|
380
|
1 |
|
$supportedTypes = $coreTypes + $aliasTypes; |
381
|
1 |
|
$supportedTypes = array_filter($supportedTypes); |
382
|
1 |
|
asort($supportedTypes); |
383
|
1 |
|
return $supportedTypes; |
384
|
|
|
} |
385
|
|
|
|
386
|
|
|
/** |
387
|
|
|
* @return string |
388
|
|
|
*/ |
389
|
34 |
|
protected function getUserAgent() |
390
|
|
|
{ |
391
|
34 |
|
return 'User-Agent: ' . $GLOBALS['TYPO3_CONF_VARS']['HTTP']['headers']['User-Agent'] ?? 'TYPO3'; |
392
|
|
|
} |
393
|
|
|
|
394
|
|
|
/** |
395
|
|
|
* @param \TYPO3\CMS\Core\Resource\FileInterface $file |
396
|
|
|
* @param string $response |
397
|
|
|
* @return array |
398
|
|
|
*/ |
399
|
16 |
|
protected function getLogData($file, $response) |
400
|
|
|
{ |
401
|
16 |
|
$logData = ['file' => $file->getName(), 'file_path' => $file->getPublicUrl(), 'tika_url' => $this->getTikaServerUrl(), 'response' => $response]; |
402
|
16 |
|
return $logData; |
403
|
|
|
} |
404
|
|
|
} |
405
|
|
|
|