1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace ApacheSolrForTypo3\Tika\Service\Tika; |
6
|
|
|
|
7
|
|
|
/* |
8
|
|
|
* This file is part of the TYPO3 CMS project. |
9
|
|
|
* |
10
|
|
|
* It is free software; you can redistribute it and/or modify it under |
11
|
|
|
* the terms of the GNU General Public License, either version 2 |
12
|
|
|
* of the License, or any later version. |
13
|
|
|
* |
14
|
|
|
* For the full copyright and license information, please read the |
15
|
|
|
* LICENSE.txt file that was distributed with this source code. |
16
|
|
|
* |
17
|
|
|
* The TYPO3 project - inspiring people to share! |
18
|
|
|
*/ |
19
|
|
|
|
20
|
|
|
use ApacheSolrForTypo3\Solr\ConnectionManager; |
21
|
|
|
use ApacheSolrForTypo3\Solr\System\Solr\SolrConnection; |
22
|
|
|
use Solarium\QueryType\Extract\Query; |
23
|
|
|
use TYPO3\CMS\Core\Resource\FileInterface; |
24
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* A Tika service implementation using a Solr server |
28
|
|
|
* |
29
|
|
|
* @author Ingo Renner <[email protected]> |
30
|
|
|
*/ |
31
|
|
|
class SolrCellService extends AbstractService |
32
|
|
|
{ |
33
|
|
|
/** |
34
|
|
|
* Solr connection |
35
|
|
|
* |
36
|
|
|
* @var SolrConnection |
37
|
|
|
*/ |
38
|
|
|
protected SolrConnection $solrConnection; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Service initialization |
42
|
|
|
*/ |
43
|
6 |
|
protected function initializeService(): void |
44
|
|
|
{ |
45
|
|
|
// EM might define a different connection than already in use by |
46
|
|
|
// Index Queue |
47
|
|
|
/** @var ConnectionManager $connectionManager */ |
48
|
6 |
|
$connectionManager = GeneralUtility::makeInstance(ConnectionManager::class); |
49
|
|
|
|
50
|
6 |
|
$readNode = [ |
51
|
6 |
|
'host' => $this->configuration['solrHost'], |
52
|
6 |
|
'port' => (int)$this->configuration['solrPort'], |
53
|
6 |
|
'path' => $this->configuration['solrPath'], |
54
|
6 |
|
'scheme' => $this->configuration['solrScheme'], |
55
|
6 |
|
]; |
56
|
6 |
|
$writeNode = $readNode; |
57
|
6 |
|
$this->solrConnection = $connectionManager->getSolrConnectionForNodes($readNode, $writeNode); |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Retrieves a configuration value or a default value when not available. |
62
|
|
|
* |
63
|
|
|
* @param string $key |
64
|
|
|
* @param mixed $defaultValue |
65
|
|
|
* @return mixed |
66
|
|
|
*/ |
67
|
|
|
protected function getConfigurationOrDefaultValue(string $key, $defaultValue) |
68
|
|
|
{ |
69
|
|
|
return $this->configuration[$key] ?? $defaultValue; |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* Takes a file reference and extracts the text from it. |
74
|
|
|
* |
75
|
|
|
* @param FileInterface $file |
76
|
|
|
* @return string |
77
|
|
|
*/ |
78
|
2 |
|
public function extractText(FileInterface $file): string |
79
|
|
|
{ |
80
|
2 |
|
$localTempFilePath = $file->getForLocalProcessing(false); |
81
|
|
|
/** @var Query $query */ |
82
|
2 |
|
$query = GeneralUtility::makeInstance(Query::class); |
83
|
2 |
|
$query->setFile($localTempFilePath); |
84
|
2 |
|
$query->setExtractOnly(true); |
85
|
2 |
|
$query->addParam('extractFormat', 'text'); |
86
|
|
|
|
87
|
2 |
|
$writer = $this->solrConnection->getWriteService(); |
88
|
2 |
|
$response = $writer->extractByQuery($query); |
89
|
|
|
|
90
|
2 |
|
$this->log('Text Extraction using Solr', [ |
91
|
2 |
|
'file' => $file, |
92
|
2 |
|
'solr connection' => (array)$writer, |
93
|
2 |
|
'query' => (array)$query, |
94
|
2 |
|
'response' => $response, |
95
|
2 |
|
]); |
96
|
|
|
|
97
|
2 |
|
return $response[0] ?? ''; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* Takes a file reference and extracts its meta-data. |
102
|
|
|
* |
103
|
|
|
* @param FileInterface $file |
104
|
|
|
* @return array |
105
|
|
|
*/ |
106
|
2 |
|
public function extractMetaData(FileInterface $file): array |
107
|
|
|
{ |
108
|
2 |
|
$localTempFilePath = $file->getForLocalProcessing(false); |
109
|
|
|
/** @var Query $query */ |
110
|
2 |
|
$query = GeneralUtility::makeInstance(Query::class); |
111
|
2 |
|
$query->setFile($localTempFilePath); |
112
|
2 |
|
$query->setExtractOnly(true); |
113
|
2 |
|
$query->addParam('extractFormat', 'text'); |
114
|
|
|
|
115
|
2 |
|
$writer = $this->solrConnection->getWriteService(); |
116
|
2 |
|
$response = $writer->extractByQuery($query); |
117
|
|
|
|
118
|
2 |
|
$metaData = []; |
119
|
2 |
|
if (isset($response[1]) && is_array($response[1])) { |
120
|
2 |
|
$metaData = $this->solrResponseToArray($response[1]); |
121
|
|
|
} |
122
|
|
|
|
123
|
2 |
|
$this->log('Meta Data Extraction using Solr', [ |
124
|
2 |
|
'file' => $file, |
125
|
2 |
|
'solr connection' => (array)$writer, |
126
|
2 |
|
'query' => (array)$query, |
127
|
2 |
|
'response' => $response, |
128
|
2 |
|
'meta data' => $metaData, |
129
|
2 |
|
]); |
130
|
|
|
|
131
|
2 |
|
return $metaData; |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* Takes a file reference and detects its content's language. |
136
|
|
|
* |
137
|
|
|
* @param FileInterface $file |
138
|
|
|
* @return string Language ISO code |
139
|
|
|
*/ |
140
|
|
|
public function detectLanguageFromFile(FileInterface $file): string |
141
|
|
|
{ |
142
|
|
|
// TODO check whether Solr supports text extraction now |
143
|
|
|
throw new UnsupportedOperationException( |
144
|
|
|
'The Tika Solr service does not support language detection', |
145
|
|
|
1423457153 |
146
|
|
|
); |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* Takes a string as input and detects its language. |
151
|
|
|
* |
152
|
|
|
* @param string $input |
153
|
|
|
* @return string Language ISO code |
154
|
|
|
*/ |
155
|
|
|
public function detectLanguageFromString(string $input): string |
156
|
|
|
{ |
157
|
|
|
// TODO check whether Solr supports text extraction now |
158
|
|
|
throw new UnsupportedOperationException( |
159
|
|
|
'The Tika Solr service does not support language detection', |
160
|
|
|
1423457153 |
161
|
|
|
); |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
/** |
165
|
|
|
* Turns the nested Solr response into the same format as produced by a |
166
|
|
|
* local Tika jar call |
167
|
|
|
* |
168
|
|
|
* @param array $metaDataResponse The part of the Solr response containing the meta-data |
169
|
|
|
* @return array The cleaned meta-data, matching the Tika jar call format |
170
|
|
|
*/ |
171
|
2 |
|
protected function solrResponseToArray(array $metaDataResponse = []): array |
172
|
|
|
{ |
173
|
2 |
|
$cleanedData = []; |
174
|
|
|
|
175
|
2 |
|
foreach ($metaDataResponse as $dataName => $dataArray) { |
176
|
2 |
|
if (!($dataName % 2) == 0) { |
177
|
1 |
|
continue; |
178
|
|
|
} |
179
|
2 |
|
$fieldName = $dataArray; |
180
|
2 |
|
$fieldValue = $metaDataResponse[$dataName + 1] ?? ['']; |
181
|
|
|
|
182
|
2 |
|
$cleanedData[$fieldName] = $fieldValue[0]; |
183
|
|
|
} |
184
|
|
|
|
185
|
2 |
|
return $cleanedData; |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
/** |
189
|
|
|
* Gets the Tika version |
190
|
|
|
* |
191
|
|
|
* @return string Apache Solr server version string |
192
|
|
|
*/ |
193
|
|
|
public function getTikaVersion(): string |
194
|
|
|
{ |
195
|
|
|
// TODO add patch for endpoint on Apache Solr to return Tika version |
196
|
|
|
// for now returns the Solr version string f.e. "Apache Solr X.Y.Z" |
197
|
|
|
return $this->solrConnection->getAdminService()->getSolrServerVersion(); |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
/** |
201
|
|
|
* Since solr cell does not allow to query the supported mimetypes, we return a list of known supported mimetypes here. |
202
|
|
|
* |
203
|
|
|
* @return array |
204
|
|
|
*/ |
205
|
1 |
|
public function getSupportedMimeTypes(): array |
206
|
|
|
{ |
207
|
1 |
|
$mapping = [ |
208
|
1 |
|
'application/epub+zip' => ['epub'], |
209
|
1 |
|
'application/gzip' => ['gz', 'tgz'], |
210
|
1 |
|
'application/msword' => ['doc'], |
211
|
1 |
|
'application/pdf' => ['pdf'], |
212
|
1 |
|
'application/rtf' => ['rtf'], |
213
|
1 |
|
'application/vnd.ms-excel' => ['xsl'], |
214
|
1 |
|
'application/vnd.ms-outlook' => ['msg'], |
215
|
1 |
|
'application/vnd.oasis.opendocument.formula' => ['odf'], |
216
|
1 |
|
'application/vnd.oasis.opendocument.text' => ['odt'], |
217
|
1 |
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => ['pptx'], |
218
|
1 |
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => ['xlsx'], |
219
|
1 |
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => ['docx'], |
220
|
1 |
|
'application/vnd.sun.xml.writer' => ['sxw'], |
221
|
1 |
|
'application/zip' => ['zip'], |
222
|
1 |
|
'application/x-midi' => ['mid'], |
223
|
1 |
|
'application/xml' => ['xml'], |
224
|
1 |
|
'audio/aiff' => ['aif', 'aiff'], |
225
|
1 |
|
'audio/basic' => ['au'], |
226
|
1 |
|
'audio/midi' => ['mid'], |
227
|
1 |
|
'audio/mpeg3' => ['mp3'], |
228
|
1 |
|
'audio/mpeg' => ['mp3'], |
229
|
1 |
|
'audio/wav' => ['wav'], |
230
|
1 |
|
'audio/x-mpeg-3' => ['mp3'], |
231
|
1 |
|
'audio/x-wav' => ['wav'], |
232
|
1 |
|
'image/bmp' => ['bmp'], |
233
|
1 |
|
'image/gif' => ['gif'], |
234
|
1 |
|
'image/jpeg' => ['jpg', 'jpeg'], |
235
|
1 |
|
'image/png' => ['png'], |
236
|
1 |
|
'image/svg+xml' => ['svg'], |
237
|
1 |
|
'image/tiff' => ['tif', 'tiff'], |
238
|
1 |
|
'text/html' => ['html', 'htm'], |
239
|
1 |
|
'text/plain' => ['txt'], |
240
|
1 |
|
'text/xml' => ['xml'], |
241
|
1 |
|
'video/mpeg' => ['mp3'], |
242
|
1 |
|
'video/x-mpeg' => ['mp3'], |
243
|
1 |
|
]; |
244
|
|
|
|
245
|
1 |
|
return array_keys($mapping); |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
/** |
249
|
|
|
* The service is available when the solr server is reachable. |
250
|
|
|
* |
251
|
|
|
* @return bool |
252
|
|
|
*/ |
253
|
1 |
|
public function isAvailable(): bool |
254
|
|
|
{ |
255
|
1 |
|
return $this->solrConnection->getWriteService()->ping(); |
256
|
|
|
} |
257
|
|
|
} |
258
|
|
|
|