1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace ApacheSolrForTypo3\Tika\Service\Extractor; |
6
|
|
|
|
7
|
|
|
/* |
8
|
|
|
* This file is part of the TYPO3 CMS project. |
9
|
|
|
* |
10
|
|
|
* It is free software; you can redistribute it and/or modify it under |
11
|
|
|
* the terms of the GNU General Public License, either version 2 |
12
|
|
|
* of the License, or any later version. |
13
|
|
|
* |
14
|
|
|
* For the full copyright and license information, please read the |
15
|
|
|
* LICENSE.txt file that was distributed with this source code. |
16
|
|
|
* |
17
|
|
|
* The TYPO3 project - inspiring people to share! |
18
|
|
|
*/ |
19
|
|
|
|
20
|
|
|
use ApacheSolrForTypo3\Tika\Service\Tika\AppService; |
21
|
|
|
use ApacheSolrForTypo3\Tika\Service\Tika\ServerService; |
22
|
|
|
use ApacheSolrForTypo3\Tika\Service\Tika\ServiceFactory; |
23
|
|
|
use ApacheSolrForTypo3\Tika\Service\Tika\SolrCellService; |
24
|
|
|
use Psr\Http\Client\ClientExceptionInterface; |
25
|
|
|
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException; |
26
|
|
|
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException; |
27
|
|
|
use TYPO3\CMS\Core\Resource\File; |
28
|
|
|
use TYPO3\CMS\Core\Resource\FileInterface; |
29
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* A service to extract meta-data from files using Apache Tika |
33
|
|
|
* |
34
|
|
|
* @author Ingo Renner <[email protected]> |
35
|
|
|
*/ |
36
|
|
|
class MetaDataExtractor extends AbstractExtractor |
37
|
|
|
{ |
38
|
|
|
/** |
39
|
|
|
* @var int |
40
|
|
|
*/ |
41
|
|
|
protected int $priority = 100; |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* Checks if the given file can be processed by this Extractor |
45
|
|
|
* |
46
|
|
|
* @param File $file |
47
|
|
|
* @return bool |
48
|
|
|
* @throws ClientExceptionInterface |
49
|
|
|
* @throws ExtensionConfigurationExtensionNotConfiguredException |
50
|
|
|
* @throws ExtensionConfigurationPathDoesNotExistException |
51
|
|
|
*/ |
52
|
2 |
|
public function canProcess(File $file): bool |
53
|
|
|
{ |
54
|
2 |
|
$tikaService = $this->getExtractor(); |
55
|
2 |
|
$mimeTypes = $tikaService->getSupportedMimeTypes(); |
56
|
2 |
|
$allowedMimeTypes = $this->mergeAllowedMimeTypes($mimeTypes); |
57
|
|
|
|
58
|
2 |
|
$isAllowedMimetype = in_array($file->getMimeType(), $allowedMimeTypes); |
59
|
2 |
|
$isSizeBelowLimit = $this->fileSizeValidator->isBelowLimit($file); |
60
|
|
|
|
61
|
2 |
|
return $isAllowedMimetype && $isSizeBelowLimit; |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
/** |
65
|
|
|
* Method to return a filtered $mimeTypes list - excludes the ones defined in |
66
|
|
|
* $this->configuration['excludeMimeTypes'] |
67
|
|
|
* |
68
|
|
|
* @param array $mimeTypes |
69
|
|
|
* @return array |
70
|
|
|
*/ |
71
|
2 |
|
protected function mergeAllowedMimeTypes(array $mimeTypes): array |
72
|
|
|
{ |
73
|
2 |
|
if (empty($this->configuration['excludeMimeTypes'])) { |
74
|
2 |
|
return $mimeTypes; |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
$allowedMimeTypes = GeneralUtility::trimExplode(',', $this->configuration['excludeMimeTypes']); |
78
|
|
|
|
79
|
|
|
return array_diff($mimeTypes, $allowedMimeTypes); |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* @return AppService|ServerService|SolrCellService |
84
|
|
|
* @throws ExtensionConfigurationExtensionNotConfiguredException |
85
|
|
|
* @throws ExtensionConfigurationPathDoesNotExistException |
86
|
|
|
*/ |
87
|
|
|
protected function getExtractor() |
88
|
|
|
{ |
89
|
|
|
return ServiceFactory::getTika($this->configuration['extractor']); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* Extracts meta-data from a file using Apache Tika |
94
|
|
|
* |
95
|
|
|
* @param File $file |
96
|
|
|
* @param array $previousExtractedData Already extracted/existing data |
97
|
|
|
* @return array |
98
|
|
|
* @throws ClientExceptionInterface |
99
|
|
|
*/ |
100
|
1 |
|
public function extractMetaData(File $file, array $previousExtractedData = []): array |
101
|
|
|
{ |
102
|
1 |
|
$extractedMetaData = $this->getExtractedMetaDataFromTikaService($file); |
103
|
1 |
|
return $this->normalizeMetaData($extractedMetaData); |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
/** |
107
|
|
|
* Creates an instance of the service and returns the result from "extractMetaData". |
108
|
|
|
* |
109
|
|
|
* @param FileInterface $file |
110
|
|
|
* @return array |
111
|
|
|
* @throws ClientExceptionInterface |
112
|
|
|
* @throws ExtensionConfigurationExtensionNotConfiguredException |
113
|
|
|
* @throws ExtensionConfigurationPathDoesNotExistException |
114
|
|
|
*/ |
115
|
|
|
protected function getExtractedMetaDataFromTikaService(FileInterface $file): array |
116
|
|
|
{ |
117
|
|
|
$tikaService = $this->getExtractor(); |
118
|
|
|
return $tikaService->extractMetaData($file); |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* Normalizes the names / keys of the meta-data found. |
123
|
|
|
* |
124
|
|
|
* @param array $metaData An array of raw meta-data from a file |
125
|
|
|
* @return array An array with cleaned meta-data keys |
126
|
|
|
*/ |
127
|
1 |
|
protected function normalizeMetaData(array $metaData): array |
128
|
|
|
{ |
129
|
1 |
|
$metaDataCleaned = []; |
130
|
|
|
|
131
|
1 |
|
foreach ($metaData as $key => $value) { |
132
|
1 |
|
if (is_array($value)) { |
133
|
1 |
|
$value = implode(', ', $value); |
134
|
|
|
} |
135
|
|
|
|
136
|
1 |
|
if (empty($value)) { |
137
|
|
|
continue; |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
// clean / add values under alternative names |
141
|
|
|
switch ($key) { |
142
|
1 |
|
case 'dc:title': |
143
|
1 |
|
case 'title': |
144
|
|
|
$metaDataCleaned['title'] = $value; |
145
|
|
|
break; |
146
|
1 |
|
case 'dc:creator': |
147
|
1 |
|
case 'meta:author': |
148
|
1 |
|
case 'Author': |
149
|
1 |
|
case 'creator': |
150
|
|
|
$metaDataCleaned['creator'] = $value; |
151
|
|
|
break; |
152
|
1 |
|
case 'dc:publisher': |
153
|
|
|
$metaDataCleaned['publisher'] = $value; |
154
|
|
|
break; |
155
|
1 |
|
case 'height': |
156
|
|
|
$metaDataCleaned['height'] = $value; |
157
|
|
|
break; |
158
|
1 |
|
case 'Exif Image Height': |
159
|
1 |
|
[$height] = explode(' ', $value, 2); |
160
|
1 |
|
$metaDataCleaned['height'] = $height; |
161
|
1 |
|
break; |
162
|
1 |
|
case 'width': |
163
|
|
|
$metaDataCleaned['width'] = $value; |
164
|
|
|
break; |
165
|
1 |
|
case 'Exif Image Width': |
166
|
1 |
|
[$width] = explode(' ', $value, 2); |
167
|
1 |
|
$metaDataCleaned['width'] = $width; |
168
|
1 |
|
break; |
169
|
1 |
|
case 'Color space': |
170
|
|
|
if ($value != 'Undefined') { |
171
|
|
|
$metaDataCleaned['color_space'] = $value; |
172
|
|
|
} |
173
|
|
|
break; |
174
|
1 |
|
case 'Image Description': |
175
|
1 |
|
case 'Jpeg Comment': |
176
|
1 |
|
case 'subject': |
177
|
1 |
|
case 'dc:description': |
178
|
|
|
$metaDataCleaned['description'] = $value; |
179
|
|
|
break; |
180
|
1 |
|
case 'Headline': |
181
|
|
|
$metaDataCleaned['alternative'] = $value; |
182
|
|
|
break; |
183
|
1 |
|
case 'dc:subject': |
184
|
1 |
|
case 'meta:keyword': |
185
|
1 |
|
case 'Keywords': |
186
|
|
|
$metaDataCleaned['keywords'] = $value; |
187
|
|
|
break; |
188
|
1 |
|
case 'Copyright Notice': |
189
|
|
|
$metaDataCleaned['note'] = $value; |
190
|
|
|
break; |
191
|
1 |
|
case 'dcterms:created': |
192
|
1 |
|
case 'meta:creation-date': |
193
|
1 |
|
case 'Creation-Date': |
194
|
|
|
$metaDataCleaned['content_creation_date'] = strtotime($value); |
195
|
|
|
break; |
196
|
1 |
|
case 'Date/Time Original': |
197
|
|
|
$metaDataCleaned['content_creation_date'] = $this->exifDateToTimestamp($value); |
198
|
|
|
break; |
199
|
1 |
|
case 'dcterms:modified': |
200
|
1 |
|
case 'meta:save-date': |
201
|
1 |
|
case 'Last-Save-Date': |
202
|
1 |
|
case 'Last-Modified': |
203
|
|
|
$metaDataCleaned['content_modification_date'] = strtotime($value); |
204
|
|
|
break; |
205
|
1 |
|
case 'xmpTPg:NPages': |
206
|
1 |
|
case 'Page-Count': |
207
|
|
|
$metaDataCleaned['pages'] = $value; |
208
|
|
|
break; |
209
|
1 |
|
case 'Application-Name': |
210
|
1 |
|
case 'xmp:CreatorTool': |
211
|
|
|
$metaDataCleaned['creator_tool'] = $value; |
212
|
|
|
break; |
213
|
|
|
default: |
214
|
|
|
// ignore |
215
|
|
|
} |
216
|
|
|
} |
217
|
|
|
|
218
|
1 |
|
return $metaDataCleaned; |
219
|
|
|
} |
220
|
|
|
|
221
|
|
|
/** |
222
|
|
|
* Converts a date string into timestamp |
223
|
|
|
* exif-tags: 2002:09:07 15:29:52 |
224
|
|
|
* |
225
|
|
|
* @param string $date An exif date string |
226
|
|
|
* @return int Unix timestamp |
227
|
|
|
*/ |
228
|
|
|
protected function exifDateToTimestamp(string $date): int |
229
|
|
|
{ |
230
|
|
|
if (($timestamp = strtotime($date)) === -1) { |
231
|
|
|
$date = 0; |
232
|
|
|
} else { |
233
|
|
|
$date = $timestamp; |
234
|
|
|
} |
235
|
|
|
|
236
|
|
|
return $date; |
237
|
|
|
} |
238
|
|
|
} |
239
|
|
|
|