MetaDataExtractor   B
last analyzed

Complexity

Total Complexity 47

Size/Duplication

Total Lines 201
Duplicated Lines 0 %

Test Coverage

Coverage 57.28%

Importance

Changes 5
Bugs 0 Features 0
Metric Value
eloc 101
dl 0
loc 201
ccs 59
cts 103
cp 0.5728
rs 8.64
c 5
b 0
f 0
wmc 47

7 Methods

Rating   Name   Duplication   Size   Complexity  
A extractMetaData() 0 4 1
A canProcess() 0 10 2
A mergeAllowedMimeTypes() 0 9 2
A getExtractor() 0 3 1
A exifDateToTimestamp() 0 9 2
A getExtractedMetaDataFromTikaService() 0 4 1
D normalizeMetaData() 0 92 38

How to fix   Complexity   

Complex Class

Complex classes like MetaDataExtractor often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use MetaDataExtractor, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Extractor;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Tika\Service\Tika\AppService;
21
use ApacheSolrForTypo3\Tika\Service\Tika\ServerService;
22
use ApacheSolrForTypo3\Tika\Service\Tika\ServiceFactory;
23
use ApacheSolrForTypo3\Tika\Service\Tika\SolrCellService;
24
use Psr\Http\Client\ClientExceptionInterface;
25
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException;
26
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException;
27
use TYPO3\CMS\Core\Resource\File;
28
use TYPO3\CMS\Core\Resource\FileInterface;
29
use TYPO3\CMS\Core\Utility\GeneralUtility;
30
31
/**
32
 * A service to extract meta-data from files using Apache Tika
33
 *
34
 * @author Ingo Renner <[email protected]>
35
 */
36
class MetaDataExtractor extends AbstractExtractor
37
{
38
    /**
39
     * @var int
40
     */
41
    protected int $priority = 100;
42
43
    /**
44
     * Checks if the given file can be processed by this Extractor
45
     *
46
     * @param File $file
47
     * @return bool
48
     * @throws ClientExceptionInterface
49
     * @throws ExtensionConfigurationExtensionNotConfiguredException
50
     * @throws ExtensionConfigurationPathDoesNotExistException
51
     */
52 2
    public function canProcess(File $file): bool
53
    {
54 2
        $tikaService = $this->getExtractor();
55 2
        $mimeTypes = $tikaService->getSupportedMimeTypes();
56 2
        $allowedMimeTypes = $this->mergeAllowedMimeTypes($mimeTypes);
57
58 2
        $isAllowedMimetype = in_array($file->getMimeType(), $allowedMimeTypes);
59 2
        $isSizeBelowLimit = $this->fileSizeValidator->isBelowLimit($file);
60
61 2
        return $isAllowedMimetype && $isSizeBelowLimit;
62
    }
63
64
    /**
65
     * Method to return a filtered $mimeTypes list - excludes the ones defined in
66
     * $this->configuration['excludeMimeTypes']
67
     *
68
     * @param array $mimeTypes
69
     * @return array
70
     */
71 2
    protected function mergeAllowedMimeTypes(array $mimeTypes): array
72
    {
73 2
        if (empty($this->configuration['excludeMimeTypes'])) {
74 2
            return $mimeTypes;
75
        }
76
77
        $allowedMimeTypes = GeneralUtility::trimExplode(',', $this->configuration['excludeMimeTypes']);
78
79
        return array_diff($mimeTypes, $allowedMimeTypes);
80
    }
81
82
    /**
83
     * @return AppService|ServerService|SolrCellService
84
     * @throws ExtensionConfigurationExtensionNotConfiguredException
85
     * @throws ExtensionConfigurationPathDoesNotExistException
86
     */
87
    protected function getExtractor()
88
    {
89
        return ServiceFactory::getTika($this->configuration['extractor']);
90
    }
91
92
    /**
93
     * Extracts meta-data from a file using Apache Tika
94
     *
95
     * @param File $file
96
     * @param array $previousExtractedData Already extracted/existing data
97
     * @return array
98
     * @throws ClientExceptionInterface
99
     */
100 1
    public function extractMetaData(File $file, array $previousExtractedData = []): array
101
    {
102 1
        $extractedMetaData = $this->getExtractedMetaDataFromTikaService($file);
103 1
        return $this->normalizeMetaData($extractedMetaData);
104
    }
105
106
    /**
107
     * Creates an instance of the service and returns the result from "extractMetaData".
108
     *
109
     * @param FileInterface $file
110
     * @return array
111
     * @throws ClientExceptionInterface
112
     * @throws ExtensionConfigurationExtensionNotConfiguredException
113
     * @throws ExtensionConfigurationPathDoesNotExistException
114
     */
115
    protected function getExtractedMetaDataFromTikaService(FileInterface $file): array
116
    {
117
        $tikaService = $this->getExtractor();
118
        return $tikaService->extractMetaData($file);
119
    }
120
121
    /**
122
     * Normalizes the names / keys of the meta-data found.
123
     *
124
     * @param array $metaData An array of raw meta-data from a file
125
     * @return array An array with cleaned meta-data keys
126
     */
127 1
    protected function normalizeMetaData(array $metaData): array
128
    {
129 1
        $metaDataCleaned = [];
130
131 1
        foreach ($metaData as $key => $value) {
132 1
            if (is_array($value)) {
133 1
                $value = implode(', ', $value);
134
            }
135
136 1
            if (empty($value)) {
137
                continue;
138
            }
139
140
            // clean / add values under alternative names
141
            switch ($key) {
142 1
                case 'dc:title':
143 1
                case 'title':
144
                    $metaDataCleaned['title'] = $value;
145
                    break;
146 1
                case 'dc:creator':
147 1
                case 'meta:author':
148 1
                case 'Author':
149 1
                case 'creator':
150
                    $metaDataCleaned['creator'] = $value;
151
                    break;
152 1
                case 'dc:publisher':
153
                    $metaDataCleaned['publisher'] = $value;
154
                    break;
155 1
                case 'height':
156
                    $metaDataCleaned['height'] = $value;
157
                    break;
158 1
                case 'Exif Image Height':
159 1
                    [$height] = explode(' ', $value, 2);
160 1
                    $metaDataCleaned['height'] = $height;
161 1
                    break;
162 1
                case 'width':
163
                    $metaDataCleaned['width'] = $value;
164
                    break;
165 1
                case 'Exif Image Width':
166 1
                    [$width] = explode(' ', $value, 2);
167 1
                    $metaDataCleaned['width'] = $width;
168 1
                    break;
169 1
                case 'Color space':
170
                    if ($value != 'Undefined') {
171
                        $metaDataCleaned['color_space'] = $value;
172
                    }
173
                    break;
174 1
                case 'Image Description':
175 1
                case 'Jpeg Comment':
176 1
                case 'subject':
177 1
                case 'dc:description':
178
                    $metaDataCleaned['description'] = $value;
179
                    break;
180 1
                case 'Headline':
181
                    $metaDataCleaned['alternative'] = $value;
182
                    break;
183 1
                case 'dc:subject':
184 1
                case 'meta:keyword':
185 1
                case 'Keywords':
186
                    $metaDataCleaned['keywords'] = $value;
187
                    break;
188 1
                case 'Copyright Notice':
189
                    $metaDataCleaned['note'] = $value;
190
                    break;
191 1
                case 'dcterms:created':
192 1
                case 'meta:creation-date':
193 1
                case 'Creation-Date':
194
                    $metaDataCleaned['content_creation_date'] = strtotime($value);
195
                    break;
196 1
                case 'Date/Time Original':
197
                    $metaDataCleaned['content_creation_date'] = $this->exifDateToTimestamp($value);
198
                    break;
199 1
                case 'dcterms:modified':
200 1
                case 'meta:save-date':
201 1
                case 'Last-Save-Date':
202 1
                case 'Last-Modified':
203
                    $metaDataCleaned['content_modification_date'] = strtotime($value);
204
                    break;
205 1
                case 'xmpTPg:NPages':
206 1
                case 'Page-Count':
207
                    $metaDataCleaned['pages'] = $value;
208
                    break;
209 1
                case 'Application-Name':
210 1
                case 'xmp:CreatorTool':
211
                    $metaDataCleaned['creator_tool'] = $value;
212
                    break;
213
                default:
214
                    // ignore
215
            }
216
        }
217
218 1
        return $metaDataCleaned;
219
    }
220
221
    /**
222
     * Converts a date string into timestamp
223
     * exif-tags: 2002:09:07 15:29:52
224
     *
225
     * @param string $date An exif date string
226
     * @return int Unix timestamp
227
     */
228
    protected function exifDateToTimestamp(string $date): int
229
    {
230
        if (($timestamp = strtotime($date)) === -1) {
231
            $date = 0;
232
        } else {
233
            $date = $timestamp;
234
        }
235
236
        return $date;
237
    }
238
}
239