SolrCellService::getTikaVersion()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 5
ccs 0
cts 2
cp 0
crap 2
rs 10
c 1
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Tika;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Solr\ConnectionManager;
21
use ApacheSolrForTypo3\Solr\System\Solr\SolrConnection;
22
use Solarium\QueryType\Extract\Query;
23
use TYPO3\CMS\Core\Resource\FileInterface;
24
use TYPO3\CMS\Core\Utility\GeneralUtility;
25
26
/**
27
 * A Tika service implementation using a Solr server
28
 *
29
 * @author Ingo Renner <[email protected]>
30
 */
31
class SolrCellService extends AbstractService
32
{
33
    /**
34
     * Solr connection
35
     *
36
     * @var SolrConnection
37
     */
38
    protected SolrConnection $solrConnection;
39
40
    /**
41
     * Service initialization
42
     */
43 6
    protected function initializeService(): void
44
    {
45
        // EM might define a different connection than already in use by
46
        // Index Queue
47
        /** @var ConnectionManager $connectionManager */
48 6
        $connectionManager =  GeneralUtility::makeInstance(ConnectionManager::class);
49
50 6
        $readNode = [
51 6
            'host' => $this->configuration['solrHost'],
52 6
            'port' => (int)$this->configuration['solrPort'],
53 6
            'path' => $this->configuration['solrPath'],
54 6
            'scheme' => $this->configuration['solrScheme'],
55 6
        ];
56 6
        $writeNode = $readNode;
57 6
        $this->solrConnection = $connectionManager->getSolrConnectionForNodes($readNode, $writeNode);
58
    }
59
60
    /**
61
     * Retrieves a configuration value or a default value when not available.
62
     *
63
     * @param string $key
64
     * @param mixed $defaultValue
65
     * @return mixed
66
     */
67
    protected function getConfigurationOrDefaultValue(string $key, $defaultValue)
68
    {
69
        return $this->configuration[$key] ?? $defaultValue;
70
    }
71
72
    /**
73
     * Takes a file reference and extracts the text from it.
74
     *
75
     * @param FileInterface $file
76
     * @return string
77
     */
78 2
    public function extractText(FileInterface $file): string
79
    {
80 2
        $localTempFilePath = $file->getForLocalProcessing(false);
81
        /** @var Query $query */
82 2
        $query = GeneralUtility::makeInstance(Query::class);
83 2
        $query->setFile($localTempFilePath);
84 2
        $query->setExtractOnly(true);
85 2
        $query->addParam('extractFormat', 'text');
86
87 2
        $writer = $this->solrConnection->getWriteService();
88 2
        $response = $writer->extractByQuery($query);
89
90 2
        $this->log('Text Extraction using Solr', [
91 2
            'file' => $file,
92 2
            'solr connection' => (array)$writer,
93 2
            'query' => (array)$query,
94 2
            'response' => $response,
95 2
        ]);
96
97 2
        return $response[0] ?? '';
98
    }
99
100
    /**
101
     * Takes a file reference and extracts its meta-data.
102
     *
103
     * @param FileInterface $file
104
     * @return array
105
     */
106 2
    public function extractMetaData(FileInterface $file): array
107
    {
108 2
        $localTempFilePath = $file->getForLocalProcessing(false);
109
        /** @var Query $query */
110 2
        $query = GeneralUtility::makeInstance(Query::class);
111 2
        $query->setFile($localTempFilePath);
112 2
        $query->setExtractOnly(true);
113 2
        $query->addParam('extractFormat', 'text');
114
115 2
        $writer = $this->solrConnection->getWriteService();
116 2
        $response = $writer->extractByQuery($query);
117
118 2
        $metaData = [];
119 2
        if (isset($response[1]) && is_array($response[1])) {
120 2
            $metaData = $this->solrResponseToArray($response[1]);
121
        }
122
123 2
        $this->log('Meta Data Extraction using Solr', [
124 2
            'file' => $file,
125 2
            'solr connection' => (array)$writer,
126 2
            'query' => (array)$query,
127 2
            'response' => $response,
128 2
            'meta data' => $metaData,
129 2
        ]);
130
131 2
        return $metaData;
132
    }
133
134
    /**
135
     * Takes a file reference and detects its content's language.
136
     *
137
     * @param FileInterface $file
138
     * @return string Language ISO code
139
     */
140
    public function detectLanguageFromFile(FileInterface $file): string
141
    {
142
        // TODO check whether Solr supports text extraction now
143
        throw new UnsupportedOperationException(
144
            'The Tika Solr service does not support language detection',
145
            1423457153
146
        );
147
    }
148
149
    /**
150
     * Takes a string as input and detects its language.
151
     *
152
     * @param string $input
153
     * @return string Language ISO code
154
     */
155
    public function detectLanguageFromString(string $input): string
156
    {
157
        // TODO check whether Solr supports text extraction now
158
        throw new UnsupportedOperationException(
159
            'The Tika Solr service does not support language detection',
160
            1423457153
161
        );
162
    }
163
164
    /**
165
     * Turns the nested Solr response into the same format as produced by a
166
     * local Tika jar call
167
     *
168
     * @param array $metaDataResponse The part of the Solr response containing the meta-data
169
     * @return array The cleaned meta-data, matching the Tika jar call format
170
     */
171 2
    protected function solrResponseToArray(array $metaDataResponse = []): array
172
    {
173 2
        $cleanedData = [];
174
175 2
        foreach ($metaDataResponse as $dataName => $dataArray) {
176 2
            if (!($dataName % 2) == 0) {
177 1
                continue;
178
            }
179 2
            $fieldName = $dataArray;
180 2
            $fieldValue = $metaDataResponse[$dataName + 1] ?? [''];
181
182 2
            $cleanedData[$fieldName] = $fieldValue[0];
183
        }
184
185 2
        return $cleanedData;
186
    }
187
188
    /**
189
     * Gets the Tika version
190
     *
191
     * @return string Apache Solr server version string
192
     */
193
    public function getTikaVersion(): string
194
    {
195
        // TODO add patch for endpoint on Apache Solr to return Tika version
196
        // for now returns the Solr version string f.e. "Apache Solr X.Y.Z"
197
        return $this->solrConnection->getAdminService()->getSolrServerVersion();
198
    }
199
200
    /**
201
     * Since solr cell does not allow to query the supported mimetypes, we return a list of known supported mimetypes here.
202
     *
203
     * @return array
204
     */
205 1
    public function getSupportedMimeTypes(): array
206
    {
207 1
        $mapping = [
208 1
            'application/epub+zip' => ['epub'],
209 1
            'application/gzip' => ['gz', 'tgz'],
210 1
            'application/msword' => ['doc'],
211 1
            'application/pdf' => ['pdf'],
212 1
            'application/rtf' => ['rtf'],
213 1
            'application/vnd.ms-excel' => ['xsl'],
214 1
            'application/vnd.ms-outlook' => ['msg'],
215 1
            'application/vnd.oasis.opendocument.formula' => ['odf'],
216 1
            'application/vnd.oasis.opendocument.text' => ['odt'],
217 1
            'application/vnd.openxmlformats-officedocument.presentationml.presentation' => ['pptx'],
218 1
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => ['xlsx'],
219 1
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => ['docx'],
220 1
            'application/vnd.sun.xml.writer' => ['sxw'],
221 1
            'application/zip' => ['zip'],
222 1
            'application/x-midi' => ['mid'],
223 1
            'application/xml' => ['xml'],
224 1
            'audio/aiff' => ['aif', 'aiff'],
225 1
            'audio/basic' => ['au'],
226 1
            'audio/midi' => ['mid'],
227 1
            'audio/mpeg3' => ['mp3'],
228 1
            'audio/mpeg' => ['mp3'],
229 1
            'audio/wav' => ['wav'],
230 1
            'audio/x-mpeg-3' => ['mp3'],
231 1
            'audio/x-wav' => ['wav'],
232 1
            'image/bmp' => ['bmp'],
233 1
            'image/gif' => ['gif'],
234 1
            'image/jpeg' => ['jpg', 'jpeg'],
235 1
            'image/png' => ['png'],
236 1
            'image/svg+xml' => ['svg'],
237 1
            'image/tiff' => ['tif', 'tiff'],
238 1
            'text/html' => ['html', 'htm'],
239 1
            'text/plain' => ['txt'],
240 1
            'text/xml' => ['xml'],
241 1
            'video/mpeg' => ['mp3'],
242 1
            'video/x-mpeg' => ['mp3'],
243 1
        ];
244
245 1
        return array_keys($mapping);
246
    }
247
248
    /**
249
     * The service is available when the solr server is reachable.
250
     *
251
     * @return bool
252
     */
253 1
    public function isAvailable(): bool
254
    {
255 1
        return $this->solrConnection->getWriteService()->ping();
256
    }
257
}
258