Passed
Push — release-11.0.x ( 910681...ed160c )
by Rafael
11:00
created

SolrCellService::extractText()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 20
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 14
CRAP Score 1

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 13
c 2
b 0
f 0
nc 1
nop 1
dl 0
loc 20
ccs 14
cts 14
cp 1
crap 1
rs 9.8333
1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Tika;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Solr\ConnectionManager;
21
use ApacheSolrForTypo3\Solr\System\Solr\SolrConnection;
22
use Solarium\QueryType\Extract\Query;
23
use TYPO3\CMS\Core\Resource\FileInterface;
24
use TYPO3\CMS\Core\Utility\GeneralUtility;
25
26
/**
27
 * A Tika service implementation using a Solr server
28
 *
29
 * @author Ingo Renner <[email protected]>
30
 */
31
class SolrCellService extends AbstractService
32
{
33
34
    /**
35
     * Solr connection
36
     *
37
     * @var SolrConnection
38
     */
39
    protected SolrConnection $solrConnection;
40
41
    /**
42
     * Service initialization
43
     */
44 6
    protected function initializeService(): void
45
    {
46
        // EM might define a different connection than already in use by
47
        // Index Queue
48
        /** @var ConnectionManager $connectionManager */
49 6
        $connectionManager =  GeneralUtility::makeInstance(ConnectionManager::class);
50
51 6
        $readNode = [
52 6
            'host' => $this->configuration['solrHost'],
53 6
            'port' => $this->configuration['solrPort'],
54 6
            'path' => $this->configuration['solrPath'],
55 6
            'scheme' => $this->configuration['solrScheme'],
56
        ];
57 6
        $writeNode = $readNode;
58 6
        $this->solrConnection = $connectionManager->getSolrConnectionForNodes($readNode, $writeNode);
59 6
    }
60
61
    /**
62
     * Retrieves a configuration value or a default value when not available.
63
     *
64
     * @param string $key
65
     * @param mixed $defaultValue
66
     * @return mixed
67
     */
68
    protected function getConfigurationOrDefaultValue(string $key, $defaultValue)
69
    {
70
        return $this->configuration[$key] ?? $defaultValue;
71
    }
72
73
    /**
74
     * Takes a file reference and extracts the text from it.
75
     *
76
     * @param FileInterface $file
77
     * @return string
78
     */
79 2
    public function extractText(FileInterface $file): string
80
    {
81 2
        $localTempFilePath = $file->getForLocalProcessing(false);
82
        /** @var Query $query */
83 2
        $query = GeneralUtility::makeInstance(Query::class);
84 2
        $query->setFile($localTempFilePath);
85 2
        $query->setExtractOnly(true);
86 2
        $query->addParam('extractFormat', 'text');
87
88 2
        $writer = $this->solrConnection->getWriteService();
89 2
        $response = $writer->extractByQuery($query);
90
91 2
        $this->log('Text Extraction using Solr', [
92 2
            'file' => $file,
93 2
            'solr connection' => (array)$writer,
94 2
            'query' => (array)$query,
95 2
            'response' => $response,
96
        ]);
97
98 2
        return $response[0] ?? '';
99
    }
100
101
    /**
102
     * Takes a file reference and extracts its meta-data.
103
     *
104
     * @param FileInterface $file
105
     * @return array
106
     */
107 2
    public function extractMetaData(FileInterface $file): array
108
    {
109 2
        $localTempFilePath = $file->getForLocalProcessing(false);
110
        /** @var Query $query */
111 2
        $query = GeneralUtility::makeInstance(Query::class);
112 2
        $query->setFile($localTempFilePath);
113 2
        $query->setExtractOnly(true);
114 2
        $query->addParam('extractFormat', 'text');
115
116 2
        $writer = $this->solrConnection->getWriteService();
117 2
        $response = $writer->extractByQuery($query);
118
119 2
        $metaData = [];
120 2
        if (isset($response[1]) && is_array($response[1])) {
121 2
            $metaData = $this->solrResponseToArray($response[1]);
122
        }
123
124 2
        $this->log('Meta Data Extraction using Solr', [
125 2
            'file' => $file,
126 2
            'solr connection' => (array)$writer,
127 2
            'query' => (array)$query,
128 2
            'response' => $response,
129 2
            'meta data' => $metaData,
130
        ]);
131
132 2
        return $metaData;
133
    }
134
135
    /**
136
     * Takes a file reference and detects its content's language.
137
     *
138
     * @param FileInterface $file
139
     * @return string Language ISO code
140
     */
141
    public function detectLanguageFromFile(FileInterface $file): string
142
    {
143
        // TODO check whether Solr supports text extraction now
144
        throw new UnsupportedOperationException(
145
            'The Tika Solr service does not support language detection',
146
            1423457153
147
        );
148
    }
149
150
    /**
151
     * Takes a string as input and detects its language.
152
     *
153
     * @param string $input
154
     * @return string Language ISO code
155
     */
156
    public function detectLanguageFromString(string $input): string
157
    {
158
        // TODO check whether Solr supports text extraction now
159
        throw new UnsupportedOperationException(
160
            'The Tika Solr service does not support language detection',
161
            1423457153
162
        );
163
    }
164
165
    /**
166
     * Turns the nested Solr response into the same format as produced by a
167
     * local Tika jar call
168
     *
169
     * @param array $metaDataResponse The part of the Solr response containing the meta-data
170
     * @return array The cleaned meta-data, matching the Tika jar call format
171
     */
172 2
    protected function solrResponseToArray(array $metaDataResponse = []): array
173
    {
174 2
        $cleanedData = [];
175
176 2
        foreach ($metaDataResponse as $dataName => $dataArray) {
177 2
            if (!($dataName % 2) == 0) {
178 1
                continue;
179
            }
180 2
            $fieldName = $dataArray;
181 2
            $fieldValue = $metaDataResponse[$dataName + 1] ?? [''];
182
183 2
            $cleanedData[$fieldName] = $fieldValue[0];
184
        }
185
186 2
        return $cleanedData;
187
    }
188
189
    /**
190
     * Gets the Tika version
191
     *
192
     * @return string Apache Solr server version string
193
     */
194
    public function getTikaVersion(): string
195
    {
196
        // TODO add patch for endpoint on Apache Solr to return Tika version
197
        // for now returns the Solr version string f.e. "Apache Solr X.Y.Z"
198
        return $this->solrConnection->getAdminService()->getSolrServerVersion();
199
    }
200
201
    /**
202
     * Since solr cell does not allow to query the supported mimetypes, we return a list of known supported mimetypes here.
203
     *
204
     * @return array
205
     */
206 1
    public function getSupportedMimeTypes(): array
207
    {
208 1
        $mapping = [
209
            'application/epub+zip' => ['epub'],
210
            'application/gzip' => ['gz', 'tgz'],
211
            'application/msword' => ['doc'],
212
            'application/pdf' => ['pdf'],
213
            'application/rtf' => ['rtf'],
214
            'application/vnd.ms-excel' => ['xsl'],
215
            'application/vnd.ms-outlook' => ['msg'],
216
            'application/vnd.oasis.opendocument.formula' => ['odf'],
217
            'application/vnd.oasis.opendocument.text' => ['odt'],
218
            'application/vnd.openxmlformats-officedocument.presentationml.presentation' => ['pptx'],
219
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => ['xlsx'],
220
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => ['docx'],
221
            'application/vnd.sun.xml.writer' => ['sxw'],
222
            'application/zip' => ['zip'],
223
            'application/x-midi' => ['mid'],
224
            'application/xml' => ['xml'],
225
            'audio/aiff' => ['aif', 'aiff'],
226
            'audio/basic' => ['au'],
227
            'audio/midi' => ['mid'],
228
            'audio/mpeg3' => ['mp3'],
229
            'audio/mpeg' => ['mp3'],
230
            'audio/wav' => ['wav'],
231
            'audio/x-mpeg-3' => ['mp3'],
232
            'audio/x-wav' => ['wav'],
233
            'image/bmp' => ['bmp'],
234
            'image/gif' => ['gif'],
235
            'image/jpeg' => ['jpg', 'jpeg'],
236
            'image/png' => ['png'],
237
            'image/svg+xml' => ['svg'],
238
            'image/tiff' => ['tif', 'tiff'],
239
            'text/html' => ['html', 'htm'],
240
            'text/plain' => ['txt'],
241
            'text/xml' => ['xml'],
242
            'video/mpeg' => ['mp3'],
243
            'video/x-mpeg' => ['mp3'],
244
        ];
245
246 1
        return array_keys($mapping);
247
    }
248
249
    /**
250
     * The service is available when the solr server is reachable.
251
     *
252
     * @return bool
253
     */
254 1
    public function isAvailable(): bool
255
    {
256 1
        return $this->solrConnection->getWriteService()->ping();
257
    }
258
}
259