TextExtractor::extractText()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 2
b 0
f 0
nc 1
nop 1
dl 0
loc 4
ccs 0
cts 3
cp 0
crap 2
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Extractor;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Tika\Service\File\SizeValidator;
21
use ApacheSolrForTypo3\Tika\Service\Tika\ServiceFactory;
22
use ApacheSolrForTypo3\Tika\Util;
23
use Psr\Http\Client\ClientExceptionInterface;
24
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException;
25
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException;
26
use TYPO3\CMS\Core\Resource\FileInterface;
27
use TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorInterface;
28
use TYPO3\CMS\Core\Utility\GeneralUtility;
29
30
/**
31
 * A service to extract text from files using Apache Tika
32
 *
33
 * @author Ingo Renner <[email protected]>
34
 */
35
class TextExtractor implements TextExtractorInterface
36
{
37
    /**
38
     * @var array
39
     */
40
    protected array $configuration;
41
42
    /**
43
     * Supported file types (by extension)
44
     * @TODO query Tika for supported extensions
45
     *
46
     * @var array
47
     */
48
    protected array $supportedFileTypes = [
49
        'doc',
50
        'docx',
51
        'epub',
52
        'htm',
53
        'html',
54
        'msg',
55
        'odf',
56
        'odt',
57
        'pdf',
58
        'ppt',
59
        'pptx',
60
        'rtf',
61
        'sxw',
62
        'txt',
63
        'xls',
64
        'xlsx',
65
        'zip',
66
    ];
67
68
    /**
69
     * @var SizeValidator
70
     */
71
    private $fileSizeValidator;
72
73
    /**
74
     * Constructor
75
     */
76
    public function __construct()
77
    {
78
        $this->configuration = Util::getTikaExtensionConfiguration();
79
        $this->fileSizeValidator = GeneralUtility::makeInstance(SizeValidator::class);
80
    }
81
82
    /**
83
     * Checks if the given file can be processed by this Extractor
84
     *
85
     * @param FileInterface $file
86
     * @return bool
87
     */
88
    public function canExtractText(FileInterface $file): bool
89
    {
90
        $isSupportedFileExtension = in_array($file->getExtension(), $this->supportedFileTypes);
91
        $isSizeBelowLimit = $this->fileSizeValidator->isBelowLimit($file);
92
93
        return $isSizeBelowLimit && $isSupportedFileExtension;
94
    }
95
96
    /**
97
     * Extracts text from a file using Apache Tika
98
     *
99
     * @param FileInterface $file
100
     * @return string Text extracted from the input file
101
     * @throws ClientExceptionInterface
102
     * @throws ExtensionConfigurationExtensionNotConfiguredException
103
     * @throws ExtensionConfigurationPathDoesNotExistException
104
     */
105
    public function extractText(FileInterface $file): string
106
    {
107
        $tika = ServiceFactory::getTika($this->configuration['extractor']);
108
        return $tika->extractText($file);
109
    }
110
}
111