LanguageDetector::canProcess()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
ccs 0
cts 4
cp 0
rs 10
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 6
1
<?php
2
3
declare(strict_types=1);
4
5
namespace ApacheSolrForTypo3\Tika\Service\Extractor;
6
7
/*
8
 * This file is part of the TYPO3 CMS project.
9
 *
10
 * It is free software; you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License, either version 2
12
 * of the License, or any later version.
13
 *
14
 * For the full copyright and license information, please read the
15
 * LICENSE.txt file that was distributed with this source code.
16
 *
17
 * The TYPO3 project - inspiring people to share!
18
 */
19
20
use ApacheSolrForTypo3\Tika\Service\Tika\ServiceFactory;
21
use Psr\Http\Client\ClientExceptionInterface;
22
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException;
23
use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException;
24
use TYPO3\CMS\Core\Resource\File;
25
26
/**
27
 * A service to detect a text's language using Apache Tika
28
 *
29
 * @author Ingo Renner <[email protected]>
30
 */
31
class LanguageDetector extends AbstractExtractor
32
{
33
    protected array $supportedFileTypes = [
34
        'doc',
35
        'docx',
36
        'epub',
37
        'htm',
38
        'html',
39
        'msg',
40
        'odf',
41
        'odt',
42
        'pdf',
43
        'ppt',
44
        'pptx',
45
        'rtf',
46
        'sxw',
47
        'txt',
48
        'xls',
49
        'xlsx',
50
    ];
51
52
    /**
53
     * @var int
54
     */
55
    protected int $priority = 98;
56
57
    /**
58
     * Checks if the given file can be processed by this Extractor
59
     *
60
     * @param File $file
61
     * @return bool
62
     */
63
    public function canProcess(File $file): bool
64
    {
65
        $isSupportedFileType = in_array($file->getProperty('extension'), $this->supportedFileTypes);
66
        $isSizeBelowLimit = $this->fileSizeValidator->isBelowLimit($file);
67
68
        return $isSupportedFileType && $isSizeBelowLimit;
69
    }
70
71
    /**
72
     * Extracts meta data from a file using Apache Tika
73
     *
74
     * @param File $file
75
     * @param array $previousExtractedData Already extracted/existing data
76
     * @return array
77
     *
78
     * @throws ClientExceptionInterface
79
     * @throws ExtensionConfigurationExtensionNotConfiguredException
80
     * @throws ExtensionConfigurationPathDoesNotExistException
81
     */
82
    public function extractMetaData(
83
        File $file,
84
        array $previousExtractedData = []
85
    ): array {
86
        $metaData = [];
87
88
        $tika = ServiceFactory::getTika($this->configuration['extractor']);
89
        $metaData['language'] = $tika->detectLanguageFromFile($file);
90
91
        return $metaData;
92
    }
93
}
94