FileTextExtractor::get_extractor()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use SilverStripe\Assets\File;
6
use SilverStripe\Core\ClassInfo;
7
use SilverStripe\Core\Config\Config;
8
use SilverStripe\Core\Config\Configurable;
9
use SilverStripe\Core\Injector\Injectable;
10
use SilverStripe\Core\Injector\Injector;
11
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
12
13
/**
14
 * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
15
 * @author mstephens
16
 */
17
abstract class FileTextExtractor
18
{
19
    use Configurable;
20
    use Injectable;
21
22
    /**
23
     * Set priority from 0-100.
24
     * The highest priority extractor for a given content type will be selected.
25
     *
26
     * @config
27
     * @var integer
28
     */
29
    private static $priority = 50;
0 ignored issues
show
introduced by
The private property $priority is not used, and could be removed.
Loading history...
30
31
    /**
32
     * Cache of extractor class names, sorted by priority
33
     *
34
     * @var array
35
     */
36
    protected static $sorted_extractor_classes = null;
37
38
    /**
39
     * Gets the list of prioritised extractor classes
40
     *
41
     * @return array
42
     */
43
    protected static function get_extractor_classes()
44
    {
45
        // Check cache
46
        if (self::$sorted_extractor_classes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression self::sorted_extractor_classes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
47
            return self::$sorted_extractor_classes;
48
        }
49
50
        // Generate the sorted list of extractors on demand.
51
        $classes = ClassInfo::subclassesFor(__CLASS__);
52
        array_shift($classes);
53
        $classPriorities = [];
54
55
        foreach ($classes as $class) {
56
            $classPriorities[$class] = Config::inst()->get($class, 'priority');
57
        }
58
        arsort($classPriorities);
59
60
        // Save classes
61
        $sortedClasses = array_keys($classPriorities);
62
        return self::$sorted_extractor_classes = $sortedClasses;
63
    }
64
65
    /**
66
     * Get the text file extractor for the given class
67
     *
68
     * @param string $class
69
     * @return FileTextExtractor
70
     */
71
    protected static function get_extractor($class)
72
    {
73
        return Injector::inst()->get($class);
74
    }
75
76
    /**
77
     * Given a File object, decide which extractor instance to use to handle it
78
     *
79
     * @param File|string $file
80
     * @return FileTextExtractor|null
81
     */
82
    public static function for_file($file)
83
    {
84
        if (!$file || (is_string($file) && !file_exists($file))) {
85
            return null;
86
        }
87
88
        // Ensure we have a File instance to work with
89
        if (is_string($file)) {
90
            /** @var File $fileObject */
91
            $fileObject = File::create();
92
            $fileObject->setFromLocalFile($file);
93
            $file = $fileObject;
94
        }
95
96
        $extension = $file->getExtension();
97
        $mime = $file->getMimeType();
98
99
        foreach (self::get_extractor_classes() as $className) {
100
            $extractor = self::get_extractor($className);
101
102
            // Skip unavailable extractors
103
            if (!$extractor->isAvailable()) {
104
                continue;
105
            }
106
107
            // Check extension
108
            if ($extension && $extractor->supportsExtension($extension)) {
109
                return $extractor;
110
            }
111
112
            // Check mime
113
            if ($mime && $extractor->supportsMime($mime)) {
114
                return $extractor;
115
            }
116
        }
117
    }
118
119
    /**
120
     * Some text extractors (like pdftotext) may require a physical file to read from, so write the current
121
     * file contents to a temp file and return its path
122
     *
123
     * @param File $file
124
     * @return string
125
     * @throws Exception
126
     */
127
    protected static function getPathFromFile(File $file)
128
    {
129
        $path = tempnam(TEMP_PATH, 'pdftextextractor_');
130
        if (false === $path) {
131
            throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
132
        }
133
134
        // Append extension to temp file if one is set
135
        if ($file->getExtension()) {
136
            $path .= '.' . $file->getExtension();
137
        }
138
139
        // Remove any existing temp files with this name
140
        if (file_exists($path)) {
141
            unlink($path);
142
        }
143
144
        $bytesWritten = file_put_contents($path, $file->getStream());
145
        if (false === $bytesWritten) {
146
            throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
147
        }
148
149
        return $path;
150
    }
151
152
    /**
153
     * Checks if the extractor is supported on the current environment,
154
     * for example if the correct binaries or libraries are available.
155
     *
156
     * @return boolean
157
     */
158
    abstract public function isAvailable();
159
160
    /**
161
     * Determine if this extractor supports the given extension.
162
     * If support is determined by mime/type only, then this should return false.
163
     *
164
     * @param string $extension
165
     * @return boolean
166
     */
167
    abstract public function supportsExtension($extension);
168
169
    /**
170
     * Determine if this extractor supports the given mime type.
171
     * Will only be called if supportsExtension returns false.
172
     *
173
     * @param string $mime
174
     * @return boolean
175
     */
176
    abstract public function supportsMime($mime);
177
178
    /**
179
     * Given a File instance, extract the contents as text.
180
     *
181
     * @param File|string $file Either the File instance, or a file path for a file to load
182
     * @return string
183
     */
184
    abstract public function getContent($file);
185
}
186