Completed
Pull Request — master (#45)
by Robbie
24:33 queued 17:24
created

FileTextExtractor::getPathFromFile()   A

Complexity

Conditions 5
Paths 9

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
nc 9
nop 1
dl 0
loc 23
rs 9.2408
c 0
b 0
f 0
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use SilverStripe\Assets\File;
6
use SilverStripe\Core\ClassInfo;
7
use SilverStripe\Core\Config\Config;
8
use SilverStripe\Core\Config\Configurable;
9
use SilverStripe\Core\Injector\Injectable;
10
use SilverStripe\Core\Injector\Injector;
11
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
12
13
/**
14
 * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
15
 * @author mstephens
16
 */
17
abstract class FileTextExtractor
18
{
19
    use Configurable;
20
    use Injectable;
21
22
    /**
23
     * Set priority from 0-100.
24
     * The highest priority extractor for a given content type will be selected.
25
     *
26
     * @config
27
     * @var integer
28
     */
29
    private static $priority = 50;
0 ignored issues
show
introduced by
The private property $priority is not used, and could be removed.
Loading history...
30
31
    /**
32
     * Cache of extractor class names, sorted by priority
33
     *
34
     * @var array
35
     */
36
    protected static $sorted_extractor_classes = null;
37
38
    /**
39
     * Gets the list of prioritised extractor classes
40
     *
41
     * @return array
42
     */
43
    protected static function get_extractor_classes()
44
    {
45
        // Check cache
46
        if (self::$sorted_extractor_classes) {
0 ignored issues
show
Bug Best Practice introduced by
The expression self::sorted_extractor_classes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
47
            return self::$sorted_extractor_classes;
48
        }
49
50
        // Generate the sorted list of extractors on demand.
51
        $classes = ClassInfo::subclassesFor(__CLASS__);
52
        array_shift($classes);
53
        $classPriorities = [];
54
55
        foreach ($classes as $class) {
56
            $classPriorities[$class] = Config::inst()->get($class, 'priority');
57
        }
58
        arsort($classPriorities);
59
60
        // Save classes
61
        $sortedClasses = array_keys($classPriorities);
62
        return self::$sorted_extractor_classes = $sortedClasses;
63
    }
64
65
    /**
66
     * Get the text file extractor for the given class
67
     *
68
     * @param string $class
69
     * @return FileTextExtractor
70
     */
71
    protected static function get_extractor($class)
72
    {
73
        return Injector::inst()->get($class);
74
    }
75
76
    /**
77
     * Attempt to detect mime type for given file
78
     *
79
     * @param string $path
80
     * @return string Mime type if found
81
     */
82
    protected static function get_mime($path)
83
    {
84
        $file = new \Symfony\Component\HttpFoundation\File\File($path);
85
86
        return $file->getMimeType();
87
    }
88
89
    /**
90
     * Given a File object, decide which extractor instance to use to handle it
91
     *
92
     * @param File $file
93
     * @return FileTextExtractor|null
94
     */
95
    public static function for_file(File $file)
96
    {
97
        if (!$file) {
0 ignored issues
show
introduced by
$file is of type SilverStripe\Assets\File, thus it always evaluated to true.
Loading history...
98
            return null;
99
        }
100
101
        $extension = $file->getExtension();
102
        $mime = $file->getMimeType();
103
104
        foreach (self::get_extractor_classes() as $className) {
105
            $extractor = self::get_extractor($className);
106
107
            // Skip unavailable extractors
108
            if (!$extractor->isAvailable()) {
109
                continue;
110
            }
111
112
            // Check extension
113
            if ($extension && $extractor->supportsExtension($extension)) {
114
                return $extractor;
115
            }
116
117
            // Check mime
118
            if ($mime && $extractor->supportsMime($mime)) {
119
                return $extractor;
120
            }
121
        }
122
    }
123
124
    /**
125
     * Some text extractors (like pdftotext) may require a physical file to read from, so write the current
126
     * file contents to a temp file and return its path
127
     *
128
     * @param File $file
129
     * @return string
130
     * @throws Exception
131
     */
132
    protected function getPathFromFile(File $file)
133
    {
134
        $path = tempnam(TEMP_PATH, 'pdftextextractor_');
135
        if (false === $path) {
136
            throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
137
        }
138
139
        // Append extension to temp file if one is set
140
        if ($file->getExtension()) {
141
            $path .= '.' . $file->getExtension();
142
        }
143
144
        // Remove any existing temp files with this name
145
        if (file_exists($path)) {
146
            unlink($path);
147
        }
148
149
        $bytesWritten = file_put_contents($path, $file->getStream());
150
        if (false === $bytesWritten) {
151
            throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
152
        }
153
154
        return $path;
155
    }
156
157
    /**
158
     * Checks if the extractor is supported on the current environment,
159
     * for example if the correct binaries or libraries are available.
160
     *
161
     * @return boolean
162
     */
163
    abstract public function isAvailable();
164
165
    /**
166
     * Determine if this extractor supports the given extension.
167
     * If support is determined by mime/type only, then this should return false.
168
     *
169
     * @param string $extension
170
     * @return boolean
171
     */
172
    abstract public function supportsExtension($extension);
173
174
    /**
175
     * Determine if this extractor supports the given mime type.
176
     * Will only be called if supportsExtension returns false.
177
     *
178
     * @param string $mime
179
     * @return boolean
180
     */
181
    abstract public function supportsMime($mime);
182
183
    /**
184
     * Given a File instance, extract the contents as text.
185
     *
186
     * @param File|string $file Either the File instance, or a file path for a file to load
187
     * @return string
188
     */
189
    abstract public function getContent($file);
190
}
191