Completed
Pull Request — master (#45)
by Robbie
24:33 queued 17:24
created

TikaTextExtractor::runShell()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 4
dl 0
loc 27
rs 9.488
c 0
b 0
f 0
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use SilverStripe\Assets\File;
6
7
/**
8
 * Enables text extraction of file content via the Tika CLI
9
 *
10
 * {@link http://tika.apache.org/1.7/gettingstarted.html}
11
 */
12
class TikaTextExtractor extends FileTextExtractor
13
{
14
    /**
15
     * Text extraction mode. Defaults to -t (plain text)
16
     *
17
     * @var string
18
     * @config
19
     */
20
    private static $output_mode = '-t';
0 ignored issues
show
introduced by
The private property $output_mode is not used, and could be removed.
Loading history...
21
22
    /**
23
     * Get the version of tika installed, or 0 if not installed
24
     *
25
     * @return mixed float | int The version of tika
26
     */
27
    public function getVersion()
28
    {
29
        $code = $this->runShell('tika --version', $stdout);
30
31
        // Parse output
32
        if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
33
            return $matches['version'];
34
        }
35
36
        return 0;
37
    }
38
39
    /**
40
     * Runs an arbitrary and safely escaped shell command
41
     *
42
     * @param  string $command Full command including arguments
43
     * @param  string &$stdout Standand output
44
     * @param  string &$stderr Standard error
45
     * @param  string $input   Content to pass via standard input
46
     * @return int Exit code.  0 is success
47
     */
48
    protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
49
    {
50
        $descriptorSpecs = [
51
            0 => ["pipe", "r"],
52
            1 => ["pipe", "w"],
53
            2 => ["pipe", "w"]
54
        ];
55
        // Invoke command
56
        $pipes = [];
57
        $proc = proc_open($command, $descriptorSpecs, $pipes);
58
59
        if (!is_resource($proc)) {
60
            return 255;
61
        }
62
63
        // Send content as input
64
        fwrite($pipes[0], $input);
65
        fclose($pipes[0]);
66
67
        // Get output
68
        $stdout = stream_get_contents($pipes[1]);
69
        fclose($pipes[1]);
70
        $stderr = stream_get_contents($pipes[2]);
71
        fclose($pipes[2]);
72
73
        // Get result
74
        return proc_close($proc);
75
    }
76
77
    public function getContent($file)
78
    {
79
        $mode = $this->config()->get('output_mode');
80
        $path = $file instanceof File ? $this->getPathFromFile($file) : $file;
81
        $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
82
        $code = $this->runShell($command, $output);
83
84
        if ($code == 0) {
85
            return $output;
86
        }
87
    }
88
89
    /**
90
     * @return bool
91
     */
92
    public function isAvailable()
93
    {
94
        return $this->getVersion() > 0;
95
    }
96
97
    /**
98
     * @return bool
99
     */
100
    public function supportsExtension($extension)
101
    {
102
        // Determine support via mime type only
103
        return false;
104
    }
105
106
107
    /**
108
     * @param  string $mime
109
     * @return bool
110
     */
111
    public function supportsMime($mime)
112
    {
113
        // Get list of supported mime types
114
        $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
115
116
        if ($code) {
117
            // Error case
118
            return false;
119
        }
120
121
        // Check if the mime type is inside the result
122
        $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
123
124
        return (bool)preg_match($pattern, $supportedTypes);
125
    }
126
}
127