Completed
Push — master ( bde4cf...61750e )
by Daniel
9s
created

TikaTextExtractor   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 98
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 3
Bugs 0 Features 1
Metric Value
c 3
b 0
f 1
dl 0
loc 98
wmc 11
lcom 1
cbo 1
rs 10

6 Methods

Rating   Name   Duplication   Size   Complexity  
A getVersion() 0 11 3
B runShell() 0 27 2
A getContent() 0 9 2
A isAvailable() 0 4 1
A supportsExtension() 0 5 1
A supportsMime() 0 12 2
1
<?php
2
3
/**
4
 * Enables text extraction of file content via the Tika CLI
5
 * 
6
 * {@link http://tika.apache.org/1.7/gettingstarted.html}
7
 */
8
class TikaTextExtractor extends FileTextExtractor
0 ignored issues
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
9
{
10
    /**
11
     * Text extraction mode. Defaults to -t (plain text)
12
     *
13
     * @var string
14
     * @config
15
     */
16
    private static $output_mode = '-t';
0 ignored issues
show
Unused Code introduced by
The property $output_mode is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
17
18
    /**
19
     * Get the version of tika installed, or 0 if not installed
20
     *
21
     * @return float version of tika
22
     */
23
    public function getVersion()
24
    {
25
        $code = $this->runShell('tika --version', $stdout);
26
27
        // Parse output
28
        if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
29
            return $matches['version'];
30
        }
31
32
        return 0;
33
    }
34
35
    /**
36
     * Runs an arbitrary and safely escaped shell command
37
     *
38
     * @param string $command Full command including arguments
39
     * @param string &$stdout Standand output
40
     * @param string &$stderr Standard error
41
     * @param string $input Content to pass via standard input
42
     * @return int Exit code. 0 is success
43
     */
44
    protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
45
    {
46
        $descriptorSpecs = array(
47
            0 => array("pipe", "r"),
48
            1 => array("pipe", "w"),
49
            2 => array("pipe", "w")
50
        );
51
        // Invoke command
52
        $pipes = array();
53
        $proc = proc_open($command, $descriptorSpecs, $pipes);
54
        if (!is_resource($proc)) {
55
            return 255;
56
        }
57
58
        // Send content as input
59
        fwrite($pipes[0], $input);
60
        fclose($pipes[0]);
61
62
        // Get output
63
        $stdout = stream_get_contents($pipes[1]);
64
        fclose($pipes[1]);
65
        $stderr = stream_get_contents($pipes[2]);
66
        fclose($pipes[2]);
67
68
        // Get result
69
        return proc_close($proc);
70
    }
71
    
72
    public function getContent($path)
73
    {
74
        $mode = $this->config()->output_mode;
75
        $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
76
        $code = $this->runShell($command, $output);
77
        if ($code == 0) {
78
            return $output;
79
        }
80
    }
81
82
    public function isAvailable()
83
    {
84
        return $this->getVersion() > 0;
85
    }
86
87
    public function supportsExtension($extension)
88
    {
89
        // Determine support via mime type only
90
        return false;
91
    }
92
93
    public function supportsMime($mime)
94
    {
95
        // Get list of supported mime types
96
        $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
97
        if ($code) {
98
            return false;
99
        } // Error case
100
101
        // Check if the mime type is inside the result
102
        $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
103
        return (bool)preg_match($pattern, $supportedTypes);
104
    }
105
}
106