PDFTextExtractor::supportsExtension()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use SilverStripe\Assets\File;
6
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
7
8
/**
9
 * Text extractor that calls pdftotext to do the conversion.
10
 * @author mstephens
11
 */
12
class PDFTextExtractor extends FileTextExtractor
13
{
14
    /**
15
     * Set to bin path this extractor can execute
16
     *
17
     * @var string
18
     */
19
    private static $binary_location = null;
0 ignored issues
show
introduced by
The private property $binary_location is not used, and could be removed.
Loading history...
20
21
    /**
22
     * Used if binary_location isn't set.
23
     * List of locations to search for a given binary in
24
     *
25
     * @config
26
     * @var array
27
     */
28
    private static $search_binary_locations = [
0 ignored issues
show
introduced by
The private property $search_binary_locations is not used, and could be removed.
Loading history...
29
        '/usr/bin',
30
        '/usr/local/bin',
31
    ];
32
33
    public function isAvailable()
34
    {
35
        $bin = $this->bin('pdftotext');
36
        return $bin && file_exists($bin) && is_executable($bin);
37
    }
38
39
    public function supportsExtension($extension)
40
    {
41
        return strtolower($extension) === 'pdf';
42
    }
43
44
    public function supportsMime($mime)
45
    {
46
        return in_array(
47
            strtolower($mime),
48
            [
49
                'application/pdf',
50
                'application/x-pdf',
51
                'application/x-bzpdf',
52
                'application/x-gzpdf'
53
            ]
54
        );
55
    }
56
57
    /**
58
     * Accessor to get the location of the binary
59
     *
60
     * @param string $program Name of binary
61
     * @return string
62
     */
63
    protected function bin($program = '')
64
    {
65
        // Get list of allowed search paths
66
        if ($location = $this->config()->get('binary_location')) {
67
            $locations = [$location];
68
        } else {
69
            $locations = $this->config()->get('search_binary_locations');
70
        }
71
72
        // Find program in each path
73
        foreach ($locations as $location) {
74
            $path = "{$location}/{$program}";
75
            if (file_exists($path)) {
76
                return $path;
77
            }
78
            if (file_exists($path . '.exe')) {
79
                return $path . '.exe';
80
            }
81
        }
82
83
        // Not found
84
        return null;
85
    }
86
87
    public function getContent($file)
88
    {
89
        if (!$file || (is_string($file) && !file_exists($file))) {
90
            // no file
91
            return '';
92
        }
93
        $content = $this->getRawOutput($file);
94
        return $this->cleanupLigatures($content);
95
    }
96
97
    /**
98
     * Invoke pdftotext with the given File object
99
     *
100
     * @param  File|string $file
101
     * @return string Output
102
     * @throws Exception
103
     */
104
    protected function getRawOutput($file)
105
    {
106
        if (!$this->isAvailable()) {
107
            throw new Exception("getRawOutput called on unavailable extractor");
108
        }
109
110
        $path = $file instanceof File ? $this->getPathFromFile($file) : $file;
111
        exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
112
        if ($err) {
113
            if (!is_array($err) && $err == 1) {
114
                // For Windows compatibility
115
                $err = $content;
116
            }
117
118
            throw new Exception(sprintf(
119
                'PDFTextExtractor->getContent() failed for %s: %s',
120
                $path,
121
                implode(PHP_EOL, $err)
122
            ));
123
        }
124
125
        return implode(PHP_EOL, $content);
126
    }
127
128
    /**
129
     * Removes utf-8 ligatures.
130
     *
131
     * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
132
     *
133
     * @param string $input
134
     * @return string
135
     */
136
    protected function cleanupLigatures($input)
137
    {
138
        $mapping = [
139
            'ff' => 'ff',
140
            'fi' => 'fi',
141
            'fl' => 'fl',
142
            'ffi' => 'ffi',
143
            'ffl' => 'ffl',
144
            'ſt' => 'ft',
145
            'st' => 'st'
146
        ];
147
148
        return str_replace(array_keys($mapping), array_values($mapping), $input);
149
    }
150
}
151