1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace SilverStripe\TextExtraction\Extractor; |
4
|
|
|
|
5
|
|
|
use SilverStripe\Assets\File; |
6
|
|
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Text extractor that calls pdftotext to do the conversion. |
10
|
|
|
* @author mstephens |
11
|
|
|
*/ |
12
|
|
|
class PDFTextExtractor extends FileTextExtractor |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* Set to bin path this extractor can execute |
16
|
|
|
* |
17
|
|
|
* @var string |
18
|
|
|
*/ |
19
|
|
|
private static $binary_location = null; |
|
|
|
|
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* Used if binary_location isn't set. |
23
|
|
|
* List of locations to search for a given binary in |
24
|
|
|
* |
25
|
|
|
* @config |
26
|
|
|
* @var array |
27
|
|
|
*/ |
28
|
|
|
private static $search_binary_locations = [ |
|
|
|
|
29
|
|
|
'/usr/bin', |
30
|
|
|
'/usr/local/bin', |
31
|
|
|
]; |
32
|
|
|
|
33
|
|
|
public function isAvailable() |
34
|
|
|
{ |
35
|
|
|
$bin = $this->bin('pdftotext'); |
36
|
|
|
return $bin && file_exists($bin) && is_executable($bin); |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
public function supportsExtension($extension) |
40
|
|
|
{ |
41
|
|
|
return strtolower($extension) === 'pdf'; |
42
|
|
|
} |
43
|
|
|
|
44
|
|
|
public function supportsMime($mime) |
45
|
|
|
{ |
46
|
|
|
return in_array( |
47
|
|
|
strtolower($mime), |
48
|
|
|
[ |
49
|
|
|
'application/pdf', |
50
|
|
|
'application/x-pdf', |
51
|
|
|
'application/x-bzpdf', |
52
|
|
|
'application/x-gzpdf' |
53
|
|
|
] |
54
|
|
|
); |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* Accessor to get the location of the binary |
59
|
|
|
* |
60
|
|
|
* @param string $program Name of binary |
61
|
|
|
* @return string |
62
|
|
|
*/ |
63
|
|
|
protected function bin($program = '') |
64
|
|
|
{ |
65
|
|
|
// Get list of allowed search paths |
66
|
|
|
if ($location = $this->config()->get('binary_location')) { |
67
|
|
|
$locations = [$location]; |
68
|
|
|
} else { |
69
|
|
|
$locations = $this->config()->get('search_binary_locations'); |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
// Find program in each path |
73
|
|
|
foreach ($locations as $location) { |
74
|
|
|
$path = "{$location}/{$program}"; |
75
|
|
|
if (file_exists($path)) { |
76
|
|
|
return $path; |
77
|
|
|
} |
78
|
|
|
if (file_exists($path . '.exe')) { |
79
|
|
|
return $path . '.exe'; |
80
|
|
|
} |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
// Not found |
84
|
|
|
return null; |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
public function getContent($file) |
88
|
|
|
{ |
89
|
|
|
if (!$file || (is_string($file) && !file_exists($file))) { |
90
|
|
|
// no file |
91
|
|
|
return ''; |
92
|
|
|
} |
93
|
|
|
$content = $this->getRawOutput($file); |
94
|
|
|
return $this->cleanupLigatures($content); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
/** |
98
|
|
|
* Invoke pdftotext with the given File object |
99
|
|
|
* |
100
|
|
|
* @param File|string $file |
101
|
|
|
* @return string Output |
102
|
|
|
* @throws Exception |
103
|
|
|
*/ |
104
|
|
|
protected function getRawOutput($file) |
105
|
|
|
{ |
106
|
|
|
if (!$this->isAvailable()) { |
107
|
|
|
throw new Exception("getRawOutput called on unavailable extractor"); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file; |
111
|
|
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); |
112
|
|
|
if ($err) { |
113
|
|
|
if (!is_array($err) && $err == 1) { |
114
|
|
|
// For Windows compatibility |
115
|
|
|
$err = $content; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
throw new Exception(sprintf( |
119
|
|
|
'PDFTextExtractor->getContent() failed for %s: %s', |
120
|
|
|
$path, |
121
|
|
|
implode(PHP_EOL, $err) |
122
|
|
|
)); |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
return implode(PHP_EOL, $content); |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
/** |
129
|
|
|
* Removes utf-8 ligatures. |
130
|
|
|
* |
131
|
|
|
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting |
132
|
|
|
* |
133
|
|
|
* @param string $input |
134
|
|
|
* @return string |
135
|
|
|
*/ |
136
|
|
|
protected function cleanupLigatures($input) |
137
|
|
|
{ |
138
|
|
|
$mapping = [ |
139
|
|
|
'ff' => 'ff', |
140
|
|
|
'fi' => 'fi', |
141
|
|
|
'fl' => 'fl', |
142
|
|
|
'ffi' => 'ffi', |
143
|
|
|
'ffl' => 'ffl', |
144
|
|
|
'ſt' => 'ft', |
145
|
|
|
'st' => 'st' |
146
|
|
|
]; |
147
|
|
|
|
148
|
|
|
return str_replace(array_keys($mapping), array_values($mapping), $input); |
149
|
|
|
} |
150
|
|
|
} |
151
|
|
|
|