1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace SilverStripe\TextExtraction\Rest; |
4
|
|
|
|
5
|
|
|
use GuzzleHttp\Client; |
6
|
|
|
use GuzzleHttp\Exception\RequestException; |
7
|
|
|
use GuzzleHttp\Psr7\Response; |
8
|
|
|
use Psr\Log\LoggerInterface; |
9
|
|
|
use SilverStripe\Core\Convert; |
10
|
|
|
use SilverStripe\Core\Environment; |
11
|
|
|
use SilverStripe\Core\Injector\Injector; |
12
|
|
|
|
13
|
|
|
class TikaRestClient extends Client |
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* Authentication options to be sent to the Tika server |
17
|
|
|
* |
18
|
|
|
* @var array |
19
|
|
|
*/ |
20
|
|
|
protected $options = ['username' => null, 'password' => null]; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var array |
24
|
|
|
*/ |
25
|
|
|
protected $mimes = []; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* |
29
|
|
|
* @param string $baseUrl |
30
|
|
|
* @param array $config |
31
|
|
|
*/ |
32
|
|
|
public function __construct($baseUrl = '', $config = []) |
33
|
|
|
{ |
34
|
|
|
$password = Environment::getEnv('SS_TIKA_PASSWORD'); |
35
|
|
|
|
36
|
|
|
if (!empty($password)) { |
37
|
|
|
$this->options = [ |
38
|
|
|
'username' => Environment::getEnv('SS_TIKA_USERNAME'), |
39
|
|
|
'password' => $password, |
40
|
|
|
]; |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
$config['base_uri'] = $baseUrl; |
44
|
|
|
|
45
|
|
|
parent::__construct($config); |
46
|
|
|
} |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* Detect if the service is available |
50
|
|
|
* |
51
|
|
|
* @return bool |
52
|
|
|
*/ |
53
|
|
|
public function isAvailable() |
54
|
|
|
{ |
55
|
|
|
try { |
56
|
|
|
/** @var Response $result */ |
57
|
|
|
$result = $this->get('/', $this->getGuzzleOptions()); |
58
|
|
|
|
59
|
|
|
if ($result->getStatusCode() == 200) { |
60
|
|
|
return true; |
61
|
|
|
} |
62
|
|
|
} catch (RequestException $ex) { |
63
|
|
|
$msg = sprintf("Tika unavailable - %s", $ex->getMessage()); |
64
|
|
|
Injector::inst()->get(LoggerInterface::class)->info($msg); |
65
|
|
|
|
66
|
|
|
return false; |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Get version code |
72
|
|
|
* |
73
|
|
|
* @return string |
74
|
|
|
*/ |
75
|
|
|
public function getVersion() |
76
|
|
|
{ |
77
|
|
|
/** @var Response $response */ |
78
|
|
|
$response = $this->get('version', $this->getGuzzleOptions()); |
79
|
|
|
$version = 0; |
80
|
|
|
|
81
|
|
|
// Parse output |
82
|
|
|
if ($response->getStatusCode() == 200 |
83
|
|
|
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches) |
84
|
|
|
) { |
85
|
|
|
$version = $matches['version']; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
return (string) $version; |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* Gets supported mime data. May include aliased mime types. |
93
|
|
|
* |
94
|
|
|
* @return array |
95
|
|
|
*/ |
96
|
|
|
public function getSupportedMimes() |
97
|
|
|
{ |
98
|
|
|
if ($this->mimes) { |
|
|
|
|
99
|
|
|
return $this->mimes; |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
$response = $this->get( |
103
|
|
|
'mime-types', |
104
|
|
|
$this->getGuzzleOptions([ |
105
|
|
|
'headers' => [ |
106
|
|
|
'Accept' => 'application/json', |
107
|
|
|
], |
108
|
|
|
]) |
109
|
|
|
); |
110
|
|
|
|
111
|
|
|
return $this->mimes = Convert::json2array($response->getBody()); |
|
|
|
|
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
/** |
115
|
|
|
* Extract text content from a given file. |
116
|
|
|
* Logs a notice-level error if the document can't be parsed. |
117
|
|
|
* |
118
|
|
|
* @param string $file Full filesystem path to a file to post |
119
|
|
|
* @return string Content of the file extracted as plain text |
120
|
|
|
*/ |
121
|
|
|
public function tika($file) |
122
|
|
|
{ |
123
|
|
|
$text = null; |
124
|
|
|
try { |
125
|
|
|
/** @var Response $response */ |
126
|
|
|
$response = $this->put( |
127
|
|
|
'tika', |
128
|
|
|
$this->getGuzzleOptions([ |
129
|
|
|
'headers' => [ |
130
|
|
|
'Accept' => 'text/plain', |
131
|
|
|
], |
132
|
|
|
'body' => file_get_contents($file), |
133
|
|
|
]) |
134
|
|
|
); |
135
|
|
|
$text = $response->getBody(); |
136
|
|
|
} catch (RequestException $e) { |
137
|
|
|
$msg = sprintf( |
138
|
|
|
'TikaRestClient was not able to process %s. Response: %s %s.', |
139
|
|
|
$file, |
140
|
|
|
$e->getResponse()->getStatusCode(), |
141
|
|
|
$e->getResponse()->getReasonPhrase() |
142
|
|
|
); |
143
|
|
|
// Only available if tika-server was started with --includeStack |
144
|
|
|
$body = $e->getResponse()->getBody(); |
145
|
|
|
if ($body) { |
|
|
|
|
146
|
|
|
$msg .= ' Body: ' . $body; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
Injector::inst()->get(LoggerInterface::class)->info($msg); |
150
|
|
|
} |
151
|
|
|
|
152
|
|
|
return (string) $text; |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* Assembles an array of request options to pass to Guzzle |
157
|
|
|
* |
158
|
|
|
* @param array $options Authentication (etc) will be merged into this array and returned |
159
|
|
|
* @return array |
160
|
|
|
*/ |
161
|
|
|
protected function getGuzzleOptions($options = []) |
162
|
|
|
{ |
163
|
|
|
if (!empty($this->options['username']) && !empty($this->options['password'])) { |
164
|
|
|
$options['auth'] = [ |
165
|
|
|
$this->options['username'], |
166
|
|
|
$this->options['password'] |
167
|
|
|
]; |
168
|
|
|
} |
169
|
|
|
return $options; |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.