1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* nextCloud - ocr |
4
|
|
|
* |
5
|
|
|
* This file is licensed under the Affero General Public License version 3 or |
6
|
|
|
* later. See the COPYING file. |
7
|
|
|
* |
8
|
|
|
* @author Janis Koehr <[email protected]> |
9
|
|
|
* @copyright Janis Koehr 2016 |
10
|
|
|
*/ |
11
|
|
|
|
12
|
|
|
namespace OCA\Ocr\Service; |
13
|
|
|
|
14
|
|
|
use Exception; |
15
|
|
|
use OC\Files\View; |
16
|
|
|
use OCA\Ocr\Db\OcrStatus; |
17
|
|
|
use OCA\Ocr\Db\OcrStatusMapper; |
18
|
|
|
use OCP\AppFramework\Http\JSONResponse; |
19
|
|
|
use OCP\Files; |
20
|
|
|
use OCP\IConfig; |
21
|
|
|
use OCP\ILogger; |
22
|
|
|
use OCP\ITempManager; |
23
|
|
|
|
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* Class OcrService |
27
|
|
|
* @package OCA\Ocr\Service |
28
|
|
|
*/ |
29
|
|
|
class OcrService { |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* @var ILogger |
33
|
|
|
*/ |
34
|
|
|
private $logger; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* @var ITempManager |
38
|
|
|
*/ |
39
|
|
|
private $tempM; |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* @var IConfig |
43
|
|
|
*/ |
44
|
|
|
private $config; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @var GearmanWorkerService |
48
|
|
|
*/ |
49
|
|
|
private $workerService; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* @var OcrStatusMapper |
53
|
|
|
*/ |
54
|
|
|
private $statusMapper; |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* @var View |
58
|
|
|
*/ |
59
|
|
|
private $view; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* @var |
63
|
|
|
*/ |
64
|
|
|
private $userId; |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Array of allowed mimetypes for ocr processing |
68
|
|
|
*/ |
69
|
|
|
const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff']; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* the correct mimetype for a pdf file |
73
|
|
|
*/ |
74
|
|
|
const MIMETYPE_PDF = 'application/pdf'; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* the only allowed image mimetypes by tesseract |
78
|
|
|
*/ |
79
|
|
|
const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff']; |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* OcrService constructor. |
83
|
|
|
* |
84
|
|
|
* @param ILogger $logger |
85
|
|
|
*/ |
86
|
2 |
|
public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, ILogger $logger) { |
87
|
2 |
|
$this->logger = $logger; |
88
|
2 |
|
$this->tempM = $tempManager; |
89
|
2 |
|
$this->config = $config; |
90
|
2 |
|
$this->workerService = $workerService; |
91
|
2 |
|
$this->statusMapper = $mapper; |
92
|
2 |
|
$this->view = $view; |
93
|
2 |
|
$this->userId = $userId; |
94
|
2 |
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Gets the list of all available tesseract-ocr languages. |
98
|
|
|
* |
99
|
|
|
* @return array Languages |
100
|
|
|
*/ |
101
|
|
|
public function listLanguages(){ |
102
|
|
|
try { |
103
|
|
|
$success = -1; |
104
|
|
|
$this->logger->debug('Fetching languages. ', ['app' => 'ocr']); |
105
|
|
|
exec('tesseract --list-langs 2>&1', $result, $success); |
106
|
|
|
if ($success === 0 && count($result) > 0) { |
107
|
|
|
if (is_array($result)) { |
108
|
|
|
$traineddata = $result; |
109
|
|
|
} else { |
110
|
|
|
$traineddata = explode(' ', $result); |
111
|
|
|
} |
112
|
|
|
$languages = array(); |
113
|
|
|
foreach ($traineddata as $td) { |
114
|
|
|
$tdname = trim($td); |
115
|
|
|
if (strlen($tdname) == 3) { |
116
|
|
|
array_push($languages, $tdname); |
117
|
|
|
} |
118
|
|
|
} |
119
|
|
|
$this->logger->debug('Fetched languages: '.json_encode($languages), ['app' => 'ocr']); |
120
|
|
|
return $languages; |
121
|
|
|
} else { |
122
|
|
|
throw new NotFoundException('No languages found.'); |
123
|
|
|
} |
124
|
|
|
}catch(Exception $e){ |
125
|
|
|
$this->handleException($e); |
126
|
|
|
} |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* Processes and prepares the files for ocr. |
131
|
|
|
* Sends the stuff to the gearman client in order to ocr async. |
132
|
|
|
* |
133
|
|
|
* @param $language |
134
|
|
|
* @param array $files |
135
|
|
|
* @return string |
136
|
|
|
*/ |
137
|
|
|
public function process($language, $files) { |
138
|
|
|
try { |
139
|
|
|
$this->logger->debug('Will now process files: '.json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']); |
140
|
|
|
// Check if $files and $language not empty |
|
|
|
|
141
|
|
|
if(!empty($files) && !empty($language) && in_array($language, $this->listLanguages())){ |
142
|
|
|
// get the array with full fileinfo |
143
|
|
|
$fileInfo = $this->buildFileInfo($files); |
144
|
|
|
foreach ($fileInfo as $fInfo){ |
|
|
|
|
145
|
|
|
// Check if filelock existing |
146
|
|
|
// TODO: FileLock maybe \OC\Files\View::lockFile() |
147
|
|
|
// get new name for saving purpose |
148
|
|
|
$newName = $this->buildNewName($fInfo); |
149
|
|
|
|
150
|
|
|
// create a temp file for ocr processing purposes |
151
|
|
|
$tempFile = $this->tempM->getTemporaryFile(); |
152
|
|
|
|
153
|
|
|
// set the gearman running type |
154
|
|
|
if($fInfo->getMimetype() == $this::MIMETYPE_PDF){ |
155
|
|
|
$ftype = 'mypdf'; |
156
|
|
|
}else{ |
157
|
|
|
$ftype = 'tess'; |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
// Create status object |
161
|
|
|
$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId); |
162
|
|
|
|
163
|
|
|
// Init Gearman client and send task / job |
164
|
|
|
// Feed the gearman worker |
165
|
|
|
$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT); |
166
|
|
|
} |
167
|
|
|
return 'PROCESSING'; |
168
|
|
|
}else{ |
169
|
|
|
throw new NotFoundException('Empty parameters.'); |
170
|
|
|
} |
171
|
|
|
}catch(Exception $e){ |
172
|
|
|
$this->handleException($e); |
173
|
|
|
} |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
/** |
177
|
|
|
* A function which returns the JSONResponse for all required status checks and tasks. |
178
|
|
|
* It will check for already processed, pending and failed ocr tasks and return them as needed. |
179
|
|
|
* |
180
|
|
|
* @return string |
181
|
|
|
*/ |
182
|
|
|
public function status(){ |
183
|
|
|
try { |
184
|
|
|
// TODO: release lock |
185
|
|
|
$processed = $this->handleProcessed(); |
186
|
|
|
|
187
|
|
|
$failed = count($this->handleFailed()); |
188
|
|
|
|
189
|
|
|
$pending = count($this->statusMapper->findAllPending($this->userId)); |
190
|
|
|
|
191
|
|
|
return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending]; |
192
|
|
|
}catch (Exception $e){ |
193
|
|
|
$this->handleException($e); |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* The command ocr:complete for occ will call this function in order to set the status. |
199
|
|
|
* the gearman worker should call it automatically after each processing step. |
200
|
|
|
* |
201
|
|
|
* @param $statusId |
202
|
|
|
* @param $failed |
203
|
|
|
*/ |
204
|
|
|
public function complete($statusId, $failed){ |
205
|
|
|
try{ |
206
|
|
|
$status = $this->statusMapper->find($statusId); |
207
|
|
|
if(!$failed) { |
208
|
|
|
$status->setStatus('PROCESSED'); |
209
|
|
|
$this->statusMapper->update($status); |
210
|
|
|
}else{ |
211
|
|
|
$status->setStatus('FAILED'); |
212
|
|
|
$this->statusMapper->update($status); |
213
|
|
|
} |
214
|
|
|
} catch (Exception $e){ |
215
|
|
|
if ($e instanceof NotFoundException){ |
216
|
|
|
$status->setStatus('FAILED'); |
217
|
|
|
$this->statusMapper->update($status); |
218
|
|
|
$this->handleException($e); |
219
|
|
|
} |
220
|
|
|
$this->handleException($e); |
221
|
|
|
} |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
/** |
225
|
|
|
* Finishes all Processed files by copying them to the right path and deleteing the temp files. |
226
|
|
|
* Returns the number of processed files. |
227
|
|
|
* |
228
|
|
|
* @return int |
229
|
|
|
*/ |
230
|
|
|
private function handleProcessed(){ |
231
|
|
|
try { |
232
|
|
|
$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']); |
233
|
|
|
$processed = $this->statusMapper->findAllProcessed($this->userId); |
234
|
|
|
foreach ($processed as $status) { |
235
|
|
|
if ($status->getType() == 'tess' && file_exists($status->getTempFile().'.txt')) { |
236
|
|
|
//Save the tmp file with newname |
237
|
|
|
$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt'));// need .txt because tesseract saves it like this |
238
|
|
|
// Cleaning temp files |
239
|
|
|
$this->statusMapper->delete($status); |
240
|
|
|
exec('rm ' . $status->getTempFile() . '.txt'); |
241
|
|
|
} elseif ($status->getType() == 'mypdf' && file_exists($status->getTempFile())) { |
242
|
|
|
//Save the tmp file with newname |
243
|
|
|
$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile()));// don't need to extend with .pdf / it uses the tmp file to save |
244
|
|
|
$this->statusMapper->delete($status); |
245
|
|
|
exec('rm ' . $status->getTempFile()); |
246
|
|
|
}else{ |
247
|
|
|
throw new NotFoundException('Temp file does not exist.'); |
248
|
|
|
} |
249
|
|
|
} |
250
|
|
|
return count($processed); |
251
|
|
|
}catch (Exception $e){ |
252
|
|
|
$this->handleException($e); |
253
|
|
|
} |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
/** |
257
|
|
|
* Handles all failed orders of ocr processing queue and returns the status objects. |
258
|
|
|
* |
259
|
|
|
* @return array |
260
|
|
|
*/ |
261
|
|
|
private function handleFailed(){ |
262
|
|
|
try { |
263
|
|
|
$failed = $this->statusMapper->findAllFailed($this->userId); |
264
|
|
|
foreach ($failed as $status) { |
265
|
|
|
// clean the tempfile |
266
|
|
|
exec('rm ' . $status->getTempFile()); |
267
|
|
|
// clean from db |
268
|
|
|
$this->statusMapper->delete($status); |
269
|
|
|
} |
270
|
|
|
$this->logger->debug('Following status objects failed: '.json_encode($failed), ['app' => 'ocr']); |
271
|
|
|
return $failed; |
272
|
|
|
}catch (Exception $e){ |
273
|
|
|
$this->handleException($e); |
274
|
|
|
} |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
|
278
|
|
|
/** |
279
|
|
|
* Returns a not existing file name for pdf or image processing |
280
|
|
|
* |
281
|
|
|
* @param Files\FileInfo $fileInfo |
282
|
|
|
* @return string |
283
|
|
|
*/ |
284
|
|
|
private function buildNewName(Files\FileInfo $fileInfo){ |
285
|
|
|
// get rid of the .png or .pdf and so on |
286
|
|
|
$fileName = substr($fileInfo->getName(),0,-4); |
287
|
|
|
// eliminate the file name from the path |
288
|
|
|
$filePath = str_replace($fileInfo->getName(),'',$fileInfo->getPath()); |
289
|
|
|
// and get the path on top of the user/files/ dir |
290
|
|
|
$filePath = str_replace('/'.$this->userId.'/files','',$filePath); |
291
|
|
|
if($fileInfo->getMimetype() == $this::MIMETYPE_PDF){ |
292
|
|
|
// PDFs: |
293
|
|
|
return Files::buildNotExistingFileName($filePath, $fileName.'_OCR.pdf'); |
294
|
|
|
} else { |
295
|
|
|
// IMAGES: |
296
|
|
|
return Files::buildNotExistingFileName($filePath, $fileName.'_OCR.txt'); |
297
|
|
|
} |
298
|
|
|
} |
299
|
|
|
|
300
|
|
|
/** |
301
|
|
|
* Returns the fileInfo for each file in files and checks |
302
|
|
|
* if it has a allowed mimetype and some other conditions. |
303
|
|
|
* |
304
|
|
|
* @param array $files |
305
|
|
|
* @return array of Files\FileInfo |
306
|
|
|
* @throws NotFoundException |
307
|
|
|
*/ |
308
|
|
|
private function buildFileInfo(array $files){ |
309
|
|
|
try { |
310
|
|
|
$fileArray = array(); |
311
|
|
|
foreach ($files as $file) { |
312
|
|
|
if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] == 'file') { |
313
|
|
|
if(empty($file['path'])){ $file['path'] = $file['directory']; } //Because new updated files have the property directory instead of path |
314
|
|
|
if($file['path'] == '/'){ |
315
|
|
|
$path = ''. '/' . $file['name']; |
316
|
|
|
}else{ |
317
|
|
|
$path = $file['path']. '/' . $file['name']; |
318
|
|
|
} |
319
|
|
|
$fileInfo = $this->view->getFileInfo($path); |
320
|
|
|
if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) { |
321
|
|
|
$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']); |
322
|
|
|
throw new NotFoundException('Wrong parameters or wrong mimetype.'); |
323
|
|
|
} |
324
|
|
|
array_push($fileArray, $fileInfo); |
325
|
|
|
} else { |
326
|
|
|
throw new NotFoundException('Wrong path parameter.'); |
327
|
|
|
} |
328
|
|
|
} |
329
|
|
|
return $fileArray; |
330
|
|
|
} catch (Exception $e) { |
331
|
|
|
$this->handleException($e); |
332
|
|
|
} |
333
|
|
|
} |
334
|
|
|
|
335
|
|
|
/** |
336
|
|
|
* Inits the Gearman client and sends the task to the background worker (async) |
337
|
|
|
* @param $type |
338
|
|
|
* @param $datadirectory |
339
|
|
|
* @param $path |
340
|
|
|
* @param $tempFile |
341
|
|
|
* @param $language |
342
|
|
|
* @param $statusId |
343
|
|
|
*/ |
344
|
|
|
private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir){ |
345
|
|
|
try { |
346
|
|
|
if(!$this->workerService->workerExists()){ |
|
|
|
|
347
|
|
|
throw new NotFoundException('No gearman worker exists.'); |
348
|
|
|
} |
349
|
|
|
$this->statusMapper->insert($status); |
350
|
|
|
// Gearman thing |
351
|
|
|
$client = new \GearmanClient(); |
352
|
|
|
$client->addServer('127.0.0.1',4730); |
353
|
|
|
$result = $client->doBackground("ocr", json_encode(array( |
|
|
|
|
354
|
|
|
'type' => $type, |
355
|
|
|
'datadirectory' => $datadirectory, |
356
|
|
|
'path' => $path, |
357
|
|
|
'tempfile' => $tempFile, |
358
|
|
|
'language' => $language, |
359
|
|
|
'statusid' => $status->getId(), |
360
|
|
|
'occdir' => $occDir |
361
|
|
|
))); |
362
|
|
|
} catch (Exception $e) { |
363
|
|
|
$this->handleException($e); |
364
|
|
|
} |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
/** |
368
|
|
|
* Handle the possible thrown Exceptions from all methods of this class. |
369
|
|
|
* |
370
|
|
|
* @param Exception $e |
371
|
|
|
* @throws Exception |
372
|
|
|
* @throws NotFoundException |
373
|
|
|
*/ |
374
|
|
View Code Duplication |
private function handleException($e) { |
|
|
|
|
375
|
|
|
$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']); |
376
|
|
|
if ($e instanceof NotFoundException) { |
377
|
|
|
throw new NotFoundException($e->getMessage()); |
378
|
|
|
} else { |
379
|
|
|
throw $e; |
380
|
|
|
} |
381
|
|
|
} |
382
|
|
|
} |
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.