Completed
Push — master ( 0f7120...6fc9e3 )
by Janis
03:12
created

OcrService::process()   C

Complexity

Conditions 7
Paths 16

Size

Total Lines 38
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
dl 0
loc 38
ccs 0
cts 19
cp 0
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 19
nc 16
nop 2
crap 56
1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\IConfig;
20
use OCP\ILogger;
21
use OCP\ITempManager;
22
23
24
/**
25
 * Class OcrService
26
 * @package OCA\Ocr\Service
27
 */
28
class OcrService {
29
30
	/**
31
	 * @var ILogger
32
	 */
33
	private $logger;
34
35
	/**
36
	 * @var ITempManager
37
	 */
38
	private $tempM;
39
40
	/**
41
	 * @var IConfig
42
	 */
43
	private $config;
44
45
	/**
46
	 * @var GearmanWorkerService
47
	 */
48
	private $workerService;
49
50
	/**
51
	 * @var OcrStatusMapper
52
	 */
53
	private $statusMapper;
54
55
	/**
56
	 * @var View
57
	 */
58
	private $view;
59
60
	/**
61
	 * @var
62
	 */
63
	private $userId;
64
65
	/**
66
	 * Array of allowed mimetypes for ocr processing
67
	 */
68
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
69
70
	/**
71
	 * the correct mimetype for a pdf file
72
	 */
73
	const MIMETYPE_PDF = 'application/pdf';
74
75
	/**
76
	 * the only allowed image mimetypes by tesseract
77
	 */
78
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
79
80
	/**
81
	 * OcrService constructor.
82
	 *
83
	 * @param ILogger $logger
84
	 */
85 2
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, ILogger $logger) {
86 2
		$this->logger = $logger;
87 2
		$this->tempM = $tempManager;
88 2
		$this->config = $config;
89 2
		$this->workerService = $workerService;
90 2
		$this->statusMapper = $mapper;
91 2
		$this->view = $view;
92 2
		$this->userId = $userId;
93 2
	}
94
95
	/**
96
	 * Gets the list of all available tesseract-ocr languages.
97
	 *
98
	 * @return array Languages
99
	 */
100
	public function listLanguages() {
101
		try {
102
			$success = -1;
103
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
104
			exec('tesseract --list-langs 2>&1', $result, $success);
105
			if ($success === 0 && count($result) > 0) {
106
				if (is_array($result)) {
107
					$traineddata = $result;
108
				} else {
109
					$traineddata = explode(' ', $result);
110
				}
111
				$languages = array();
112
				foreach ($traineddata as $td) {
113
					$tdname = trim($td);
114
					if (strlen($tdname) === 3) {
115
						array_push($languages, $tdname);
116
					}
117
				}
118
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
119
				return $languages;
120
			} else {
121
				throw new NotFoundException('No languages found.');
122
			}
123
		} catch (Exception $e) {
124
			$this->handleException($e);
125
		}
126
	}
127
128
	/**
129
	 * Processes and prepares the files for ocr.
130
	 * Sends the stuff to the gearman client in order to ocr async.
131
	 *
132
	 * @param string $language
133
	 * @param array $files
134
	 * @return string
135
	 */
136
	public function process($language, $files) {
137
		try {
138
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
139
			// Check if files and language not empty
140
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
141
				// get the array with full fileinfo
142
				$fileInfo = $this->buildFileInfo($files);
143
				foreach ($fileInfo as $fInfo) {
144
					// Check if filelock existing
145
					// TODO: FileLock maybe \OC\Files\View::lockFile()
146
					// get new name for saving purpose
147
					$newName = $this->buildNewName($fInfo);
148
149
					// create a temp file for ocr processing purposes
150
					$tempFile = $this->tempM->getTemporaryFile();
151
152
					// set the gearman running type
153
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
154
						$ftype = 'mypdf';
155
					} else {
156
						$ftype = 'tess';
157
					}
158
159
					// Create status object
160
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
161
162
					// Init Gearman client and send task / job
163
					// Feed the gearman worker
164
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
165
				}
166
				return 'PROCESSING';
167
			} else {
168
				throw new NotFoundException('Empty parameters.');
169
			}
170
		} catch (Exception $e) {
171
			$this->handleException($e);
172
		}
173
	}
174
175
	/**
176
	 * A function which returns the JSONResponse for all required status checks and tasks.
177
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
178
	 *
179
	 * @return string
180
	 */
181
	public function status() {
182
		try {
183
			// TODO: release lock
184
			$processed = $this->handleProcessed();
185
186
			$failed = count($this->handleFailed());
187
188
			$pending = count($this->statusMapper->findAllPending($this->userId));
189
190
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
191
		} catch (Exception $e) {
192
			$this->handleException($e);
193
		}
194
	}
195
196
	/**
197
	 * The command ocr:complete for occ will call this function in order to set the status.
198
	 * the gearman worker should call it automatically after each processing step.
199
	 *
200
	 * @param $statusId
201
	 * @param boolean $failed
202
	 */
203
	public function complete($statusId, $failed) {
204
		try {
205
			$status = $this->statusMapper->find($statusId);
206
			if (!$failed) {
207
				$status->setStatus('PROCESSED');
208
				$this->statusMapper->update($status);
209
			} else {
210
				$status->setStatus('FAILED');
211
				$this->statusMapper->update($status);
212
			}
213
		} catch (Exception $e) {
214
			if ($e instanceof NotFoundException) {
215
				$status->setStatus('FAILED');
216
				$this->statusMapper->update($status);
217
				$this->handleException($e);
218
			}
219
			$this->handleException($e);
220
		}
221
	}
222
223
	/**
224
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
225
	 * Returns the number of processed files.
226
	 *
227
	 * @return int
228
	 */
229
	private function handleProcessed() {
230
		try {
231
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
232
			$processed = $this->statusMapper->findAllProcessed($this->userId);
233
			foreach ($processed as $status) {
234
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
235
					//Save the tmp file with newname
236
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
237
					// Cleaning temp files
238
					$this->statusMapper->delete($status);
239
					exec('rm ' . $status->getTempFile() . '.txt');
240
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
241
					//Save the tmp file with newname
242
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
243
					$this->statusMapper->delete($status);
244
					exec('rm ' . $status->getTempFile());
245
				} else {
246
					throw new NotFoundException('Temp file does not exist.');
247
				}
248
			}
249
			return count($processed);
250
		} catch (Exception $e) {
251
			$this->handleException($e);
252
		}
253
	}
254
255
	/**
256
	 * Handles all failed orders of ocr processing queue and returns the status objects.
257
	 *
258
	 * @return array
259
	 */
260
	private function handleFailed() {
261
		try {
262
			$failed = $this->statusMapper->findAllFailed($this->userId);
263
			foreach ($failed as $status) {
264
				// clean the tempfile
265
				exec('rm ' . $status->getTempFile());
266
				// clean from db
267
				$this->statusMapper->delete($status);
268
			}
269
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
270
			return $failed;
271
		} catch (Exception $e) {
272
			$this->handleException($e);
273
		}
274
	}
275
276
277
	/**
278
	 * Returns a not existing file name for pdf or image processing
279
	 *
280
	 * @param Files\FileInfo $fileInfo
281
	 * @return string
282
	 */
283
	private function buildNewName(Files\FileInfo $fileInfo) {
284
		// get rid of the .png or .pdf and so on
285
		$fileName = substr($fileInfo->getName(), 0, -4);
286
		// eliminate the file name from the path
287
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
288
		// and get the path on top of the user/files/ dir
289
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
290
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
291
			// PDFs:
292
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
293
		} else {
294
			// IMAGES:
295
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
296
		}
297
	}
298
299
	/**
300
	 * Returns the fileInfo for each file in files and checks
301
	 * if it has a allowed mimetype and some other conditions.
302
	 *
303
	 * @param array $files
304
	 * @return array of Files\FileInfo
305
	 * @throws NotFoundException
306
	 */
307
	private function buildFileInfo(array $files) {
308
		try {
309
			$fileArray = array();
310
			foreach ($files as $file) {
311
				// Check if anything is missing and file type is correct
312
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
313
					if (empty($file['path'])) {
314
						//Because new updated files have the property directory instead of path
315
						$file['path'] = $file['directory'];
316
					}
317
					// get correct path
318
					$path = $this->getCorrectPath($file);
319
					$fileInfo = $this->view->getFileInfo($path);
320
					if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
321
						$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
322
						throw new NotFoundException('Wrong parameters or wrong mimetype.');
323
					}
324
					array_push($fileArray, $fileInfo);
325
				} else {
326
					throw new NotFoundException('Wrong path parameter.');
327
				}
328
			}
329
			return $fileArray;
330
		} catch (Exception $e) {
331
			$this->handleException($e);
332
		}
333
	}
334
335
	/**
336
	 * Returns the correct path based on delivered file variable
337
	 * @param $file
338
	 * @return string
339
	 */
340
	private function getCorrectPath($file) {
341
		if ($file['path'] === '/') {
342
			$path = '' . '/' . $file['name'];
343
		} else {
344
			$path = $file['path'] . '/' . $file['name'];
345
		}
346
		return $path;
347
	}
348
349
	/**
350
	 * Inits the Gearman client and sends the task to the background worker (async)
351
	 * @param string $type
352
	 * @param $datadirectory
353
	 * @param $path
354
	 * @param $tempFile
355
	 * @param $language
356
	 * @param $statusId
357
	 * @param OcrStatus $status
358
	 */
359
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir) {
360
		try {
361
			if ($this->workerService->workerExists() === false) {
362
				throw new NotFoundException('No gearman worker exists.');
363
			}
364
			$this->statusMapper->insert($status);
365
			// Gearman thing
366
			$client = new \GearmanClient();
367
			$client->addServer('127.0.0.1', 4730);
368
			$result = $client->doBackground("ocr", json_encode(array(
369
				'type' => $type,
370
				'datadirectory' => $datadirectory,
371
				'path' => $path,
372
				'tempfile' => $tempFile,
373
				'language' => $language,
374
				'statusid' => $status->getId(),
375
				'occdir' => $occDir
376
			)));
377
			$this->logger->debug('Gearman Client output: ' . json_encode($result), ['app' => 'ocr']);
378
		} catch (Exception $e) {
379
			$this->handleException($e);
380
		}
381
	}
382
383
	/**
384
	 * Handle the possible thrown Exceptions from all methods of this class.
385
	 *
386
	 * @param Exception $e
387
	 * @throws Exception
388
	 * @throws NotFoundException
389
	 */
390 View Code Duplication
	private function handleException($e) {
391
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
392
		if ($e instanceof NotFoundException) {
393
			throw new NotFoundException($e->getMessage());
394
		} else {
395
			throw $e;
396
		}
397
	}
398
}