Completed
Pull Request — master (#12)
by Janis
06:16
created

OcrService::getCorrectPath()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 12
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
dl 0
loc 12
ccs 0
cts 9
cp 0
rs 9.4285
c 0
b 0
f 0
cc 3
eloc 8
nc 4
nop 1
crap 12
1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\Files\FileInfo;
20
use OCP\IConfig;
21
use OCP\IL10N;
22
use OCP\ILogger;
23
use OCP\ITempManager;
24
25
26
/**
27
 * Class OcrService
28
 * @package OCA\Ocr\Service
29
 */
30
class OcrService {
31
32
	/**
33
	 * @var ILogger
34
	 */
35
	private $logger;
36
37
	/**
38
	 * @var ITempManager
39
	 */
40
	private $tempM;
41
42
	/**
43
	 * @var IConfig
44
	 */
45
	private $config;
46
47
	/**
48
	 * @var GearmanWorkerService
49
	 */
50
	private $workerService;
51
52
	/**
53
	 * @var OcrStatusMapper
54
	 */
55
	private $statusMapper;
56
57
	/**
58
	 * @var View
59
	 */
60
	private $view;
61
62
	/**
63
	 * @var
64
	 */
65
	private $userId;
66
67
	/**
68
	 * @var IL10N
69
	 */
70
	private $l10n;
71
72
	/**
73
	 * Array of allowed mimetypes for ocr processing
74
	 */
75
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
76
77
	/**
78
	 * the correct mimetype for a pdf file
79
	 */
80
	const MIMETYPE_PDF = 'application/pdf';
81
82
	/**
83
	 * the only allowed image mimetypes by tesseract
84
	 */
85
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
86
87
	/**
88
	 * OcrService constructor.
89
	 *
90
	 * @param ITempManager $tempManager
91
	 * @param IConfig $config
92
	 * @param GearmanWorkerService $workerService
93
	 * @param OcrStatusMapper $mapper
94
	 * @param View $view
95
	 * @param $userId
96
	 * @param IL10N $l10n
97
	 * @param ILogger $logger
98
	 */
99 5
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, IL10N $l10n, ILogger $logger) {
100 5
		$this->logger = $logger;
101 5
		$this->tempM = $tempManager;
102 5
		$this->config = $config;
103 5
		$this->workerService = $workerService;
104 5
		$this->statusMapper = $mapper;
105 5
		$this->view = $view;
106 5
		$this->userId = $userId;
107 5
		$this->l10n = $l10n;
108 5
	}
109
110
	/**
111
	 * Gets the list of all available tesseract-ocr languages.
112
	 *
113
	 * @return array Languages
114
	 */
115
	public function listLanguages() {
116
		try {
117
			$success = -1;
118
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
119
			exec('tesseract --list-langs 2>&1', $result, $success);
120
			if ($success === 0 && count($result) > 0) {
121
				if (is_array($result)) {
122
					$traineddata = $result;
123
				} else {
124
					$traineddata = explode(' ', $result);
125
				}
126
				$languages = array();
127
				foreach ($traineddata as $td) {
128
					$tdname = trim($td);
129
					if (strlen($tdname) === 3) {
130
						array_push($languages, $tdname);
131
					}
132
				}
133
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
134
				return $languages;
135
			} else {
136
				throw new NotFoundException($this->l10n->t('No languages found.'));
137
			}
138
		} catch (Exception $e) {
139
			$this->handleException($e);
140
		}
141
	}
142
143
	/**
144
	 * Processes and prepares the files for ocr.
145
	 * Sends the stuff to the gearman client in order to ocr async.
146
	 *
147
	 * @param string $language
148
	 * @param array $files
149
	 * @return string
150
	 */
151
	public function process($language, $files) {
152
		try {
153
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
154
			// Check if files and language not empty
155
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
156
				// get the array with full fileinfo
157
				$fileInfo = $this->buildFileInfo($files);
158
				foreach ($fileInfo as $fInfo) {
159
					// Check if filelock existing
160
					// TODO: FileLock maybe \OC\Files\View::lockFile()
161
					// get new name for saving purpose
162
					$newName = $this->buildNewName($fInfo);
163
164
					// create a temp file for ocr processing purposes
165
					$tempFile = $this->tempM->getTemporaryFile();
166
167
					// set the gearman running type
168
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
169
						$ftype = 'mypdf';
170
					} else {
171
						$ftype = 'tess';
172
					}
173
174
					// Create status object
175
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
176
177
					// Init Gearman client and send task / job
178
					// Feed the gearman worker
179
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
180
				}
181
				return 'PROCESSING';
182
			} else {
183
				throw new NotFoundException($this->l10n->t('Empty passed parameters.'));
184
			}
185
		} catch (Exception $e) {
186
			$this->handleException($e);
187
		}
188
	}
189
190
	/**
191
	 * A function which returns the JSONResponse for all required status checks and tasks.
192
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
193
	 *
194
	 * @return string
195
	 */
196
	public function status() {
197
		try {
198
			// TODO: release lock
199
			$processed = $this->handleProcessed();
200
201
			$failed = count($this->handleFailed());
202
203
			$pending = count($this->statusMapper->findAllPending($this->userId));
204
205
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
206
		} catch (Exception $e) {
207
			$this->handleException($e);
208
		}
209
	}
210
211
	/**
212
	 * The command ocr:complete for occ will call this function in order to set the status.
213
	 * the gearman worker should call it automatically after each processing step.
214
	 *
215
	 * @param $statusId
216
	 * @param boolean $failed
217
	 */
218 3
	public function complete($statusId, $failed) {
219
		try {
220 3
			$status = $this->statusMapper->find($statusId);
221 2
			if (!$failed) {
222 1
				$status->setStatus('PROCESSED');
223 1
				$this->statusMapper->update($status);
224 1
			} else {
225 1
				$status->setStatus('FAILED');
226 1
				$this->statusMapper->update($status);
227
			}
228 3
		} catch (Exception $e) {
229 1
			$this->handleException($e);
230
		}
231 2
	}
232
233
	/**
234
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
235
	 * Returns the number of processed files.
236
	 *
237
	 * @return int
238
	 */
239
	private function handleProcessed() {
240
		try {
241
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
242
			$processed = $this->statusMapper->findAllProcessed($this->userId);
243
			foreach ($processed as $status) {
244
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
245
					//Save the tmp file with newname
246
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
247
					// Cleaning temp files
248
					$this->statusMapper->delete($status);
249
					exec('rm ' . $status->getTempFile() . '.txt');
250
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
251
					//Save the tmp file with newname
252
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
253
					$this->statusMapper->delete($status);
254
					exec('rm ' . $status->getTempFile());
255
				} else {
256
					throw new NotFoundException($this->l10n->t('Temp file does not exist.'));
257
				}
258
			}
259
			return count($processed);
260
		} catch (Exception $e) {
261
			$this->handleException($e);
262
		}
263
	}
264
265
	/**
266
	 * Handles all failed orders of ocr processing queue and returns the status objects.
267
	 *
268
	 * @return array
269
	 */
270
	private function handleFailed() {
271
		try {
272
			$failed = $this->statusMapper->findAllFailed($this->userId);
273
			foreach ($failed as $status) {
274
				// clean the tempfile
275
				exec('rm ' . $status->getTempFile());
276
				// clean from db
277
				$this->statusMapper->delete($status);
278
			}
279
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
280
			return $failed;
281
		} catch (Exception $e) {
282
			$this->handleException($e);
283
		}
284
	}
285
286
287
	/**
288
	 * Returns a not existing file name for pdf or image processing
289
	 *
290
	 * @param FileInfo $fileInfo
291
	 * @return string
292
	 */
293
	private function buildNewName(FileInfo $fileInfo) {
294
		// get rid of the .png or .pdf and so on
295
		$fileName = substr($fileInfo->getName(), 0, -4);
296
		// eliminate the file name from the path
297
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
298
		// and get the path on top of the user/files/ dir
299
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
300
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
301
			// PDFs:
302
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
303
		} else {
304
			// IMAGES:
305
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
306
		}
307
	}
308
309
	/**
310
	 * Returns the fileInfo for each file in files and checks
311
	 * if it has a allowed mimetype and some other conditions.
312
	 *
313
	 * @param array $files
314
	 * @return array of Files\FileInfo
315
	 * @throws NotFoundException
316
	 */
317
	private function buildFileInfo(array $files) {
318
		try {
319
			$fileArray = array();
320
			foreach ($files as $file) {
321
				// Check if anything is missing and file type is correct
322
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
323
					// get correct path
324
					$path = $this->getCorrectPath($file);
325
					$fileInfo = $this->view->getFileInfo($path);
326
					$this->checkMimeType($fileInfo);
327
					array_push($fileArray, $fileInfo);
328
				} else {
329
					throw new NotFoundException($this->l10n->t('Wrong path parameter.'));
330
				}
331
			}
332
			return $fileArray;
333
		} catch (Exception $e) {
334
			$this->handleException($e);
335
		}
336
	}
337
338
	/**
339
	 * Checks a Mimetype for a specific given FileInfo.
340
	 * @param Files\FileInfo $fileInfo
341
	 */
342
	private function checkMimeType(FileInfo $fileInfo) {
343
		try {
344
			if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
345
				$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
346
				throw new NotFoundException($this->l10n->t('Wrong parameters or wrong mimetype.'));
347
			}
348
		} catch (Exception $e) {
349
			$this->handleException($e);
350
		}
351
	}
352
353
	/**
354
	 * Returns the correct path based on delivered file variable
355
	 * @param $file
356
	 * @return string
357
	 */
358
	private function getCorrectPath($file) {
359
		if (empty($file['path'])) {
360
			//Because new updated files have the property directory instead of path
361
			$file['path'] = $file['directory'];
362
		}
363
		if ($file['path'] === '/') {
364
			$path = '' . '/' . $file['name'];
365
		} else {
366
			$path = $file['path'] . '/' . $file['name'];
367
		}
368
		return $path;
369
	}
370
371
	/**
372
	 * Inits the Gearman client and sends the task to the background worker (async)
373
	 * @param string $type
374
	 * @param $datadirectory
375
	 * @param $path
376
	 * @param $tempFile
377
	 * @param string $language
378
	 * @param OcrStatus $status
379
	 * @param string $occDir
380
	 */
381
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir) {
382
		try {
383
			if ($this->workerService->workerExists() === false) {
384
				throw new NotFoundException($this->l10n->t('No gearman worker exists.'));
385
			}
386
			$this->statusMapper->insert($status);
387
			// Gearman thing
388
			$client = new \GearmanClient();
389
			$client->addServer('127.0.0.1', 4730);
390
			$result = $client->doBackground("ocr", json_encode(array(
391
				'type' => $type,
392
				'datadirectory' => $datadirectory,
393
				'path' => $path,
394
				'tempfile' => $tempFile,
395
				'language' => $language,
396
				'statusid' => $status->getId(),
397
				'occdir' => $occDir
398
			)));
399
			$this->logger->debug('Gearman Client output: ' . json_encode($result), ['app' => 'ocr']);
400
		} catch (Exception $e) {
401
			$this->handleException($e);
402
		}
403
	}
404
405
	/**
406
	 * Handle the possible thrown Exceptions from all methods of this class.
407
	 *
408
	 * @param Exception $e
409
	 * @throws Exception
410
	 * @throws NotFoundException
411
	 */
412 1 View Code Duplication
	private function handleException($e) {
413 1
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
414 1
		if ($e instanceof NotFoundException) {
415 1
			throw new NotFoundException($e->getMessage());
416
		} else {
417
			throw $e;
418
		}
419
	}
420
}