Completed
Pull Request — master (#17)
by Janis
03:06
created

OcrService::process()   C

Complexity

Conditions 7
Paths 16

Size

Total Lines 38
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 22.6944

Importance

Changes 0
Metric Value
dl 0
loc 38
ccs 6
cts 19
cp 0.3158
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 19
nc 16
nop 2
crap 22.6944
1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\Files\FileInfo;
20
use OCP\IConfig;
21
use OCP\IL10N;
22
use OCP\ILogger;
23
use OCP\ITempManager;
24
25
26
/**
27
 * Class OcrService
28
 * @package OCA\Ocr\Service
29
 */
30
class OcrService {
31
32
	/**
33
	 * @var ILogger
34
	 */
35
	private $logger;
36
37
	/**
38
	 * @var ITempManager
39
	 */
40
	private $tempM;
41
42
	/**
43
	 * @var IConfig
44
	 */
45
	private $config;
46
47
	/**
48
	 * @var QueueService
49
	 */
50
	private $queueService;
51
52
	/**
53
	 * @var OcrStatusMapper
54
	 */
55
	private $statusMapper;
56
57
	/**
58
	 * @var View
59
	 */
60
	private $view;
61
62
	/**
63
	 * @var
64
	 */
65
	private $userId;
66
67
	/**
68
	 * @var IL10N
69
	 */
70
	private $l10n;
71
72
	/**
73
	 * Array of allowed mimetypes for ocr processing
74
	 */
75
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
76
77
	/**
78
	 * the correct mimetype for a pdf file
79
	 */
80
	const MIMETYPE_PDF = 'application/pdf';
81
82
	/**
83
	 * the only allowed image mimetypes by tesseract
84
	 */
85
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
86
87
	/**
88
	 * OcrService constructor.
89
	 *
90
	 * @param ITempManager $tempManager
91
	 * @param IConfig $config
92
	 * @param QueueService $queueService
93
	 * @param OcrStatusMapper $mapper
94
	 * @param View $view
95
	 * @param $userId
96
	 * @param IL10N $l10n
97
	 * @param ILogger $logger
98
	 */
99 10
	public function __construct(ITempManager $tempManager, IConfig $config, QueueService $queueService, OcrStatusMapper $mapper, View $view, $userId, IL10N $l10n, ILogger $logger) {
100 10
		$this->logger = $logger;
101 10
		$this->tempM = $tempManager;
102 10
		$this->config = $config;
103 10
		$this->queueService = $queueService;
104 10
		$this->statusMapper = $mapper;
105 10
		$this->view = $view;
106 10
		$this->userId = $userId;
107 10
		$this->l10n = $l10n;
108 10
	}
109
110
	/**
111
	 * Gets the list of all available tesseract-ocr languages.
112
	 *
113
	 * @return array Languages
114
	 */
115 3
	public function listLanguages() {
116
		try {
117 3
			$success = -1;
118 3
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
119 3
			exec('tesseract --list-langs 2>&1', $result, $success);
120 3
			if ($success === 0 && count($result) > 0) {
121
				if (is_array($result)) {
122
					$traineddata = $result;
123
				} else {
124
					$traineddata = explode(' ', $result);
125
				}
126
				$languages = array();
127
				foreach ($traineddata as $td) {
128
					$tdname = trim($td);
129
					if (strlen($tdname) === 3) {
130
						array_push($languages, $tdname);
131
					}
132
				}
133
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
134
				return $languages;
135
			} else {
136 3
				throw new NotFoundException($this->l10n->t('No languages found.'));
137
			}
138 3
		} catch (Exception $e) {
139 3
			$this->handleException($e);
140
		}
141
	}
142
143
	/**
144
	 * Processes and prepares the files for ocr.
145
	 * Sends the stuff to the client in order to ocr async.
146
	 *
147
	 * @param string $language
148
	 * @param array $files
149
	 * @return string
150
	 */
151 4
	public function process($language, $files) {
152
		try {
153 4
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
154
			// Check if files and language not empty
155 4
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
156
				// get the array with full fileinfo
157
				$fileInfo = $this->buildFileInfo($files);
158
				foreach ($fileInfo as $fInfo) {
159
					// Check if filelock existing
160
					// TODO: FileLock maybe \OC\Files\View::lockFile()
161
					// get new name for saving purpose
162
					$newName = $this->buildNewName($fInfo);
163
164
					// create a temp file for ocr processing purposes
165
					$tempFile = $this->tempM->getTemporaryFile();
166
167
					// set the running type
168
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
169
						$ftype = 'mypdf';
170
					} else {
171
						$ftype = 'tess';
172
					}
173
174
					// Create status object
175
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
176
177
					// Init client and send task / job
178
					// Feed the worker
179
					$this->queueService->clientSend($status, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $language, \OC::$SERVERROOT);
180
				}
181
				return 'PROCESSING';
182
			} else {
183 2
				throw new NotFoundException($this->l10n->t('Empty passed parameters.'));
184
			}
185 4
		} catch (Exception $e) {
186 4
			$this->handleException($e);
187
		}
188
	}
189
190
	/**
191
	 * A function which returns the JSONResponse for all required status checks and tasks.
192
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
193
	 *
194
	 * @return string
195
	 */
196
	public function status() {
197
		try {
198
			// TODO: release lock
199
			$processed = $this->handleProcessed();
200
201
			$failed = count($this->handleFailed());
202
203
			$pending = count($this->statusMapper->findAllPending($this->userId));
204
205
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
206
		} catch (Exception $e) {
207
			$this->handleException($e);
208
		}
209
	}
210
211
	/**
212
	 * The command ocr:complete for occ will call this function in order to set the status.
213
	 * the worker should call it automatically after each processing step.
214
	 *
215
	 * @param $statusId
216
	 * @param boolean $failed
217
	 */
218 3
	public function complete($statusId, $failed) {
219
		try {
220 3
			$status = $this->statusMapper->find($statusId);
221 2
			if (!$failed) {
222 1
				$status->setStatus('PROCESSED');
223 1
				$this->statusMapper->update($status);
224 1
			} else {
225 1
				$status->setStatus('FAILED');
226 1
				$this->statusMapper->update($status);
227
			}
228 3
		} catch (Exception $e) {
229 1
			$this->handleException($e);
230
		}
231 2
	}
232
233
	/**
234
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
235
	 * Returns the number of processed files.
236
	 *
237
	 * @return int
238
	 */
239
	private function handleProcessed() {
240
		try {
241
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
242
			$processed = $this->statusMapper->findAllProcessed($this->userId);
243
			foreach ($processed as $status) {
244
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
245
					//Save the tmp file with newname
246
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
247
					// Cleaning temp files
248
					$this->statusMapper->delete($status);
249
					exec('rm ' . $status->getTempFile() . '.txt');
250
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
251
					//Save the tmp file with newname
252
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
253
					$this->statusMapper->delete($status);
254
					exec('rm ' . $status->getTempFile());
255
				} else {
256
					throw new NotFoundException($this->l10n->t('Temp file does not exist.'));
257
				}
258
			}
259
			return count($processed);
260
		} catch (Exception $e) {
261
			$this->handleException($e);
262
		}
263
	}
264
265
	/**
266
	 * Handles all failed orders of ocr processing queue and returns the status objects.
267
	 *
268
	 * @return array
269
	 */
270
	private function handleFailed() {
271
		try {
272
			$failed = $this->statusMapper->findAllFailed($this->userId);
273
			foreach ($failed as $status) {
274
				// clean the tempfile
275
				exec('rm ' . $status->getTempFile());
276
				// clean from db
277
				$this->statusMapper->delete($status);
278
			}
279
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
280
			return $failed;
281
		} catch (Exception $e) {
282
			$this->handleException($e);
283
		}
284
	}
285
286
287
	/**
288
	 * Returns a not existing file name for pdf or image processing
289
	 * protected as of testing issues with static methods. (Actually
290
	 * it will be mocked partially) FIXME: Change this behaviour as soon as the buidlNotExistingFileName function is not static anymore
291
	 *
292
	 * @param FileInfo $fileInfo
293
	 * @return string
294
	 */
295
	protected function buildNewName(FileInfo $fileInfo) {
296
		// get rid of the .png or .pdf and so on
297
		$fileName = substr($fileInfo->getName(), 0, -4);
298
		// eliminate the file name from the path
299
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
300
		// and get the path on top of the user/files/ dir
301
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
302
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
303
			// PDFs:
304
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
305
		} else {
306
			// IMAGES:
307
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
308
		}
309
	}
310
311
	/**
312
	 * Returns the fileInfo for each file in files and checks
313
	 * if it has a allowed mimetype and some other conditions.
314
	 *
315
	 * @param array $files
316
	 * @return array of Files\FileInfo
317
	 * @throws NotFoundException
318
	 */
319
	private function buildFileInfo(array $files) {
320
		try {
321
			$fileArray = array();
322
			foreach ($files as $file) {
323
				// Check if anything is missing and file type is correct
324
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
325
					// get correct path
326
					$path = $this->getCorrectPath($file);
327
					$fileInfo = $this->view->getFileInfo($path);
328
					$this->checkMimeType($fileInfo);
329
					array_push($fileArray, $fileInfo);
330
				} else {
331
					throw new NotFoundException($this->l10n->t('Wrong path parameter.'));
332
				}
333
			}
334
			return $fileArray;
335
		} catch (Exception $e) {
336
			$this->handleException($e);
337
		}
338
	}
339
340
	/**
341
	 * Checks a Mimetype for a specific given FileInfo.
342
	 * @param Files\FileInfo $fileInfo
343
	 */
344
	private function checkMimeType(FileInfo $fileInfo) {
345
		try {
346
			if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
347
				$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
348
				throw new NotFoundException($this->l10n->t('Wrong parameters or wrong mimetype.'));
349
			}
350
		} catch (Exception $e) {
351
			$this->handleException($e);
352
		}
353
	}
354
355
	/**
356
	 * Returns the correct path based on delivered file variable
357
	 * @param $file
358
	 * @return string
359
	 */
360
	private function getCorrectPath($file) {
361
		if (empty($file['path'])) {
362
			//Because new updated files have the property directory instead of path
363
			$file['path'] = $file['directory'];
364
		}
365
		if ($file['path'] === '/') {
366
			$path = $file['path'] . $file['name'];
367
		} else {
368
			$path = $file['path'] . '/' . $file['name'];
369
		}
370
		return $path;
371
	}
372
373
	/**
374
	 * Handle the possible thrown Exceptions from all methods of this class.
375
	 *
376
	 * @param Exception $e
377
	 * @throws Exception
378
	 * @throws NotFoundException
379
	 */
380 6 View Code Duplication
	private function handleException($e) {
381 6
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
382 6
		if ($e instanceof NotFoundException) {
383 6
			throw new NotFoundException($e->getMessage());
384
		} else {
385
			throw $e;
386
		}
387
	}
388
}