Completed
Pull Request — master (#11)
by Janis
03:04
created

OcrService::checkMimeType()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 0
cts 7
cp 0
rs 9.2
c 0
b 0
f 0
cc 4
eloc 7
nc 4
nop 1
crap 20
1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\Files\FileInfo;
20
use OCP\IConfig;
21
use OCP\IL10N;
22
use OCP\ILogger;
23
use OCP\ITempManager;
24
25
26
/**
27
 * Class OcrService
28
 * @package OCA\Ocr\Service
29
 */
30
class OcrService {
31
32
	/**
33
	 * @var ILogger
34
	 */
35
	private $logger;
36
37
	/**
38
	 * @var ITempManager
39
	 */
40
	private $tempM;
41
42
	/**
43
	 * @var IConfig
44
	 */
45
	private $config;
46
47
	/**
48
	 * @var GearmanWorkerService
49
	 */
50
	private $workerService;
51
52
	/**
53
	 * @var OcrStatusMapper
54
	 */
55
	private $statusMapper;
56
57
	/**
58
	 * @var View
59
	 */
60
	private $view;
61
62
	/**
63
	 * @var
64
	 */
65
	private $userId;
66
67
	/**
68
	 * @var IL10N
69
	 */
70
	private $l10n;
71
72
	/**
73
	 * Array of allowed mimetypes for ocr processing
74
	 */
75
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
76
77
	/**
78
	 * the correct mimetype for a pdf file
79
	 */
80
	const MIMETYPE_PDF = 'application/pdf';
81
82
	/**
83
	 * the only allowed image mimetypes by tesseract
84
	 */
85
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
86
87
	/**
88
	 * OcrService constructor.
89
	 *
90
	 * @param ITempManager $tempManager
91
	 * @param IConfig $config
92
	 * @param GearmanWorkerService $workerService
93
	 * @param OcrStatusMapper $mapper
94
	 * @param View $view
95
	 * @param $userId
96
	 * @param IL10N $l10n
97
	 * @param ILogger $logger
98
	 */
99 2
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, IL10N $l10n, ILogger $logger) {
100 2
		$this->logger = $logger;
101 2
		$this->tempM = $tempManager;
102 2
		$this->config = $config;
103 2
		$this->workerService = $workerService;
104 2
		$this->statusMapper = $mapper;
105 2
		$this->view = $view;
106 2
		$this->userId = $userId;
107 2
		$this->l10n = $l10n;
108 2
	}
109
110
	/**
111
	 * Gets the list of all available tesseract-ocr languages.
112
	 *
113
	 * @return array Languages
114
	 */
115
	public function listLanguages() {
116
		try {
117
			$success = -1;
118
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
119
			exec('tesseract --list-langs 2>&1', $result, $success);
120
			if ($success === 0 && count($result) > 0) {
121
				if (is_array($result)) {
122
					$traineddata = $result;
123
				} else {
124
					$traineddata = explode(' ', $result);
125
				}
126
				$languages = array();
127
				foreach ($traineddata as $td) {
128
					$tdname = trim($td);
129
					if (strlen($tdname) === 3) {
130
						array_push($languages, $tdname);
131
					}
132
				}
133
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
134
				return $languages;
135
			} else {
136
				throw new NotFoundException($this->l10n->t('No languages found.'));
137
			}
138
		} catch (Exception $e) {
139
			$this->handleException($e);
140
		}
141
	}
142
143
	/**
144
	 * Processes and prepares the files for ocr.
145
	 * Sends the stuff to the gearman client in order to ocr async.
146
	 *
147
	 * @param string $language
148
	 * @param array $files
149
	 * @return string
150
	 */
151
	public function process($language, $files) {
152
		try {
153
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
154
			// Check if files and language not empty
155
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
156
				// get the array with full fileinfo
157
				$fileInfo = $this->buildFileInfo($files);
158
				foreach ($fileInfo as $fInfo) {
159
					// Check if filelock existing
160
					// TODO: FileLock maybe \OC\Files\View::lockFile()
161
					// get new name for saving purpose
162
					$newName = $this->buildNewName($fInfo);
163
164
					// create a temp file for ocr processing purposes
165
					$tempFile = $this->tempM->getTemporaryFile();
166
167
					// set the gearman running type
168
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
169
						$ftype = 'mypdf';
170
					} else {
171
						$ftype = 'tess';
172
					}
173
174
					// Create status object
175
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
176
177
					// Init Gearman client and send task / job
178
					// Feed the gearman worker
179
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
180
				}
181
				return 'PROCESSING';
182
			} else {
183
				throw new NotFoundException($this->l10n->t('Empty passed parameters.'));
184
			}
185
		} catch (Exception $e) {
186
			$this->handleException($e);
187
		}
188
	}
189
190
	/**
191
	 * A function which returns the JSONResponse for all required status checks and tasks.
192
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
193
	 *
194
	 * @return string
195
	 */
196
	public function status() {
197
		try {
198
			// TODO: release lock
199
			$processed = $this->handleProcessed();
200
201
			$failed = count($this->handleFailed());
202
203
			$pending = count($this->statusMapper->findAllPending($this->userId));
204
205
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
206
		} catch (Exception $e) {
207
			$this->handleException($e);
208
		}
209
	}
210
211
	/**
212
	 * The command ocr:complete for occ will call this function in order to set the status.
213
	 * the gearman worker should call it automatically after each processing step.
214
	 *
215
	 * @param $statusId
216
	 * @param boolean $failed
217
	 */
218
	public function complete($statusId, $failed) {
219
		try {
220
			$status = $this->statusMapper->find($statusId);
221
			if (!$failed) {
222
				$status->setStatus('PROCESSED');
223
				$this->statusMapper->update($status);
224
			} else {
225
				$status->setStatus('FAILED');
226
				$this->statusMapper->update($status);
227
			}
228
		} catch (Exception $e) {
229
			if ($e instanceof NotFoundException) {
230
				$status->setStatus('FAILED');
231
				$this->statusMapper->update($status);
232
				$this->handleException($e);
233
			}
234
			$this->handleException($e);
235
		}
236
	}
237
238
	/**
239
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
240
	 * Returns the number of processed files.
241
	 *
242
	 * @return int
243
	 */
244
	private function handleProcessed() {
245
		try {
246
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
247
			$processed = $this->statusMapper->findAllProcessed($this->userId);
248
			foreach ($processed as $status) {
249
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
250
					//Save the tmp file with newname
251
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
252
					// Cleaning temp files
253
					$this->statusMapper->delete($status);
254
					exec('rm ' . $status->getTempFile() . '.txt');
255
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
256
					//Save the tmp file with newname
257
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
258
					$this->statusMapper->delete($status);
259
					exec('rm ' . $status->getTempFile());
260
				} else {
261
					throw new NotFoundException($this->l10n->t('Temp file does not exist.'));
262
				}
263
			}
264
			return count($processed);
265
		} catch (Exception $e) {
266
			$this->handleException($e);
267
		}
268
	}
269
270
	/**
271
	 * Handles all failed orders of ocr processing queue and returns the status objects.
272
	 *
273
	 * @return array
274
	 */
275
	private function handleFailed() {
276
		try {
277
			$failed = $this->statusMapper->findAllFailed($this->userId);
278
			foreach ($failed as $status) {
279
				// clean the tempfile
280
				exec('rm ' . $status->getTempFile());
281
				// clean from db
282
				$this->statusMapper->delete($status);
283
			}
284
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
285
			return $failed;
286
		} catch (Exception $e) {
287
			$this->handleException($e);
288
		}
289
	}
290
291
292
	/**
293
	 * Returns a not existing file name for pdf or image processing
294
	 *
295
	 * @param FileInfo $fileInfo
296
	 * @return string
297
	 */
298
	private function buildNewName(FileInfo $fileInfo) {
299
		// get rid of the .png or .pdf and so on
300
		$fileName = substr($fileInfo->getName(), 0, -4);
301
		// eliminate the file name from the path
302
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
303
		// and get the path on top of the user/files/ dir
304
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
305
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
306
			// PDFs:
307
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
308
		} else {
309
			// IMAGES:
310
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
311
		}
312
	}
313
314
	/**
315
	 * Returns the fileInfo for each file in files and checks
316
	 * if it has a allowed mimetype and some other conditions.
317
	 *
318
	 * @param array $files
319
	 * @return array of Files\FileInfo
320
	 * @throws NotFoundException
321
	 */
322
	private function buildFileInfo(array $files) {
323
		try {
324
			$fileArray = array();
325
			foreach ($files as $file) {
326
				// Check if anything is missing and file type is correct
327
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
328
					// get correct path
329
					$path = $this->getCorrectPath($file);
330
					$fileInfo = $this->view->getFileInfo($path);
331
					$this->checkMimeType($fileInfo);
332
					array_push($fileArray, $fileInfo);
333
				} else {
334
					throw new NotFoundException($this->l10n->t('Wrong path parameter.'));
335
				}
336
			}
337
			return $fileArray;
338
		} catch (Exception $e) {
339
			$this->handleException($e);
340
		}
341
	}
342
343
	/**
344
	 * Checks a Mimetype for a specific given FileInfo.
345
	 * @param Files\FileInfo $fileInfo
346
	 */
347
	private function checkMimeType(FileInfo $fileInfo) {
348
		try {
349
			if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
350
				$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
351
				throw new NotFoundException($this->l10n->t('Wrong parameters or wrong mimetype.'));
352
			}
353
		} catch (Exception $e) {
354
			$this->handleException($e);
355
		}
356
	}
357
358
	/**
359
	 * Returns the correct path based on delivered file variable
360
	 * @param $file
361
	 * @return string
362
	 */
363
	private function getCorrectPath($file) {
364
		if (empty($file['path'])) {
365
			//Because new updated files have the property directory instead of path
366
			$file['path'] = $file['directory'];
367
		}
368
		if ($file['path'] === '/') {
369
			$path = '' . '/' . $file['name'];
370
		} else {
371
			$path = $file['path'] . '/' . $file['name'];
372
		}
373
		return $path;
374
	}
375
376
	/**
377
	 * Inits the Gearman client and sends the task to the background worker (async)
378
	 * @param string $type
379
	 * @param $datadirectory
380
	 * @param $path
381
	 * @param $tempFile
382
	 * @param string $language
383
	 * @param OcrStatus $status
384
	 * @param string $occDir
385
	 */
386
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir) {
387
		try {
388
			if ($this->workerService->workerExists() === false) {
389
				throw new NotFoundException($this->l10n->t('No gearman worker exists.'));
390
			}
391
			$this->statusMapper->insert($status);
392
			// Gearman thing
393
			$client = new \GearmanClient();
394
			$client->addServer('127.0.0.1', 4730);
395
			$result = $client->doBackground("ocr", json_encode(array(
396
				'type' => $type,
397
				'datadirectory' => $datadirectory,
398
				'path' => $path,
399
				'tempfile' => $tempFile,
400
				'language' => $language,
401
				'statusid' => $status->getId(),
402
				'occdir' => $occDir
403
			)));
404
			$this->logger->debug('Gearman Client output: ' . json_encode($result), ['app' => 'ocr']);
405
		} catch (Exception $e) {
406
			$this->handleException($e);
407
		}
408
	}
409
410
	/**
411
	 * Handle the possible thrown Exceptions from all methods of this class.
412
	 *
413
	 * @param Exception $e
414
	 * @throws Exception
415
	 * @throws NotFoundException
416
	 */
417 View Code Duplication
	private function handleException($e) {
418
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
419
		if ($e instanceof NotFoundException) {
420
			throw new NotFoundException($e->getMessage());
421
		} else {
422
			throw $e;
423
		}
424
	}
425
}