Completed
Pull Request — master (#12)
by Janis
06:16
created

OcrService::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 10
cts 10
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 9
nc 1
nop 8
crap 1

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\Files\FileInfo;
20
use OCP\IConfig;
21
use OCP\IL10N;
22
use OCP\ILogger;
23
use OCP\ITempManager;
24
25
26
/**
27
 * Class OcrService
28
 * @package OCA\Ocr\Service
29
 */
30
class OcrService {
31
32
	/**
33
	 * @var ILogger
34
	 */
35
	private $logger;
36
37
	/**
38
	 * @var ITempManager
39
	 */
40
	private $tempM;
41
42
	/**
43
	 * @var IConfig
44
	 */
45
	private $config;
46
47
	/**
48
	 * @var GearmanWorkerService
49
	 */
50
	private $workerService;
51
52
	/**
53
	 * @var OcrStatusMapper
54
	 */
55
	private $statusMapper;
56
57
	/**
58
	 * @var View
59
	 */
60
	private $view;
61
62
	/**
63
	 * @var
64
	 */
65
	private $userId;
66
67
	/**
68
	 * @var IL10N
69
	 */
70
	private $l10n;
71
72
	/**
73
	 * Array of allowed mimetypes for ocr processing
74
	 */
75
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
76
77
	/**
78
	 * the correct mimetype for a pdf file
79
	 */
80
	const MIMETYPE_PDF = 'application/pdf';
81
82
	/**
83
	 * the only allowed image mimetypes by tesseract
84
	 */
85
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
86
87
	/**
88
	 * OcrService constructor.
89
	 *
90
	 * @param ITempManager $tempManager
91
	 * @param IConfig $config
92
	 * @param GearmanWorkerService $workerService
93
	 * @param OcrStatusMapper $mapper
94
	 * @param View $view
95
	 * @param $userId
96
	 * @param IL10N $l10n
97
	 * @param ILogger $logger
98
	 */
99 5
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, IL10N $l10n, ILogger $logger) {
100 5
		$this->logger = $logger;
101 5
		$this->tempM = $tempManager;
102 5
		$this->config = $config;
103 5
		$this->workerService = $workerService;
104 5
		$this->statusMapper = $mapper;
105 5
		$this->view = $view;
106 5
		$this->userId = $userId;
107 5
		$this->l10n = $l10n;
108 5
	}
109
110
	/**
111
	 * Gets the list of all available tesseract-ocr languages.
112
	 *
113
	 * @return array Languages
114
	 */
115
	public function listLanguages() {
116
		try {
117
			$success = -1;
118
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
119
			exec('tesseract --list-langs 2>&1', $result, $success);
120
			if ($success === 0 && count($result) > 0) {
121
				if (is_array($result)) {
122
					$traineddata = $result;
123
				} else {
124
					$traineddata = explode(' ', $result);
125
				}
126
				$languages = array();
127
				foreach ($traineddata as $td) {
128
					$tdname = trim($td);
129
					if (strlen($tdname) === 3) {
130
						array_push($languages, $tdname);
131
					}
132
				}
133
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
134
				return $languages;
135
			} else {
136
				throw new NotFoundException($this->l10n->t('No languages found.'));
137
			}
138
		} catch (Exception $e) {
139
			$this->handleException($e);
140
		}
141
	}
142
143
	/**
144
	 * Processes and prepares the files for ocr.
145
	 * Sends the stuff to the gearman client in order to ocr async.
146
	 *
147
	 * @param string $language
148
	 * @param array $files
149
	 * @return string
150
	 */
151
	public function process($language, $files) {
152
		try {
153
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
154
			// Check if files and language not empty
155
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
156
				// get the array with full fileinfo
157
				$fileInfo = $this->buildFileInfo($files);
158
				foreach ($fileInfo as $fInfo) {
159
					// Check if filelock existing
160
					// TODO: FileLock maybe \OC\Files\View::lockFile()
161
					// get new name for saving purpose
162
					$newName = $this->buildNewName($fInfo);
163
164
					// create a temp file for ocr processing purposes
165
					$tempFile = $this->tempM->getTemporaryFile();
166
167
					// set the gearman running type
168
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
169
						$ftype = 'mypdf';
170
					} else {
171
						$ftype = 'tess';
172
					}
173
174
					// Create status object
175
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
176
177
					// Init Gearman client and send task / job
178
					// Feed the gearman worker
179
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
180
				}
181
				return 'PROCESSING';
182
			} else {
183
				throw new NotFoundException($this->l10n->t('Empty passed parameters.'));
184
			}
185
		} catch (Exception $e) {
186
			$this->handleException($e);
187
		}
188
	}
189
190
	/**
191
	 * A function which returns the JSONResponse for all required status checks and tasks.
192
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
193
	 *
194
	 * @return string
195
	 */
196
	public function status() {
197
		try {
198
			// TODO: release lock
199
			$processed = $this->handleProcessed();
200
201
			$failed = count($this->handleFailed());
202
203
			$pending = count($this->statusMapper->findAllPending($this->userId));
204
205
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
206
		} catch (Exception $e) {
207
			$this->handleException($e);
208
		}
209
	}
210
211
	/**
212
	 * The command ocr:complete for occ will call this function in order to set the status.
213
	 * the gearman worker should call it automatically after each processing step.
214
	 *
215
	 * @param $statusId
216
	 * @param boolean $failed
217
	 */
218 3
	public function complete($statusId, $failed) {
219
		try {
220 3
			$status = $this->statusMapper->find($statusId);
221 2
			if (!$failed) {
222 1
				$status->setStatus('PROCESSED');
223 1
				$this->statusMapper->update($status);
224 1
			} else {
225 1
				$status->setStatus('FAILED');
226 1
				$this->statusMapper->update($status);
227
			}
228 3
		} catch (Exception $e) {
229 1
			$this->handleException($e);
230
		}
231 2
	}
232
233
	/**
234
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
235
	 * Returns the number of processed files.
236
	 *
237
	 * @return int
238
	 */
239
	private function handleProcessed() {
240
		try {
241
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
242
			$processed = $this->statusMapper->findAllProcessed($this->userId);
243
			foreach ($processed as $status) {
244
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
245
					//Save the tmp file with newname
246
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
247
					// Cleaning temp files
248
					$this->statusMapper->delete($status);
249
					exec('rm ' . $status->getTempFile() . '.txt');
250
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
251
					//Save the tmp file with newname
252
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
253
					$this->statusMapper->delete($status);
254
					exec('rm ' . $status->getTempFile());
255
				} else {
256
					throw new NotFoundException($this->l10n->t('Temp file does not exist.'));
257
				}
258
			}
259
			return count($processed);
260
		} catch (Exception $e) {
261
			$this->handleException($e);
262
		}
263
	}
264
265
	/**
266
	 * Handles all failed orders of ocr processing queue and returns the status objects.
267
	 *
268
	 * @return array
269
	 */
270
	private function handleFailed() {
271
		try {
272
			$failed = $this->statusMapper->findAllFailed($this->userId);
273
			foreach ($failed as $status) {
274
				// clean the tempfile
275
				exec('rm ' . $status->getTempFile());
276
				// clean from db
277
				$this->statusMapper->delete($status);
278
			}
279
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
280
			return $failed;
281
		} catch (Exception $e) {
282
			$this->handleException($e);
283
		}
284
	}
285
286
287
	/**
288
	 * Returns a not existing file name for pdf or image processing
289
	 *
290
	 * @param FileInfo $fileInfo
291
	 * @return string
292
	 */
293
	private function buildNewName(FileInfo $fileInfo) {
294
		// get rid of the .png or .pdf and so on
295
		$fileName = substr($fileInfo->getName(), 0, -4);
296
		// eliminate the file name from the path
297
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
298
		// and get the path on top of the user/files/ dir
299
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
300
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
301
			// PDFs:
302
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
303
		} else {
304
			// IMAGES:
305
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
306
		}
307
	}
308
309
	/**
310
	 * Returns the fileInfo for each file in files and checks
311
	 * if it has a allowed mimetype and some other conditions.
312
	 *
313
	 * @param array $files
314
	 * @return array of Files\FileInfo
315
	 * @throws NotFoundException
316
	 */
317
	private function buildFileInfo(array $files) {
318
		try {
319
			$fileArray = array();
320
			foreach ($files as $file) {
321
				// Check if anything is missing and file type is correct
322
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
323
					// get correct path
324
					$path = $this->getCorrectPath($file);
325
					$fileInfo = $this->view->getFileInfo($path);
326
					$this->checkMimeType($fileInfo);
327
					array_push($fileArray, $fileInfo);
328
				} else {
329
					throw new NotFoundException($this->l10n->t('Wrong path parameter.'));
330
				}
331
			}
332
			return $fileArray;
333
		} catch (Exception $e) {
334
			$this->handleException($e);
335
		}
336
	}
337
338
	/**
339
	 * Checks a Mimetype for a specific given FileInfo.
340
	 * @param Files\FileInfo $fileInfo
341
	 */
342
	private function checkMimeType(FileInfo $fileInfo) {
343
		try {
344
			if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
345
				$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
346
				throw new NotFoundException($this->l10n->t('Wrong parameters or wrong mimetype.'));
347
			}
348
		} catch (Exception $e) {
349
			$this->handleException($e);
350
		}
351
	}
352
353
	/**
354
	 * Returns the correct path based on delivered file variable
355
	 * @param $file
356
	 * @return string
357
	 */
358
	private function getCorrectPath($file) {
359
		if (empty($file['path'])) {
360
			//Because new updated files have the property directory instead of path
361
			$file['path'] = $file['directory'];
362
		}
363
		if ($file['path'] === '/') {
364
			$path = '' . '/' . $file['name'];
365
		} else {
366
			$path = $file['path'] . '/' . $file['name'];
367
		}
368
		return $path;
369
	}
370
371
	/**
372
	 * Inits the Gearman client and sends the task to the background worker (async)
373
	 * @param string $type
374
	 * @param $datadirectory
375
	 * @param $path
376
	 * @param $tempFile
377
	 * @param string $language
378
	 * @param OcrStatus $status
379
	 * @param string $occDir
380
	 */
381
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir) {
382
		try {
383
			if ($this->workerService->workerExists() === false) {
384
				throw new NotFoundException($this->l10n->t('No gearman worker exists.'));
385
			}
386
			$this->statusMapper->insert($status);
387
			// Gearman thing
388
			$client = new \GearmanClient();
389
			$client->addServer('127.0.0.1', 4730);
390
			$result = $client->doBackground("ocr", json_encode(array(
391
				'type' => $type,
392
				'datadirectory' => $datadirectory,
393
				'path' => $path,
394
				'tempfile' => $tempFile,
395
				'language' => $language,
396
				'statusid' => $status->getId(),
397
				'occdir' => $occDir
398
			)));
399
			$this->logger->debug('Gearman Client output: ' . json_encode($result), ['app' => 'ocr']);
400
		} catch (Exception $e) {
401
			$this->handleException($e);
402
		}
403
	}
404
405
	/**
406
	 * Handle the possible thrown Exceptions from all methods of this class.
407
	 *
408
	 * @param Exception $e
409
	 * @throws Exception
410
	 * @throws NotFoundException
411
	 */
412 1 View Code Duplication
	private function handleException($e) {
413 1
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
414 1
		if ($e instanceof NotFoundException) {
415 1
			throw new NotFoundException($e->getMessage());
416
		} else {
417
			throw $e;
418
		}
419
	}
420
}