Completed
Pull Request — master (#11)
by Janis
03:05 queued 22s
created

OcrService::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 10
cts 10
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 9
nc 1
nop 8
crap 1

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\Files\FileInfo;
20
use OCP\IConfig;
21
use OCP\IL10N;
22
use OCP\ILogger;
23
use OCP\ITempManager;
24
25
26
/**
27
 * Class OcrService
28
 * @package OCA\Ocr\Service
29
 */
30
class OcrService {
31
32
	/**
33
	 * @var ILogger
34
	 */
35
	private $logger;
36
37
	/**
38
	 * @var ITempManager
39
	 */
40
	private $tempM;
41
42
	/**
43
	 * @var IConfig
44
	 */
45
	private $config;
46
47
	/**
48
	 * @var GearmanWorkerService
49
	 */
50
	private $workerService;
51
52
	/**
53
	 * @var OcrStatusMapper
54
	 */
55
	private $statusMapper;
56
57
	/**
58
	 * @var View
59
	 */
60
	private $view;
61
62
	/**
63
	 * @var
64
	 */
65
	private $userId;
66
67
	/**
68
	 * @var IL10N
69
	 */
70
	private $l10n;
71
72
	/**
73
	 * Array of allowed mimetypes for ocr processing
74
	 */
75
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
76
77
	/**
78
	 * the correct mimetype for a pdf file
79
	 */
80
	const MIMETYPE_PDF = 'application/pdf';
81
82
	/**
83
	 * the only allowed image mimetypes by tesseract
84
	 */
85
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
86
87
	/**
88
	 * OcrService constructor.
89
	 *
90
	 * @param ITempManager $tempManager
91
	 * @param IConfig $config
92
	 * @param GearmanWorkerService $workerService
93
	 * @param OcrStatusMapper $mapper
94
	 * @param View $view
95
	 * @param $userId
96
	 * @param IL10N $l10n
97
	 * @param ILogger $logger
98
	 */
99 2
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, IL10N $l10n, ILogger $logger) {
100 2
		$this->logger = $logger;
101 2
		$this->tempM = $tempManager;
102 2
		$this->config = $config;
103 2
		$this->workerService = $workerService;
104 2
		$this->statusMapper = $mapper;
105 2
		$this->view = $view;
106 2
		$this->userId = $userId;
107 2
		$this->l10n = $l10n;
108 2
	}
109
110
	/**
111
	 * Gets the list of all available tesseract-ocr languages.
112
	 *
113
	 * @return array Languages
114
	 */
115
	public function listLanguages() {
116
		try {
117
			$success = -1;
118
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
119
			exec('tesseract --list-langs 2>&1', $result, $success);
120
			if ($success === 0 && count($result) > 0) {
121
				if (is_array($result)) {
122
					$traineddata = $result;
123
				} else {
124
					$traineddata = explode(' ', $result);
125
				}
126
				$languages = array();
127
				foreach ($traineddata as $td) {
128
					$tdname = trim($td);
129
					if (strlen($tdname) === 3) {
130
						array_push($languages, $tdname);
131
					}
132
				}
133
				$this->logger->debug('Fetched languages: ' . json_encode($languages), ['app' => 'ocr']);
134
				return $languages;
135
			} else {
136
				throw new NotFoundException($this->l10n->t('No languages found.'));
137
			}
138
		} catch (Exception $e) {
139
			$this->handleException($e);
140
		}
141
	}
142
143
	/**
144
	 * Processes and prepares the files for ocr.
145
	 * Sends the stuff to the gearman client in order to ocr async.
146
	 *
147
	 * @param string $language
148
	 * @param array $files
149
	 * @return string
150
	 */
151
	public function process($language, $files) {
152
		try {
153
			$this->logger->debug('Will now process files: ' . json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
154
			// Check if files and language not empty
155
			if (!empty($files) && !empty($language) && in_array($language, $this->listLanguages())) {
156
				// get the array with full fileinfo
157
				$fileInfo = $this->buildFileInfo($files);
158
				foreach ($fileInfo as $fInfo) {
159
					// Check if filelock existing
160
					// TODO: FileLock maybe \OC\Files\View::lockFile()
161
					// get new name for saving purpose
162
					$newName = $this->buildNewName($fInfo);
163
164
					// create a temp file for ocr processing purposes
165
					$tempFile = $this->tempM->getTemporaryFile();
166
167
					// set the gearman running type
168
					if ($fInfo->getMimetype() === $this::MIMETYPE_PDF) {
169
						$ftype = 'mypdf';
170
					} else {
171
						$ftype = 'tess';
172
					}
173
174
					// Create status object
175
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
176
177
					// Init Gearman client and send task / job
178
					// Feed the gearman worker
179
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
180
				}
181
				return 'PROCESSING';
182
			} else {
183
				throw new NotFoundException($this->l10n->t('Empty passed parameters.'));
184
			}
185
		} catch (Exception $e) {
186
			$this->handleException($e);
187
		}
188
	}
189
190
	/**
191
	 * A function which returns the JSONResponse for all required status checks and tasks.
192
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
193
	 *
194
	 * @return string
195
	 */
196
	public function status() {
197
		try {
198
			// TODO: release lock
199
			$processed = $this->handleProcessed();
200
201
			$failed = count($this->handleFailed());
202
203
			$pending = count($this->statusMapper->findAllPending($this->userId));
204
205
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
206
		} catch (Exception $e) {
207
			$this->handleException($e);
208
		}
209
	}
210
211
	/**
212
	 * The command ocr:complete for occ will call this function in order to set the status.
213
	 * the gearman worker should call it automatically after each processing step.
214
	 *
215
	 * @param $statusId
216
	 * @param boolean $failed
217
	 */
218
	public function complete($statusId, $failed) {
219
		try {
220
			$status = $this->statusMapper->find($statusId);
221
			if (!$failed) {
222
				$status->setStatus('PROCESSED');
223
				$this->statusMapper->update($status);
224
			} else {
225
				$status->setStatus('FAILED');
226
				$this->statusMapper->update($status);
227
			}
228
		} catch (Exception $e) {
229
			if ($e instanceof NotFoundException) {
230
				$status->setStatus('FAILED');
231
				$this->statusMapper->update($status);
232
				$this->handleException($e);
233
			}
234
			$this->handleException($e);
235
		}
236
	}
237
238
	/**
239
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
240
	 * Returns the number of processed files.
241
	 *
242
	 * @return int
243
	 */
244
	private function handleProcessed() {
245
		try {
246
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
247
			$processed = $this->statusMapper->findAllProcessed($this->userId);
248
			foreach ($processed as $status) {
249
				if ($status->getType() === 'tess' && file_exists($status->getTempFile() . '.txt')) {
250
					//Save the tmp file with newname
251
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt')); // need .txt because tesseract saves it like this
252
					// Cleaning temp files
253
					$this->statusMapper->delete($status);
254
					exec('rm ' . $status->getTempFile() . '.txt');
255
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
256
					//Save the tmp file with newname
257
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile())); // don't need to extend with .pdf / it uses the tmp file to save
258
					$this->statusMapper->delete($status);
259
					exec('rm ' . $status->getTempFile());
260
				} else {
261
					throw new NotFoundException($this->l10n->t('Temp file does not exist.'));
262
				}
263
			}
264
			return count($processed);
265
		} catch (Exception $e) {
266
			$this->handleException($e);
267
		}
268
	}
269
270
	/**
271
	 * Handles all failed orders of ocr processing queue and returns the status objects.
272
	 *
273
	 * @return array
274
	 */
275
	private function handleFailed() {
276
		try {
277
			$failed = $this->statusMapper->findAllFailed($this->userId);
278
			foreach ($failed as $status) {
279
				// clean the tempfile
280
				exec('rm ' . $status->getTempFile());
281
				// clean from db
282
				$this->statusMapper->delete($status);
283
			}
284
			$this->logger->debug('Following status objects failed: ' . json_encode($failed), ['app' => 'ocr']);
285
			return $failed;
286
		} catch (Exception $e) {
287
			$this->handleException($e);
288
		}
289
	}
290
291
292
	/**
293
	 * Returns a not existing file name for pdf or image processing
294
	 *
295
	 * @param FileInfo $fileInfo
296
	 * @return string
297
	 */
298
	private function buildNewName(FileInfo $fileInfo) {
299
		// get rid of the .png or .pdf and so on
300
		$fileName = substr($fileInfo->getName(), 0, -4);
301
		// eliminate the file name from the path
302
		$filePath = str_replace($fileInfo->getName(), '', $fileInfo->getPath());
303
		// and get the path on top of the user/files/ dir
304
		$filePath = str_replace('/' . $this->userId . '/files', '', $filePath);
305
		if ($fileInfo->getMimetype() === $this::MIMETYPE_PDF) {
306
			// PDFs:
307
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.pdf');
308
		} else {
309
			// IMAGES:
310
			return Files::buildNotExistingFileName($filePath, $fileName . '_OCR.txt');
311
		}
312
	}
313
314
	/**
315
	 * Returns the fileInfo for each file in files and checks
316
	 * if it has a allowed mimetype and some other conditions.
317
	 *
318
	 * @param array $files
319
	 * @return array of Files\FileInfo
320
	 * @throws NotFoundException
321
	 */
322
	private function buildFileInfo(array $files) {
323
		try {
324
			$fileArray = array();
325
			foreach ($files as $file) {
326
				// Check if anything is missing and file type is correct
327
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
328
					// get correct path
329
					$path = $this->getCorrectPath($file);
330
					$fileInfo = $this->view->getFileInfo($path);
331
					$this->checkMimeType($fileInfo);
332
					array_push($fileArray, $fileInfo);
333
				} else {
334
					throw new NotFoundException($this->l10n->t('Wrong path parameter.'));
335
				}
336
			}
337
			return $fileArray;
338
		} catch (Exception $e) {
339
			$this->handleException($e);
340
		}
341
	}
342
343
	/**
344
	 * Checks a Mimetype for a specific given FileInfo.
345
	 * @param Files\FileInfo $fileInfo
346
	 */
347
	private function checkMimeType(FileInfo $fileInfo){
348
		try{
349
			if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
350
				$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
351
				throw new NotFoundException($this->l10n->t('Wrong parameters or wrong mimetype.'));
352
			}
353
		}catch (Exception $e){
354
			$this->handleException($e);
355
		}
356
	}
357
358
	/**
359
	 * Returns the correct path based on delivered file variable
360
	 * @param $file
361
	 * @return string
362
	 */
363
	private function getCorrectPath($file) {
364
		if (empty($file['path'])) {
365
			//Because new updated files have the property directory instead of path
366
			$file['path'] = $file['directory'];
367
		}
368
		if ($file['path'] === '/') {
369
			$path = '' . '/' . $file['name'];
370
		} else {
371
			$path = $file['path'] . '/' . $file['name'];
372
		}
373
		return $path;
374
	}
375
376
	/**
377
	 * Inits the Gearman client and sends the task to the background worker (async)
378
	 * @param string $type
379
	 * @param $datadirectory
380
	 * @param $path
381
	 * @param $tempFile
382
	 * @param string $language
383
	 * @param OcrStatus $status
384
	 * @param string $occDir
385
	 */
386
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir) {
387
		try {
388
			if ($this->workerService->workerExists() === false) {
389
				throw new NotFoundException($this->l10n->t('No gearman worker exists.'));
390
			}
391
			$this->statusMapper->insert($status);
392
			// Gearman thing
393
			$client = new \GearmanClient();
394
			$client->addServer('127.0.0.1', 4730);
395
			$result = $client->doBackground("ocr", json_encode(array(
396
				'type' => $type,
397
				'datadirectory' => $datadirectory,
398
				'path' => $path,
399
				'tempfile' => $tempFile,
400
				'language' => $language,
401
				'statusid' => $status->getId(),
402
				'occdir' => $occDir
403
			)));
404
			$this->logger->debug('Gearman Client output: ' . json_encode($result), ['app' => 'ocr']);
405
		} catch (Exception $e) {
406
			$this->handleException($e);
407
		}
408
	}
409
410
	/**
411
	 * Handle the possible thrown Exceptions from all methods of this class.
412
	 *
413
	 * @param Exception $e
414
	 * @throws Exception
415
	 * @throws NotFoundException
416
	 */
417 View Code Duplication
	private function handleException($e) {
418
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
419
		if ($e instanceof NotFoundException) {
420
			throw new NotFoundException($e->getMessage());
421
		} else {
422
			throw $e;
423
		}
424
	}
425
}