Completed
Pull Request — master (#6)
by Janis
03:08
created

OcrService::status()   A

Complexity

Conditions 2
Paths 4

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 14
ccs 0
cts 8
cp 0
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 8
nc 4
nop 0
crap 6
1
<?php
2
/**
3
 * nextCloud - ocr
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Janis Koehr <[email protected]>
9
 * @copyright Janis Koehr 2016
10
 */
11
12
namespace OCA\Ocr\Service;
13
14
use Exception;
15
use OC\Files\View;
16
use OCA\Ocr\Db\OcrStatus;
17
use OCA\Ocr\Db\OcrStatusMapper;
18
use OCP\Files;
19
use OCP\IConfig;
20
use OCP\ILogger;
21
use OCP\ITempManager;
22
23
24
/**
25
 * Class OcrService
26
 * @package OCA\Ocr\Service
27
 */
28
class OcrService {
29
30
	/**
31
	 * @var ILogger
32
	 */
33
	private $logger;
34
35
	/**
36
	 * @var ITempManager
37
	 */
38
	private $tempM;
39
40
	/**
41
	 * @var IConfig
42
	 */
43
	private $config;
44
45
	/**
46
	 * @var GearmanWorkerService
47
	 */
48
	private $workerService;
49
50
	/**
51
	 * @var OcrStatusMapper
52
	 */
53
	private $statusMapper;
54
55
	/**
56
	 * @var View
57
	 */
58
	private $view;
59
60
	/**
61
	 * @var
62
	 */
63
	private $userId;
64
65
	/**
66
	 * Array of allowed mimetypes for ocr processing
67
	 */
68
	const ALLOWED_MIMETYPES = ['application/pdf', 'image/png', 'image/jpeg', 'image/tiff'];
69
70
	/**
71
	 * the correct mimetype for a pdf file
72
	 */
73
	const MIMETYPE_PDF = 'application/pdf';
74
75
	/**
76
	 * the only allowed image mimetypes by tesseract
77
	 */
78
	const MIMETYPES_IMAGE = ['image/png', 'image/jpeg', 'image/tiff'];
79
80
	/**
81
	 * OcrService constructor.
82
	 *
83
	 * @param ILogger $logger
84
	 */
85 2
	public function __construct(ITempManager $tempManager, IConfig $config, GearmanWorkerService $workerService, OcrStatusMapper $mapper, View $view, $userId, ILogger $logger) {
86 2
		$this->logger = $logger;
87 2
		$this->tempM = $tempManager;
88 2
		$this->config = $config;
89 2
		$this->workerService = $workerService;
90 2
		$this->statusMapper = $mapper;
91 2
		$this->view = $view;
92 2
		$this->userId = $userId;
93 2
	}
94
95
	/**
96
	 * Gets the list of all available tesseract-ocr languages.
97
	 *
98
	 * @return array Languages
99
	 */
100
	public function listLanguages(){
101
		try {
102
			$success = -1;
103
			$this->logger->debug('Fetching languages. ', ['app' => 'ocr']);
104
			exec('tesseract --list-langs 2>&1', $result, $success);
105
			if ($success === 0 && count($result) > 0) {
106
				if (is_array($result)) {
107
					$traineddata = $result;
108
				} else {
109
					$traineddata = explode(' ', $result);
110
				}
111
				$languages = array();
112
				foreach ($traineddata as $td) {
113
					$tdname = trim($td);
114
					if (strlen($tdname) === 3) {
115
						array_push($languages, $tdname);
116
					}
117
				}
118
				$this->logger->debug('Fetched languages: '.json_encode($languages), ['app' => 'ocr']);
119
				return $languages;
120
			} else {
121
				throw new NotFoundException('No languages found.');
122
			}
123
		}catch(Exception $e){
124
			$this->handleException($e);
125
		}
126
	}
127
128
	/**
129
	 * Processes and prepares the files for ocr.
130
	 * Sends the stuff to the gearman client in order to ocr async.
131
	 *
132
	 * @param $language
133
	 * @param array $files
134
	 * @return string
135
	 */
136
	public function process($language, $files) {
137
		try {
138
			$this->logger->debug('Will now process files: '.json_encode($files) . ' with language: ' . json_encode($language), ['app' => 'ocr']);
139
			// Check if files and language not empty
140
			if(!empty($files) && !empty($language) && in_array($language, $this->listLanguages())){
141
				// get the array with full fileinfo
142
				$fileInfo = $this->buildFileInfo($files);
143
				foreach ($fileInfo as $fInfo){
1 ignored issue
show
Bug introduced by
The expression $fileInfo of type array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
144
					// Check if filelock existing
145
					// TODO: FileLock maybe \OC\Files\View::lockFile()
146
					// get new name for saving purpose
147
					$newName = $this->buildNewName($fInfo);
148
149
					// create a temp file for ocr processing purposes
150
					$tempFile = $this->tempM->getTemporaryFile();
151
152
					// set the gearman running type
153
					if($fInfo->getMimetype() === $this::MIMETYPE_PDF){
154
						$ftype = 'mypdf';
155
					}else{
156
						$ftype = 'tess';
157
					}
158
159
					// Create status object
160
					$status = new OcrStatus('PENDING', $fInfo->getId(), $newName, $tempFile, $ftype, $this->userId);
161
162
					// Init Gearman client and send task / job
163
					// Feed the gearman worker
164
					$this->sendGearmanJob($ftype, $this->config->getSystemValue('datadirectory'), $fInfo->getPath(), $tempFile, $language, $status, \OC::$SERVERROOT);
165
				}
166
				return 'PROCESSING';
167
			}else{
168
				throw new NotFoundException('Empty parameters.');
169
			}
170
		}catch(Exception $e){
171
			$this->handleException($e);
172
		}
173
	}
174
175
	/**
176
	 * A function which returns the JSONResponse for all required status checks and tasks.
177
	 * It will check for already processed, pending and failed ocr tasks and return them as needed.
178
	 *
179
	 * @return string
180
	 */
181
	public function status(){
182
		try {
183
			// TODO: release lock
184
			$processed = $this->handleProcessed();
185
186
			$failed = count($this->handleFailed());
187
188
			$pending = count($this->statusMapper->findAllPending($this->userId));
189
190
			return ['processed' => $processed, 'failed' => $failed, 'pending' => $pending];
191
		}catch (Exception $e){
192
			$this->handleException($e);
193
		}
194
	}
195
196
	/**
197
	 * The command ocr:complete for occ will call this function in order to set the status.
198
	 * the gearman worker should call it automatically after each processing step.
199
	 *
200
	 * @param $statusId
201
	 * @param $failed
202
	 */
203
	public function complete($statusId, $failed){
204
		try{
205
			$status = $this->statusMapper->find($statusId);
206
			if(!$failed) {
207
				$status->setStatus('PROCESSED');
208
				$this->statusMapper->update($status);
209
			}else{
210
				$status->setStatus('FAILED');
211
				$this->statusMapper->update($status);
212
			}
213
		} catch (Exception $e){
214
			if ($e instanceof NotFoundException){
215
				$status->setStatus('FAILED');
216
				$this->statusMapper->update($status);
217
				$this->handleException($e);
218
			}
219
			$this->handleException($e);
220
		}
221
	}
222
223
	/**
224
	 * Finishes all Processed files by copying them to the right path and deleteing the temp files.
225
	 * Returns the number of processed files.
226
	 *
227
	 * @return int
228
	 */
229
	private function handleProcessed(){
230
		try {
231
			$this->logger->debug('Find processed ocr files and put them to the right dirs.', ['app' => 'ocr']);
232
			$processed = $this->statusMapper->findAllProcessed($this->userId);
233
			foreach ($processed as $status) {
234
				if ($status->getType() === 'tess' && file_exists($status->getTempFile().'.txt')) {
235
					//Save the tmp file with newname
236
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile() . '.txt'));// need .txt because tesseract saves it like this
237
					// Cleaning temp files
238
					$this->statusMapper->delete($status);
239
					exec('rm ' . $status->getTempFile() . '.txt');
240
				} elseif ($status->getType() === 'mypdf' && file_exists($status->getTempFile())) {
241
					//Save the tmp file with newname
242
					$this->view->file_put_contents($status->getNewName(), file_get_contents($status->getTempFile()));// don't need to extend with .pdf / it uses the tmp file to save
243
					$this->statusMapper->delete($status);
244
					exec('rm ' . $status->getTempFile());
245
				}else{
246
					throw new NotFoundException('Temp file does not exist.');
247
				}
248
			}
249
			return count($processed);
250
		}catch (Exception $e){
251
			$this->handleException($e);
252
		}
253
	}
254
255
	/**
256
	 * Handles all failed orders of ocr processing queue and returns the status objects.
257
	 *
258
	 * @return array
259
	 */
260
	private function handleFailed(){
261
		try {
262
			$failed = $this->statusMapper->findAllFailed($this->userId);
263
			foreach ($failed as $status) {
264
				// clean the tempfile
265
				exec('rm ' . $status->getTempFile());
266
				// clean from db
267
				$this->statusMapper->delete($status);
268
			}
269
			$this->logger->debug('Following status objects failed: '.json_encode($failed), ['app' => 'ocr']);
270
			return $failed;
271
		}catch (Exception $e){
272
			$this->handleException($e);
273
		}
274
	}
275
276
277
	/**
278
	 * Returns a not existing file name for pdf or image processing
279
	 *
280
	 * @param Files\FileInfo $fileInfo
281
	 * @return string
282
	 */
283
	private function buildNewName(Files\FileInfo $fileInfo){
284
		// get rid of the .png or .pdf and so on
285
		$fileName = substr($fileInfo->getName(),0,-4);
286
		// eliminate the file name from the path
287
		$filePath = str_replace($fileInfo->getName(),'',$fileInfo->getPath());
288
		// and get the path on top of the user/files/ dir
289
		$filePath = str_replace('/'.$this->userId.'/files','',$filePath);
290
		if($fileInfo->getMimetype() === $this::MIMETYPE_PDF){
291
			// PDFs:
292
			return Files::buildNotExistingFileName($filePath, $fileName.'_OCR.pdf');
293
		} else {
294
			// IMAGES:
295
			return Files::buildNotExistingFileName($filePath, $fileName.'_OCR.txt');
296
		}
297
	}
298
299
	/**
300
	 * Returns the fileInfo for each file in files and checks
301
	 * if it has a allowed mimetype and some other conditions.
302
	 *
303
	 * @param array $files
304
	 * @return array of Files\FileInfo
305
	 * @throws NotFoundException
306
	 */
307
	private function buildFileInfo(array $files){
308
		try {
309
			$fileArray = array();
310
			foreach ($files as $file) {
311
				// Check if anything is missing and file type is correct
312
				if ((!empty($file['path']) || !empty($file['directory'])) && $file['type'] === 'file') {
313
					if(empty($file['path'])){
314
						//Because new updated files have the property directory instead of path
315
						$file['path'] = $file['directory'];
316
					}
317
					// get correct path
318
					$path = $this->getCorrectPath($file);
319
					$fileInfo = $this->view->getFileInfo($path);
320
					if (!$fileInfo || !in_array($fileInfo->getMimetype(), $this::ALLOWED_MIMETYPES)) {
321
						$this->logger->debug('Getting FileInfo did not work or not included in the ALLOWED_MIMETYPES array.', ['app' => 'ocr']);
322
						throw new NotFoundException('Wrong parameters or wrong mimetype.');
323
					}
324
					array_push($fileArray, $fileInfo);
325
				} else {
326
					throw new NotFoundException('Wrong path parameter.');
327
				}
328
			}
329
			return $fileArray;
330
		} catch (Exception $e) {
331
			$this->handleException($e);
332
		}
333
	}
334
335
	/**
336
	 * Returns the correct path based on delivered file variable
337
	 * @param $file
338
	 * @return string
339
	 */
340
	private function getCorrectPath($file){
341
		if($file['path'] === '/'){
342
			$path = ''. '/' . $file['name'];
343
		}else{
344
			$path = $file['path']. '/' . $file['name'];
345
		}
346
		return $path;
347
	}
348
349
	/**
350
	 * Inits the Gearman client and sends the task to the background worker (async)
351
	 * @param $type
352
	 * @param $datadirectory
353
	 * @param $path
354
	 * @param $tempFile
355
	 * @param $language
356
	 * @param $statusId
357
	 */
358
	private function sendGearmanJob($type, $datadirectory, $path, $tempFile, $language, $status, $occDir){
359
		try {
360
			if($this->workerService->workerExists() === false){
361
				throw new NotFoundException('No gearman worker exists.');
362
			}
363
			$this->statusMapper->insert($status);
364
			// Gearman thing
365
			$client = new \GearmanClient();
366
			$client->addServer('127.0.0.1',4730);
367
			$result = $client->doBackground("ocr", json_encode(array(
368
				'type' => $type,
369
				'datadirectory' => $datadirectory,
370
				'path' => $path,
371
				'tempfile' => $tempFile,
372
				'language' => $language,
373
				'statusid' => $status->getId(),
374
				'occdir' => $occDir
375
			)));
376
			$this->logger->debug('Gearman Client output: '.json_encode($result), ['app' => 'ocr']);
377
		} catch (Exception $e) {
378
			$this->handleException($e);
379
		}
380
	}
381
382
	/**
383
	 * Handle the possible thrown Exceptions from all methods of this class.
384
	 *
385
	 * @param Exception $e
386
	 * @throws Exception
387
	 * @throws NotFoundException
388
	 */
389 View Code Duplication
	private function handleException($e) {
1 ignored issue
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
390
		$this->logger->logException($e, ['app' => 'ocr', 'message' => 'Exception during ocr service function processing']);
391
		if ($e instanceof NotFoundException) {
392
			throw new NotFoundException($e->getMessage());
393
		} else {
394
			throw $e;
395
		}
396
	}
397
}