Completed
Push — master ( 701b8c...6df4c8 )
by Maxence
02:44
created

FilesService::extractContentUsingTesseractOCR()   A

Complexity

Conditions 4
Paths 9

Size

Total Lines 21
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 21
rs 9.0534
c 0
b 0
f 0
cc 4
eloc 13
nc 9
nop 2
1
<?php
2
/**
3
 * Files_FullTextSearch - Index the content of your files
4
 *
5
 * This file is licensed under the Affero General Public License version 3 or
6
 * later. See the COPYING file.
7
 *
8
 * @author Maxence Lange <[email protected]>
9
 * @copyright 2018
10
 * @license GNU AGPL version 3 or any later version
11
 *
12
 * This program is free software: you can redistribute it and/or modify
13
 * it under the terms of the GNU Affero General Public License as
14
 * published by the Free Software Foundation, either version 3 of the
15
 * License, or (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU Affero General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Affero General Public License
23
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
 *
25
 */
26
27
namespace OCA\Files_FullTextSearch\Service;
28
29
30
use Exception;
31
use OC\App\AppManager;
32
use OCA\Files_FullTextSearch\Exceptions\FileIsNotIndexableException;
33
use OCA\Files_FullTextSearch\Exceptions\KnownFileMimeTypeException;
34
use OCA\Files_FullTextSearch\Exceptions\KnownFileSourceException;
35
use OCA\Files_FullTextSearch\Model\FilesDocument;
36
use OCA\Files_FullTextSearch\Provider\FilesProvider;
37
use OCA\Files_FullTextSearch_Tesseract\Service\TesseractService;
38
use OCA\FullTextSearch\Exceptions\InterruptException;
39
use OCA\FullTextSearch\Exceptions\TickDoesNotExistException;
40
use OCA\FullTextSearch\Model\Index;
41
use OCA\FullTextSearch\Model\IndexDocument;
42
use OCA\FullTextSearch\Model\Runner;
43
use OCP\AppFramework\IAppContainer;
44
use OCP\Files\File;
45
use OCP\Files\FileInfo;
46
use OCP\Files\Folder;
47
use OCP\Files\InvalidPathException;
48
use OCP\Files\IRootFolder;
49
use OCP\Files\Node;
50
use OCP\Files\NotFoundException;
51
use OCP\Files\NotPermittedException;
52
use OCP\Files\StorageNotAvailableException;
53
use OCP\IUserManager;
54
use OCP\Share\IManager;
55
56
class FilesService {
57
58
	const MIMETYPE_TEXT = 'files_text';
59
	const MIMETYPE_PDF = 'files_pdf';
60
	const MIMETYPE_OFFICE = 'files_office';
61
	const MIMETYPE_OCR = 'files_ocr';
62
	const MIMETYPE_IMAGE = 'files_image';
63
	const MIMETYPE_AUDIO = 'files_audio';
64
65
66
	/** @var IAppContainer */
67
	private $container;
68
69
	/** @var IRootFolder */
70
	private $rootFolder;
71
72
	/** @var IUserManager */
73
	private $userManager;
74
75
	/** @var AppManager */
76
	private $appManager;
77
78
	/** @var IManager */
79
	private $shareManager;
80
81
	/** @var ConfigService */
82
	private $configService;
83
84
	/** @var LocalFilesService */
85
	private $localFilesService;
86
87
	/** @var ExternalFilesService */
88
	private $externalFilesService;
89
90
	/** @var GroupFoldersService */
91
	private $groupFoldersService;
92
93
	/** @var MiscService */
94
	private $miscService;
95
96
97
	/**
98
	 * FilesService constructor.
99
	 *
100
	 * @param IAppContainer $container
101
	 * @param IRootFolder $rootFolder
102
	 * @param AppManager $appManager
103
	 * @param IUserManager $userManager
104
	 * @param IManager $shareManager
105
	 * @param ConfigService $configService
106
	 * @param LocalFilesService $localFilesService
107
	 * @param ExternalFilesService $externalFilesService
108
	 * @param GroupFoldersService $groupFoldersService
109
	 * @param MiscService $miscService
110
	 *
111
	 * @internal param IProviderFactory $factory
112
	 */
113
	public function __construct(
114
		IAppContainer $container, IRootFolder $rootFolder, AppManager $appManager,
115
		IUserManager $userManager,
116
		IManager $shareManager,
117
		ConfigService $configService, LocalFilesService $localFilesService,
118
		ExternalFilesService $externalFilesService,
119
		GroupFoldersService $groupFoldersService,
120
		MiscService $miscService
121
	) {
122
		$this->container = $container;
123
		$this->rootFolder = $rootFolder;
124
		$this->appManager = $appManager;
125
		$this->userManager = $userManager;
126
		$this->shareManager = $shareManager;
127
128
		$this->configService = $configService;
129
		$this->localFilesService = $localFilesService;
130
		$this->externalFilesService = $externalFilesService;
131
		$this->groupFoldersService = $groupFoldersService;
132
133
		$this->miscService = $miscService;
134
	}
135
136
137
	/**
138
	 * @param Runner $runner
139
	 * @param string $userId
140
	 *
141
	 * @return FilesDocument[]
142
	 * @throws InterruptException
143
	 * @throws InvalidPathException
144
	 * @throws NotFoundException
145
	 * @throws TickDoesNotExistException
146
	 */
147
	public function getFilesFromUser(Runner $runner, $userId) {
148
149
		$this->initFileSystems($userId);
150
151
		/** @var Folder $files */
152
		$files = $this->rootFolder->getUserFolder($userId)
153
								  ->get('/');
154
		$result = $this->getFilesFromDirectory($runner, $userId, $files);
155
156
		return $result;
157
	}
158
159
160
	/**
161
	 * @param string $userId
162
	 */
163
	private function initFileSystems($userId) {
164
		if ($userId === '') {
165
			return;
166
		}
167
168
		$this->externalFilesService->initExternalFilesForUser($userId);
169
		$this->groupFoldersService->initGroupSharesForUser($userId);
170
	}
171
172
173
	/**
174
	 * @param Runner $runner
175
	 * @param string $userId
176
	 * @param Folder $node
177
	 *
178
	 * @return FilesDocument[]
179
	 * @throws InterruptException
180
	 * @throws InvalidPathException
181
	 * @throws NotFoundException
182
	 * @throws TickDoesNotExistException
183
	 */
184
	public function getFilesFromDirectory(Runner $runner, $userId, Folder $node) {
185
		$documents = [];
186
187
		try {
188
			if ($node->nodeExists('.noindex')) {
189
				return $documents;
190
			}
191
		} catch (StorageNotAvailableException $e) {
0 ignored issues
show
Bug introduced by
The class OCP\Files\StorageNotAvailableException does not exist. Did you forget a USE statement, or did you not list all dependencies?

Scrutinizer analyzes your composer.json/composer.lock file if available to determine the classes, and functions that are defined by your dependencies.

It seems like the listed class was neither found in your dependencies, nor was it found in the analyzed files in your repository. If you are using some other form of dependency management, you might want to disable this analysis.

Loading history...
192
			return $documents;
193
		}
194
195
		$files = $node->getDirectoryListing();
196
		foreach ($files as $file) {
197
			$runner->update('getFilesFromDirectory');
198
199
			try {
200
				$documents[] = $this->generateFilesDocumentFromFile($file, $userId);
201
			} catch (FileIsNotIndexableException $e) {
202
				continue;
203
			}
204
205
			if ($file->getType() === FileInfo::TYPE_FOLDER) {
206
				/** @var $file Folder */
207
				$documents =
208
					array_merge($documents, $this->getFilesFromDirectory($runner, $userId, $file));
209
			}
210
		}
211
212
		return $documents;
213
	}
214
215
216
	/**
217
	 * @param Node $file
218
	 *
219
	 * @param string $viewerId
220
	 *
221
	 * @return FilesDocument
222
	 * @throws FileIsNotIndexableException
223
	 * @throws InvalidPathException
224
	 * @throws NotFoundException
225
	 * @throws Exception
226
	 */
227
	private function generateFilesDocumentFromFile(Node $file, $viewerId) {
228
229
		$source = $this->getFileSource($file);
230
		$document = new FilesDocument(FilesProvider::FILES_PROVIDER_ID, $file->getId());
231
232
		$ownerId = '';
233
		if ($file->getOwner() !== null) {
234
			$ownerId = $file->getOwner()
235
							->getUID();
236
		}
237
238
		$document->setType($file->getType())
239
				 ->setSource($source)
240
				 ->setOwnerId($ownerId)
241
				 ->setPath($this->getPathFromViewerId($file->getId(), $viewerId))
242
				 ->setViewerId($viewerId)
243
				 ->setModifiedTime($file->getMTime())
244
				 ->setMimetype($file->getMimetype());
245
246
		return $document;
247
	}
248
249
250
	/**
251
	 * @param Node $file
252
	 *
253
	 * @return string
254
	 * @throws FileIsNotIndexableException
255
	 * @throws NotFoundException
256
	 */
257
	private function getFileSource(Node $file) {
258
		$source = '';
259
260
		try {
261
			$this->localFilesService->getFileSource($file, $source);
262
			$this->externalFilesService->getFileSource($file, $source);
263
			$this->groupFoldersService->getFileSource($file, $source);
264
		} catch (KnownFileSourceException $e) {
265
			/** we know the source, just leave. */
266
		}
267
268
		return $source;
269
	}
270
271
272
	/**
273
	 * @param string $userId
274
	 * @param string $path
275
	 *
276
	 * @return Node
277
	 * @throws NotFoundException
278
	 */
279
	public function getFileFromPath($userId, $path) {
280
		return $this->rootFolder->getUserFolder($userId)
281
								->get($path);
282
	}
283
284
285
	/**
286
	 * @param string $userId
287
	 * @param int $fileId
288
	 *
289
	 * @return Node
290
	 */
291
	public function getFileFromId($userId, $fileId) {
292
293
		if ($userId === '') {
294
			return null;
295
		}
296
297
		try {
298
			$files = $this->rootFolder->getUserFolder($userId)
299
									  ->getById($fileId);
300
		} catch (Exception $e) {
301
			return null;
302
		}
303
304
		if (sizeof($files) === 0) {
305
			return null;
306
		}
307
308
		$file = array_shift($files);
309
310
		return $file;
311
	}
312
313
314
	/**
315
	 * @param int $fileId
316
	 * @param string $viewerId
317
	 *
318
	 * @throws Exception
319
	 * @return string
320
	 */
321
	private function getPathFromViewerId($fileId, $viewerId) {
322
323
		$viewerFiles = $this->rootFolder->getUserFolder($viewerId)
324
										->getById($fileId);
325
326
		if (sizeof($viewerFiles) === 0) {
327
			return '';
328
		}
329
330
		$file = array_shift($viewerFiles);
331
332
		// TODO: better way to do this : we remove the '/userid/files/'
333
		$path = MiscService::noEndSlash(substr($file->getPath(), 8 + strlen($viewerId)));
334
335
		return $path;
336
	}
337
338
339
	/**
340
	 * @param FilesDocument $document
341
	 */
342
	public function setDocumentInfo(FilesDocument $document) {
343
344
		$viewerId = $document->getAccess()
345
							 ->getViewerId();
346
347
		$viewerFiles = $this->rootFolder->getUserFolder($viewerId)
348
										->getById($document->getId());
349
350
		if (sizeof($viewerFiles) === 0) {
351
			return;
352
		}
353
		// we only take the first file
354
		$file = array_shift($viewerFiles);
355
356
		// TODO: better way to do this : we remove the '/userId/files/'
357
		$path = MiscService::noEndSlash(substr($file->getPath(), 7 + strlen($viewerId)));
358
359
		$document->setPath($path);
360
		$document->setFileName($file->getName());
361
	}
362
363
364
	/**
365
	 * @param FilesDocument $document
366
	 */
367
	public function setDocumentTitle(FilesDocument $document) {
368
		$document->setTitle($document->getPath());
369
	}
370
371
372
	/**
373
	 * @param FilesDocument $document
374
	 */
375
	public function setDocumentLink(FilesDocument $document) {
376
377
		$path = $document->getPath();
378
		$filename = $document->getFileName();
379
		$dir = substr($path, 0, -strlen($filename));
380
381
		$document->setLink(
382
			\OC::$server->getURLGenerator()
383
						->linkToRoute(
384
							'files.view.index',
385
							[
386
								'dir'      => $dir,
387
								'scrollto' => $filename,
388
							]
389
						)
390
		);
391
	}
392
393
394
	/**
395
	 * @param FilesDocument $document
396
	 *
397
	 * @throws InvalidPathException
398
	 * @throws NotFoundException
399
	 */
400
	public function setDocumentMore(FilesDocument $document) {
401
402
		$access = $document->getAccess();
403
		$file = $this->getFileFromId($access->getViewerId(), $document->getId());
404
405
		if ($file === null) {
406
			return;
407
		}
408
409
		// TODO: better way to do this : we remove the '/userid/files/'
410
		$path =
411
			MiscService::noEndSlash(substr($file->getPath(), 7 + strlen($access->getViewerId())));
412
413
		$more = [
414
			'webdav'             => $this->getWebdavId($document->getId()),
415
			'path'               => $path,
416
			'timestamp'          => $file->getMTime(), // FIXME: get the creation date of the file
417
			'mimetype'           => $file->getMimetype(),
418
			'modified_timestamp' => $file->getMTime(),
419
			'etag'               => $file->getEtag(),
420
			'permissions'        => $file->getPermissions(),
421
			'size'               => $file->getSize(),
422
			'favorite'           => false // FIXME: get the favorite status
423
		];
424
425
		$document->setMore($more);
426
	}
427
428
429
	/**
430
	 * @param FilesDocument[] $documents
431
	 *
432
	 * @return FilesDocument[]
433
	 */
434
	public function generateDocuments($documents) {
435
436
		$index = [];
437
438
		foreach ($documents as $document) {
439
			if (!($document instanceof FilesDocument)) {
440
				continue;
441
			}
442
443
			try {
444
				$this->updateFilesDocument($document);
445
			} catch (Exception $e) {
446
				// TODO - update $document with a error status instead of just ignore !
447
				$document->getIndex()
448
						 ->setStatus(Index::INDEX_IGNORE);
449
				echo 'Exception: ' . json_encode($e->getTrace()) . ' - ' . $e->getMessage() . "\n";
450
			}
451
452
			$index[] = $document;
453
		}
454
455
		return $index;
456
	}
457
458
459
	/**
460
	 * @param Index $index
461
	 *
462
	 * @return FilesDocument
463
	 * @throws FileIsNotIndexableException
464
	 * @throws InvalidPathException
465
	 * @throws NotFoundException
466
	 * @throws NotPermittedException
467
	 */
468
	private function generateDocumentFromIndex(Index $index) {
469
		$file = $this->getFileFromId($index->getOwnerId(), $index->getDocumentId());
470
471
		if ($file === null) {
472
			$index->setStatus(Index::INDEX_REMOVE);
473
			$document = new FilesDocument($index->getProviderId(), $index->getDocumentId());
474
			$document->setIndex($index);
475
476
			return $document;
477
		}
478
479
		$document = $this->generateFilesDocumentFromFile($file, $index->getOwnerId());
480
		$document->setIndex($index);
481
482
		$this->updateFilesDocumentFromFile($document, $file);
483
484
		return $document;
485
	}
486
487
488
	/**
489
	 * @param IndexDocument $document
490
	 *
491
	 * @return bool
492
	 */
493
	public function isDocumentUpToDate($document) {
494
		$index = $document->getIndex();
495
496
		if (!$this->configService->compareIndexOptions($index)) {
497
			$index->setStatus(Index::INDEX_CONTENT);
498
			$document->setIndex($index);
499
500
			return false;
501
		}
502
503
		if ($index->getStatus() !== Index::INDEX_OK) {
504
			return false;
505
		}
506
507
		if ($index->getLastIndex() >= $document->getModifiedTime()) {
0 ignored issues
show
Unused Code introduced by
This if statement, and the following return statement can be replaced with return $index->getLastIn...ent->getModifiedTime();.
Loading history...
508
			return true;
509
		}
510
511
		return false;
512
	}
513
514
515
	/**
516
	 * @param Index $index
517
	 *
518
	 * @return FilesDocument
0 ignored issues
show
Documentation introduced by
Should the return type not be FilesDocument|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
519
	 * @throws InvalidPathException
520
	 * @throws NotFoundException
521
	 * @throws NotPermittedException
522
	 */
523
	public function updateDocument(Index $index) {
524
		$this->impersonateOwner($index);
525
		$this->initFileSystems($index->getOwnerId());
526
527
		try {
528
			$document = $this->generateDocumentFromIndex($index);
529
530
			return $document;
531
		} catch (FileIsNotIndexableException $e) {
532
			return null;
533
		}
534
	}
535
536
537
	/**
538
	 * @param FilesDocument $document
539
	 *
540
	 * @throws InvalidPathException
541
	 * @throws NotFoundException
542
	 * @throws NotPermittedException
543
	 */
544
	private function updateFilesDocument(FilesDocument $document) {
545
		$userFolder = $this->rootFolder->getUserFolder($document->getViewerId());
546
		$file = $userFolder->get($document->getPath());
547
548
		try {
549
			$this->updateFilesDocumentFromFile($document, $file);
550
		} catch (FileIsNotIndexableException $e) {
551
			$document->getIndex()
552
					 ->setStatus(Index::INDEX_IGNORE);
553
		}
554
	}
555
556
557
	/**
558
	 * @param FilesDocument $document
559
	 * @param Node $file
560
	 *
561
	 * @throws InvalidPathException
562
	 * @throws NotFoundException
563
	 * @throws NotPermittedException
564
	 */
565
	private function updateFilesDocumentFromFile(FilesDocument $document, Node $file) {
566
567
		$document->getIndex()
568
				 ->setSource($document->getSource());
569
570
		$this->updateDocumentAccess($document, $file);
571
		$this->updateContentFromFile($document, $file);
572
573
		$document->addTag($document->getSource());
574
	}
575
576
577
	/**
578
	 * @param FilesDocument $document
579
	 * @param Node $file
580
	 */
581
	private function updateDocumentAccess(FilesDocument $document, Node $file) {
582
583
		$index = $document->getIndex();
584
585
		if (!$index->isStatus(Index::INDEX_FULL)
586
			&& !$index->isStatus(FilesDocument::STATUS_FILE_ACCESS)) {
587
			return;
588
		}
589
590
		$this->localFilesService->updateDocumentAccess($document, $file);
591
		$this->externalFilesService->updateDocumentAccess($document, $file);
592
		$this->groupFoldersService->updateDocumentAccess($document, $file);
593
594
		$this->updateShareNames($document, $file);
595
	}
596
597
598
	/**
599
	 * @param FilesDocument $document
600
	 * @param Node $file
601
	 *
602
	 * @throws InvalidPathException
603
	 * @throws NotFoundException
604
	 * @throws NotPermittedException
605
	 */
606
	private function updateContentFromFile(FilesDocument $document, Node $file) {
607
608
		$document->setTitle($document->getPath());
609
610
		if (!$document->getIndex()
611
					  ->isStatus(Index::INDEX_CONTENT)
612
			|| $file->getType() !== FileInfo::TYPE_FILE) {
613
			return;
614
		}
615
616
		/** @var File $file */
617
		if ($file->getSize() <
618
			($this->configService->getAppValue(ConfigService::FILES_SIZE) * 1024 * 1024)) {
619
			$this->extractContentFromFileText($document, $file);
620
			$this->extractContentFromFileOffice($document, $file);
621
			$this->extractContentFromFilePDF($document, $file);
622
			$this->extractContentFromFileOCR($document, $file);
623
		}
624
625
		if ($document->getContent() === null) {
626
			$document->getIndex()
627
					 ->unsetStatus(Index::INDEX_CONTENT);
628
		}
629
	}
630
631
632
	/**
633
	 * @param FilesDocument $document
634
	 * @param Node $file
635
	 *
636
	 * @return array
637
	 */
638
	private function updateShareNames(FilesDocument $document, Node $file) {
639
640
		$users = [];
641
642
		$this->localFilesService->getShareUsersFromFile($file, $users);
643
		$this->externalFilesService->getShareUsers($document, $users);
644
		$this->groupFoldersService->getShareUsers($document, $users);
645
646
		$shareNames = [];
647
		foreach ($users as $user) {
648
			try {
649
				$shareNames[MiscService::secureUsername($user)] =
650
					$this->getPathFromViewerId($file->getId(), $user);
651
			} catch (Exception $e) {
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment why this CATCH block is empty.
Loading history...
652
			}
653
		}
654
655
		$document->setInfo('share_names', $shareNames);
656
657
//			if ($file->getStorage()
0 ignored issues
show
Unused Code Comprehensibility introduced by
53% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
658
//					 ->isLocal() === false) {
659
//				$shares = $this->externalFilesService->getAllSharesFromExternalFile($access);
660
//			} else {
661
//				$shares = $this->getAllSharesFromFile($file);
662
//			}
663
//
664
//			foreach ($shares as $user) {
665
//				try {
666
//					$shareNames[$user] = $this->getPathFromViewerId($file->getId(), $user);
667
//				} catch (Exception $e) {
668
//				}
669
//			}
670
//
671
		return $shareNames;
672
673
	}
674
675
	/**
676
	 * @param int $fileId
677
	 *
678
	 * @return string
679
	 */
680
	private function getWebdavId($fileId) {
681
		$instanceId = $this->configService->getSystemValue('instanceid');
682
683
		return sprintf("%08s", $fileId) . $instanceId;
684
	}
685
686
687
	/**
688
	 * @param string $mimeType
689
	 *
690
	 * @return string
691
	 */
692
	private function parseMimeType($mimeType) {
693
694
		$parsed = '';
695
		try {
696
			$this->parseMimeTypeText($mimeType, $parsed);
697
			$this->parseMimeTypePDF($mimeType, $parsed);
698
			$this->parseMimeTypeOffice($mimeType, $parsed);
699
		} catch (KnownFileMimeTypeException $e) {
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment why this CATCH block is empty.
Loading history...
700
		}
701
702
		return $parsed;
703
	}
704
705
706
	/**
707
	 * @param string $mimeType
708
	 * @param string $parsed
709
	 *
710
	 * @throws KnownFileMimeTypeException
711
	 */
712
	private function parseMimeTypeText($mimeType, &$parsed) {
713
714
		if (substr($mimeType, 0, 5) === 'text/') {
715
			$parsed = self::MIMETYPE_TEXT;
716
			throw new KnownFileMimeTypeException();
717
		}
718
719
		$textMimes = [
720
			'application/epub+zip'
721
		];
722
723 View Code Duplication
		foreach ($textMimes as $mime) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
724
			if (strpos($mimeType, $mime) === 0) {
725
				$parsed = self::MIMETYPE_TEXT;
726
				throw new KnownFileMimeTypeException();
727
			}
728
		}
729
	}
730
731
732
	/**
733
	 * @param string $mimeType
734
	 * @param string $parsed
735
	 *
736
	 * @throws KnownFileMimeTypeException
737
	 */
738
	private function parseMimeTypePDF($mimeType, &$parsed) {
739
740
		if ($mimeType === 'application/pdf') {
741
			$parsed = self::MIMETYPE_PDF;
742
			throw new KnownFileMimeTypeException();
743
		}
744
	}
745
746
747
	/**
748
	 * @param string $mimeType
749
	 * @param string $parsed
750
	 *
751
	 * @throws KnownFileMimeTypeException
752
	 */
753
	private function parseMimeTypeOffice($mimeType, &$parsed) {
754
755
		$officeMimes = [
756
			'application/msword',
757
			'application/vnd.oasis.opendocument',
758
			'application/vnd.sun.xml',
759
			'application/vnd.openxmlformats-officedocument',
760
			'application/vnd.ms-word',
761
			'application/vnd.ms-powerpoint',
762
			'application/vnd.ms-excel'
763
		];
764
765 View Code Duplication
		foreach ($officeMimes as $mime) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
766
			if (strpos($mimeType, $mime) === 0) {
767
				$parsed = self::MIMETYPE_OFFICE;
768
				throw new KnownFileMimeTypeException();
769
			}
770
		}
771
	}
772
773
774
	/**
775
	 * @param FilesDocument $document
776
	 * @param File $file
777
	 *
778
	 * @throws NotPermittedException
779
	 */
780
	private function extractContentFromFileText(FilesDocument $document, File $file) {
781
782
		if ($this->parseMimeType($document->getMimeType()) !== self::MIMETYPE_TEXT) {
783
			return;
784
		}
785
786
		if (!$this->isSourceIndexable($document)) {
787
			return;
788
		}
789
790
		$document->setContent(base64_encode($file->getContent()), IndexDocument::ENCODED_BASE64);
791
	}
792
793
794
	/**
795
	 * @param FilesDocument $document
796
	 * @param File $file
797
	 *
798
	 * @throws NotPermittedException
799
	 */
800 View Code Duplication
	private function extractContentFromFilePDF(FilesDocument $document, File $file) {
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
801
		if ($this->parseMimeType($document->getMimeType()) !== self::MIMETYPE_PDF) {
802
			return;
803
		}
804
805
		$this->configService->setDocumentIndexOption($document, ConfigService::FILES_PDF);
806
		if (!$this->isSourceIndexable($document)) {
807
			return;
808
		}
809
810
		if ($this->configService->getAppValue(ConfigService::FILES_PDF) !== '1') {
811
			$document->setContent('');
812
813
			return;
814
		}
815
816
		$document->setContent(base64_encode($file->getContent()), IndexDocument::ENCODED_BASE64);
817
	}
818
819
820
	/**
821
	 * @param FilesDocument $document
822
	 * @param File $file
823
	 *
824
	 * @throws NotPermittedException
825
	 */
826 View Code Duplication
	private function extractContentFromFileOffice(FilesDocument $document, File $file) {
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
827
		if ($this->parseMimeType($document->getMimeType()) !== self::MIMETYPE_OFFICE) {
828
			return;
829
		}
830
831
		$this->configService->setDocumentIndexOption($document, ConfigService::FILES_OFFICE);
832
		if (!$this->isSourceIndexable($document)) {
833
			return;
834
		}
835
836
		if ($this->configService->getAppValue(ConfigService::FILES_OFFICE) !== '1') {
837
			$document->setContent('');
838
839
			return;
840
		}
841
842
		$document->setContent(base64_encode($file->getContent()), IndexDocument::ENCODED_BASE64);
843
	}
844
845
846
	/**
847
	 * @param FilesDocument $document
848
	 * @param File $file
849
	 */
850
	private function extractContentFromFileOCR(FilesDocument $document, File $file) {
851
		if ($this->configService->getAppValue(ConfigService::FILES_OCR) !== '1') {
852
			return;
853
		}
854
855
		if ($document->getContent() !== '' && $document->getContent() !== null) {
856
			return;
857
		}
858
859
		$document->setContent('');
860
		$this->extractContentUsingTesseractOCR($document, $file);
861
	}
862
863
864
	/**
865
	 * @param FilesDocument $document
866
	 * @param File $file
867
	 */
868
	private function extractContentUsingTesseractOCR(FilesDocument $document, File $file) {
869
		try {
870
			$tesseractService = $this->container->query(TesseractService::class);
871
			$extension = pathinfo($document->getPath(), PATHINFO_EXTENSION);
872
873
			if (!$tesseractService->parsedMimeType($document->getMimetype(), $extension)) {
874
				return;
875
			}
876
877
			$this->configService->setDocumentIndexOption($document, ConfigService::FILES_OCR);
878
			if (!$this->isSourceIndexable($document)) {
879
				return;
880
			}
881
882
			$content = $tesseractService->ocrFile($file);
883
		} catch (Exception $e) {
884
			return;
885
		}
886
887
		$document->setContent(base64_encode($content), IndexDocument::ENCODED_BASE64);
888
	}
889
890
891
	/**
892
	 * @param FilesDocument $document
893
	 *
894
	 * @return bool
895
	 */
896
	private function isSourceIndexable(FilesDocument $document) {
897
		$this->configService->setDocumentIndexOption($document, $document->getSource());
898
		if ($this->configService->getAppValue($document->getSource()) !== '1') {
899
			$document->setContent('');
900
901
			return false;
902
		}
903
904
		return true;
905
	}
906
907
908
	private function impersonateOwner(Index $index) {
909
		if ($index->getOwnerId() !== '') {
910
			return;
911
		}
912
913
		$this->groupFoldersService->impersonateOwner($index);
914
	}
915
916
}
917
918