Passed
Push — master ( 904963...c192c4 )
by
unknown
14:57 queued 05:56
created

DocumentXapianIndexer::resolveCourseSessionAndRootNode()   B

Complexity

Conditions 11
Paths 14

Size

Total Lines 31
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 11
eloc 17
c 1
b 0
f 0
nc 14
nop 1
dl 0
loc 31
rs 7.3166

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
/* For licensing terms, see /license.txt */
6
7
namespace Chamilo\CoreBundle\Search\Xapian;
8
9
use Chamilo\CoreBundle\Entity\ResourceFile;
10
use Chamilo\CoreBundle\Entity\ResourceLink;
11
use Chamilo\CoreBundle\Entity\ResourceNode;
12
use Chamilo\CoreBundle\Entity\SearchEngineRef;
13
use Chamilo\CoreBundle\Settings\SettingsManager;
14
use Chamilo\CourseBundle\Entity\CDocument;
15
use Doctrine\DBAL\Connection;
16
use Doctrine\ORM\EntityManagerInterface;
17
use Symfony\Component\HttpFoundation\Request;
18
use Symfony\Component\HttpFoundation\RequestStack;
19
use Symfony\Component\Process\Process;
20
use Throwable;
21
use ZipArchive;
22
23
/**
24
 * Handles Xapian indexing for CDocument entities.
25
 */
26
final class DocumentXapianIndexer
27
{
28
    public function __construct(
29
        private readonly XapianIndexService $xapianIndexService,
30
        private readonly EntityManagerInterface $em,
31
        private readonly SettingsManager $settingsManager,
32
        private readonly DocumentRawTextExtractor $rawTextExtractor,
33
        private readonly RequestStack $requestStack,
34
    ) {}
35
36
    /**
37
     * Index a CDocument into Xapian.
38
     *
39
     * @return int|null Xapian document id or null when indexing is skipped
40
     */
41
    public function indexDocument(CDocument $document): ?int
42
    {
43
        $resourceNode = $document->getResourceNode();
44
45
        $enabled = (string) $this->settingsManager->getSetting('search.search_enabled', true);
46
47
        if ('true' !== $enabled) {
48
            error_log('[Xapian] indexDocument: search is disabled, skipping indexing');
49
            return null;
50
        }
51
52
        if (!$resourceNode instanceof ResourceNode) {
53
            error_log('[Xapian] indexDocument: missing ResourceNode, skipping');
54
            return null;
55
        }
56
57
        if ('folder' === $document->getFiletype()) {
58
            error_log('[Xapian] indexDocument: skipping folder document, resource_node_id='.$resourceNode->getId());
59
            return null;
60
        }
61
62
        [$courseId, $sessionId, $courseRootNodeId] = $this->resolveCourseSessionAndRootNode($resourceNode);
63
64
        $content = $this->rawTextExtractor->extract($document);
65
66
        $fields = [
67
            'title' => (string) $document->getTitle(),
68
            'description' => (string) ($document->getComment() ?? ''),
69
            'content' => $content,
70
            'filetype' => (string) $document->getFiletype(),
71
            'resource_node_id' => (string) $resourceNode->getId(),
72
            'course_id' => null !== $courseId ? (string) $courseId : '',
73
            'session_id' => null !== $sessionId ? (string) $sessionId : '',
74
            'course_root_node_id' => null !== $courseRootNodeId ? (string) $courseRootNodeId : '',
75
            'full_path' => $document->getFullPath(),
76
        ];
77
78
        $terms = ['Tdocument'];
79
80
        if (null !== $courseId) {
81
            $terms[] = 'C'.$courseId;
82
        }
83
        if (null !== $sessionId) {
84
            $terms[] = 'S'.$sessionId;
85
        }
86
87
        $this->applyPrefilterConfigToTerms($terms, $courseId, $sessionId, $document);
88
89
        $resourceNodeId = (int) $resourceNode->getId();
90
        $resourceNodeRef = $this->em->getReference(ResourceNode::class, $resourceNodeId);
91
92
        /** @var SearchEngineRef|null $existingRef */
93
        $existingRef = $this->em
94
            ->getRepository(SearchEngineRef::class)
95
            ->findOneBy(['resourceNode' => $resourceNodeRef]);
96
97
        $existingDocId = $existingRef?->getSearchDid();
98
99
        if (null !== $existingDocId) {
100
            try {
101
                $this->xapianIndexService->deleteDocument($existingDocId);
102
            } catch (Throwable $e) {
103
                error_log('[Xapian] indexDocument: failed to delete previous docId='.$existingDocId.' error='.$e->getMessage());
104
            }
105
        }
106
107
        // Get raw input from request (might be keyed by code OR by field_id)
108
        $rawInput = $this->extractSearchFieldValuesFromRequest();
109
110
        // Normalize into code => value (t/d/k/whatever)
111
        $inputByCode = $this->normalizeSearchFieldValuesToCode($rawInput);
112
113
        // Merge with stored values (stored wins only when request has nothing for that field)
114
        $storedByCode = $this->fetchStoredSearchFieldValuesByCode($resourceNodeId);
115
116
        // Request should override stored
117
        $searchFieldValuesByCode = array_replace($storedByCode, $inputByCode);
118
119
        // resolve language ISO for stemming (resource_file > resource_node)
120
        $languageIso = $this->resolveLanguageIsoForResourceNode($resourceNode);
121
122
        try {
123
            // Pass language ISO to the index service (it will map ISO -> Xapian language)
124
            $docId = $this->xapianIndexService->indexDocument(
125
                $fields,
126
                $terms,
127
                $languageIso,
128
                $searchFieldValuesByCode
129
            );
130
        } catch (Throwable $e) {
131
            error_log('[Xapian] indexDocument: Xapian indexing failed: '.$e->getMessage());
132
            return null;
133
        }
134
135
        if ($existingRef instanceof SearchEngineRef) {
136
            $existingRef->setSearchDid($docId);
137
        } else {
138
            $existingRef = new SearchEngineRef();
139
            $existingRef->setResourceNode($resourceNodeRef);
140
            $existingRef->setSearchDid($docId);
141
            $this->em->persist($existingRef);
142
        }
143
144
        // Persist dynamic search field values (create/update)
145
        $this->syncSearchEngineFieldValues($resourceNodeId, $document, $content);
146
147
        $this->em->flush();
148
149
        return $docId;
150
    }
151
152
    public function deleteForResourceNodeId(int $resourceNodeId): void
153
    {
154
        $enabled = (string) $this->settingsManager->getSetting('search.search_enabled', true);
155
        if ('true' !== $enabled) {
156
            error_log('[Xapian] deleteForResourceNodeId: search is disabled, skipping');
157
            return;
158
        }
159
160
        try {
161
            $this->em->getConnection()->executeStatement(
162
                'DELETE FROM search_engine_field_value WHERE resource_node_id = ?',
163
                [$resourceNodeId]
164
            );
165
        } catch (Throwable $e) {
166
            error_log('[Xapian] deleteForResourceNodeId: failed to delete field values: '.$e->getMessage());
167
        }
168
169
        $resourceNodeRef = $this->em->getReference(ResourceNode::class, $resourceNodeId);
170
171
        /** @var SearchEngineRef|null $ref */
172
        $ref = $this->em
173
            ->getRepository(SearchEngineRef::class)
174
            ->findOneBy(['resourceNode' => $resourceNodeRef]);
175
176
        if (!$ref instanceof SearchEngineRef) {
177
            error_log('[Xapian] deleteForResourceNodeId: no SearchEngineRef found, nothing to delete');
178
            return;
179
        }
180
181
        $docId = $ref->getSearchDid();
182
        if (null !== $docId) {
183
            try {
184
                $this->xapianIndexService->deleteDocument($docId);
185
            } catch (Throwable $e) {
186
                error_log('[Xapian] deleteForResourceNodeId: deleteDocument failed for did='.$docId.' error='.$e->getMessage());
187
            }
188
        }
189
190
        $this->em->remove($ref);
191
        $this->em->flush();
192
    }
193
194
    /**
195
     * Persist search_engine_field_value dynamically based on values sent by UI/API.
196
     *
197
     * Accepts:
198
     * - multipart: searchFieldValues[t]=..., searchFieldValues[d]=...
199
     * - multipart: searchFieldValues as JSON string {"t":"..."}
200
     * - legacy/alt: searchFieldValues as array keyed by field id (1,2,3)
201
     */
202
    private function syncSearchEngineFieldValues(int $resourceNodeId, CDocument $document, string $content): void
203
    {
204
        $conn = $this->em->getConnection();
205
206
        $maps = $this->fetchSearchEngineFields($conn);
207
        $byCode = $maps['byCode'];
208
        $byId = $maps['byId'];
209
210
        if (empty($byCode)) {
211
            error_log('[Xapian] syncSearchEngineFieldValues: no search_engine_field rows found, skipping');
212
            return;
213
        }
214
215
        // Raw values from request (could be keyed by code OR id)
216
        $rawValues = $this->extractSearchFieldValuesFromRequest();
217
        $hasExplicitInput = \is_array($rawValues) && \count($rawValues) > 0;
218
219
        // If we didn't receive anything, do NOT overwrite existing values on update.
220
        // This prevents accidental resets when the request does not carry searchFieldValues.
221
        try {
222
            $existingCount = (int) $conn->fetchOne(
223
                'SELECT COUNT(*) FROM search_engine_field_value WHERE resource_node_id = ?',
224
                [$resourceNodeId]
225
            );
226
        } catch (Throwable $e) {
227
            $existingCount = 0;
228
        }
229
230
        if (!$hasExplicitInput && $existingCount > 0) {
231
            error_log(
232
                '[Xapian] syncSearchEngineFieldValues: no input received, keeping existing values for resource_node_id='.$resourceNodeId
233
            );
234
            return;
235
        }
236
237
        // Normalize into field_id => value
238
        $valuesByFieldId = [];
239
240
        foreach ($rawValues as $key => $val) {
241
            // NOTE: keep explicit empty strings to allow "clear",
242
            // but skip when building inserts
243
            $value = (string) $val;
244
245
            $fieldId = null;
246
247
            if (is_numeric((string) $key)) {
248
                $id = (int) $key;
249
                if (isset($byId[$id])) {
250
                    $fieldId = $id;
251
                }
252
            } else {
253
                $code = strtolower(trim((string) $key));
254
                if (isset($byCode[$code])) {
255
                    $fieldId = (int) $byCode[$code]['id'];
256
                }
257
            }
258
259
            if (null === $fieldId) {
260
                continue;
261
            }
262
263
            $valuesByFieldId[$fieldId] = trim($value);
264
        }
265
266
        // Conservative fallback: only fill missing ones for known semantics (t/d/c)
267
        foreach ($byCode as $code => $meta) {
268
            $fid = (int) $meta['id'];
269
            if (isset($valuesByFieldId[$fid])) {
270
                continue;
271
            }
272
273
            $fallback = $this->guessFallbackValue(
274
                (string) $code,
275
                (string) ($meta['title'] ?? ''),
276
                $document,
277
                $content
278
            );
279
280
            if (null !== $fallback) {
281
                $fallback = trim($fallback);
282
                if ('' !== $fallback) {
283
                    $valuesByFieldId[$fid] = $fallback;
284
                }
285
            }
286
        }
287
288
        try {
289
            $conn->executeStatement(
290
                'DELETE FROM search_engine_field_value WHERE resource_node_id = ?',
291
                [$resourceNodeId]
292
            );
293
294
            foreach ($valuesByFieldId as $fid => $value) {
295
                $conn->insert('search_engine_field_value', [
296
                    'resource_node_id' => $resourceNodeId,
297
                    'field_id' => (int) $fid,
298
                    'value' => (string) $value,
299
                ]);
300
            }
301
        } catch (Throwable $e) {
302
            error_log('[Xapian] syncSearchEngineFieldValues: failed: '.$e->getMessage());
303
        }
304
    }
305
306
    /**
307
     * @return array{
308
     *   byCode: array<string, array{id:int,title:string}>,
309
     *   byId: array<int, array{code:string,title:string}>
310
     * }
311
     */
312
    private function fetchSearchEngineFields(Connection $conn): array
313
    {
314
        try {
315
            $rows = $conn->fetchAllAssociative('SELECT id, code, title FROM search_engine_field');
316
        } catch (Throwable $e) {
317
            error_log('[Xapian] fetchSearchEngineFields: query failed: '.$e->getMessage());
318
            return ['byCode' => [], 'byId' => []];
319
        }
320
321
        $byCode = [];
322
        $byId = [];
323
324
        foreach ($rows as $row) {
325
            $id = (int) ($row['id'] ?? 0);
326
            $code = strtolower(trim((string) ($row['code'] ?? '')));
327
            $title = (string) ($row['title'] ?? '');
328
329
            if ($id <= 0 || '' === $code) {
330
                continue;
331
            }
332
333
            $byCode[$code] = ['id' => $id, 'title' => $title];
334
            $byId[$id] = ['code' => $code, 'title' => $title];
335
        }
336
337
        return ['byCode' => $byCode, 'byId' => $byId];
338
    }
339
340
    /**
341
     * Normalize any request-provided values to "code => value".
342
     *
343
     * Input can be:
344
     *  - ['t' => '...', 'k' => '...']
345
     *  - [1 => '...', 3 => '...'] (field IDs)
346
     *
347
     * Output is always:
348
     *  - ['t' => '...', 'k' => '...']
349
     *
350
     * @param array<string|int, mixed> $rawValues
351
     *
352
     * @return array<string, string>
353
     */
354
    private function normalizeSearchFieldValuesToCode(array $rawValues): array
355
    {
356
        if (empty($rawValues)) {
357
            return [];
358
        }
359
360
        $conn = $this->em->getConnection();
361
        $maps = $this->fetchSearchEngineFields($conn);
362
363
        $byCode = $maps['byCode']; // code => ['id'=>..]
364
        $byId = $maps['byId'];     // id => ['code'=>..]
365
366
        if (empty($byCode) || empty($byId)) {
367
            // Safe fallback: if DB read fails, keep only string codes as-is
368
            $out = [];
369
            foreach ($rawValues as $k => $v) {
370
                if (!is_string($k)) {
371
                    continue;
372
                }
373
                $code = strtolower(trim($k));
374
                if ('' === $code) {
375
                    continue;
376
                }
377
                $out[$code] = trim((string) $v);
378
            }
379
            return $out;
380
        }
381
382
        $out = [];
383
384
        foreach ($rawValues as $key => $val) {
385
            $value = trim((string) $val);
386
387
            $code = null;
388
389
            // Key is numeric => treat as field_id
390
            if (is_numeric((string) $key)) {
391
                $id = (int) $key;
392
                if (isset($byId[$id])) {
393
                    $code = strtolower(trim((string) $byId[$id]['code']));
394
                }
395
            } else {
396
                // Key is string => treat as code
397
                $candidate = strtolower(trim((string) $key));
398
                if ('' !== $candidate && isset($byCode[$candidate])) {
399
                    $code = $candidate;
400
                }
401
            }
402
403
            if (null === $code || '' === $code) {
404
                continue;
405
            }
406
407
            // Keep empty string (allows "clear"), indexer will skip empties anyway
408
            $out[$code] = $value;
409
        }
410
411
        return $out;
412
    }
413
414
    /**
415
     * Extract values from the current HTTP request.
416
     *
417
     * Supports:
418
     * - multipart: searchFieldValues[t]=... (Symfony returns array)
419
     * - multipart: searchFieldValues as JSON string {"t":"..."}
420
     * - JSON body: { "searchFieldValues": {...} }
421
     *
422
     * @return array<string|int, string>
423
     */
424
    private function extractSearchFieldValuesFromRequest(): array
425
    {
426
        $req = $this->requestStack->getCurrentRequest();
427
        if (!$req instanceof Request) {
428
            return [];
429
        }
430
431
        // Standard multipart parsed array: searchFieldValues[t]=...
432
        $fromForm = $req->get('searchFieldValues');
433
        if (is_array($fromForm)) {
434
            $out = [];
435
            foreach ($fromForm as $k => $v) {
436
                $out[$k] = (string) $v;
437
            }
438
            return $out;
439
        }
440
441
        // If it's a string, it might be JSON (or broken "[object Object]")
442
        if (is_string($fromForm) && '' !== trim($fromForm)) {
443
            $raw = trim($fromForm);
444
445
            if ('[object Object]' === $raw) {
446
                error_log(
447
                    '[Xapian] extractSearchFieldValuesFromRequest: searchFieldValues arrived as "[object Object]". '.
448
                    'Frontend must JSON.stringify() or send searchFieldValues[code]=...'
449
                );
450
                return [];
451
            }
452
453
            $decoded = json_decode($raw, true);
454
            if (is_array($decoded)) {
455
                $out = [];
456
                foreach ($decoded as $k => $v) {
457
                    $out[$k] = (string) $v;
458
                }
459
                return $out;
460
            }
461
        }
462
463
        // JSON body
464
        $contentType = (string) $req->headers->get('Content-Type', '');
465
        if (str_contains($contentType, 'application/json')) {
466
            $body = $req->getContent();
467
            if (is_string($body) && '' !== trim($body)) {
468
                $decoded = json_decode($body, true);
469
                if (is_array($decoded)) {
470
                    $blob = $decoded['searchFieldValues'] ?? null;
471
                    if (is_array($blob)) {
472
                        $out = [];
473
                        foreach ($blob as $k => $v) {
474
                            $out[$k] = (string) $v;
475
                        }
476
                        return $out;
477
                    }
478
                }
479
            }
480
        }
481
482
        return [];
483
    }
484
485
    /**
486
     * Only used when request didn't provide values.
487
     * Keeps it conservative: title/description/content.
488
     */
489
    private function guessFallbackValue(string $code, string $title, CDocument $document, string $content): ?string
490
    {
491
        $code = strtolower(trim($code));
492
        $titleNorm = strtolower(trim($title));
493
494
        // By code (common convention)
495
        if ('t' === $code) {
496
            return (string) $document->getTitle();
497
        }
498
        if ('d' === $code) {
499
            return (string) ($document->getComment() ?? '');
500
        }
501
        if ('c' === $code) {
502
            return $content;
503
        }
504
505
        // By title label (common in UI)
506
        if ('title' === $titleNorm) {
507
            return (string) $document->getTitle();
508
        }
509
        if ('description' === $titleNorm) {
510
            return (string) ($document->getComment() ?? '');
511
        }
512
        if ('content' === $titleNorm) {
513
            return $content;
514
        }
515
516
        return null;
517
    }
518
519
    /**
520
     * Resolve course id, session id and course root node id from resource links.
521
     *
522
     * @return array{0: int|null, 1: int|null, 2: int|null}
523
     */
524
    private function resolveCourseSessionAndRootNode(ResourceNode $resourceNode): array
525
    {
526
        $courseId = null;
527
        $sessionId = null;
528
        $courseRootNodeId = null;
529
530
        foreach ($resourceNode->getResourceLinks() as $link) {
531
            if (!$link instanceof ResourceLink) {
532
                continue;
533
            }
534
535
            if (null === $courseId && $link->getCourse()) {
536
                $course = $link->getCourse();
537
                $courseId = $course->getId();
538
539
                $courseRootNode = $course->getResourceNode();
540
                if ($courseRootNode instanceof ResourceNode) {
541
                    $courseRootNodeId = $courseRootNode->getId();
542
                }
543
            }
544
545
            if (null === $sessionId && $link->getSession()) {
546
                $sessionId = $link->getSession()->getId();
547
            }
548
549
            if (null !== $courseId && null !== $sessionId && null !== $courseRootNodeId) {
550
                break;
551
            }
552
        }
553
554
        return [$courseId, $sessionId, $courseRootNodeId];
555
    }
556
557
    /**
558
     * Apply configured prefilter prefixes to Xapian terms.
559
     */
560
    private function applyPrefilterConfigToTerms(
561
        array &$terms,
562
        ?int $courseId,
563
        ?int $sessionId,
564
        CDocument $document
565
    ): void {
566
        $raw = (string) $this->settingsManager->getSetting('search.search_prefilter_prefix', true);
567
        if ('' === $raw) {
568
            return;
569
        }
570
571
        $config = json_decode($raw, true);
572
        if (!\is_array($config)) {
573
            return;
574
        }
575
576
        foreach ($config as $key => $item) {
577
            if (!\is_array($item)) {
578
                continue;
579
            }
580
581
            $prefix = (string) ($item['prefix'] ?? '');
582
            if ('' === $prefix) {
583
                $prefix = strtoupper((string) $key);
584
            }
585
586
            switch ($key) {
587
                case 'course':
588
                    if (null !== $courseId) {
589
                        $terms[] = $prefix.(string) $courseId;
590
                    }
591
592
                    break;
593
594
                case 'session':
595
                    if (null !== $sessionId) {
596
                        $terms[] = $prefix.(string) $sessionId;
597
                    }
598
599
                    break;
600
601
                case 'filetype':
602
                    $terms[] = $prefix.$document->getFiletype();
603
604
                    break;
605
606
                default:
607
                    // Unknown key: ignore for now
608
                    break;
609
            }
610
        }
611
    }
612
613
    private function extractRawTextContent(CDocument $document): string
614
    {
615
        $resourceNode = $document->getResourceNode();
616
        if (!$resourceNode instanceof ResourceNode) {
617
            return '';
618
        }
619
620
        // Prefer content stored directly on the node (if any)
621
        $nodeContent = (string) ($resourceNode->getContent() ?? '');
622
        if ('' !== trim($nodeContent)) {
623
            return $this->toPlainText($nodeContent);
624
        }
625
626
        // Fallback to file content from ResourceFile (most documents are stored as files)
627
        $resourceFile = $resourceNode->getFirstResourceFile();
628
        if (!$resourceFile instanceof ResourceFile) {
629
            return '';
630
        }
631
632
        $path = $this->resolveResourceFilePath($resourceFile);
633
        if (null === $path || !is_file($path) || !is_readable($path)) {
634
            error_log('[Xapian] extractRawTextContent: file path not resolved or not readable');
635
636
            return '';
637
        }
638
639
        $ext = strtolower((string) pathinfo($path, PATHINFO_EXTENSION));
640
641
        // HTML
642
        if (\in_array($ext, ['html', 'htm'], true)) {
643
            $html = $this->safeReadFile($path);
644
645
            return '' !== $html ? $this->toPlainText($html) : '';
646
        }
647
648
        // Plain text-like
649
        if (\in_array($ext, ['txt', 'md', 'csv', 'log'], true)) {
650
            return $this->safeReadFile($path);
651
        }
652
653
        // Zip-based office formats (no external tools needed)
654
        if ('docx' === $ext) {
655
            return $this->extractTextFromZipXml($path, [
656
                'word/document.xml',
657
                'word/footnotes.xml',
658
                'word/endnotes.xml',
659
            ]);
660
        }
661
662
        if ('odt' === $ext) {
663
            return $this->extractTextFromZipXml($path, [
664
                'content.xml',
665
            ]);
666
        }
667
668
        if ('pptx' === $ext) {
669
            return $this->extractTextFromPptx($path);
670
        }
671
672
        // PDF: optional hook (NO legacy, but depends on OS tool)
673
        // If you want pure-PHP only, just return '' here.
674
        if ('pdf' === $ext) {
675
            $text = $this->extractPdfWithPdftotext($path);
676
            if ('' !== $text) {
677
                return $text;
678
            }
679
680
            return '';
681
        }
682
683
        return '';
684
    }
685
686
    private function toPlainText(string $input): string
687
    {
688
        $text = html_entity_decode($input, ENT_QUOTES | ENT_HTML5, 'UTF-8');
689
        $text = strip_tags($text);
690
        $text = preg_replace('/\s+/u', ' ', $text) ?? $text;
691
692
        return trim($text);
693
    }
694
695
    private function resolveResourceFilePath(ResourceFile $resourceFile): ?string
696
    {
697
        // Most common in Symfony/Vich setups: getFile() returns a Symfony File instance
698
        if (method_exists($resourceFile, 'getFile')) {
699
            $file = $resourceFile->getFile();
700
701
            if ($file instanceof \Symfony\Component\HttpFoundation\File\File) {
702
                return $file->getPathname();
703
            }
704
705
            if ($file instanceof \SplFileInfo) {
706
                return $file->getPathname();
707
            }
708
        }
709
710
        // Some entities store a direct path/string field (depending on implementation)
711
        foreach (['getPathname', 'getPath', 'getFilePath', 'getAbsolutePath'] as $method) {
712
            if (method_exists($resourceFile, $method)) {
713
                $value = $resourceFile->{$method}();
714
                if (\is_string($value) && '' !== trim($value)) {
715
                    return $value;
716
                }
717
            }
718
        }
719
720
        return null;
721
    }
722
723
    private function safeReadFile(string $path, int $maxBytes = 2_000_000): string
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected T_STRING, expecting ')' on line 723 at column 65
Loading history...
724
    {
725
        try {
726
            $size = filesize($path);
727
            if (\is_int($size) && $size > $maxBytes) {
728
                error_log('[Xapian] safeReadFile: file too large, truncating. size='.$size.' max='.$maxBytes);
729
            }
730
731
            $handle = fopen($path, 'rb');
732
            if (false === $handle) {
733
                return '';
734
            }
735
736
            $data = fread($handle, $maxBytes);
737
            fclose($handle);
738
739
            return \is_string($data) ? $data : '';
740
        } catch (\Throwable $e) {
741
            error_log('[Xapian] safeReadFile: read failed: '.$e->getMessage());
742
743
            return '';
744
        }
745
    }
746
747
    private function extractTextFromZipXml(string $zipPath, array $xmlCandidates): string
748
    {
749
        $zip = new ZipArchive();
750
        $opened = $zip->open($zipPath);
751
752
        if (true !== $opened) {
753
            error_log('[Xapian] extractTextFromZipXml: failed to open zip');
754
755
            return '';
756
        }
757
758
        $chunks = [];
759
760
        foreach ($xmlCandidates as $xmlName) {
761
            $xml = $zip->getFromName($xmlName);
762
            if (\is_string($xml) && '' !== trim($xml)) {
763
                $chunks[] = $this->toPlainText($xml);
764
            }
765
        }
766
767
        $zip->close();
768
769
        $text = trim(implode(' ', $chunks));
770
        $text = preg_replace('/\s+/u', ' ', $text) ?? $text;
771
772
        return trim((string) $text);
773
    }
774
775
    private function extractTextFromPptx(string $zipPath): string
776
    {
777
        $zip = new ZipArchive();
778
        $opened = $zip->open($zipPath);
779
780
        if (true !== $opened) {
781
            error_log('[Xapian] extractTextFromPptx: failed to open pptx zip');
782
783
            return '';
784
        }
785
786
        $chunks = [];
787
        $slideFiles = [];
788
789
        for ($i = 0; $i < $zip->numFiles; $i++) {
790
            $name = (string) $zip->getNameIndex($i);
791
            if (str_starts_with($name, 'ppt/slides/slide') && str_ends_with($name, '.xml')) {
792
                $slideFiles[] = $name;
793
            }
794
        }
795
796
        sort($slideFiles);
797
798
        foreach ($slideFiles as $slideName) {
799
            $xml = $zip->getFromName($slideName);
800
            if (\is_string($xml) && '' !== trim($xml)) {
801
                $chunks[] = $this->toPlainText($xml);
802
            }
803
        }
804
805
        $zip->close();
806
807
        $text = trim(implode(' ', $chunks));
808
        $text = preg_replace('/\s+/u', ' ', $text) ?? $text;
809
810
        return trim((string) $text);
811
    }
812
813
    private function extractPdfWithPdftotext(string $path): string
814
    {
815
        // Check if command exists
816
        $process = new Process(['which', 'pdftotext']);
817
        $process->run();
818
819
        if (!$process->isSuccessful()) {
820
            return '';
821
        }
822
823
        $tmp = sys_get_temp_dir().'/xapian_'.uniqid('', true).'.txt';
824
825
        try {
826
            $process = new Process(['pdftotext', '-enc', 'UTF-8', $path, $tmp]);
827
            $process->setTimeout(10);
828
            $process->run();
829
830
            if (!$process->isSuccessful() || !is_file($tmp)) {
831
                return '';
832
            }
833
834
            $text = $this->safeReadFile($tmp, 2_000_000);
835
836
            return trim($text);
837
        } catch (\Throwable $e) {
838
            error_log('[Xapian] extractPdfWithPdftotext: failed: '.$e->getMessage());
839
840
            return '';
841
        } finally {
842
            if (is_file($tmp)) {
843
                @unlink($tmp);
844
            }
845
        }
846
    }
847
848
    private function fetchStoredSearchFieldValuesByCode(int $resourceNodeId): array
849
    {
850
        $conn = $this->em->getConnection();
851
852
        try {
853
            $rows = $conn->fetchAllAssociative(
854
                'SELECT f.code, v.value
855
             FROM search_engine_field_value v
856
             INNER JOIN search_engine_field f ON f.id = v.field_id
857
             WHERE v.resource_node_id = ?',
858
                [$resourceNodeId]
859
            );
860
        } catch (Throwable $e) {
861
            error_log('[Xapian] fetchStoredSearchFieldValuesByCode: query failed: '.$e->getMessage());
862
            return [];
863
        }
864
865
        $out = [];
866
        foreach ($rows as $row) {
867
            $code = strtolower(trim((string) ($row['code'] ?? '')));
868
            $val = trim((string) ($row['value'] ?? ''));
869
            if ('' === $code) {
870
                continue;
871
            }
872
873
            // Keep empty string too (allows clear), but indexer will skip empties
874
            $out[$code] = $val;
875
        }
876
877
        return $out;
878
    }
879
880
    private function resolveLanguageIsoForResourceNode(ResourceNode $resourceNode): ?string
881
    {
882
        // Prefer ResourceFile language when possible
883
        $file = $resourceNode->getFirstResourceFile();
884
        if ($file instanceof ResourceFile) {
885
            $lang = $file->getLanguage();
886
            if (null !== $lang) {
887
                $iso = trim((string) $lang->getIsocode());
888
                if ('' !== $iso) {
889
                    return $iso;
890
                }
891
            }
892
        }
893
894
        // Fallback to ResourceNode language
895
        $nodeLang = $resourceNode->getLanguage();
896
        if (null !== $nodeLang) {
897
            $iso = trim((string) $nodeLang->getIsocode());
898
            if ('' !== $iso) {
899
                return $iso;
900
            }
901
        }
902
903
        // Unknown language
904
        return null;
905
    }
906
}
907