Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (214)

Classes/Common/AbstractDocument.php (3 issues)

1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Cache\CacheManager;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Log\Logger;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
21
use Ubl\Iiif\Tools\IiifHelper;
22
23
/**
24
 * Document class for the 'dlf' extension
25
 *
26
 * @package TYPO3
27
 * @subpackage dlf
28
 *
29
 * @access public
30
 *
31
 * @abstract
32
 *
33
 * @property int $cPid this holds the PID for the configuration
34
 * @property-read array $formats this holds the configuration for all supported metadata encodings
35
 * @property bool $formatsLoaded flag with information if the available metadata formats are loaded
36
 * @property-read bool $hasFulltext flag with information if there are any fulltext files available
37
 * @property array $lastSearchedPhysicalPage the last searched logical and physical page
38
 * @property array $logicalUnits this holds the logical units
39
 * @property-read array $metadataArray this holds the documents' parsed metadata array
40
 * @property bool $metadataArrayLoaded flag with information if the metadata array is loaded
41
 * @property-read int $numPages the holds the total number of pages
42
 * @property-read int $parentId this holds the UID of the parent document or zero if not multi-volumed
43
 * @property-read array $physicalStructure this holds the physical structure
44
 * @property-read array $physicalStructureInfo this holds the physical structure metadata
45
 * @property bool $physicalStructureLoaded flag with information if the physical structure is loaded
46
 * @property-read int $pid this holds the PID of the document or zero if not in database
47
 * @property array $rawTextArray this holds the documents' raw text pages with their corresponding structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
48
 * @property-read bool $ready Is the document instantiated successfully?
49
 * @property-read string $recordId the METS file's / IIIF manifest's record identifier
50
 * @property-read int $rootId this holds the UID of the root document or zero if not multi-volumed
51
 * @property-read array $smLinks this holds the smLinks between logical and physical structMap
52
 * @property bool $smLinksLoaded flag with information if the smLinks are loaded
53
 * @property-read array $tableOfContents this holds the logical structure
54
 * @property bool $tableOfContentsLoaded flag with information if the table of contents is loaded
55
 * @property-read string $thumbnail this holds the document's thumbnail location
56
 * @property bool $thumbnailLoaded flag with information if the thumbnail is loaded
57
 * @property-read string $toplevelId this holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
58
 * @property \SimpleXMLElement $xml this holds the whole XML file as \SimpleXMLElement object
59
 */
60
abstract class AbstractDocument
61
{
62
    /**
63
     * @access protected
64
     * @var Logger This holds the logger
65
     */
66
    protected Logger $logger;
67
68
    /**
69
     * @access protected
70
     * @var int This holds the PID for the configuration
71
     */
72
    protected int $cPid = 0;
73
74
    /**
75
     * @access public
76
     * @static
77
     * @var string The extension key
78
     */
79
    public static string $extKey = 'dlf';
80
81
    /**
82
     * @access protected
83
     * @var array Additional information about files (e.g., ADMID), indexed by ID.
84
     */
85
    protected array $fileInfos = [];
86
87
    /**
88
     * @access protected
89
     * @var array This holds the configuration for all supported metadata encodings
90
     *
91
     * @see loadFormats()
92
     */
93
    protected array $formats = [
94
        'OAI' => [
95
            'rootElement' => 'OAI-PMH',
96
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
97
        ],
98
        'METS' => [
99
            'rootElement' => 'mets',
100
            'namespaceURI' => 'http://www.loc.gov/METS/',
101
        ],
102
        'XLINK' => [
103
            'rootElement' => 'xlink',
104
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
105
        ]
106
    ];
107
108
    /**
109
     * @access protected
110
     * @var bool Are the available metadata formats loaded?
111
     *
112
     * @see $formats
113
     */
114
    protected bool $formatsLoaded = false;
115
116
    /**
117
     * Are there any fulltext files available? This also includes IIIF text annotations
118
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
119
     * annotations as fulltext.
120
     *
121
     * @access protected
122
     * @var bool
123
     */
124
    protected bool $hasFulltext = false;
125
126
    /**
127
     * @access protected
128
     * @var array Last searched logical and physical page
129
     */
130
    protected array $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
131
132
    /**
133
     * @access protected
134
     * @var array This holds the logical units
135
     */
136
    protected array $logicalUnits = [];
137
138
    /**
139
     * This holds the documents' parsed metadata array with their corresponding
140
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
141
     *
142
     * @access protected
143
     * @var array
144
     */
145
    protected array $metadataArray = [];
146
147
    /**
148
     * @access protected
149
     * @var bool Is the metadata array loaded?
150
     *
151
     * @see $metadataArray
152
     */
153
    protected bool $metadataArrayLoaded = false;
154
155
    /**
156
     * @access protected
157
     * @var int The holds the total number of pages
158
     */
159
    protected int $numPages = 0;
160
161
    /**
162
     * @access protected
163
     * @var int This holds the UID of the parent document or zero if not multi-volumed
164
     */
165
    protected int $parentId = 0;
166
167
    /**
168
     * @access protected
169
     * @var array This holds the physical structure
170
     */
171
    protected array $physicalStructure = [];
172
173
    /**
174
     * @access protected
175
     * @var array This holds the physical structure metadata
176
     */
177
    protected array $physicalStructureInfo = [];
178
179
    /**
180
     * @access protected
181
     * @var bool Is the physical structure loaded?
182
     *
183
     * @see $physicalStructure
184
     */
185
    protected bool $physicalStructureLoaded = false;
186
187
    /**
188
     * @access protected
189
     * @var int This holds the PID of the document or zero if not in database
190
     */
191
    protected int $pid = 0;
192
193
    /**
194
     * This holds the documents' raw text pages with their corresponding
195
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
196
     *
197
     * @access protected
198
     * @var array
199
     */
200
    protected array $rawTextArray = [];
201
202
    /**
203
     * @access protected
204
     * @var bool Is the document instantiated successfully?
205
     */
206
    protected bool $ready = false;
207
208
    /**
209
     * @access protected
210
     * @var string The METS file's / IIIF manifest's record identifier
211
     */
212
    protected string $recordId = '';
213
214
    /**
215
     * @access protected
216
     * @var int This holds the UID of the root document or zero if not multi-volumed
217
     */
218
    protected int $rootId = 0;
219
220
    /**
221
     * @access protected
222
     * @var bool Is the root id loaded?
223
     *
224
     * @see $rootId
225
     */
226
    protected bool $rootIdLoaded = false;
227
228
    /**
229
     * @access protected
230
     * @var array This holds the smLinks between logical and physical structMap
231
     */
232
    protected array $smLinks = ['l2p' => [], 'p2l' => []];
233
234
    /**
235
     * @access protected
236
     * @var bool Are the smLinks loaded?
237
     *
238
     * @see $smLinks
239
     */
240
    protected bool $smLinksLoaded = false;
241
242
    /**
243
     * This holds the logical structure
244
     *
245
     * @access protected
246
     * @var array
247
     */
248
    protected array $tableOfContents = [];
249
250
    /**
251
     * @access protected
252
     * @var bool Is the table of contents loaded?
253
     *
254
     * @see $tableOfContents
255
     */
256
    protected bool $tableOfContentsLoaded = false;
257
258
    /**
259
     * @access protected
260
     * @var string This holds the document's thumbnail location
261
     */
262
    protected string $thumbnail = '';
263
264
    /**
265
     * @access protected
266
     * @var bool Is the document's thumbnail location loaded?
267
     *
268
     * @see $thumbnail
269
     */
270
    protected bool $thumbnailLoaded = false;
271
272
    /**
273
     * @access protected
274
     * @var string This holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
275
     */
276
    protected string $toplevelId = '';
277
278
    /**
279
     * @access protected
280
     * @var \SimpleXMLElement This holds the whole XML file as \SimpleXMLElement object
281
     */
282
    protected \SimpleXMLElement $xml;
283
284
    /**
285
     * This gets the location of a downloadable file for a physical page or track
286
     *
287
     * @access public
288
     *
289
     * @abstract
290
     *
291
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
292
     *
293
     * @return string The file's location as URL
294
     */
295
    abstract public function getDownloadLocation(string $id): string;
296
297
    /**
298
     * This gets all file information stored in single array.
299
     *
300
     * @access public
301
     *
302
     * @abstract
303
     *
304
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
305
     *
306
     * @return array|null The set of file information
307
     */
308
    abstract public function getFileInfo($id): ?array;
309
310
    /**
311
     * This gets the location of a file representing a physical page or track
312
     *
313
     * @access public
314
     *
315
     * @abstract
316
     *
317
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
318
     *
319
     * @return string The file's location as URL
320
     */
321
    abstract public function getFileLocation(string $id): string;
322
323
    /**
324
     * This gets the MIME type of a file representing a physical page or track
325
     *
326
     * @access public
327
     *
328
     * @abstract
329
     *
330
     * @param string $id The "@ID" attribute of the file node
331
     *
332
     * @return string The file's MIME type
333
     */
334
    abstract public function getFileMimeType(string $id): string;
335
336
    /**
337
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
338
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
339
     *
340
     * @access public
341
     *
342
     * @abstract
343
     *
344
     * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
345
     * of the Manifest / Range (IIIF)
346
     *
347
     * @return string The OCR full text
348
     */
349
    abstract public function getFullText(string $id): string;
350
351
    /**
352
     * This gets details about a logical structure element
353
     *
354
     * @access public
355
     *
356
     * @abstract
357
     *
358
     * @param string $id The "@ID" attribute of the logical structure node (METS) or
359
     * the "@id" property of the Manifest / Range (IIIF)
360
     * @param bool $recursive Whether to include the child elements / resources
361
     *
362
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
363
     */
364
    abstract public function getLogicalStructure(string $id, bool $recursive = false): array;
365
366
    /**
367
     * This extracts all the metadata for a logical structure node
368
     *
369
     * @access public
370
     *
371
     * @abstract
372
     *
373
     * @param string $id The "@ID" attribute of the logical structure node (METS) or the "@id" property
374
     * of the Manifest / Range (IIIF)
375
     * @param int $cPid The PID for the metadata definitions (defaults to $this->cPid or $this->pid)
376
     *
377
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
378
     */
379
    abstract public function getMetadata(string $id, int $cPid = 0): array;
380
381
    /**
382
     * Analyze the document if it contains any fulltext that needs to be indexed.
383
     *
384
     * @access protected
385
     *
386
     * @abstract
387
     *
388
     * @return void
389
     */
390
    abstract protected function ensureHasFulltextIsSet(): void;
391
392
    /**
393
     * This ensures that the recordId, if existent, is retrieved from the document
394
     *
395
     * @access protected
396
     *
397
     * @abstract
398
     *
399
     * @param int $pid ID of the configuration page with the recordId config
400
     *
401
     * @return void
402
     */
403
    abstract protected function establishRecordId(int $pid): void;
404
405
    /**
406
     * Source document PHP object which is represented by a Document instance
407
     *
408
     * @access protected
409
     *
410
     * @abstract
411
     *
412
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
413
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
414
     */
415
    abstract protected function getDocument();
416
417
    /**
418
     * This builds an array of the document's physical structure
419
     *
420
     * @access protected
421
     *
422
     * @abstract
423
     *
424
     * @return array Array of physical elements' id, type, label and file representations ordered
425
     * by "@ORDER" attribute / IIIF Sequence's Canvases
426
     */
427
    abstract protected function magicGetPhysicalStructure(): array;
428
429
    /**
430
     * This returns the smLinks between logical and physical structMap (METS) and models the
431
     * relation between IIIF Canvases and Manifests / Ranges in the same way
432
     *
433
     * @access protected
434
     *
435
     * @abstract
436
     *
437
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
438
     */
439
    abstract protected function magicGetSmLinks(): array;
440
441
    /**
442
     * This returns the document's thumbnail location
443
     *
444
     * @access protected
445
     *
446
     * @abstract
447
     *
448
     * @param bool $forceReload Force reloading the thumbnail instead of returning the cached value
449
     *
450
     * @return string The document's thumbnail location
451
     */
452
    abstract protected function magicGetThumbnail(bool $forceReload = false): string;
453
454
    /**
455
     * This returns the ID of the toplevel logical structure node
456
     *
457
     * @access protected
458
     *
459
     * @abstract
460
     *
461
     * @return string The logical structure node's ID
462
     */
463
    abstract protected function magicGetToplevelId(): string;
464
465
    /**
466
     * This sets some basic class properties
467
     *
468
     * @access protected
469
     *
470
     * @abstract
471
     *
472
     * @param string $location The location URL of the XML file to parse
473
     * @param array $settings The extension settings
474
     *
475
     * @return void
476
     */
477
    abstract protected function init(string $location, array $settings): void;
478
479
    /**
480
     * METS/IIIF specific part of loading a location
481
     *
482
     * @access protected
483
     *
484
     * @abstract
485
     *
486
     * @param string $location The URL of the file to load
487
     *
488
     * @return bool true on success or false on failure
489
     */
490
    abstract protected function loadLocation(string $location): bool;
491
492
    /**
493
     * Format specific part of building the document's metadata array
494
     *
495
     * @access protected
496
     *
497
     * @abstract
498
     *
499
     * @param int $cPid
500
     *
501
     * @return void
502
     */
503
    abstract protected function prepareMetadataArray(int $cPid): void;
504
505
    /**
506
     * Reuse any document object that might have been already loaded to determine whether document is METS or IIIF
507
     *
508
     * @access protected
509
     *
510
     * @abstract
511
     *
512
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument any instance that has already been loaded
513
     *
514
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
515
     */
516
    abstract protected function setPreloadedDocument($preloadedDocument): bool;
517
518
    /**
519
     * This is a singleton class, thus an instance must be created by this method
520
     *
521
     * @access public
522
     *
523
     * @static
524
     *
525
     * @param string $location The URL of XML file or the IRI of the IIIF resource
526
     * @param array $settings
527
     * @param bool $forceReload Force reloading the document instead of returning the cached instance
528
     *
529
     * @return AbstractDocument|null Instance of this class, either MetsDocument or IiifManifest
530
     */
531
    public static function &getInstance(string $location, array $settings = [], bool $forceReload = false)
532
    {
533
        // Create new instance depending on format (METS or IIIF) ...
534
        $documentFormat = null;
535
        $xml = null;
536
        $iiif = null;
537
538
        if (!$forceReload) {
539
            $instance = self::getDocumentCache($location);
540
            if ($instance !== false) {
541
                return $instance;
542
            }
543
        }
544
545
        $instance = null;
546
547
        // Try to load a file from the url
548
        if (GeneralUtility::isValidUrl($location)) {
549
            // Load extension configuration
550
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
551
552
            $content = Helper::getUrl($location);
553
            if ($content !== false) {
554
                $xml = Helper::getXmlFileAsString($content);
555
                if ($xml !== false) {
556
                    /* @var $xml \SimpleXMLElement */
557
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
558
                    $xpathResult = $xml->xpath('//mets:mets');
559
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
560
                } else {
561
                    // Try to load file as IIIF resource instead.
562
                    $contentAsJsonArray = json_decode($content, true);
563
                    if ($contentAsJsonArray !== null) {
564
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
565
                        IiifHelper::setMaxThumbnailHeight($extConf['iiif']['thumbnailHeight']);
566
                        IiifHelper::setMaxThumbnailWidth($extConf['iiif']['thumbnailWidth']);
567
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
568
                        if ($iiif instanceof IiifResourceInterface) {
569
                            $documentFormat = 'IIIF';
570
                        }
571
                    }
572
                }
573
            }
574
        }
575
576
        // Sanitize input.
577
        $pid = array_key_exists('storagePid', $settings) ? max((int) $settings['storagePid'], 0) : 0;
578
        if ($documentFormat == 'METS') {
579
            $instance = new MetsDocument($pid, $location, $xml, $settings);
580
        } elseif ($documentFormat == 'IIIF') {
581
            // TODO: Parameter $preloadedDocument of class Kitodo\Dlf\Common\IiifManifest constructor expects SimpleXMLElement|Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface, Ubl\Iiif\Presentation\Common\Model\AbstractIiifEntity|null given.
582
            // @phpstan-ignore-next-line
583
            $instance = new IiifManifest($pid, $location, $iiif);
584
        }
585
586
        if ($instance !== null) {
587
            self::setDocumentCache($location, $instance);
588
        }
589
590
        return $instance;
591
    }
592
593
    /**
594
     * Clear document cache.
595
     *
596
     * @access public
597
     *
598
     * @static
599
     *
600
     * @return void
601
     */
602
    public static function clearDocumentCache(): void
603
    {
604
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
605
        $cache->flush();
606
    }
607
608
    /**
609
     * This returns the first corresponding physical page number of a given logical page label
610
     *
611
     * @access public
612
     *
613
     * @param string $logicalPage The label (or a part of the label) of the logical page
614
     *
615
     * @return int The physical page number
616
     */
617
    public function getPhysicalPage(string $logicalPage): int
618
    {
619
        if (
620
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
621
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
622
        ) {
623
            return $this->lastSearchedPhysicalPage['physicalPage'];
624
        } else {
625
            $physicalPage = 0;
626
            foreach ($this->physicalStructureInfo as $page) {
627
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
628
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
629
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
630
                    return $physicalPage;
631
                }
632
                $physicalPage++;
633
            }
634
        }
635
        return 1;
636
    }
637
638
    /**
639
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
640
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
641
     * to be given in the Canvas' / Manifest's "seeAlso" property.
642
     *
643
     * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
644
     * of the Manifest / Range (IIIF)
645
     *
646
     * @return string The OCR full text
647
     */
648
    protected function getFullTextFromXml(string $id): string
649
    {
650
        $fullText = '';
651
        // Load available text formats, ...
652
        $this->loadFormats();
653
        // ... physical structure ...
654
        $this->magicGetPhysicalStructure();
655
        // ... and extension configuration.
656
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');
657
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
658
        $textFormat = "";
659
        if (!empty($this->physicalStructureInfo[$id])) {
660
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
661
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
662
                    // Get full text file.
663
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
664
                    if ($fileContent !== false) {
665
                        $textFormat = $this->getTextFormat($fileContent);
666
                    } else {
667
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
668
                        return $fullText;
669
                    }
670
                    break;
671
                }
672
            }
673
        } else {
674
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
675
            return $fullText;
676
        }
677
        // Is this text format supported?
678
        // This part actually differs from previous version of indexed OCR
679
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
680
            $textMiniOcr = '';
681
            if (!empty($this->formats[$textFormat]['class'])) {
682
                $textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat);
683
            }
684
            $fullText = $textMiniOcr;
685
        } else {
686
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
687
        }
688
        return $fullText;
689
    }
690
691
    /**
692
     * Get raw text from class for given format.
693
     *
694
     * @access private
695
     *
696
     * @param $id
697
     * @param $fileContent
698
     * @param $textFormat
699
     *
700
     * @return string
701
     */
702
    private function getRawTextFromClass($id, $fileContent, $textFormat): string
703
    {
704
        $textMiniOcr = '';
705
        $class = $this->formats[$textFormat]['class'];
706
        // Get the raw text from class.
707
        if (class_exists($class)) {
708
            $obj = GeneralUtility::makeInstance($class);
709
            if ($obj instanceof FulltextInterface) {
710
                // Load XML from file.
711
                $ocrTextXml = Helper::getXmlFileAsString($fileContent);
712
                $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
713
                $this->rawTextArray[$id] = $textMiniOcr;
714
            } else {
715
                $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
716
            }
717
        } else {
718
            $this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
719
        }
720
        return $textMiniOcr;
721
    }
722
723
    /**
724
     * Get format of the OCR full text
725
     *
726
     * @access private
727
     *
728
     * @param string $fileContent content of the XML file
729
     *
730
     * @return string The format of the OCR full text
731
     */
732
    private function getTextFormat(string $fileContent): string
733
    {
734
        $xml = Helper::getXmlFileAsString($fileContent);
735
736
        if ($xml !== false) {
737
            // Get the root element's name as text format.
738
            return strtoupper($xml->getName());
739
        } else {
740
            return '';
741
        }
742
    }
743
744
    /**
745
     * This determines a title for the given document
746
     *
747
     * @access public
748
     *
749
     * @static
750
     *
751
     * @param int $uid The UID of the document
752
     * @param bool $recursive Search superior documents for a title, too?
753
     *
754
     * @return string The title of the document itself or a parent document
755
     */
756
    public static function getTitle(int $uid, bool $recursive = false): string
757
    {
758
        $title = '';
759
        // Sanitize input.
760
        $uid = max($uid, 0);
761
        if ($uid) {
762
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
763
                ->getQueryBuilderForTable('tx_dlf_documents');
764
765
            $result = $queryBuilder
766
                ->select(
767
                    'tx_dlf_documents.title',
768
                    'tx_dlf_documents.partof'
769
                )
770
                ->from('tx_dlf_documents')
771
                ->where(
772
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
773
                    Helper::whereExpression('tx_dlf_documents')
774
                )
775
                ->setMaxResults(1)
776
                ->execute();
777
778
            $resArray = $result->fetchAssociative();
779
            if ($resArray) {
780
                // Get title information.
781
                $title = $resArray['title'];
782
                $partof = $resArray['partof'];
783
                // Search parent documents recursively for a title?
784
                if (
785
                    $recursive
786
                    && empty($title)
787
                    && (int) $partof
788
                    && $partof != $uid
789
                ) {
790
                    $title = self::getTitle($partof, true);
791
                }
792
            } else {
793
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
794
            }
795
        } else {
796
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
797
        }
798
        return $title;
799
    }
800
801
    /**
802
     * This extracts all the metadata for the toplevel logical structure node / resource
803
     *
804
     * @access public
805
     *
806
     * @param int $cPid The PID for the metadata definitions
807
     *
808
     * @return array The logical structure node's / resource's parsed metadata array
809
     */
810
    public function getToplevelMetadata(int $cPid = 0): array
811
    {
812
        $toplevelMetadata = $this->getMetadata($this->magicGetToplevelId(), $cPid);
813
        // Add information from METS structural map to toplevel metadata array.
814
        if ($this instanceof MetsDocument) {
815
            $this->addMetadataFromMets($toplevelMetadata, $this->magicGetToplevelId());
816
        }
817
        // Set record identifier for METS file / IIIF manifest if not present.
818
        if (array_key_exists('record_id', $toplevelMetadata)) {
819
            if (
820
                !empty($this->recordId)
821
                && !in_array($this->recordId, $toplevelMetadata['record_id'])
822
            ) {
823
                array_unshift($toplevelMetadata['record_id'], $this->recordId);
824
            }
825
        }
826
        return $toplevelMetadata;
827
    }
828
829
    /**
830
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return its depth.
831
     *
832
     * @access protected
833
     *
834
     * @param array $structure logical structure array
835
     * @param int $depth current tree depth
836
     * @param string $logId ID of the logical structure whose depth is requested
837
     *
838
     * @return int|bool false if structure with $logId is not a child of this substructure,
839
     * or the actual depth.
840
     */
841
    protected function getTreeDepth(array $structure, int $depth, string $logId)
842
    {
843
        foreach ($structure as $element) {
844
            if ($element['id'] == $logId) {
845
                return $depth;
846
            } elseif (array_key_exists('children', $element)) {
847
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
848
                if ($foundInChildren !== false) {
849
                    return $foundInChildren;
850
                }
851
            }
852
        }
853
        return false;
854
    }
855
856
    /**
857
     * Get the tree depth of a logical structure element within the table of content
858
     *
859
     * @access public
860
     *
861
     * @param string $logId The id of the logical structure element whose depth is requested
862
     *
863
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
864
     */
865
    public function getStructureDepth(string $logId)
866
    {
867
        return $this->getTreeDepth($this->magicGetTableOfContents(), 1, $logId);
868
    }
869
870
    /**
871
     * Load XML file / IIIF resource from URL
872
     *
873
     * @access protected
874
     *
875
     * @param string $location The URL of the file to load
876
     *
877
     * @return bool true on success or false on failure
878
     */
879
    protected function load(string $location): bool
880
    {
881
        // Load XML / JSON-LD file.
882
        if (GeneralUtility::isValidUrl($location)) {
883
            // the actual loading is format specific
884
            return $this->loadLocation($location);
885
        } else {
886
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
887
        }
888
        return false;
889
    }
890
891
    /**
892
     * Register all available data formats
893
     *
894
     * @access protected
895
     *
896
     * @return void
897
     */
898
    protected function loadFormats(): void
899
    {
900
        if (!$this->formatsLoaded) {
901
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
902
                ->getQueryBuilderForTable('tx_dlf_formats');
903
904
            // Get available data formats from database.
905
            $result = $queryBuilder
906
                ->select(
907
                    'tx_dlf_formats.type AS type',
908
                    'tx_dlf_formats.root AS root',
909
                    'tx_dlf_formats.namespace AS namespace',
910
                    'tx_dlf_formats.class AS class'
911
                )
912
                ->from('tx_dlf_formats')
913
                ->where(
914
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
915
                )
916
                ->execute();
917
918
            while ($resArray = $result->fetchAssociative()) {
919
                // Update format registry.
920
                $this->formats[$resArray['type']] = [
0 ignored issues
show
The property formats is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
921
                    'rootElement' => $resArray['root'],
922
                    'namespaceURI' => $resArray['namespace'],
923
                    'class' => $resArray['class']
924
                ];
925
            }
926
            $this->formatsLoaded = true;
927
        }
928
    }
929
930
    /**
931
     * Register all available namespaces for a \SimpleXMLElement object
932
     *
933
     * @access public
934
     *
935
     * @param \SimpleXMLElement|\DOMXPath &$obj \SimpleXMLElement or \DOMXPath object
936
     *
937
     * @return void
938
     */
939
    public function registerNamespaces(&$obj): void
940
    {
941
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
942
        $this->loadFormats();
943
        // Do we have a \SimpleXMLElement or \DOMXPath object?
944
        if ($obj instanceof \SimpleXMLElement) {
945
            $method = 'registerXPathNamespace';
946
        } elseif ($obj instanceof \DOMXPath) {
0 ignored issues
show
$obj is always a sub-type of DOMXPath.
Loading history...
947
            $method = 'registerNamespace';
948
        } else {
949
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
950
            return;
951
        }
952
        // Register metadata format's namespaces.
953
        foreach ($this->formats as $enc => $conf) {
954
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
955
        }
956
    }
957
958
    /**
959
     * Initialize metadata array with empty values.
960
     *
961
     * @access protected
962
     *
963
     * @param string $format of the document eg. METS
964
     *
965
     * @return array
966
     */
967
    protected function initializeMetadata(string $format): array
968
    {
969
        return [
970
            'title' => [],
971
            'title_sorting' => [],
972
            'description' => [],
973
            'author' => [],
974
            'holder' => [],
975
            'place' => [],
976
            'year' => [],
977
            'prod_id' => [],
978
            'record_id' => [],
979
            'opac_id' => [],
980
            'union_id' => [],
981
            'urn' => [],
982
            'purl' => [],
983
            'type' => [],
984
            'volume' => [],
985
            'volume_sorting' => [],
986
            'date' => [],
987
            'license' => [],
988
            'terms' => [],
989
            'restrictions' => [],
990
            'out_of_print' => [],
991
            'rights_info' => [],
992
            'collection' => [],
993
            'owner' => [],
994
            'mets_label' => [],
995
            'mets_orderlabel' => [],
996
            'document_format' => [$format]
997
        ];
998
    }
999
1000
    /**
1001
     * This returns $this->cPid via __get()
1002
     *
1003
     * @access protected
1004
     *
1005
     * @return int The PID of the metadata definitions
1006
     */
1007
    protected function magicGetCPid(): int
1008
    {
1009
        return $this->cPid;
1010
    }
1011
1012
    /**
1013
     * This returns $this->hasFulltext via __get()
1014
     *
1015
     * @access protected
1016
     *
1017
     * @return bool Are there any fulltext files available?
1018
     */
1019
    protected function magicGetHasFulltext(): bool
1020
    {
1021
        $this->ensureHasFulltextIsSet();
1022
        return $this->hasFulltext;
1023
    }
1024
1025
    /**
1026
     * This magic method is called each time an invisible property is referenced from the object
1027
     * It builds an array of the document's metadata
1028
     *
1029
     * @access protected
1030
     *
1031
     * @return array Array of metadata with their corresponding logical structure node ID as key
1032
     */
1033
    protected function magicGetMetadataArray(): array
1034
    {
1035
        // Set metadata definitions' PID.
1036
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
1037
        if (!$cPid) {
1038
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
1039
            return [];
1040
        }
1041
        if (
1042
            !$this->metadataArrayLoaded
1043
            || $this->metadataArray[0] != $cPid
1044
        ) {
1045
            $this->prepareMetadataArray($cPid);
1046
            $this->metadataArray[0] = $cPid;
1047
            $this->metadataArrayLoaded = true;
1048
        }
1049
        return $this->metadataArray;
1050
    }
1051
1052
    /**
1053
     * This returns $this->numPages via __get()
1054
     *
1055
     * @access protected
1056
     *
1057
     * @return int The total number of pages and/or tracks
1058
     */
1059
    protected function magicGetNumPages(): int
1060
    {
1061
        $this->magicGetPhysicalStructure();
1062
        return $this->numPages;
1063
    }
1064
1065
    /**
1066
     * This returns $this->parentId via __get()
1067
     *
1068
     * @access protected
1069
     *
1070
     * @return int The UID of the parent document or zero if not applicable
1071
     */
1072
    protected function magicGetParentId(): int
1073
    {
1074
        return $this->parentId;
1075
    }
1076
1077
    /**
1078
     * This gives an array of the document's physical structure metadata
1079
     *
1080
     * @access protected
1081
     *
1082
     * @return array Array of elements' type, label and file representations ordered by "@ID" attribute / Canvas order
1083
     */
1084
    protected function magicGetPhysicalStructureInfo(): array
1085
    {
1086
        // Is there no physical structure array yet?
1087
        if (!$this->physicalStructureLoaded) {
1088
            // Build physical structure array.
1089
            $this->magicGetPhysicalStructure();
1090
        }
1091
        return $this->physicalStructureInfo;
1092
    }
1093
1094
    /**
1095
     * This returns $this->pid via __get()
1096
     *
1097
     * @access protected
1098
     *
1099
     * @return int The PID of the document or zero if not in database
1100
     */
1101
    protected function magicGetPid(): int
1102
    {
1103
        return $this->pid;
1104
    }
1105
1106
    /**
1107
     * This returns $this->ready via __get()
1108
     *
1109
     * @access protected
1110
     *
1111
     * @return bool Is the document instantiated successfully?
1112
     */
1113
    protected function magicGetReady(): bool
1114
    {
1115
        return $this->ready;
1116
    }
1117
1118
    /**
1119
     * This returns $this->recordId via __get()
1120
     *
1121
     * @access protected
1122
     *
1123
     * @return mixed The METS file's / IIIF manifest's record identifier
1124
     */
1125
    protected function magicGetRecordId()
1126
    {
1127
        return $this->recordId;
1128
    }
1129
1130
    /**
1131
     * This returns $this->rootId via __get()
1132
     *
1133
     * @access protected
1134
     *
1135
     * @return int The UID of the root document or zero if not applicable
1136
     */
1137
    protected function magicGetRootId(): int
1138
    {
1139
        if (!$this->rootIdLoaded) {
1140
            if ($this->parentId) {
1141
                // TODO: Parameter $location of static method AbstractDocument::getInstance() expects string, int<min, -1>|int<1, max> given.
1142
                // @phpstan-ignore-next-line
1143
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1144
                $this->rootId = $parent->rootId;
1145
            }
1146
            $this->rootIdLoaded = true;
1147
        }
1148
        return $this->rootId;
1149
    }
1150
1151
    /**
1152
     * This builds an array of the document's logical structure
1153
     *
1154
     * @access protected
1155
     *
1156
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1157
     */
1158
    protected function magicGetTableOfContents(): array
1159
    {
1160
        // Is there no logical structure array yet?
1161
        if (!$this->tableOfContentsLoaded) {
1162
            // Get all logical structures.
1163
            $this->getLogicalStructure('', true);
1164
            $this->tableOfContentsLoaded = true;
1165
        }
1166
        return $this->tableOfContents;
1167
    }
1168
1169
    /**
1170
     * This sets $this->cPid via __set()
1171
     *
1172
     * @access protected
1173
     *
1174
     * @param int $value The new PID for the metadata definitions
1175
     *
1176
     * @return void
1177
     */
1178
    protected function _setCPid(int $value): void
1179
    {
1180
        $this->cPid = max($value, 0);
1181
    }
1182
1183
    /**
1184
     * This is a singleton class, thus the constructor should be private/protected
1185
     * (Get an instance of this class by calling AbstractDocument::getInstance())
1186
     *
1187
     * @access protected
1188
     *
1189
     * @param int $pid If > 0, then only document with this PID gets loaded
1190
     * @param string $location The location URL of the XML file to parse
1191
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument Either null or the \SimpleXMLElement
1192
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1193
     *
1194
     * @return void
1195
     */
1196
    protected function __construct(int $pid, string $location, $preloadedDocument, array $settings = [])
1197
    {
1198
        $this->pid = $pid;
0 ignored issues
show
The property pid is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
1199
        $this->setPreloadedDocument($preloadedDocument);
1200
        $this->init($location, $settings);
1201
        $this->establishRecordId($pid);
1202
    }
1203
1204
    /**
1205
     * This magic method is called each time an invisible property is referenced from the object
1206
     *
1207
     * @access public
1208
     *
1209
     * @param string $var Name of variable to get
1210
     *
1211
     * @return mixed Value of $this->$var
1212
     */
1213
    public function __get(string $var)
1214
    {
1215
        $method = 'magicGet' . ucfirst($var);
1216
        if (
1217
            !property_exists($this, $var)
1218
            || !method_exists($this, $method)
1219
        ) {
1220
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1221
            return null;
1222
        } else {
1223
            return $this->$method();
1224
        }
1225
    }
1226
1227
    /**
1228
     * This magic method is called each time an invisible property is checked for isset() or empty()
1229
     *
1230
     * @access public
1231
     *
1232
     * @param string $var Name of variable to check
1233
     *
1234
     * @return bool true if variable is set and not empty, false otherwise
1235
     */
1236
    public function __isset(string $var): bool
1237
    {
1238
        return !empty($this->__get($var));
1239
    }
1240
1241
    /**
1242
     * This magic method is called each time an invisible property is referenced from the object
1243
     *
1244
     * @access public
1245
     *
1246
     * @param string $var Name of variable to set
1247
     * @param mixed $value New value of variable
1248
     *
1249
     * @return void
1250
     */
1251
    public function __set(string $var, $value): void
1252
    {
1253
        $method = '_set' . ucfirst($var);
1254
        if (
1255
            !property_exists($this, $var)
1256
            || !method_exists($this, $method)
1257
        ) {
1258
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1259
        } else {
1260
            $this->$method($value);
1261
        }
1262
    }
1263
1264
    /**
1265
     * Get Cache Hit for document instance
1266
     *
1267
     * @access private
1268
     *
1269
     * @static
1270
     *
1271
     * @param string $location
1272
     *
1273
     * @return AbstractDocument|false
1274
     */
1275
    private static function getDocumentCache(string $location)
1276
    {
1277
        $cacheIdentifier = hash('md5', $location);
1278
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1279
        $cacheHit = $cache->get($cacheIdentifier);
1280
1281
        return $cacheHit;
1282
    }
1283
1284
    /**
1285
     * Set Cache for document instance
1286
     *
1287
     * @access private
1288
     *
1289
     * @static
1290
     *
1291
     * @param string $location
1292
     * @param AbstractDocument $currentDocument
1293
     *
1294
     * @return void
1295
     */
1296
    private static function setDocumentCache(string $location, AbstractDocument $currentDocument): void
1297
    {
1298
        $cacheIdentifier = hash('md5', $location);
1299
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1300
1301
        // Save value in cache
1302
        $cache->set($cacheIdentifier, $currentDocument);
1303
    }
1304
}
1305