Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( cb42f3...1ced4e )
by
unknown
04:06 queued 45s
created

AbstractDocument::_getReady()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Cache\CacheManager;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Log\Logger;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
21
use Ubl\Iiif\Tools\IiifHelper;
22
23
/**
24
 * Document class for the 'dlf' extension
25
 *
26
 * @package TYPO3
27
 * @subpackage dlf
28
 *
29
 * @access public
30
 *
31
 * @abstract
32
 *
33
 * @property int $cPid this holds the PID for the configuration
34
 * @property-read array $formats this holds the configuration for all supported metadata encodings
35
 * @property bool $formatsLoaded flag with information if the available metadata formats are loaded
36
 * @property-read bool $hasFulltext flag with information if there are any fulltext files available
37
 * @property array $lastSearchedPhysicalPage the last searched logical and physical page
38
 * @property array $logicalUnits this holds the logical units
39
 * @property-read array $metadataArray this holds the documents' parsed metadata array
40
 * @property bool $metadataArrayLoaded flag with information if the metadata array is loaded
41
 * @property-read int $numPages the holds the total number of pages
42
 * @property-read int $parentId this holds the UID of the parent document or zero if not multi-volumed
43
 * @property-read array $physicalStructure this holds the physical structure
44
 * @property-read array $physicalStructureInfo this holds the physical structure metadata
45
 * @property bool $physicalStructureLoaded flag with information if the physical structure is loaded
46
 * @property-read int $pid this holds the PID of the document or zero if not in database
47
 * @property array $rawTextArray this holds the documents' raw text pages with their corresponding structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
48
 * @property-read bool $ready Is the document instantiated successfully?
49
 * @property-read string $recordId the METS file's / IIIF manifest's record identifier
50
 * @property array $registry this holds the singleton object of the document
51
 * @property-read int $rootId this holds the UID of the root document or zero if not multi-volumed
52
 * @property-read array $smLinks this holds the smLinks between logical and physical structMap
53
 * @property bool $smLinksLoaded flag with information if the smLinks are loaded
54
 * @property-read array $tableOfContents this holds the logical structure
55
 * @property bool $tableOfContentsLoaded flag with information if the table of contents is loaded
56
 * @property-read string $thumbnail this holds the document's thumbnail location
57
 * @property bool $thumbnailLoaded flag with information if the thumbnail is loaded
58
 * @property-read string $toplevelId this holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
59
 * @property \SimpleXMLElement $xml this holds the whole XML file as \SimpleXMLElement object
60
 */
61
abstract class AbstractDocument
62
{
63
    /**
64
     * @access protected
65
     * @var Logger This holds the logger
66
     */
67
    protected Logger $logger;
68
69
    /**
70
     * @access protected
71
     * @var int This holds the PID for the configuration
72
     */
73
    protected int $cPid = 0;
74
75
    /**
76
     * @access public
77
     * @static
78
     * @var string The extension key
79
     */
80
    public static string $extKey = 'dlf';
81
82
    /**
83
     * @access protected
84
     * @var array Additional information about files (e.g., ADMID), indexed by ID.
85
     */
86
    protected $fileInfos = [];
87
88
    /**
89
     * @access protected
90
     * @var array This holds the configuration for all supported metadata encodings
91
     *
92
     * @see loadFormats()
93
     */
94
    protected array $formats = [
95
        'OAI' => [
96
            'rootElement' => 'OAI-PMH',
97
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
98
        ],
99
        'METS' => [
100
            'rootElement' => 'mets',
101
            'namespaceURI' => 'http://www.loc.gov/METS/',
102
        ],
103
        'XLINK' => [
104
            'rootElement' => 'xlink',
105
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
106
        ]
107
    ];
108
109
    /**
110
     * @access protected
111
     * @var bool Are the available metadata formats loaded?
112
     *
113
     * @see $formats
114
     */
115
    protected bool $formatsLoaded = false;
116
117
    /**
118
     * Are there any fulltext files available? This also includes IIIF text annotations
119
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
120
     * annotations as fulltext.
121
     *
122
     * @access protected
123
     * @var bool
124
     */
125
    protected bool $hasFulltext = false;
126
127
    /**
128
     * @access protected
129
     * @var array Last searched logical and physical page
130
     */
131
    protected array $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
132
133
    /**
134
     * @access protected
135
     * @var array This holds the logical units
136
     */
137
    protected array $logicalUnits = [];
138
139
    /**
140
     * This holds the documents' parsed metadata array with their corresponding
141
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
142
     *
143
     * @access protected
144
     * @var array
145
     */
146
    protected array $metadataArray = [];
147
148
    /**
149
     * @access protected
150
     * @var bool Is the metadata array loaded?
151
     *
152
     * @see $metadataArray
153
     */
154
    protected bool $metadataArrayLoaded = false;
155
156
    /**
157
     * @access protected
158
     * @var int The holds the total number of pages
159
     */
160
    protected int $numPages = 0;
161
162
    /**
163
     * @access protected
164
     * @var int This holds the UID of the parent document or zero if not multi-volumed
165
     */
166
    protected int $parentId = 0;
167
168
    /**
169
     * @access protected
170
     * @var array This holds the physical structure
171
     */
172
    protected array $physicalStructure = [];
173
174
    /**
175
     * @access protected
176
     * @var array This holds the physical structure metadata
177
     */
178
    protected array $physicalStructureInfo = [];
179
180
    /**
181
     * @access protected
182
     * @var bool Is the physical structure loaded?
183
     *
184
     * @see $physicalStructure
185
     */
186
    protected bool $physicalStructureLoaded = false;
187
188
    /**
189
     * @access protected
190
     * @var int This holds the PID of the document or zero if not in database
191
     */
192
    protected int $pid = 0;
193
194
    /**
195
     * This holds the documents' raw text pages with their corresponding
196
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
197
     *
198
     * @access protected
199
     * @var array
200
     */
201
    protected array $rawTextArray = [];
202
203
    /**
204
     * @access protected
205
     * @var bool Is the document instantiated successfully?
206
     */
207
    protected bool $ready = false;
208
209
    /**
210
     * @access protected
211
     * @var string The METS file's / IIIF manifest's record identifier
212
     */
213
    protected ?string $recordId;
214
215
    /**
216
     * @access protected
217
     * @static
218
     * @var array (AbstractDocument) This holds the singleton object of the document
219
     */
220
    protected static array $registry = [];
221
222
    /**
223
     * @access protected
224
     * @var int This holds the UID of the root document or zero if not multi-volumed
225
     */
226
    protected int $rootId = 0;
227
228
    /**
229
     * @access protected
230
     * @var bool Is the root id loaded?
231
     *
232
     * @see $rootId
233
     */
234
    protected bool $rootIdLoaded = false;
235
236
    /**
237
     * @access protected
238
     * @var array This holds the smLinks between logical and physical structMap
239
     */
240
    protected array $smLinks = ['l2p' => [], 'p2l' => []];
241
242
    /**
243
     * @access protected
244
     * @var bool Are the smLinks loaded?
245
     *
246
     * @see $smLinks
247
     */
248
    protected bool $smLinksLoaded = false;
249
250
    /**
251
     * This holds the logical structure
252
     *
253
     * @access protected
254
     * @var array
255
     */
256
    protected array $tableOfContents = [];
257
258
    /**
259
     * @access protected
260
     * @var bool Is the table of contents loaded?
261
     *
262
     * @see $tableOfContents
263
     */
264
    protected bool $tableOfContentsLoaded = false;
265
266
    /**
267
     * @access protected
268
     * @var string This holds the document's thumbnail location
269
     */
270
    protected string $thumbnail = '';
271
272
    /**
273
     * @access protected
274
     * @var bool Is the document's thumbnail location loaded?
275
     *
276
     * @see $thumbnail
277
     */
278
    protected bool $thumbnailLoaded = false;
279
280
    /**
281
     * @access protected
282
     * @var string This holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
283
     */
284
    protected string $toplevelId = '';
285
286
    /**
287
     * @access protected
288
     * @var \SimpleXMLElement This holds the whole XML file as \SimpleXMLElement object
289
     */
290
    protected \SimpleXMLElement $xml;
291
292
    /**
293
     * This clears the static registry to prevent memory exhaustion
294
     *
295
     * @access public
296
     *
297
     * @static
298
     *
299
     * @return void
300
     */
301
    public static function clearRegistry(): void
302
    {
303
        // Reset registry array.
304
        self::$registry = [];
305
    }
306
307
    /**
308
     * This ensures that the recordId, if existent, is retrieved from the document
309
     *
310
     * @access protected
311
     *
312
     * @abstract
313
     *
314
     * @param int $pid: ID of the configuration page with the recordId config
315
     *
316
     * @return void
317
     */
318
    protected abstract function establishRecordId(int $pid);
319
320
    /**
321
     * Source document PHP object which is represented by a Document instance
322
     *
323
     * @access protected
324
     *
325
     * @abstract
326
     *
327
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
328
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
329
     */
330
    protected abstract function getDocument();
331
332
    /**
333
     * This gets the location of a downloadable file for a physical page or track
334
     *
335
     * @access public
336
     *
337
     * @abstract
338
     *
339
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
340
     *
341
     * @return string The file's location as URL
342
     */
343
    public abstract function getDownloadLocation(string $id): string;
344
345
    /**
346
     * This gets all file information stored in single array.
347
     *
348
     * @access public
349
     *
350
     * @abstract
351
     *
352
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
353
     * 
354
     * @return array|null The set of file information
355
     */
356
    public abstract function getFileInfo($id);
357
358
    /**
359
     * This gets the location of a file representing a physical page or track
360
     *
361
     * @access public
362
     *
363
     * @abstract
364
     *
365
     * @param string $id The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
366
     *
367
     * @return string The file's location as URL
368
     */
369
    public abstract function getFileLocation(string $id): string;
370
371
    /**
372
     * This gets the MIME type of a file representing a physical page or track
373
     *
374
     * @access public
375
     *
376
     * @abstract
377
     *
378
     * @param string $id The "@ID" attribute of the file node
379
     *
380
     * @return string The file's MIME type
381
     */
382
    public abstract function getFileMimeType(string $id): string;
383
384
    /**
385
     * This is a singleton class, thus an instance must be created by this method
386
     *
387
     * @access public
388
     *
389
     * @static
390
     *
391
     * @param string $location The URL of XML file or the IRI of the IIIF resource
392
     * @param array $settings
393
     * @param bool $forceReload Force reloading the document instead of returning the cached instance
394
     *
395
     * @return AbstractDocument|null Instance of this class, either MetsDocument or IiifManifest
396
     */
397
    public static function &getInstance(string $location, array $settings = [], bool $forceReload = false): ?AbstractDocument
398
    {
399
        // Create new instance depending on format (METS or IIIF) ...
400
        $documentFormat = null;
401
        $xml = null;
402
        $iiif = null;
403
404
        if (!$forceReload) {
405
            $instance = self::getDocumentCache($location);
406
            if ($instance !== false) {
407
                return $instance;
408
            }
409
        }
410
411
        $instance = null;
412
413
        // Try to load a file from the url
414
        if (GeneralUtility::isValidUrl($location)) {
415
            // Load extension configuration
416
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
417
418
            $content = Helper::getUrl($location);
419
            if ($content !== false) {
420
                $xml = Helper::getXmlFileAsString($content);
421
                if ($xml !== false) {
422
                    /* @var $xml \SimpleXMLElement */
423
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
424
                    $xpathResult = $xml->xpath('//mets:mets');
425
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
426
                } else {
427
                    // Try to load file as IIIF resource instead.
428
                    $contentAsJsonArray = json_decode($content, true);
429
                    if ($contentAsJsonArray !== null) {
430
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
431
                        IiifHelper::setMaxThumbnailHeight($extConf['iiifThumbnailHeight']);
432
                        IiifHelper::setMaxThumbnailWidth($extConf['iiifThumbnailWidth']);
433
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
434
                        if ($iiif instanceof IiifResourceInterface) {
435
                            $documentFormat = 'IIIF';
436
                        }
437
                    }
438
                }
439
            }
440
        }
441
442
        // Sanitize input.
443
        $pid = max(intval($settings['storagePid']), 0);
444
        if ($documentFormat == 'METS') {
445
            $instance = new MetsDocument($location, $pid, $xml);
446
        } elseif ($documentFormat == 'IIIF') {
447
            $instance = new IiifManifest($location, $pid, $iiif);
0 ignored issues
show
Bug introduced by
It seems like $iiif can also be of type Ubl\Iiif\Presentation\Co...odel\AbstractIiifEntity; however, parameter $preloadedDocument of Kitodo\Dlf\Common\IiifManifest::__construct() does only seem to accept SimpleXMLElement|Ubl\Iii...s\IiifResourceInterface, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

447
            $instance = new IiifManifest($location, $pid, /** @scrutinizer ignore-type */ $iiif);
Loading history...
448
        }
449
450
        if (!is_null($instance)) {
451
            self::setDocumentCache($location, $instance);
452
        }
453
454
        return $instance;
455
    }
456
457
    /**
458
     * This gets details about a logical structure element
459
     *
460
     * @access public
461
     *
462
     * @abstract
463
     *
464
     * @param string $id The "@ID" attribute of the logical structure node (METS) or
465
     * the "@id" property of the Manifest / Range (IIIF)
466
     * @param bool $recursive Whether to include the child elements / resources
467
     *
468
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
469
     */
470
    public abstract function getLogicalStructure(string $id, bool $recursive = false): array;
471
472
    /**
473
     * This extracts all the metadata for a logical structure node
474
     *
475
     * @access public
476
     *
477
     * @abstract
478
     *
479
     * @param string $id The "@ID" attribute of the logical structure node (METS) or the "@id" property
480
     * of the Manifest / Range (IIIF)
481
     * @param int $cPid The PID for the metadata definitions (defaults to $this->cPid or $this->pid)
482
     *
483
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
484
     */
485
    public abstract function getMetadata(string $id, int $cPid = 0): array;
486
487
    /**
488
     * This returns the first corresponding physical page number of a given logical page label
489
     *
490
     * @access public
491
     *
492
     * @param string $logicalPage The label (or a part of the label) of the logical page
493
     *
494
     * @return int The physical page number
495
     */
496
    public function getPhysicalPage(string $logicalPage): int
497
    {
498
        if (
499
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
500
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
501
        ) {
502
            return $this->lastSearchedPhysicalPage['physicalPage'];
503
        } else {
504
            $physicalPage = 0;
505
            foreach ($this->physicalStructureInfo as $page) {
506
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
507
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
508
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
509
                    return $physicalPage;
510
                }
511
                $physicalPage++;
512
            }
513
        }
514
        return 1;
515
    }
516
517
    /**
518
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
519
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
520
     *
521
     * @access public
522
     *
523
     * @abstract
524
     *
525
     * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
526
     * of the Manifest / Range (IIIF)
527
     *
528
     * @return string The OCR full text
529
     */
530
    public abstract function getFullText(string $id): string;
531
532
    /**
533
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
534
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
535
     * to be given in the Canvas' / Manifest's "seeAlso" property.
536
     *
537
     * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
538
     * of the Manifest / Range (IIIF)
539
     *
540
     * @return string The OCR full text
541
     */
542
    protected function getFullTextFromXml(string $id): string
543
    {
544
        $fullText = '';
545
        // Load available text formats, ...
546
        $this->loadFormats();
547
        // ... physical structure ...
548
        $this->_getPhysicalStructure();
549
        // ... and extension configuration.
550
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
551
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
552
        $textFormat = "";
553
        if (!empty($this->physicalStructureInfo[$id])) {
554
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
555
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
556
                    // Get full text file.
557
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
558
                    if ($fileContent !== false) {
559
                        $textFormat = $this->getTextFormat($fileContent);
560
                    } else {
561
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
562
                        return $fullText;
563
                    }
564
                    break;
565
                }
566
            }
567
        } else {
568
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
569
            return $fullText;
570
        }
571
        // Is this text format supported?
572
        // This part actually differs from previous version of indexed OCR
573
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
574
            $textMiniOcr = '';
575
            if (!empty($this->formats[$textFormat]['class'])) {
576
                $class = $this->formats[$textFormat]['class'];
577
                // Get the raw text from class.
578
                if (
579
                    class_exists($class)
580
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
581
                ) {
582
                    // Load XML from file.
583
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
584
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
0 ignored issues
show
Bug introduced by
It seems like $ocrTextXml can also be of type false; however, parameter $xml of Kitodo\Dlf\Common\Fullte...ace::getTextAsMiniOcr() does only seem to accept SimpleXMLElement, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

584
                    $textMiniOcr = $obj->getTextAsMiniOcr(/** @scrutinizer ignore-type */ $ocrTextXml);
Loading history...
585
                    $this->rawTextArray[$id] = $textMiniOcr;
586
                } else {
587
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
588
                }
589
            }
590
            $fullText = $textMiniOcr;
591
        } else {
592
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
593
        }
594
        return $fullText;
595
    }
596
597
    /**
598
     * Get format of the OCR full text
599
     *
600
     * @access private
601
     *
602
     * @param string $fileContent content of the XML file
603
     *
604
     * @return string The format of the OCR full text
605
     */
606
    private function getTextFormat(string $fileContent): string
607
    {
608
        $xml = Helper::getXmlFileAsString($fileContent);
609
610
        if ($xml !== false) {
611
            // Get the root element's name as text format.
612
            return strtoupper($xml->getName());
613
        } else {
614
            return '';
615
        }
616
    }
617
618
    /**
619
     * This determines a title for the given document
620
     *
621
     * @access public
622
     *
623
     * @static
624
     *
625
     * @param int $uid The UID of the document
626
     * @param bool $recursive Search superior documents for a title, too?
627
     *
628
     * @return string The title of the document itself or a parent document
629
     */
630
    public static function getTitle(int $uid, bool $recursive = false): string
631
    {
632
        $title = '';
633
        // Sanitize input.
634
        $uid = max(intval($uid), 0);
635
        if ($uid) {
636
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
637
                ->getQueryBuilderForTable('tx_dlf_documents');
638
639
            $result = $queryBuilder
640
                ->select(
641
                    'tx_dlf_documents.title',
642
                    'tx_dlf_documents.partof'
643
                )
644
                ->from('tx_dlf_documents')
645
                ->where(
646
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
647
                    Helper::whereExpression('tx_dlf_documents')
648
                )
649
                ->setMaxResults(1)
650
                ->execute();
651
652
            if ($resArray = $result->fetchAssociative()) {
653
                // Get title information.
654
                $title = $resArray['title'];
655
                $partof = $resArray['partof'];
656
                // Search parent documents recursively for a title?
657
                if (
658
                    $recursive
659
                    && empty($title)
660
                    && intval($partof)
661
                    && $partof != $uid
662
                ) {
663
                    $title = self::getTitle($partof, true);
664
                }
665
            } else {
666
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
667
            }
668
        } else {
669
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
670
        }
671
        return $title;
672
    }
673
674
    /**
675
     * This extracts all the metadata for the toplevel logical structure node / resource
676
     *
677
     * @access public
678
     *
679
     * @param int $cPid The PID for the metadata definitions
680
     *
681
     * @return array The logical structure node's / resource's parsed metadata array
682
     */
683
    public function getTitledata(int $cPid = 0): array
684
    {
685
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
686
        // Add information from METS structural map to titledata array.
687
        if ($this instanceof MetsDocument) {
688
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
689
        }
690
        // Set record identifier for METS file / IIIF manifest if not present.
691
        if (
692
            is_array($titledata)
693
            && array_key_exists('record_id', $titledata)
694
        ) {
695
            if (
696
                !empty($this->recordId)
697
                && !in_array($this->recordId, $titledata['record_id'])
698
            ) {
699
                array_unshift($titledata['record_id'], $this->recordId);
700
            }
701
        }
702
        return $titledata;
703
    }
704
705
    /**
706
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return its depth.
707
     *
708
     * @access protected
709
     *
710
     * @param array $structure logical structure array
711
     * @param int $depth current tree depth
712
     * @param string $logId ID of the logical structure whose depth is requested
713
     *
714
     * @return int|bool false if structure with $logId is not a child of this substructure,
715
     * or the actual depth.
716
     */
717
    protected function getTreeDepth(array $structure, int $depth, string $logId)
718
    {
719
        foreach ($structure as $element) {
720
            if ($element['id'] == $logId) {
721
                return $depth;
722
            } elseif (array_key_exists('children', $element)) {
723
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
724
                if ($foundInChildren !== false) {
725
                    return $foundInChildren;
726
                }
727
            }
728
        }
729
        return false;
730
    }
731
732
    /**
733
     * Get the tree depth of a logical structure element within the table of content
734
     *
735
     * @access public
736
     *
737
     * @param string $logId The id of the logical structure element whose depth is requested
738
     *
739
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
740
     */
741
    public function getStructureDepth(string $logId)
742
    {
743
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
744
    }
745
746
    /**
747
     * This sets some basic class properties
748
     *
749
     * @access protected
750
     *
751
     * @abstract
752
     *
753
     * @param string $location The location URL of the XML file to parse
754
     *
755
     * @return void
756
     */
757
    protected abstract function init(string $location): void;
758
759
    /**
760
     * Reuse any document object that might have been already loaded to determine whether document is METS or IIIF
761
     *
762
     * @access protected
763
     *
764
     * @abstract
765
     *
766
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument any instance that has already been loaded
767
     *
768
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
769
     */
770
    protected abstract function setPreloadedDocument($preloadedDocument): bool;
771
772
    /**
773
     * METS/IIIF specific part of loading a location
774
     *
775
     * @access protected
776
     *
777
     * @abstract
778
     *
779
     * @param string $location The URL of the file to load
780
     *
781
     * @return bool true on success or false on failure
782
     */
783
    protected abstract function loadLocation(string $location): bool;
784
785
    /**
786
     * Load XML file / IIIF resource from URL
787
     *
788
     * @access protected
789
     *
790
     * @param string $location The URL of the file to load
791
     *
792
     * @return bool true on success or false on failure
793
     */
794
    protected function load(string $location): bool
795
    {
796
        // Load XML / JSON-LD file.
797
        if (GeneralUtility::isValidUrl($location)) {
798
            // the actual loading is format specific
799
            return $this->loadLocation($location);
800
        } else {
801
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
802
        }
803
        return false;
804
    }
805
806
    /**
807
     * Analyze the document if it contains any fulltext that needs to be indexed.
808
     *
809
     * @access protected
810
     *
811
     * @abstract
812
     *
813
     * @return void
814
     */
815
    protected abstract function ensureHasFulltextIsSet();
816
817
    /**
818
     * Register all available data formats
819
     *
820
     * @access protected
821
     *
822
     * @return void
823
     */
824
    protected function loadFormats(): void
825
    {
826
        if (!$this->formatsLoaded) {
827
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
828
                ->getQueryBuilderForTable('tx_dlf_formats');
829
830
            // Get available data formats from database.
831
            $result = $queryBuilder
832
                ->select(
833
                    'tx_dlf_formats.type AS type',
834
                    'tx_dlf_formats.root AS root',
835
                    'tx_dlf_formats.namespace AS namespace',
836
                    'tx_dlf_formats.class AS class'
837
                )
838
                ->from('tx_dlf_formats')
839
                ->where(
840
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
841
                )
842
                ->execute();
843
844
            while ($resArray = $result->fetchAssociative()) {
845
                // Update format registry.
846
                $this->formats[$resArray['type']] = [
0 ignored issues
show
Bug introduced by
The property formats is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
847
                    'rootElement' => $resArray['root'],
848
                    'namespaceURI' => $resArray['namespace'],
849
                    'class' => $resArray['class']
850
                ];
851
            }
852
            $this->formatsLoaded = true;
853
        }
854
    }
855
856
    /**
857
     * Register all available namespaces for a \SimpleXMLElement object
858
     *
859
     * @access public
860
     *
861
     * @param \SimpleXMLElement|\DOMXPath &$obj \SimpleXMLElement or \DOMXPath object
862
     *
863
     * @return void
864
     */
865
    public function registerNamespaces(&$obj): void
866
    {
867
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
868
        $this->loadFormats();
869
        // Do we have a \SimpleXMLElement or \DOMXPath object?
870
        if ($obj instanceof \SimpleXMLElement) {
871
            $method = 'registerXPathNamespace';
872
        } elseif ($obj instanceof \DOMXPath) {
0 ignored issues
show
introduced by
$obj is always a sub-type of DOMXPath.
Loading history...
873
            $method = 'registerNamespace';
874
        } else {
875
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
876
            return;
877
        }
878
        // Register metadata format's namespaces.
879
        foreach ($this->formats as $enc => $conf) {
880
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
881
        }
882
    }
883
884
    /**
885
     * Initialize metadata array with empty values.
886
     *
887
     * @access protected
888
     *
889
     * @param string $format of the document eg. METS
890
     *
891
     * @return array
892
     */
893
    protected function initializeMetadata(string $format): array {
894
        return [
895
            'title' => [],
896
            'title_sorting' => [],
897
            'description' => [],
898
            'author' => [],
899
            'holder' => [],
900
            'place' => [],
901
            'year' => [],
902
            'prod_id' => [],
903
            'record_id' => [],
904
            'opac_id' => [],
905
            'union_id' => [],
906
            'urn' => [],
907
            'purl' => [],
908
            'type' => [],
909
            'volume' => [],
910
            'volume_sorting' => [],
911
            'date' => [],
912
            'license' => [],
913
            'terms' => [],
914
            'restrictions' => [],
915
            'out_of_print' => [],
916
            'rights_info' => [],
917
            'collection' => [],
918
            'owner' => [],
919
            'mets_label' => [],
920
            'mets_orderlabel' => [],
921
            'document_format' => [$format]
922
        ];
923
    }
924
925
    /**
926
     * This returns $this->cPid via __get()
927
     *
928
     * @access protected
929
     *
930
     * @return int The PID of the metadata definitions
931
     */
932
    protected function _getCPid(): int
933
    {
934
        return $this->cPid;
935
    }
936
937
    /**
938
     * This returns $this->hasFulltext via __get()
939
     *
940
     * @access protected
941
     *
942
     * @return bool Are there any fulltext files available?
943
     */
944
    protected function _getHasFulltext(): bool
945
    {
946
        $this->ensureHasFulltextIsSet();
947
        return $this->hasFulltext;
948
    }
949
950
    /**
951
     * Format specific part of building the document's metadata array
952
     *
953
     * @access protected
954
     *
955
     * @abstract
956
     *
957
     * @param int $cPid
958
     *
959
     * @return void
960
     */
961
    protected abstract function prepareMetadataArray(int $cPid): void;
962
963
    /**
964
     * This builds an array of the document's metadata
965
     *
966
     * @access protected
967
     *
968
     * @return array Array of metadata with their corresponding logical structure node ID as key
969
     */
970
    protected function _getMetadataArray(): array
971
    {
972
        // Set metadata definitions' PID.
973
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
974
        if (!$cPid) {
975
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
976
            return [];
977
        }
978
        if (
979
            !$this->metadataArrayLoaded
980
            || $this->metadataArray[0] != $cPid
981
        ) {
982
            $this->prepareMetadataArray($cPid);
983
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
984
            $this->metadataArrayLoaded = true;
985
        }
986
        return $this->metadataArray;
987
    }
988
989
    /**
990
     * This returns $this->numPages via __get()
991
     *
992
     * @access protected
993
     *
994
     * @return int The total number of pages and/or tracks
995
     */
996
    protected function _getNumPages(): int
997
    {
998
        $this->_getPhysicalStructure();
999
        return $this->numPages;
1000
    }
1001
1002
    /**
1003
     * This returns $this->parentId via __get()
1004
     *
1005
     * @access protected
1006
     *
1007
     * @return int The UID of the parent document or zero if not applicable
1008
     */
1009
    protected function _getParentId(): int
1010
    {
1011
        return $this->parentId;
1012
    }
1013
1014
    /**
1015
     * This builds an array of the document's physical structure
1016
     *
1017
     * @access protected
1018
     *
1019
     * @abstract
1020
     *
1021
     * @return array Array of physical elements' id, type, label and file representations ordered
1022
     * by "@ORDER" attribute / IIIF Sequence's Canvases
1023
     */
1024
    protected abstract function _getPhysicalStructure(): array;
1025
1026
    /**
1027
     * This gives an array of the document's physical structure metadata
1028
     *
1029
     * @access protected
1030
     *
1031
     * @return array Array of elements' type, label and file representations ordered by "@ID" attribute / Canvas order
1032
     */
1033
    protected function _getPhysicalStructureInfo(): array
1034
    {
1035
        // Is there no physical structure array yet?
1036
        if (!$this->physicalStructureLoaded) {
1037
            // Build physical structure array.
1038
            $this->_getPhysicalStructure();
1039
        }
1040
        return $this->physicalStructureInfo;
1041
    }
1042
1043
    /**
1044
     * This returns $this->pid via __get()
1045
     *
1046
     * @access protected
1047
     *
1048
     * @return int The PID of the document or zero if not in database
1049
     */
1050
    protected function _getPid(): int
1051
    {
1052
        return $this->pid;
1053
    }
1054
1055
    /**
1056
     * This returns $this->ready via __get()
1057
     *
1058
     * @access protected
1059
     *
1060
     * @return bool Is the document instantiated successfully?
1061
     */
1062
    protected function _getReady(): bool
1063
    {
1064
        return $this->ready;
1065
    }
1066
1067
    /**
1068
     * This returns $this->recordId via __get()
1069
     *
1070
     * @access protected
1071
     *
1072
     * @return mixed The METS file's / IIIF manifest's record identifier
1073
     */
1074
    protected function _getRecordId()
1075
    {
1076
        return $this->recordId;
1077
    }
1078
1079
    /**
1080
     * This returns $this->rootId via __get()
1081
     *
1082
     * @access protected
1083
     *
1084
     * @return int The UID of the root document or zero if not applicable
1085
     */
1086
    protected function _getRootId(): int
1087
    {
1088
        if (!$this->rootIdLoaded) {
1089
            if ($this->parentId) {
1090
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1091
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
1092
            }
1093
            $this->rootIdLoaded = true;
1094
        }
1095
        return $this->rootId;
1096
    }
1097
1098
    /**
1099
     * This returns the smLinks between logical and physical structMap (METS) and models the
1100
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1101
     *
1102
     * @access protected
1103
     *
1104
     * @abstract
1105
     *
1106
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1107
     */
1108
    protected abstract function _getSmLinks(): array;
1109
1110
    /**
1111
     * This builds an array of the document's logical structure
1112
     *
1113
     * @access protected
1114
     *
1115
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1116
     */
1117
    protected function _getTableOfContents(): array
1118
    {
1119
        // Is there no logical structure array yet?
1120
        if (!$this->tableOfContentsLoaded) {
1121
            // Get all logical structures.
1122
            $this->getLogicalStructure('', true);
1123
            $this->tableOfContentsLoaded = true;
1124
        }
1125
        return $this->tableOfContents;
1126
    }
1127
1128
    /**
1129
     * This returns the document's thumbnail location
1130
     *
1131
     * @access protected
1132
     *
1133
     * @abstract
1134
     *
1135
     * @param bool $forceReload Force reloading the thumbnail instead of returning the cached value
1136
     *
1137
     * @return string The document's thumbnail location
1138
     */
1139
    protected abstract function _getThumbnail(bool $forceReload = false): string;
1140
1141
    /**
1142
     * This returns the ID of the toplevel logical structure node
1143
     *
1144
     * @access protected
1145
     *
1146
     * @abstract
1147
     *
1148
     * @return string The logical structure node's ID
1149
     */
1150
    protected abstract function _getToplevelId(): string;
1151
1152
    /**
1153
     * This sets $this->cPid via __set()
1154
     *
1155
     * @access protected
1156
     *
1157
     * @param int $value The new PID for the metadata definitions
1158
     *
1159
     * @return void
1160
     */
1161
    protected function _setCPid(int $value): void
1162
    {
1163
        $this->cPid = max(intval($value), 0);
1164
    }
1165
1166
    /**
1167
     * This is a singleton class, thus the constructor should be private/protected
1168
     * (Get an instance of this class by calling AbstractDocument::getInstance())
1169
     *
1170
     * @access protected
1171
     *
1172
     * @param string $location The location URL of the XML file to parse
1173
     * @param int $pid If > 0, then only document with this PID gets loaded
1174
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument Either null or the \SimpleXMLElement
1175
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1176
     *
1177
     * @return void
1178
     */
1179
    protected function __construct(string $location, int $pid, $preloadedDocument)
1180
    {
1181
        $this->pid = $pid;
0 ignored issues
show
Bug introduced by
The property pid is declared read-only in Kitodo\Dlf\Common\AbstractDocument.
Loading history...
1182
        $this->setPreloadedDocument($preloadedDocument);
1183
        $this->init($location);
1184
        $this->establishRecordId($pid);
1185
        return;
1186
    }
1187
1188
    /**
1189
     * This magic method is called each time an invisible property is referenced from the object
1190
     *
1191
     * @access public
1192
     *
1193
     * @param string $var Name of variable to get
1194
     *
1195
     * @return mixed Value of $this->$var
1196
     */
1197
    public function __get(string $var)
1198
    {
1199
        $method = '_get' . ucfirst($var);
1200
        if (
1201
            !property_exists($this, $var)
1202
            || !method_exists($this, $method)
1203
        ) {
1204
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1205
            return null;
1206
        } else {
1207
            return $this->$method();
1208
        }
1209
    }
1210
1211
    /**
1212
     * This magic method is called each time an invisible property is checked for isset() or empty()
1213
     *
1214
     * @access public
1215
     *
1216
     * @param string $var Name of variable to check
1217
     *
1218
     * @return bool true if variable is set and not empty, false otherwise
1219
     */
1220
    public function __isset(string $var): bool
1221
    {
1222
        return !empty($this->__get($var));
1223
    }
1224
1225
    /**
1226
     * This magic method is called each time an invisible property is referenced from the object
1227
     *
1228
     * @access public
1229
     *
1230
     * @param string $var Name of variable to set
1231
     * @param mixed $value New value of variable
1232
     *
1233
     * @return void
1234
     */
1235
    public function __set(string $var, $value): void
1236
    {
1237
        $method = '_set' . ucfirst($var);
1238
        if (
1239
            !property_exists($this, $var)
1240
            || !method_exists($this, $method)
1241
        ) {
1242
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1243
        } else {
1244
            $this->$method($value);
1245
        }
1246
    }
1247
1248
    /**
1249
     * Get Cache Hit for document instance
1250
     *
1251
     * @access private
1252
     *
1253
     * @static
1254
     *
1255
     * @param string $location
1256
     *
1257
     * @return AbstractDocument|false
1258
     */
1259
    private static function getDocumentCache(string $location)
1260
    {
1261
        $cacheIdentifier = md5($location);
1262
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1263
        $cacheHit = $cache->get($cacheIdentifier);
1264
1265
        return $cacheHit;
1266
    }
1267
1268
    /**
1269
     * Set Cache for document instance
1270
     *
1271
     * @access private
1272
     *
1273
     * @static
1274
     *
1275
     * @param string $location
1276
     * @param AbstractDocument $currentDocument
1277
     *
1278
     * @return void
1279
     */
1280
    private static function setDocumentCache(string $location, AbstractDocument $currentDocument): void
1281
    {
1282
        $cacheIdentifier = md5($location);
1283
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1284
1285
        // Save value in cache
1286
        $cache->set($cacheIdentifier, $currentDocument);
1287
    }
1288
1289
}
1290