Completed
Push — master ( ce3674...ce3674 )
by
unknown
15s queued 12s
created

Doc::__get()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 8
nc 2
nop 1
dl 0
loc 11
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Cache\CacheManager;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
19
use TYPO3\CMS\Core\Log\LogManager;
20
use TYPO3\CMS\Core\Utility\GeneralUtility;
21
use TYPO3\CMS\Core\Utility\MathUtility;
22
use TYPO3\CMS\Extbase\Configuration\ConfigurationManager;
23
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
24
use Ubl\Iiif\Tools\IiifHelper;
25
26
/**
27
 * Document class for the 'dlf' extension
28
 *
29
 * @author Sebastian Meyer <[email protected]>
30
 * @author Henrik Lochmann <[email protected]>
31
 * @package TYPO3
32
 * @subpackage dlf
33
 * @access public
34
 * @property int $cPid This holds the PID for the configuration
35
 * @property-read bool $hasFulltext Are there any fulltext files available?
36
 * @property-read string $location This holds the documents location
37
 * @property-read array $metadataArray This holds the documents' parsed metadata array
38
 * @property-read int $numPages The holds the total number of pages
39
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
40
 * @property-read array $physicalStructure This holds the physical structure
41
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
42
 * @property-read int $pid This holds the PID of the document or zero if not in database
43
 * @property-read bool $ready Is the document instantiated successfully?
44
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
45
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
46
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
47
 * @property-read array $tableOfContents This holds the logical structure
48
 * @property-read string $thumbnail This holds the document's thumbnail location
49
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
50
 * @abstract
51
 */
52
abstract class Doc
53
{
54
    /**
55
     * This holds the logger
56
     *
57
     * @var LogManager
58
     * @access protected
59
     */
60
    protected $logger;
61
62
    /**
63
     * This holds the PID for the configuration
64
     *
65
     * @var int
66
     * @access protected
67
     */
68
    protected $cPid = 0;
69
70
    /**
71
     * The extension key
72
     *
73
     * @var string
74
     * @access public
75
     */
76
    public static $extKey = 'dlf';
77
78
    /**
79
     * This holds the configuration for all supported metadata encodings
80
     * @see loadFormats()
81
     *
82
     * @var array
83
     * @access protected
84
     */
85
    protected $formats = [
86
        'OAI' => [
87
            'rootElement' => 'OAI-PMH',
88
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
89
        ],
90
        'METS' => [
91
            'rootElement' => 'mets',
92
            'namespaceURI' => 'http://www.loc.gov/METS/',
93
        ],
94
        'XLINK' => [
95
            'rootElement' => 'xlink',
96
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
97
        ]
98
    ];
99
100
    /**
101
     * Are the available metadata formats loaded?
102
     * @see $formats
103
     *
104
     * @var bool
105
     * @access protected
106
     */
107
    protected $formatsLoaded = false;
108
109
    /**
110
     * Are there any fulltext files available? This also includes IIIF text annotations
111
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
112
     * annotations as fulltext.
113
     *
114
     * @var bool
115
     * @access protected
116
     */
117
    protected $hasFulltext = false;
118
119
    /**
120
     * Last searched logical and physical page
121
     *
122
     * @var array
123
     * @access protected
124
     */
125
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
126
127
    /**
128
     * This holds the logical units
129
     *
130
     * @var array
131
     * @access protected
132
     */
133
    protected $logicalUnits = [];
134
135
    /**
136
     * This holds the documents' parsed metadata array with their corresponding
137
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
138
     *
139
     * @var array
140
     * @access protected
141
     */
142
    protected $metadataArray = [];
143
144
    /**
145
     * Is the metadata array loaded?
146
     * @see $metadataArray
147
     *
148
     * @var bool
149
     * @access protected
150
     */
151
    protected $metadataArrayLoaded = false;
152
153
    /**
154
     * The holds the total number of pages
155
     *
156
     * @var int
157
     * @access protected
158
     */
159
    protected $numPages = 0;
160
161
    /**
162
     * This holds the UID of the parent document or zero if not multi-volumed
163
     *
164
     * @var int
165
     * @access protected
166
     */
167
    protected $parentId = 0;
168
169
    /**
170
     * This holds the physical structure
171
     *
172
     * @var array
173
     * @access protected
174
     */
175
    protected $physicalStructure = [];
176
177
    /**
178
     * This holds the physical structure metadata
179
     *
180
     * @var array
181
     * @access protected
182
     */
183
    protected $physicalStructureInfo = [];
184
185
    /**
186
     * Is the physical structure loaded?
187
     * @see $physicalStructure
188
     *
189
     * @var bool
190
     * @access protected
191
     */
192
    protected $physicalStructureLoaded = false;
193
194
    /**
195
     * This holds the PID of the document or zero if not in database
196
     *
197
     * @var int
198
     * @access protected
199
     */
200
    protected $pid = 0;
201
202
    /**
203
     * This holds the documents' raw text pages with their corresponding
204
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
205
     *
206
     * @var array
207
     * @access protected
208
     */
209
    protected $rawTextArray = [];
210
211
    /**
212
     * Is the document instantiated successfully?
213
     *
214
     * @var bool
215
     * @access protected
216
     */
217
    protected $ready = false;
218
219
    /**
220
     * The METS file's / IIIF manifest's record identifier
221
     *
222
     * @var string
223
     * @access protected
224
     */
225
    protected $recordId;
226
227
    /**
228
     * This holds the singleton object of the document
229
     *
230
     * @var array (\Kitodo\Dlf\Common\Doc)
231
     * @static
232
     * @access protected
233
     */
234
    protected static $registry = [];
235
236
    /**
237
     * This holds the UID of the root document or zero if not multi-volumed
238
     *
239
     * @var int
240
     * @access protected
241
     */
242
    protected $rootId = 0;
243
244
    /**
245
     * Is the root id loaded?
246
     * @see $rootId
247
     *
248
     * @var bool
249
     * @access protected
250
     */
251
    protected $rootIdLoaded = false;
252
253
    /**
254
     * This holds the smLinks between logical and physical structMap
255
     *
256
     * @var array
257
     * @access protected
258
     */
259
    protected $smLinks = ['l2p' => [], 'p2l' => []];
260
261
    /**
262
     * Are the smLinks loaded?
263
     * @see $smLinks
264
     *
265
     * @var bool
266
     * @access protected
267
     */
268
    protected $smLinksLoaded = false;
269
270
    /**
271
     * This holds the logical structure
272
     *
273
     * @var array
274
     * @access protected
275
     */
276
    protected $tableOfContents = [];
277
278
    /**
279
     * Is the table of contents loaded?
280
     * @see $tableOfContents
281
     *
282
     * @var bool
283
     * @access protected
284
     */
285
    protected $tableOfContentsLoaded = false;
286
287
    /**
288
     * This holds the document's thumbnail location
289
     *
290
     * @var string
291
     * @access protected
292
     */
293
    protected $thumbnail = '';
294
295
    /**
296
     * Is the document's thumbnail location loaded?
297
     * @see $thumbnail
298
     *
299
     * @var bool
300
     * @access protected
301
     */
302
    protected $thumbnailLoaded = false;
303
304
    /**
305
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
306
     *
307
     * @var string
308
     * @access protected
309
     */
310
    protected $toplevelId = '';
311
312
    /**
313
     * This holds the whole XML file as \SimpleXMLElement object
314
     *
315
     * @var \SimpleXMLElement
316
     * @access protected
317
     */
318
    protected $xml;
319
320
    /**
321
     * This clears the static registry to prevent memory exhaustion
322
     *
323
     * @access public
324
     *
325
     * @static
326
     *
327
     * @return void
328
     */
329
    public static function clearRegistry()
330
    {
331
        // Reset registry array.
332
        self::$registry = [];
333
    }
334
335
    /**
336
     * This ensures that the recordId, if existent, is retrieved from the document
337
     *
338
     * @access protected
339
     *
340
     * @abstract
341
     *
342
     * @param int $pid: ID of the configuration page with the recordId config
343
     *
344
     */
345
    protected abstract function establishRecordId($pid);
346
347
    /**
348
     * Source document PHP object which is represented by a Document instance
349
     *
350
     * @access protected
351
     *
352
     * @abstract
353
     *
354
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
355
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
356
     */
357
    protected abstract function getDocument();
358
359
    /**
360
     * This gets the location of a downloadable file for a physical page or track
361
     *
362
     * @access public
363
     *
364
     * @abstract
365
     *
366
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
367
     *
368
     * @return string    The file's location as URL
369
     */
370
    public abstract function getDownloadLocation($id);
371
372
    /**
373
     * This gets the location of a file representing a physical page or track
374
     *
375
     * @access public
376
     *
377
     * @abstract
378
     *
379
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
380
     *
381
     * @return string The file's location as URL
382
     */
383
    public abstract function getFileLocation($id);
384
385
    /**
386
     * This gets the MIME type of a file representing a physical page or track
387
     *
388
     * @access public
389
     *
390
     * @abstract
391
     *
392
     * @param string $id: The @ID attribute of the file node
393
     *
394
     * @return string The file's MIME type
395
     */
396
    public abstract function getFileMimeType($id);
397
398
    /**
399
     * This is a singleton class, thus an instance must be created by this method
400
     *
401
     * @access public
402
     *
403
     * @static
404
     *
405
     * @param string $location: The URL of XML file or the IRI of the IIIF resource
406
     * @param array $settings
407
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
408
     *
409
     * @return \Kitodo\Dlf\Common\Doc|null Instance of this class, either MetsDocument or IiifManifest
410
     */
411
    public static function &getInstance($location, $settings = [], $forceReload = false)
412
    {
413
        // Create new instance depending on format (METS or IIIF) ...
414
        $documentFormat = null;
415
        $xml = null;
416
        $iiif = null;
417
418
        if ($instance = self::getDocCache($location)) {
419
            return $instance;
420
        } else {
421
            $instance = null;
422
        }
423
424
        // Try to load a file from the url
425
        if (GeneralUtility::isValidUrl($location)) {
426
            // Load extension configuration
427
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
428
429
            $content = Helper::getUrl($location);
430
            if ($content !== false) {
431
                $xml = Helper::getXmlFileAsString($content);
432
                if ($xml !== false) {
433
                    /* @var $xml \SimpleXMLElement */
434
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
435
                    $xpathResult = $xml->xpath('//mets:mets');
436
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
437
                } else {
438
                    // Try to load file as IIIF resource instead.
439
                    $contentAsJsonArray = json_decode($content, true);
440
                    if ($contentAsJsonArray !== null) {
441
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
442
                        IiifHelper::setMaxThumbnailHeight($extConf['iiifThumbnailHeight']);
443
                        IiifHelper::setMaxThumbnailWidth($extConf['iiifThumbnailWidth']);
444
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
445
                        if ($iiif instanceof IiifResourceInterface) {
446
                            $documentFormat = 'IIIF';
447
                        }
448
                    }
449
                }
450
            }
451
        }
452
453
        // Sanitize input.
454
        $pid = max(intval($settings['storagePid']), 0);
455
        if ($documentFormat == 'METS') {
456
            $instance = new MetsDocument($location, $pid, $xml);
457
        } elseif ($documentFormat == 'IIIF') {
458
            $instance = new IiifManifest($location, $pid, $iiif);
459
        }
460
461
        if ($instance) {
462
            self::setDocCache($location, $instance);
463
        }
464
465
        return $instance;
466
    }
467
468
    /**
469
     * This gets details about a logical structure element
470
     *
471
     * @access public
472
     *
473
     * @abstract
474
     *
475
     * @param string $id: The @ID attribute of the logical structure node (METS) or
476
     * the @id property of the Manifest / Range (IIIF)
477
     * @param bool $recursive: Whether to include the child elements / resources
478
     *
479
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
480
     */
481
    public abstract function getLogicalStructure($id, $recursive = false);
482
483
    /**
484
     * This extracts all the metadata for a logical structure node
485
     *
486
     * @access public
487
     *
488
     * @abstract
489
     *
490
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
491
     * of the Manifest / Range (IIIF)
492
     * @param int $cPid: The PID for the metadata definitions
493
     *                       (defaults to $this->cPid or $this->pid)
494
     *
495
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
496
     */
497
    public abstract function getMetadata($id, $cPid = 0);
498
499
    /**
500
     * This returns the first corresponding physical page number of a given logical page label
501
     *
502
     * @access public
503
     *
504
     * @param string $logicalPage: The label (or a part of the label) of the logical page
505
     *
506
     * @return int The physical page number
507
     */
508
    public function getPhysicalPage($logicalPage)
509
    {
510
        if (
511
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
512
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
513
        ) {
514
            return $this->lastSearchedPhysicalPage['physicalPage'];
515
        } else {
516
            $physicalPage = 0;
517
            foreach ($this->physicalStructureInfo as $page) {
518
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
519
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
520
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
521
                    return $physicalPage;
522
                }
523
                $physicalPage++;
524
            }
525
        }
526
        return 1;
527
    }
528
529
    /**
530
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
531
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
532
     *
533
     * @access public
534
     *
535
     * @abstract
536
     *
537
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
538
     * of the Manifest / Range (IIIF)
539
     *
540
     * @return string The OCR full text
541
     */
542
    public abstract function getFullText($id);
543
544
    /**
545
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
546
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
547
     * to be given in the Canvas' / Manifest's "seeAlso" property.
548
     *
549
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
550
     * of the Manifest / Range (IIIF)
551
     *
552
     * @return string The OCR full text
553
     */
554
    protected function getFullTextFromXml($id)
555
    {
556
        $fullText = '';
557
        // Load available text formats, ...
558
        $this->loadFormats();
559
        // ... physical structure ...
560
        $this->_getPhysicalStructure();
561
        // ... and extension configuration.
562
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
563
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
564
        if (!empty($this->physicalStructureInfo[$id])) {
565
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
566
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
567
                    // Get full text file.
568
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
569
                    if ($fileContent !== false) {
570
                        $textFormat = $this->getTextFormat($fileContent);
571
                    } else {
572
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
0 ignored issues
show
Bug introduced by
The method warning() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

572
                        $this->logger->/** @scrutinizer ignore-call */ 
573
                                       warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
573
                        return $fullText;
574
                    }
575
                    break;
576
                }
577
            }
578
        } else {
579
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
580
            return $fullText;
581
        }
582
        // Is this text format supported?
583
        // This part actually differs from previous version of indexed OCR
584
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
585
            $textMiniOcr = '';
586
            if (!empty($this->formats[$textFormat]['class'])) {
587
                $class = $this->formats[$textFormat]['class'];
588
                // Get the raw text from class.
589
                if (
590
                    class_exists($class)
591
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
592
                ) {
593
                    // Load XML from file.
594
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
595
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
596
                    $this->rawTextArray[$id] = $textMiniOcr;
597
                } else {
598
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
599
                }
600
            }
601
            $fullText = $textMiniOcr;
602
        } else {
603
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
604
        }
605
        return $fullText;
606
    }
607
608
    /**
609
     * Get format of the OCR full text
610
     *
611
     * @access private
612
     *
613
     * @param string $fileContent: content of the XML file
614
     *
615
     * @return string The format of the OCR full text
616
     */
617
    private function getTextFormat($fileContent)
618
    {
619
        $xml = Helper::getXmlFileAsString($fileContent);
620
621
        if ($xml !== false) {
622
            // Get the root element's name as text format.
623
            return strtoupper($xml->getName());
624
        } else {
625
            return '';
626
        }
627
    }
628
629
    /**
630
     * This determines a title for the given document
631
     *
632
     * @access public
633
     *
634
     * @static
635
     *
636
     * @param int $uid: The UID of the document
637
     * @param bool $recursive: Search superior documents for a title, too?
638
     *
639
     * @return string The title of the document itself or a parent document
640
     */
641
    public static function getTitle($uid, $recursive = false)
642
    {
643
        $title = '';
644
        // Sanitize input.
645
        $uid = max(intval($uid), 0);
646
        if ($uid) {
647
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
648
                ->getQueryBuilderForTable('tx_dlf_documents');
649
650
            $result = $queryBuilder
651
                ->select(
652
                    'tx_dlf_documents.title',
653
                    'tx_dlf_documents.partof'
654
                )
655
                ->from('tx_dlf_documents')
656
                ->where(
657
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
658
                    Helper::whereExpression('tx_dlf_documents')
659
                )
660
                ->setMaxResults(1)
661
                ->execute();
662
663
            if ($resArray = $result->fetch()) {
664
                // Get title information.
665
                $title = $resArray['title'];
666
                $partof = $resArray['partof'];
667
                // Search parent documents recursively for a title?
668
                if (
669
                    $recursive
670
                    && empty($title)
671
                    && intval($partof)
672
                    && $partof != $uid
673
                ) {
674
                    $title = self::getTitle($partof, true);
675
                }
676
            } else {
677
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
678
            }
679
        } else {
680
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
681
        }
682
        return $title;
683
    }
684
685
    /**
686
     * This extracts all the metadata for the toplevel logical structure node / resource
687
     *
688
     * @access public
689
     *
690
     * @param int $cPid: The PID for the metadata definitions
691
     *
692
     * @return array The logical structure node's / resource's parsed metadata array
693
     */
694
    public function getTitledata($cPid = 0)
695
    {
696
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
697
        // Add information from METS structural map to titledata array.
698
        if ($this instanceof MetsDocument) {
699
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
700
        }
701
        // Set record identifier for METS file / IIIF manifest if not present.
702
        if (
703
            is_array($titledata)
704
            && array_key_exists('record_id', $titledata)
705
        ) {
706
            if (
707
                !empty($this->recordId)
708
                && !in_array($this->recordId, $titledata['record_id'])
709
            ) {
710
                array_unshift($titledata['record_id'], $this->recordId);
711
            }
712
        }
713
        return $titledata;
714
    }
715
716
    /**
717
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
718
     *
719
     * @access protected
720
     *
721
     * @param array $structure: logical structure array
722
     * @param int $depth: current tree depth
723
     * @param string $logId: ID of the logical structure whose depth is requested
724
     *
725
     * @return int|bool: false if structure with $logId is not a child of this substructure,
726
     * or the actual depth.
727
     */
728
    protected function getTreeDepth($structure, $depth, $logId)
729
    {
730
        foreach ($structure as $element) {
731
            if ($element['id'] == $logId) {
732
                return $depth;
733
            } elseif (array_key_exists('children', $element)) {
734
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
735
                if ($foundInChildren !== false) {
736
                    return $foundInChildren;
737
                }
738
            }
739
        }
740
        return false;
741
    }
742
743
    /**
744
     * Get the tree depth of a logical structure element within the table of content
745
     *
746
     * @access public
747
     *
748
     * @param string $logId: The id of the logical structure element whose depth is requested
749
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
750
     */
751
    public function getStructureDepth($logId)
752
    {
753
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
754
    }
755
756
    /**
757
     * This sets some basic class properties
758
     *
759
     * @access protected
760
     *
761
     * @abstract
762
     *
763
     * @param string $location:The location URL of the XML file to parse
764
     *
765
     * @return void
766
     */
767
    protected abstract function init($location);
768
769
    /**
770
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
771
     *
772
     * @access protected
773
     *
774
     * @abstract
775
     *
776
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
777
     *
778
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
779
     */
780
    protected abstract function setPreloadedDocument($preloadedDocument);
781
782
    /**
783
     * METS/IIIF specific part of loading a location
784
     *
785
     * @access protected
786
     *
787
     * @abstract
788
     *
789
     * @param string $location: The URL of the file to load
790
     *
791
     * @return bool true on success or false on failure
792
     */
793
    protected abstract function loadLocation($location);
794
795
    /**
796
     * Load XML file / IIIF resource from URL
797
     *
798
     * @access protected
799
     *
800
     * @param string $location: The URL of the file to load
801
     *
802
     * @return bool true on success or false on failure
803
     */
804
    protected function load($location)
805
    {
806
        // Load XML / JSON-LD file.
807
        if (GeneralUtility::isValidUrl($location)) {
808
            // the actual loading is format specific
809
            return $this->loadLocation($location);
810
        } else {
811
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
0 ignored issues
show
Bug introduced by
The method error() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

811
            $this->logger->/** @scrutinizer ignore-call */ 
812
                           error('Invalid file location "' . $location . '" for document loading');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
812
        }
813
        return false;
814
    }
815
816
    /**
817
     * Analyze the document if it contains any fulltext that needs to be indexed.
818
     *
819
     * @access protected
820
     *
821
     * @abstract
822
     */
823
    protected abstract function ensureHasFulltextIsSet();
824
825
    /**
826
     * Register all available data formats
827
     *
828
     * @access protected
829
     *
830
     * @return void
831
     */
832
    protected function loadFormats()
833
    {
834
        if (!$this->formatsLoaded) {
835
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
836
                ->getQueryBuilderForTable('tx_dlf_formats');
837
838
            // Get available data formats from database.
839
            $result = $queryBuilder
840
                ->select(
841
                    'tx_dlf_formats.type AS type',
842
                    'tx_dlf_formats.root AS root',
843
                    'tx_dlf_formats.namespace AS namespace',
844
                    'tx_dlf_formats.class AS class'
845
                )
846
                ->from('tx_dlf_formats')
847
                ->where(
848
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
849
                )
850
                ->execute();
851
852
            while ($resArray = $result->fetch()) {
853
                // Update format registry.
854
                $this->formats[$resArray['type']] = [
855
                    'rootElement' => $resArray['root'],
856
                    'namespaceURI' => $resArray['namespace'],
857
                    'class' => $resArray['class']
858
                ];
859
            }
860
            $this->formatsLoaded = true;
861
        }
862
    }
863
864
    /**
865
     * Register all available namespaces for a \SimpleXMLElement object
866
     *
867
     * @access public
868
     *
869
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
870
     *
871
     * @return void
872
     */
873
    public function registerNamespaces(&$obj)
874
    {
875
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
876
        $this->loadFormats();
877
        // Do we have a \SimpleXMLElement or \DOMXPath object?
878
        if ($obj instanceof \SimpleXMLElement) {
879
            $method = 'registerXPathNamespace';
880
        } elseif ($obj instanceof \DOMXPath) {
881
            $method = 'registerNamespace';
882
        } else {
883
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
884
            return;
885
        }
886
        // Register metadata format's namespaces.
887
        foreach ($this->formats as $enc => $conf) {
888
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
889
        }
890
    }
891
892
    /**
893
     * This returns $this->cPid via __get()
894
     *
895
     * @access protected
896
     *
897
     * @return int The PID of the metadata definitions
898
     */
899
    protected function _getCPid()
900
    {
901
        return $this->cPid;
902
    }
903
904
    /**
905
     * This returns $this->hasFulltext via __get()
906
     *
907
     * @access protected
908
     *
909
     * @return bool Are there any fulltext files available?
910
     */
911
    protected function _getHasFulltext()
912
    {
913
        $this->ensureHasFulltextIsSet();
914
        return $this->hasFulltext;
915
    }
916
917
    /**
918
     * This returns $this->location via __get()
919
     *
920
     * @access protected
921
     *
922
     * @return string The location of the document
923
     */
924
    protected function _getLocation()
925
    {
926
        return $this->location;
927
    }
928
929
    /**
930
     * Format specific part of building the document's metadata array
931
     *
932
     * @access protected
933
     *
934
     * @abstract
935
     *
936
     * @param int $cPid
937
     */
938
    protected abstract function prepareMetadataArray($cPid);
939
940
    /**
941
     * This builds an array of the document's metadata
942
     *
943
     * @access protected
944
     *
945
     * @return array Array of metadata with their corresponding logical structure node ID as key
946
     */
947
    protected function _getMetadataArray()
948
    {
949
        // Set metadata definitions' PID.
950
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
951
        if (!$cPid) {
952
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
953
            return [];
954
        }
955
        if (
956
            !$this->metadataArrayLoaded
957
            || $this->metadataArray[0] != $cPid
958
        ) {
959
            $this->prepareMetadataArray($cPid);
960
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
961
            $this->metadataArrayLoaded = true;
962
        }
963
        return $this->metadataArray;
964
    }
965
966
    /**
967
     * This returns $this->numPages via __get()
968
     *
969
     * @access protected
970
     *
971
     * @return int The total number of pages and/or tracks
972
     */
973
    protected function _getNumPages()
974
    {
975
        $this->_getPhysicalStructure();
976
        return $this->numPages;
977
    }
978
979
    /**
980
     * This returns $this->parentId via __get()
981
     *
982
     * @access protected
983
     *
984
     * @return int The UID of the parent document or zero if not applicable
985
     */
986
    protected function _getParentId()
987
    {
988
        return $this->parentId;
989
    }
990
991
    /**
992
     * This builds an array of the document's physical structure
993
     *
994
     * @access protected
995
     *
996
     * @abstract
997
     *
998
     * @return array Array of physical elements' id, type, label and file representations ordered
999
     * by @ORDER attribute / IIIF Sequence's Canvases
1000
     */
1001
    protected abstract function _getPhysicalStructure();
1002
1003
    /**
1004
     * This gives an array of the document's physical structure metadata
1005
     *
1006
     * @access protected
1007
     *
1008
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1009
     */
1010
    protected function _getPhysicalStructureInfo()
1011
    {
1012
        // Is there no physical structure array yet?
1013
        if (!$this->physicalStructureLoaded) {
1014
            // Build physical structure array.
1015
            $this->_getPhysicalStructure();
1016
        }
1017
        return $this->physicalStructureInfo;
1018
    }
1019
1020
    /**
1021
     * This returns $this->pid via __get()
1022
     *
1023
     * @access protected
1024
     *
1025
     * @return int The PID of the document or zero if not in database
1026
     */
1027
    protected function _getPid()
1028
    {
1029
        return $this->pid;
1030
    }
1031
1032
    /**
1033
     * This returns $this->ready via __get()
1034
     *
1035
     * @access protected
1036
     *
1037
     * @return bool Is the document instantiated successfully?
1038
     */
1039
    protected function _getReady()
1040
    {
1041
        return $this->ready;
1042
    }
1043
1044
    /**
1045
     * This returns $this->recordId via __get()
1046
     *
1047
     * @access protected
1048
     *
1049
     * @return mixed The METS file's / IIIF manifest's record identifier
1050
     */
1051
    protected function _getRecordId()
1052
    {
1053
        return $this->recordId;
1054
    }
1055
1056
    /**
1057
     * This returns $this->rootId via __get()
1058
     *
1059
     * @access protected
1060
     *
1061
     * @return int The UID of the root document or zero if not applicable
1062
     */
1063
    protected function _getRootId()
1064
    {
1065
        if (!$this->rootIdLoaded) {
1066
            if ($this->parentId) {
1067
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1068
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1069
            }
1070
            $this->rootIdLoaded = true;
1071
        }
1072
        return $this->rootId;
1073
    }
1074
1075
    /**
1076
     * This returns the smLinks between logical and physical structMap (METS) and models the
1077
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1078
     *
1079
     * @access protected
1080
     *
1081
     * @abstract
1082
     *
1083
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1084
     */
1085
    protected abstract function _getSmLinks();
1086
1087
    /**
1088
     * This builds an array of the document's logical structure
1089
     *
1090
     * @access protected
1091
     *
1092
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1093
     */
1094
    protected function _getTableOfContents()
1095
    {
1096
        // Is there no logical structure array yet?
1097
        if (!$this->tableOfContentsLoaded) {
1098
            // Get all logical structures.
1099
            $this->getLogicalStructure('', true);
1100
            $this->tableOfContentsLoaded = true;
1101
        }
1102
        return $this->tableOfContents;
1103
    }
1104
1105
    /**
1106
     * This returns the document's thumbnail location
1107
     *
1108
     * @access protected
1109
     *
1110
     * @abstract
1111
     *
1112
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1113
     *
1114
     * @return string The document's thumbnail location
1115
     */
1116
    protected abstract function _getThumbnail($forceReload = false);
1117
1118
    /**
1119
     * This returns the ID of the toplevel logical structure node
1120
     *
1121
     * @access protected
1122
     *
1123
     * @abstract
1124
     *
1125
     * @return string The logical structure node's ID
1126
     */
1127
    protected abstract function _getToplevelId();
1128
1129
    /**
1130
     * This returns $this->uid via __get()
1131
     *
1132
     * @access protected
1133
     *
1134
     * @return mixed The UID or the URL of the document
1135
     */
1136
    protected function _getUid()
1137
    {
1138
        return $this->uid;
0 ignored issues
show
Bug Best Practice introduced by
The property uid does not exist on Kitodo\Dlf\Common\Doc. Since you implemented __get, consider adding a @property annotation.
Loading history...
1139
    }
1140
1141
    /**
1142
     * This sets $this->cPid via __set()
1143
     *
1144
     * @access protected
1145
     *
1146
     * @param int $value: The new PID for the metadata definitions
1147
     *
1148
     * @return void
1149
     */
1150
    protected function _setCPid($value)
1151
    {
1152
        $this->cPid = max(intval($value), 0);
1153
    }
1154
1155
    /**
1156
     * This is a singleton class, thus the constructor should be private/protected
1157
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Doc::getInstance())
1158
     *
1159
     * @access protected
1160
     *
1161
     * @param string $location: The location URL of the XML file to parse
1162
     * @param int $pid: If > 0, then only document with this PID gets loaded
1163
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1164
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1165
     *
1166
     * @return void
1167
     */
1168
    protected function __construct($location, $pid, $preloadedDocument)
1169
    {
1170
        $this->setPreloadedDocument($preloadedDocument);
1171
        $this->init($location);
1172
        $this->establishRecordId($pid);
1173
        return;
1174
    }
1175
1176
    /**
1177
     * This magic method is called each time an invisible property is referenced from the object
1178
     *
1179
     * @access public
1180
     *
1181
     * @param string $var: Name of variable to get
1182
     *
1183
     * @return mixed Value of $this->$var
1184
     */
1185
    public function __get($var)
1186
    {
1187
        $method = '_get' . ucfirst($var);
1188
        if (
1189
            !property_exists($this, $var)
1190
            || !method_exists($this, $method)
1191
        ) {
1192
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1193
            return;
1194
        } else {
1195
            return $this->$method();
1196
        }
1197
    }
1198
1199
    /**
1200
     * This magic method is called each time an invisible property is checked for isset() or empty()
1201
     *
1202
     * @access public
1203
     *
1204
     * @param string $var: Name of variable to check
1205
     *
1206
     * @return bool true if variable is set and not empty, false otherwise
1207
     */
1208
    public function __isset($var)
1209
    {
1210
        return !empty($this->__get($var));
1211
    }
1212
1213
    /**
1214
     * This magic method is called each time an invisible property is referenced from the object
1215
     *
1216
     * @access public
1217
     *
1218
     * @param string $var: Name of variable to set
1219
     * @param mixed $value: New value of variable
1220
     *
1221
     * @return void
1222
     */
1223
    public function __set($var, $value)
1224
    {
1225
        $method = '_set' . ucfirst($var);
1226
        if (
1227
            !property_exists($this, $var)
1228
            || !method_exists($this, $method)
1229
        ) {
1230
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1231
        } else {
1232
            $this->$method($value);
1233
        }
1234
    }
1235
1236
    /**
1237
     * get Cache Hit for $doc
1238
     *
1239
     * @param string $location
1240
     * @return Doc|false
1241
     */
1242
    private static function getDocCache(string $location)
1243
    {
1244
        $cacheIdentifier = md5($location);
1245
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1246
        $cacheHit = $cache->get($cacheIdentifier);
1247
1248
        return $cacheHit;
1249
    }
1250
1251
    /**
1252
     * set Cache for $doc
1253
     *
1254
     * @param string $location
1255
     * @param Doc $doc
1256
     * @return void
1257
     */
1258
    private static function setDocCache(string $location, Doc $doc)
1259
    {
1260
        $cacheIdentifier = md5($location);
1261
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1262
1263
        // Save value in cache
1264
        $cache->set($cacheIdentifier, $doc);
1265
    }
1266
1267
}
1268