Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — master (#715)
by Alexander
08:35 queued 03:45
created

Doc::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 4
nc 1
nop 3
dl 0
loc 6
rs 10
c 1
b 0
f 0
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Cache\CacheManager;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
19
use TYPO3\CMS\Core\Log\LogManager;
20
use TYPO3\CMS\Core\Utility\GeneralUtility;
21
use TYPO3\CMS\Core\Utility\MathUtility;
22
use TYPO3\CMS\Extbase\Configuration\ConfigurationManager;
23
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
24
use Ubl\Iiif\Tools\IiifHelper;
25
26
/**
27
 * Document class for the 'dlf' extension
28
 *
29
 * @author Sebastian Meyer <[email protected]>
30
 * @author Henrik Lochmann <[email protected]>
31
 * @package TYPO3
32
 * @subpackage dlf
33
 * @access public
34
 * @property int $cPid This holds the PID for the configuration
35
 * @property-read bool $hasFulltext Are there any fulltext files available?
36
 * @property-read string $location This holds the documents location
37
 * @property-read array $metadataArray This holds the documents' parsed metadata array
38
 * @property-read int $numPages The holds the total number of pages
39
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
40
 * @property-read array $physicalStructure This holds the physical structure
41
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
42
 * @property-read int $pid This holds the PID of the document or zero if not in database
43
 * @property-read bool $ready Is the document instantiated successfully?
44
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
45
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
46
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
47
 * @property-read array $tableOfContents This holds the logical structure
48
 * @property-read string $thumbnail This holds the document's thumbnail location
49
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
50
 * @abstract
51
 */
52
abstract class Doc
53
{
54
    /**
55
     * This holds the logger
56
     *
57
     * @var LogManager
58
     * @access protected
59
     */
60
    protected $logger;
61
62
    /**
63
     * This holds the PID for the configuration
64
     *
65
     * @var int
66
     * @access protected
67
     */
68
    protected $cPid = 0;
69
70
    /**
71
     * The extension key
72
     *
73
     * @var string
74
     * @access public
75
     */
76
    public static $extKey = 'dlf';
77
78
    /**
79
     * This holds the configuration for all supported metadata encodings
80
     * @see loadFormats()
81
     *
82
     * @var array
83
     * @access protected
84
     */
85
    protected $formats = [
86
        'OAI' => [
87
            'rootElement' => 'OAI-PMH',
88
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
89
        ],
90
        'METS' => [
91
            'rootElement' => 'mets',
92
            'namespaceURI' => 'http://www.loc.gov/METS/',
93
        ],
94
        'XLINK' => [
95
            'rootElement' => 'xlink',
96
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
97
        ]
98
    ];
99
100
    /**
101
     * Are the available metadata formats loaded?
102
     * @see $formats
103
     *
104
     * @var bool
105
     * @access protected
106
     */
107
    protected $formatsLoaded = false;
108
109
    /**
110
     * Are there any fulltext files available? This also includes IIIF text annotations
111
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
112
     * annotations as fulltext.
113
     *
114
     * @var bool
115
     * @access protected
116
     */
117
    protected $hasFulltext = false;
118
119
    /**
120
     * Last searched logical and physical page
121
     *
122
     * @var array
123
     * @access protected
124
     */
125
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
126
127
    /**
128
     * This holds the logical units
129
     *
130
     * @var array
131
     * @access protected
132
     */
133
    protected $logicalUnits = [];
134
135
    /**
136
     * This holds the documents' parsed metadata array with their corresponding
137
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
138
     *
139
     * @var array
140
     * @access protected
141
     */
142
    protected $metadataArray = [];
143
144
    /**
145
     * Is the metadata array loaded?
146
     * @see $metadataArray
147
     *
148
     * @var bool
149
     * @access protected
150
     */
151
    protected $metadataArrayLoaded = false;
152
153
    /**
154
     * The holds the total number of pages
155
     *
156
     * @var int
157
     * @access protected
158
     */
159
    protected $numPages = 0;
160
161
    /**
162
     * This holds the UID of the parent document or zero if not multi-volumed
163
     *
164
     * @var int
165
     * @access protected
166
     */
167
    protected $parentId = 0;
168
169
    /**
170
     * This holds the physical structure
171
     *
172
     * @var array
173
     * @access protected
174
     */
175
    protected $physicalStructure = [];
176
177
    /**
178
     * This holds the physical structure metadata
179
     *
180
     * @var array
181
     * @access protected
182
     */
183
    protected $physicalStructureInfo = [];
184
185
    /**
186
     * Is the physical structure loaded?
187
     * @see $physicalStructure
188
     *
189
     * @var bool
190
     * @access protected
191
     */
192
    protected $physicalStructureLoaded = false;
193
194
    /**
195
     * This holds the PID of the document or zero if not in database
196
     *
197
     * @var int
198
     * @access protected
199
     */
200
    protected $pid = 0;
201
202
    /**
203
     * This holds the documents' raw text pages with their corresponding
204
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
205
     *
206
     * @var array
207
     * @access protected
208
     */
209
    protected $rawTextArray = [];
210
211
    /**
212
     * Is the document instantiated successfully?
213
     *
214
     * @var bool
215
     * @access protected
216
     */
217
    protected $ready = false;
218
219
    /**
220
     * The METS file's / IIIF manifest's record identifier
221
     *
222
     * @var string
223
     * @access protected
224
     */
225
    protected $recordId;
226
227
    /**
228
     * This holds the singleton object of the document
229
     *
230
     * @var array (\Kitodo\Dlf\Common\Doc)
231
     * @static
232
     * @access protected
233
     */
234
    protected static $registry = [];
235
236
    /**
237
     * This holds the UID of the root document or zero if not multi-volumed
238
     *
239
     * @var int
240
     * @access protected
241
     */
242
    protected $rootId = 0;
243
244
    /**
245
     * Is the root id loaded?
246
     * @see $rootId
247
     *
248
     * @var bool
249
     * @access protected
250
     */
251
    protected $rootIdLoaded = false;
252
253
    /**
254
     * This holds the smLinks between logical and physical structMap
255
     *
256
     * @var array
257
     * @access protected
258
     */
259
    protected $smLinks = ['l2p' => [], 'p2l' => []];
260
261
    /**
262
     * Are the smLinks loaded?
263
     * @see $smLinks
264
     *
265
     * @var bool
266
     * @access protected
267
     */
268
    protected $smLinksLoaded = false;
269
270
    /**
271
     * This holds the logical structure
272
     *
273
     * @var array
274
     * @access protected
275
     */
276
    protected $tableOfContents = [];
277
278
    /**
279
     * Is the table of contents loaded?
280
     * @see $tableOfContents
281
     *
282
     * @var bool
283
     * @access protected
284
     */
285
    protected $tableOfContentsLoaded = false;
286
287
    /**
288
     * This holds the document's thumbnail location
289
     *
290
     * @var string
291
     * @access protected
292
     */
293
    protected $thumbnail = '';
294
295
    /**
296
     * Is the document's thumbnail location loaded?
297
     * @see $thumbnail
298
     *
299
     * @var bool
300
     * @access protected
301
     */
302
    protected $thumbnailLoaded = false;
303
304
    /**
305
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
306
     *
307
     * @var string
308
     * @access protected
309
     */
310
    protected $toplevelId = '';
311
312
    /**
313
     * This holds the whole XML file as \SimpleXMLElement object
314
     *
315
     * @var \SimpleXMLElement
316
     * @access protected
317
     */
318
    protected $xml;
319
320
    /**
321
     * This clears the static registry to prevent memory exhaustion
322
     *
323
     * @access public
324
     *
325
     * @static
326
     *
327
     * @return void
328
     */
329
    public static function clearRegistry()
330
    {
331
        // Reset registry array.
332
        self::$registry = [];
333
    }
334
335
    /**
336
     * This ensures that the recordId, if existent, is retrieved from the document
337
     *
338
     * @access protected
339
     *
340
     * @abstract
341
     *
342
     * @param int $pid: ID of the configuration page with the recordId config
343
     *
344
     */
345
    protected abstract function establishRecordId($pid);
346
347
    /**
348
     * Source document PHP object which is represented by a Document instance
349
     *
350
     * @access protected
351
     *
352
     * @abstract
353
     *
354
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
355
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
356
     */
357
    protected abstract function getDocument();
358
359
    /**
360
     * This gets the location of a downloadable file for a physical page or track
361
     *
362
     * @access public
363
     *
364
     * @abstract
365
     *
366
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
367
     *
368
     * @return string    The file's location as URL
369
     */
370
    public abstract function getDownloadLocation($id);
371
372
    /**
373
     * This gets the location of a file representing a physical page or track
374
     *
375
     * @access public
376
     *
377
     * @abstract
378
     *
379
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
380
     *
381
     * @return string The file's location as URL
382
     */
383
    public abstract function getFileLocation($id);
384
385
    /**
386
     * This gets the MIME type of a file representing a physical page or track
387
     *
388
     * @access public
389
     *
390
     * @abstract
391
     *
392
     * @param string $id: The @ID attribute of the file node
393
     *
394
     * @return string The file's MIME type
395
     */
396
    public abstract function getFileMimeType($id);
397
398
    /**
399
     * This is a singleton class, thus an instance must be created by this method
400
     *
401
     * @access public
402
     *
403
     * @static
404
     *
405
     * @param string $location: The URL of XML file or the IRI of the IIIF resource
406
     * @param array $settings
407
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
408
     *
409
     * @return \Kitodo\Dlf\Common\Doc|null Instance of this class, either MetsDocument or IiifManifest
410
     */
411
    public static function &getInstance($location, $settings = [], $forceReload = false)
412
    {
413
        // Create new instance depending on format (METS or IIIF) ...
414
        $documentFormat = null;
415
        $xml = null;
416
        $iiif = null;
417
418
        if ($instance = self::getDocCache($location)) {
419
            return $instance;
420
        } else {
421
            $instance = null;
422
        }
423
424
        // Try to load a file from the url
425
        if (GeneralUtility::isValidUrl($location)) {
426
            // Load extension configuration
427
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
428
429
            $content = Helper::getUrl($location);
430
            if ($content !== false) {
431
                $xml = Helper::getXmlFileAsString($content);
432
                if ($xml !== false) {
433
                    /* @var $xml \SimpleXMLElement */
434
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
435
                    $xpathResult = $xml->xpath('//mets:mets');
436
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
437
                } else {
438
                    // Try to load file as IIIF resource instead.
439
                    $contentAsJsonArray = json_decode($content, true);
440
                    if ($contentAsJsonArray !== null) {
441
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
442
                        IiifHelper::setMaxThumbnailHeight($extConf['iiifThumbnailHeight']);
443
                        IiifHelper::setMaxThumbnailWidth($extConf['iiifThumbnailWidth']);
444
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
445
                        if ($iiif instanceof IiifResourceInterface) {
446
                            $documentFormat = 'IIIF';
447
                        }
448
                    }
449
                }
450
            }
451
        }
452
453
        // Sanitize input.
454
        $pid = max(intval($settings['storagePid']), 0);
455
        if ($documentFormat == 'METS') {
456
            $instance = new MetsDocument($location, $pid, $xml);
457
        } elseif ($documentFormat == 'IIIF') {
458
            $instance = new IiifManifest($location, $pid, $iiif);
459
        }
460
461
        if ($instance) {
462
            self::setDocCache($location, $instance);
463
        }
464
465
        return $instance;
466
    }
467
468
    /**
469
     * This gets details about a logical structure element
470
     *
471
     * @access public
472
     *
473
     * @abstract
474
     *
475
     * @param string $id: The @ID attribute of the logical structure node (METS) or
476
     * the @id property of the Manifest / Range (IIIF)
477
     * @param bool $recursive: Whether to include the child elements / resources
478
     *
479
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
480
     */
481
    public abstract function getLogicalStructure($id, $recursive = false);
482
483
    /**
484
     * This extracts all the metadata for a logical structure node
485
     *
486
     * @access public
487
     *
488
     * @abstract
489
     *
490
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
491
     * of the Manifest / Range (IIIF)
492
     * @param int $cPid: The PID for the metadata definitions
493
     *                       (defaults to $this->cPid or $this->pid)
494
     *
495
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
496
     */
497
    public abstract function getMetadata($id, $cPid = 0);
498
499
    /**
500
     * This returns the first corresponding physical page number of a given logical page label
501
     *
502
     * @access public
503
     *
504
     * @param string $logicalPage: The label (or a part of the label) of the logical page
505
     *
506
     * @return int The physical page number
507
     */
508
    public function getPhysicalPage($logicalPage)
509
    {
510
        if (
511
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
512
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
513
        ) {
514
            return $this->lastSearchedPhysicalPage['physicalPage'];
515
        } else {
516
            $physicalPage = 0;
517
            foreach ($this->physicalStructureInfo as $page) {
518
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
519
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
520
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
521
                    return $physicalPage;
522
                }
523
                $physicalPage++;
524
            }
525
        }
526
        return 1;
527
    }
528
529
    /**
530
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
531
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
532
     *
533
     * @access public
534
     *
535
     * @abstract
536
     *
537
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
538
     * of the Manifest / Range (IIIF)
539
     *
540
     * @return string The OCR full text
541
     */
542
    public abstract function getFullText($id);
543
544
    /**
545
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
546
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
547
     * to be given in the Canvas' / Manifest's "seeAlso" property.
548
     *
549
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
550
     * of the Manifest / Range (IIIF)
551
     *
552
     * @return string The OCR full text
553
     */
554
    protected function getFullTextFromXml($id)
555
    {
556
        $fullText = '';
557
        // Load available text formats, ...
558
        $this->loadFormats();
559
        // ... physical structure ...
560
        $this->_getPhysicalStructure();
561
        // ... and extension configuration.
562
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
563
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
564
        if (!empty($this->physicalStructureInfo[$id])) {
565
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
566
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
567
                    // Get full text file.
568
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
569
                    if ($fileContent !== false) {
570
                        $textFormat = $this->getTextFormat($fileContent);
571
                    } else {
572
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
573
                        return $fullText;
574
                    }
575
                    break;
576
                }
577
            }
578
        } else {
579
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
580
            return $fullText;
581
        }
582
        // Is this text format supported?
583
        // This part actually differs from previous version of indexed OCR
584
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
585
            $textMiniOcr = '';
586
            if (!empty($this->formats[$textFormat]['class'])) {
587
                $class = $this->formats[$textFormat]['class'];
588
                // Get the raw text from class.
589
                if (
590
                    class_exists($class)
591
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
592
                ) {
593
                    // Load XML from file.
594
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
595
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
596
                    $this->rawTextArray[$id] = $textMiniOcr;
597
                } else {
598
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
599
                }
600
            }
601
            $fullText = $textMiniOcr;
602
        } else {
603
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
604
        }
605
        return $fullText;
606
    }
607
608
    /**
609
     * Get format of the OCR full text
610
     *
611
     * @access private
612
     *
613
     * @param string $fileContent: content of the XML file
614
     *
615
     * @return string The format of the OCR full text
616
     */
617
    private function getTextFormat($fileContent)
618
    {
619
        $xml = Helper::getXmlFileAsString($fileContent);
620
621
        if ($xml !== false) {
622
            // Get the root element's name as text format.
623
            return strtoupper($xml->getName());
624
        } else {
625
            return '';
626
        }
627
    }
628
629
    /**
630
     * This determines a title for the given document
631
     *
632
     * @access public
633
     *
634
     * @static
635
     *
636
     * @param int $uid: The UID of the document
637
     * @param bool $recursive: Search superior documents for a title, too?
638
     *
639
     * @return string The title of the document itself or a parent document
640
     */
641
    public static function getTitle($uid, $recursive = false)
642
    {
643
        $title = '';
644
        // Sanitize input.
645
        $uid = max(intval($uid), 0);
646
        if ($uid) {
647
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
648
                ->getQueryBuilderForTable('tx_dlf_documents');
649
650
            $result = $queryBuilder
651
                ->select(
652
                    'tx_dlf_documents.title',
653
                    'tx_dlf_documents.partof'
654
                )
655
                ->from('tx_dlf_documents')
656
                ->where(
657
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
658
                    Helper::whereExpression('tx_dlf_documents')
659
                )
660
                ->setMaxResults(1)
661
                ->execute();
662
663
            if ($resArray = $result->fetch()) {
664
                // Get title information.
665
                $title = $resArray['title'];
666
                $partof = $resArray['partof'];
667
                // Search parent documents recursively for a title?
668
                if (
669
                    $recursive
670
                    && empty($title)
671
                    && intval($partof)
672
                    && $partof != $uid
673
                ) {
674
                    $title = self::getTitle($partof, true);
675
                }
676
            } else {
677
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
678
            }
679
        } else {
680
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
681
        }
682
        return $title;
683
    }
684
685
    /**
686
     * This extracts all the metadata for the toplevel logical structure node / resource
687
     *
688
     * @access public
689
     *
690
     * @param int $cPid: The PID for the metadata definitions
691
     *
692
     * @return array The logical structure node's / resource's parsed metadata array
693
     */
694
    public function getTitledata($cPid = 0)
695
    {
696
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
697
        // Add information from METS structural map to titledata array.
698
        if ($this instanceof MetsDocument) {
699
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
700
        }
701
        // Set record identifier for METS file / IIIF manifest if not present.
702
        if (
703
            is_array($titledata)
704
            && array_key_exists('record_id', $titledata)
705
        ) {
706
            if (
707
                !empty($this->recordId)
708
                && !in_array($this->recordId, $titledata['record_id'])
709
            ) {
710
                array_unshift($titledata['record_id'], $this->recordId);
711
            }
712
        }
713
        return $titledata;
714
    }
715
716
    /**
717
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
718
     *
719
     * @access protected
720
     *
721
     * @param array $structure: logical structure array
722
     * @param int $depth: current tree depth
723
     * @param string $logId: ID of the logical structure whose depth is requested
724
     *
725
     * @return int|bool: false if structure with $logId is not a child of this substructure,
726
     * or the actual depth.
727
     */
728
    protected function getTreeDepth($structure, $depth, $logId)
729
    {
730
        foreach ($structure as $element) {
731
            if ($element['id'] == $logId) {
732
                return $depth;
733
            } elseif (array_key_exists('children', $element)) {
734
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
735
                if ($foundInChildren !== false) {
736
                    return $foundInChildren;
737
                }
738
            }
739
        }
740
        return false;
741
    }
742
743
    /**
744
     * Get the tree depth of a logical structure element within the table of content
745
     *
746
     * @access public
747
     *
748
     * @param string $logId: The id of the logical structure element whose depth is requested
749
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
750
     */
751
    public function getStructureDepth($logId)
752
    {
753
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
754
    }
755
756
    /**
757
     * This sets some basic class properties
758
     *
759
     * @access protected
760
     *
761
     * @abstract
762
     *
763
     * @param string $location:The location URL of the XML file to parse
764
     *
765
     * @return void
766
     */
767
    protected abstract function init($location);
768
769
    /**
770
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
771
     *
772
     * @access protected
773
     *
774
     * @abstract
775
     *
776
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
777
     *
778
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
779
     */
780
    protected abstract function setPreloadedDocument($preloadedDocument);
781
782
    /**
783
     * METS/IIIF specific part of loading a location
784
     *
785
     * @access protected
786
     *
787
     * @abstract
788
     *
789
     * @param string $location: The URL of the file to load
790
     *
791
     * @return bool true on success or false on failure
792
     */
793
    protected abstract function loadLocation($location);
794
795
    /**
796
     * Load XML file / IIIF resource from URL
797
     *
798
     * @access protected
799
     *
800
     * @param string $location: The URL of the file to load
801
     *
802
     * @return bool true on success or false on failure
803
     */
804
    protected function load($location)
805
    {
806
        // Load XML / JSON-LD file.
807
        if (GeneralUtility::isValidUrl($location)) {
808
            // the actual loading is format specific
809
            return $this->loadLocation($location);
810
        } else {
811
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
812
        }
813
        return false;
814
    }
815
816
    /**
817
     * Analyze the document if it contains any fulltext that needs to be indexed.
818
     *
819
     * @access protected
820
     *
821
     * @abstract
822
     */
823
    protected abstract function ensureHasFulltextIsSet();
824
825
    /**
826
     * Register all available data formats
827
     *
828
     * @access protected
829
     *
830
     * @return void
831
     */
832
    protected function loadFormats()
833
    {
834
        if (!$this->formatsLoaded) {
835
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
836
                ->getQueryBuilderForTable('tx_dlf_formats');
837
838
            // Get available data formats from database.
839
            $result = $queryBuilder
840
                ->select(
841
                    'tx_dlf_formats.type AS type',
842
                    'tx_dlf_formats.root AS root',
843
                    'tx_dlf_formats.namespace AS namespace',
844
                    'tx_dlf_formats.class AS class'
845
                )
846
                ->from('tx_dlf_formats')
847
                ->where(
848
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
849
                )
850
                ->execute();
851
852
            while ($resArray = $result->fetch()) {
853
                // Update format registry.
854
                $this->formats[$resArray['type']] = [
855
                    'rootElement' => $resArray['root'],
856
                    'namespaceURI' => $resArray['namespace'],
857
                    'class' => $resArray['class']
858
                ];
859
            }
860
            $this->formatsLoaded = true;
861
        }
862
    }
863
864
    /**
865
     * Register all available namespaces for a \SimpleXMLElement object
866
     *
867
     * @access public
868
     *
869
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
870
     *
871
     * @return void
872
     */
873
    public function registerNamespaces(&$obj)
874
    {
875
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
876
        $this->loadFormats();
877
        // Do we have a \SimpleXMLElement or \DOMXPath object?
878
        if ($obj instanceof \SimpleXMLElement) {
879
            $method = 'registerXPathNamespace';
880
        } elseif ($obj instanceof \DOMXPath) {
881
            $method = 'registerNamespace';
882
        } else {
883
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
884
            return;
885
        }
886
        // Register metadata format's namespaces.
887
        foreach ($this->formats as $enc => $conf) {
888
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
889
        }
890
    }
891
892
    /**
893
     * This returns $this->cPid via __get()
894
     *
895
     * @access protected
896
     *
897
     * @return int The PID of the metadata definitions
898
     */
899
    protected function _getCPid()
900
    {
901
        return $this->cPid;
902
    }
903
904
    /**
905
     * This returns $this->hasFulltext via __get()
906
     *
907
     * @access protected
908
     *
909
     * @return bool Are there any fulltext files available?
910
     */
911
    protected function _getHasFulltext()
912
    {
913
        $this->ensureHasFulltextIsSet();
914
        return $this->hasFulltext;
915
    }
916
917
    /**
918
     * This returns $this->location via __get()
919
     *
920
     * @access protected
921
     *
922
     * @return string The location of the document
923
     */
924
    protected function _getLocation()
925
    {
926
        return $this->location;
927
    }
928
929
    /**
930
     * Format specific part of building the document's metadata array
931
     *
932
     * @access protected
933
     *
934
     * @abstract
935
     *
936
     * @param int $cPid
937
     */
938
    protected abstract function prepareMetadataArray($cPid);
939
940
    /**
941
     * This builds an array of the document's metadata
942
     *
943
     * @access protected
944
     *
945
     * @return array Array of metadata with their corresponding logical structure node ID as key
946
     */
947
    protected function _getMetadataArray()
948
    {
949
        // Set metadata definitions' PID.
950
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
951
        if (!$cPid) {
952
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
953
            return [];
954
        }
955
        if (
956
            !$this->metadataArrayLoaded
957
            || $this->metadataArray[0] != $cPid
958
        ) {
959
            $this->prepareMetadataArray($cPid);
960
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
961
            $this->metadataArrayLoaded = true;
962
        }
963
        return $this->metadataArray;
964
    }
965
966
    /**
967
     * This returns $this->numPages via __get()
968
     *
969
     * @access protected
970
     *
971
     * @return int The total number of pages and/or tracks
972
     */
973
    protected function _getNumPages()
974
    {
975
        $this->_getPhysicalStructure();
976
        return $this->numPages;
977
    }
978
979
    /**
980
     * This returns $this->parentId via __get()
981
     *
982
     * @access protected
983
     *
984
     * @return int The UID of the parent document or zero if not applicable
985
     */
986
    protected function _getParentId()
987
    {
988
        return $this->parentId;
989
    }
990
991
    /**
992
     * This builds an array of the document's physical structure
993
     *
994
     * @access protected
995
     *
996
     * @abstract
997
     *
998
     * @return array Array of physical elements' id, type, label and file representations ordered
999
     * by @ORDER attribute / IIIF Sequence's Canvases
1000
     */
1001
    protected abstract function _getPhysicalStructure();
1002
1003
    /**
1004
     * This gives an array of the document's physical structure metadata
1005
     *
1006
     * @access protected
1007
     *
1008
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1009
     */
1010
    protected function _getPhysicalStructureInfo()
1011
    {
1012
        // Is there no physical structure array yet?
1013
        if (!$this->physicalStructureLoaded) {
1014
            // Build physical structure array.
1015
            $this->_getPhysicalStructure();
1016
        }
1017
        return $this->physicalStructureInfo;
1018
    }
1019
1020
    /**
1021
     * This returns $this->pid via __get()
1022
     *
1023
     * @access protected
1024
     *
1025
     * @return int The PID of the document or zero if not in database
1026
     */
1027
    protected function _getPid()
1028
    {
1029
        return $this->pid;
1030
    }
1031
1032
    /**
1033
     * This returns $this->ready via __get()
1034
     *
1035
     * @access protected
1036
     *
1037
     * @return bool Is the document instantiated successfully?
1038
     */
1039
    protected function _getReady()
1040
    {
1041
        return $this->ready;
1042
    }
1043
1044
    /**
1045
     * This returns $this->recordId via __get()
1046
     *
1047
     * @access protected
1048
     *
1049
     * @return mixed The METS file's / IIIF manifest's record identifier
1050
     */
1051
    protected function _getRecordId()
1052
    {
1053
        return $this->recordId;
1054
    }
1055
1056
    /**
1057
     * This returns $this->rootId via __get()
1058
     *
1059
     * @access protected
1060
     *
1061
     * @return int The UID of the root document or zero if not applicable
1062
     */
1063
    protected function _getRootId()
1064
    {
1065
        if (!$this->rootIdLoaded) {
1066
            if ($this->parentId) {
1067
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1068
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1069
            }
1070
            $this->rootIdLoaded = true;
1071
        }
1072
        return $this->rootId;
1073
    }
1074
1075
    /**
1076
     * This returns the smLinks between logical and physical structMap (METS) and models the
1077
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1078
     *
1079
     * @access protected
1080
     *
1081
     * @abstract
1082
     *
1083
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1084
     */
1085
    protected abstract function _getSmLinks();
1086
1087
    /**
1088
     * This builds an array of the document's logical structure
1089
     *
1090
     * @access protected
1091
     *
1092
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1093
     */
1094
    protected function _getTableOfContents()
1095
    {
1096
        // Is there no logical structure array yet?
1097
        if (!$this->tableOfContentsLoaded) {
1098
            // Get all logical structures.
1099
            $this->getLogicalStructure('', true);
1100
            $this->tableOfContentsLoaded = true;
1101
        }
1102
        return $this->tableOfContents;
1103
    }
1104
1105
    /**
1106
     * This returns the document's thumbnail location
1107
     *
1108
     * @access protected
1109
     *
1110
     * @abstract
1111
     *
1112
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1113
     *
1114
     * @return string The document's thumbnail location
1115
     */
1116
    protected abstract function _getThumbnail($forceReload = false);
1117
1118
    /**
1119
     * This returns the ID of the toplevel logical structure node
1120
     *
1121
     * @access protected
1122
     *
1123
     * @abstract
1124
     *
1125
     * @return string The logical structure node's ID
1126
     */
1127
    protected abstract function _getToplevelId();
1128
1129
    /**
1130
     * This returns $this->uid via __get()
1131
     *
1132
     * @access protected
1133
     *
1134
     * @return mixed The UID or the URL of the document
1135
     */
1136
    protected function _getUid()
1137
    {
1138
        return $this->uid;
0 ignored issues
show
Bug Best Practice introduced by
The property uid does not exist on Kitodo\Dlf\Common\Doc. Since you implemented __get, consider adding a @property annotation.
Loading history...
1139
    }
1140
1141
    /**
1142
     * This sets $this->cPid via __set()
1143
     *
1144
     * @access protected
1145
     *
1146
     * @param int $value: The new PID for the metadata definitions
1147
     *
1148
     * @return void
1149
     */
1150
    protected function _setCPid($value)
1151
    {
1152
        $this->cPid = max(intval($value), 0);
1153
    }
1154
1155
    /**
1156
     * This is a singleton class, thus the constructor should be private/protected
1157
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Doc::getInstance())
1158
     *
1159
     * @access protected
1160
     *
1161
     * @param string $location: The location URL of the XML file to parse
1162
     * @param int $pid: If > 0, then only document with this PID gets loaded
1163
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1164
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1165
     *
1166
     * @return void
1167
     */
1168
    protected function __construct($location, $pid, $preloadedDocument)
1169
    {
1170
        $this->setPreloadedDocument($preloadedDocument);
1171
        $this->init($location);
1172
        $this->establishRecordId($pid);
1173
        return;
1174
    }
1175
1176
    /**
1177
     * This magic method is called each time an invisible property is referenced from the object
1178
     *
1179
     * @access public
1180
     *
1181
     * @param string $var: Name of variable to get
1182
     *
1183
     * @return mixed Value of $this->$var
1184
     */
1185
    public function __get($var)
1186
    {
1187
        $method = '_get' . ucfirst($var);
1188
        if (
1189
            !property_exists($this, $var)
1190
            || !method_exists($this, $method)
1191
        ) {
1192
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1193
            return;
1194
        } else {
1195
            return $this->$method();
1196
        }
1197
    }
1198
1199
    /**
1200
     * This magic method is called each time an invisible property is checked for isset() or empty()
1201
     *
1202
     * @access public
1203
     *
1204
     * @param string $var: Name of variable to check
1205
     *
1206
     * @return bool true if variable is set and not empty, false otherwise
1207
     */
1208
    public function __isset($var)
1209
    {
1210
        return !empty($this->__get($var));
1211
    }
1212
1213
    /**
1214
     * This magic method is called each time an invisible property is referenced from the object
1215
     *
1216
     * @access public
1217
     *
1218
     * @param string $var: Name of variable to set
1219
     * @param mixed $value: New value of variable
1220
     *
1221
     * @return void
1222
     */
1223
    public function __set($var, $value)
1224
    {
1225
        $method = '_set' . ucfirst($var);
1226
        if (
1227
            !property_exists($this, $var)
1228
            || !method_exists($this, $method)
1229
        ) {
1230
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1231
        } else {
1232
            $this->$method($value);
1233
        }
1234
    }
1235
1236
    /**
1237
     * get Cache Hit for $doc
1238
     *
1239
     * @param string $location
1240
     * @return Doc|false
1241
     */
1242
    private static function getDocCache(string $location)
1243
    {
1244
        $cacheIdentifier = md5($location);
1245
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1246
        $cacheHit = $cache->get($cacheIdentifier);
1247
1248
        return $cacheHit;
1249
    }
1250
1251
    /**
1252
     * set Cache for $doc
1253
     *
1254
     * @param string $location
1255
     * @param Doc $doc
1256
     * @return void
1257
     */
1258
    private static function setDocCache(string $location, Doc $doc)
1259
    {
1260
        $cacheIdentifier = md5($location);
1261
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1262
1263
        // Save value in cache
1264
        $cache->set($cacheIdentifier, $doc);
1265
    }
1266
1267
}
1268