Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — dev-extbase-fluid (#746)
by Alexander
03:31
created

Doc::getPhysicalPage()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 19
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 13
nc 4
nop 1
dl 0
loc 19
rs 9.5222
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
16
use TYPO3\CMS\Core\Database\ConnectionPool;
17
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
18
use TYPO3\CMS\Core\Log\LogManager;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use TYPO3\CMS\Core\Utility\MathUtility;
21
use TYPO3\CMS\Extbase\Configuration\ConfigurationManager;
22
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
23
use Ubl\Iiif\Tools\IiifHelper;
24
25
/**
26
 * Document class for the 'dlf' extension
27
 *
28
 * @author Sebastian Meyer <[email protected]>
29
 * @author Henrik Lochmann <[email protected]>
30
 * @package TYPO3
31
 * @subpackage dlf
32
 * @access public
33
 * @property int $cPid This holds the PID for the configuration
34
 * @property-read bool $hasFulltext Are there any fulltext files available?
35
 * @property-read string $location This holds the documents location
36
 * @property-read array $metadataArray This holds the documents' parsed metadata array
37
 * @property-read int $numPages The holds the total number of pages
38
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
39
 * @property-read array $physicalStructure This holds the physical structure
40
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
41
 * @property-read int $pid This holds the PID of the document or zero if not in database
42
 * @property-read bool $ready Is the document instantiated successfully?
43
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
44
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
45
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
46
 * @property-read array $tableOfContents This holds the logical structure
47
 * @property-read string $thumbnail This holds the document's thumbnail location
48
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
49
 * @property-read mixed $uid This holds the UID or the URL of the document
50
 * @abstract
51
 */
52
abstract class Doc
53
{
54
    /**
55
     * This holds the logger
56
     *
57
     * @var LogManager
58
     * @access protected
59
     */
60
    protected $logger;
61
62
    /**
63
     * This holds the PID for the configuration
64
     *
65
     * @var int
66
     * @access protected
67
     */
68
    protected $cPid = 0;
69
70
    /**
71
     * The extension key
72
     *
73
     * @var string
74
     * @access public
75
     */
76
    public static $extKey = 'dlf';
77
78
    /**
79
     * This holds the configuration for all supported metadata encodings
80
     * @see loadFormats()
81
     *
82
     * @var array
83
     * @access protected
84
     */
85
    protected $formats = [
86
        'OAI' => [
87
            'rootElement' => 'OAI-PMH',
88
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
89
        ],
90
        'METS' => [
91
            'rootElement' => 'mets',
92
            'namespaceURI' => 'http://www.loc.gov/METS/',
93
        ],
94
        'XLINK' => [
95
            'rootElement' => 'xlink',
96
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
97
        ]
98
    ];
99
100
    /**
101
     * Are the available metadata formats loaded?
102
     * @see $formats
103
     *
104
     * @var bool
105
     * @access protected
106
     */
107
    protected $formatsLoaded = false;
108
109
    /**
110
     * Are there any fulltext files available? This also includes IIIF text annotations
111
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
112
     * annotations as fulltext.
113
     *
114
     * @var bool
115
     * @access protected
116
     */
117
    protected $hasFulltext = false;
118
119
    /**
120
     * Last searched logical and physical page
121
     *
122
     * @var array
123
     * @access protected
124
     */
125
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
126
127
    /**
128
     * This holds the documents location
129
     *
130
     * @var string
131
     * @access protected
132
     */
133
    protected $location = '';
134
135
    /**
136
     * This holds the logical units
137
     *
138
     * @var array
139
     * @access protected
140
     */
141
    protected $logicalUnits = [];
142
143
    /**
144
     * This holds the documents' parsed metadata array with their corresponding
145
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
146
     *
147
     * @var array
148
     * @access protected
149
     */
150
    protected $metadataArray = [];
151
152
    /**
153
     * Is the metadata array loaded?
154
     * @see $metadataArray
155
     *
156
     * @var bool
157
     * @access protected
158
     */
159
    protected $metadataArrayLoaded = false;
160
161
    /**
162
     * The holds the total number of pages
163
     *
164
     * @var int
165
     * @access protected
166
     */
167
    protected $numPages = 0;
168
169
    /**
170
     * This holds the UID of the parent document or zero if not multi-volumed
171
     *
172
     * @var int
173
     * @access protected
174
     */
175
    protected $parentId = 0;
176
177
    /**
178
     * This holds the physical structure
179
     *
180
     * @var array
181
     * @access protected
182
     */
183
    protected $physicalStructure = [];
184
185
    /**
186
     * This holds the physical structure metadata
187
     *
188
     * @var array
189
     * @access protected
190
     */
191
    protected $physicalStructureInfo = [];
192
193
    /**
194
     * Is the physical structure loaded?
195
     * @see $physicalStructure
196
     *
197
     * @var bool
198
     * @access protected
199
     */
200
    protected $physicalStructureLoaded = false;
201
202
    /**
203
     * This holds the PID of the document or zero if not in database
204
     *
205
     * @var int
206
     * @access protected
207
     */
208
    protected $pid = 0;
209
210
    /**
211
     * This holds the documents' raw text pages with their corresponding
212
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
213
     *
214
     * @var array
215
     * @access protected
216
     */
217
    protected $rawTextArray = [];
218
219
    /**
220
     * Is the document instantiated successfully?
221
     *
222
     * @var bool
223
     * @access protected
224
     */
225
    protected $ready = false;
226
227
    /**
228
     * The METS file's / IIIF manifest's record identifier
229
     *
230
     * @var string
231
     * @access protected
232
     */
233
    protected $recordId;
234
235
    /**
236
     * This holds the singleton object of the document
237
     *
238
     * @var array (\Kitodo\Dlf\Common\Doc)
239
     * @static
240
     * @access protected
241
     */
242
    protected static $registry = [];
243
244
    /**
245
     * This holds the UID of the root document or zero if not multi-volumed
246
     *
247
     * @var int
248
     * @access protected
249
     */
250
    protected $rootId = 0;
251
252
    /**
253
     * Is the root id loaded?
254
     * @see $rootId
255
     *
256
     * @var bool
257
     * @access protected
258
     */
259
    protected $rootIdLoaded = false;
260
261
    /**
262
     * This holds the smLinks between logical and physical structMap
263
     *
264
     * @var array
265
     * @access protected
266
     */
267
    protected $smLinks = ['l2p' => [], 'p2l' => []];
268
269
    /**
270
     * Are the smLinks loaded?
271
     * @see $smLinks
272
     *
273
     * @var bool
274
     * @access protected
275
     */
276
    protected $smLinksLoaded = false;
277
278
    /**
279
     * This holds the logical structure
280
     *
281
     * @var array
282
     * @access protected
283
     */
284
    protected $tableOfContents = [];
285
286
    /**
287
     * Is the table of contents loaded?
288
     * @see $tableOfContents
289
     *
290
     * @var bool
291
     * @access protected
292
     */
293
    protected $tableOfContentsLoaded = false;
294
295
    /**
296
     * This holds the document's thumbnail location
297
     *
298
     * @var string
299
     * @access protected
300
     */
301
    protected $thumbnail = '';
302
303
    /**
304
     * Is the document's thumbnail location loaded?
305
     * @see $thumbnail
306
     *
307
     * @var bool
308
     * @access protected
309
     */
310
    protected $thumbnailLoaded = false;
311
312
    /**
313
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
314
     *
315
     * @var string
316
     * @access protected
317
     */
318
    protected $toplevelId = '';
319
320
    /**
321
     * This holds the UID or the URL of the document
322
     *
323
     * @var mixed
324
     * @access protected
325
     */
326
    protected $uid = 0;
327
328
    /**
329
     * This holds the whole XML file as \SimpleXMLElement object
330
     *
331
     * @var \SimpleXMLElement
332
     * @access protected
333
     */
334
    protected $xml;
335
336
    /**
337
     * This clears the static registry to prevent memory exhaustion
338
     *
339
     * @access public
340
     *
341
     * @static
342
     *
343
     * @return void
344
     */
345
    public static function clearRegistry()
346
    {
347
        // Reset registry array.
348
        self::$registry = [];
349
    }
350
351
    /**
352
     * This ensures that the recordId, if existent, is retrieved from the document
353
     *
354
     * @access protected
355
     *
356
     * @abstract
357
     *
358
     * @param int $pid: ID of the configuration page with the recordId config
359
     *
360
     */
361
    protected abstract function establishRecordId($pid);
362
363
    /**
364
     * Source document PHP object which is represented by a Document instance
365
     *
366
     * @access protected
367
     *
368
     * @abstract
369
     *
370
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
371
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
372
     */
373
    protected abstract function getDocument();
374
375
    /**
376
     * This gets the location of a downloadable file for a physical page or track
377
     *
378
     * @access public
379
     *
380
     * @abstract
381
     *
382
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
383
     *
384
     * @return string    The file's location as URL
385
     */
386
    public abstract function getDownloadLocation($id);
387
388
    /**
389
     * This gets the location of a file representing a physical page or track
390
     *
391
     * @access public
392
     *
393
     * @abstract
394
     *
395
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
396
     *
397
     * @return string The file's location as URL
398
     */
399
    public abstract function getFileLocation($id);
400
401
    /**
402
     * This gets the MIME type of a file representing a physical page or track
403
     *
404
     * @access public
405
     *
406
     * @abstract
407
     *
408
     * @param string $id: The @ID attribute of the file node
409
     *
410
     * @return string The file's MIME type
411
     */
412
    public abstract function getFileMimeType($id);
413
414
    /**
415
     * This is a singleton class, thus an instance must be created by this method
416
     *
417
     * @access public
418
     *
419
     * @static
420
     *
421
     * @param string $location: The URL of XML file or the IRI of the IIIF resource
422
     * @param array $settings
423
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
424
     *
425
     * @return \Kitodo\Dlf\Common\Doc Instance of this class, either MetsDocument or IiifManifest
426
     */
427
    public static function &getInstance($location, $settings = [], $forceReload = false)
428
    {
429
        // Create new instance depending on format (METS or IIIF) ...
430
        $instance = null;
431
        $documentFormat = null;
432
        $xml = null;
433
        $iiif = null;
434
435
        // Try to load a file from the url
436
        if (GeneralUtility::isValidUrl($location)) {
437
            // Load extension configuration
438
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
439
            // Set user-agent to identify self when fetching XML data.
440
            if (!empty($extConf['useragent'])) {
441
                @ini_set('user_agent', $extConf['useragent']);
442
            }
443
            $content = GeneralUtility::getUrl($location);
444
            if ($content !== false) {
445
                $xml = Helper::getXmlFileAsString($content);
446
                if ($xml !== false) {
447
                    /* @var $xml \SimpleXMLElement */
448
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
449
                    $xpathResult = $xml->xpath('//mets:mets');
450
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
451
                } else {
452
                    // Try to load file as IIIF resource instead.
453
                    $contentAsJsonArray = json_decode($content, true);
454
                    if ($contentAsJsonArray !== null) {
455
                        // Load plugin configuration.
456
                        $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
457
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
458
                        IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
459
                        IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
460
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
461
                        if ($iiif instanceof IiifResourceInterface) {
462
                            $documentFormat = 'IIIF';
463
                        }
464
                    }
465
                }
466
            }
467
        }
468
469
        // Sanitize input.
470
        $pid = max(intval($settings['storagePid']), 0);
471
        if ($documentFormat == 'METS') {
472
            $instance = new MetsDocument($location, $pid, $xml);
473
        } elseif ($documentFormat == 'IIIF') {
474
            $instance = new IiifManifest($location, $pid, $iiif);
475
        }
476
477
        return $instance;
478
    }
479
480
    /**
481
     * This gets details about a logical structure element
482
     *
483
     * @access public
484
     *
485
     * @abstract
486
     *
487
     * @param string $id: The @ID attribute of the logical structure node (METS) or
488
     * the @id property of the Manifest / Range (IIIF)
489
     * @param bool $recursive: Whether to include the child elements / resources
490
     *
491
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
492
     */
493
    public abstract function getLogicalStructure($id, $recursive = false);
494
495
    /**
496
     * This extracts all the metadata for a logical structure node
497
     *
498
     * @access public
499
     *
500
     * @abstract
501
     *
502
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
503
     * of the Manifest / Range (IIIF)
504
     * @param int $cPid: The PID for the metadata definitions
505
     *                       (defaults to $this->cPid or $this->pid)
506
     *
507
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
508
     */
509
    public abstract function getMetadata($id, $cPid = 0);
510
511
    /**
512
     * This returns the first corresponding physical page number of a given logical page label
513
     *
514
     * @access public
515
     *
516
     * @param string $logicalPage: The label (or a part of the label) of the logical page
517
     *
518
     * @return int The physical page number
519
     */
520
    public function getPhysicalPage($logicalPage)
521
    {
522
        if (
523
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
524
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
525
        ) {
526
            return $this->lastSearchedPhysicalPage['physicalPage'];
527
        } else {
528
            $physicalPage = 0;
529
            foreach ($this->physicalStructureInfo as $page) {
530
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
531
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
532
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
533
                    return $physicalPage;
534
                }
535
                $physicalPage++;
536
            }
537
        }
538
        return 1;
539
    }
540
541
    /**
542
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
543
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
544
     *
545
     * @access public
546
     *
547
     * @abstract
548
     *
549
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
550
     * of the Manifest / Range (IIIF)
551
     *
552
     * @return string The OCR full text
553
     */
554
    public abstract function getFullText($id);
555
556
    /**
557
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
558
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
559
     * to be given in the Canvas' / Manifest's "seeAlso" property.
560
     *
561
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
562
     * of the Manifest / Range (IIIF)
563
     *
564
     * @return string The OCR full text
565
     */
566
    protected function getFullTextFromXml($id)
567
    {
568
        $fullText = '';
569
        // Load available text formats, ...
570
        $this->loadFormats();
571
        // ... physical structure ...
572
        $this->_getPhysicalStructure();
573
        // ... and extension configuration.
574
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
575
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
576
        if (!empty($this->physicalStructureInfo[$id])) {
577
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
578
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
579
                    // Get full text file.
580
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
581
                    if ($fileContent !== false) {
582
                        $textFormat = $this->getTextFormat($fileContent);
583
                    } else {
584
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
585
                        return $fullText;
586
                    }
587
                    break;
588
                }
589
            }
590
        } else {
591
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
592
            return $fullText;
593
        }
594
        // Is this text format supported?
595
        // This part actually differs from previous version of indexed OCR
596
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
597
            $textMiniOcr = '';
598
            if (!empty($this->formats[$textFormat]['class'])) {
599
                $class = $this->formats[$textFormat]['class'];
600
                // Get the raw text from class.
601
                if (
602
                    class_exists($class)
603
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
604
                ) {
605
                    // Load XML from file.
606
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
607
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
608
                    $this->rawTextArray[$id] = $textMiniOcr;
609
                } else {
610
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
611
                }
612
            }
613
            $fullText = $textMiniOcr;
614
        } else {
615
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
616
        }
617
        return $fullText;
618
    }
619
620
    /**
621
     * Get format of the OCR full text
622
     *
623
     * @access private
624
     *
625
     * @param string $fileContent: content of the XML file
626
     *
627
     * @return string The format of the OCR full text
628
     */
629
    private function getTextFormat($fileContent)
630
    {
631
        // Get the root element's name as text format.
632
        return strtoupper(Helper::getXmlFileAsString($fileContent)->getName());
633
    }
634
635
    /**
636
     * This determines a title for the given document
637
     *
638
     * @access public
639
     *
640
     * @static
641
     *
642
     * @param int $uid: The UID of the document
643
     * @param bool $recursive: Search superior documents for a title, too?
644
     *
645
     * @return string The title of the document itself or a parent document
646
     */
647
    public static function getTitle($uid, $recursive = false)
648
    {
649
        $title = '';
650
        // Sanitize input.
651
        $uid = max(intval($uid), 0);
652
        if ($uid) {
653
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
654
                ->getQueryBuilderForTable('tx_dlf_documents');
655
656
            $result = $queryBuilder
657
                ->select(
658
                    'tx_dlf_documents.title',
659
                    'tx_dlf_documents.partof'
660
                )
661
                ->from('tx_dlf_documents')
662
                ->where(
663
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
664
                    Helper::whereExpression('tx_dlf_documents')
665
                )
666
                ->setMaxResults(1)
667
                ->execute();
668
669
            if ($resArray = $result->fetch()) {
670
                // Get title information.
671
                $title = $resArray['title'];
672
                $partof = $resArray['partof'];
673
                // Search parent documents recursively for a title?
674
                if (
675
                    $recursive
676
                    && empty($title)
677
                    && intval($partof)
678
                    && $partof != $uid
679
                ) {
680
                    $title = self::getTitle($partof, true);
681
                }
682
            } else {
683
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
684
            }
685
        } else {
686
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
687
        }
688
        return $title;
689
    }
690
691
    /**
692
     * This extracts all the metadata for the toplevel logical structure node / resource
693
     *
694
     * @access public
695
     *
696
     * @param int $cPid: The PID for the metadata definitions
697
     *
698
     * @return array The logical structure node's / resource's parsed metadata array
699
     */
700
    public function getTitledata($cPid = 0)
701
    {
702
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
703
        // Add information from METS structural map to titledata array.
704
        if ($this instanceof MetsDocument) {
705
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
706
        }
707
        // Set record identifier for METS file / IIIF manifest if not present.
708
        if (
709
            is_array($titledata)
710
            && array_key_exists('record_id', $titledata)
711
        ) {
712
            if (
713
                !empty($this->recordId)
714
                && !in_array($this->recordId, $titledata['record_id'])
715
            ) {
716
                array_unshift($titledata['record_id'], $this->recordId);
717
            }
718
        }
719
        return $titledata;
720
    }
721
722
    /**
723
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
724
     *
725
     * @access protected
726
     *
727
     * @param array $structure: logical structure array
728
     * @param int $depth: current tree depth
729
     * @param string $logId: ID of the logical structure whose depth is requested
730
     *
731
     * @return int|bool: false if structure with $logId is not a child of this substructure,
732
     * or the actual depth.
733
     */
734
    protected function getTreeDepth($structure, $depth, $logId)
735
    {
736
        foreach ($structure as $element) {
737
            if ($element['id'] == $logId) {
738
                return $depth;
739
            } elseif (array_key_exists('children', $element)) {
740
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
741
                if ($foundInChildren !== false) {
742
                    return $foundInChildren;
743
                }
744
            }
745
        }
746
        return false;
747
    }
748
749
    /**
750
     * Get the tree depth of a logical structure element within the table of content
751
     *
752
     * @access public
753
     *
754
     * @param string $logId: The id of the logical structure element whose depth is requested
755
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
756
     */
757
    public function getStructureDepth($logId)
758
    {
759
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
760
    }
761
762
    /**
763
     * This sets some basic class properties
764
     *
765
     * @access protected
766
     *
767
     * @abstract
768
     *
769
     * @return void
770
     */
771
    protected abstract function init();
772
773
    /**
774
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
775
     *
776
     * @access protected
777
     *
778
     * @abstract
779
     *
780
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
781
     *
782
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
783
     */
784
    protected abstract function setPreloadedDocument($preloadedDocument);
785
786
    /**
787
     * METS/IIIF specific part of loading a location
788
     *
789
     * @access protected
790
     *
791
     * @abstract
792
     *
793
     * @param string $location: The URL of the file to load
794
     *
795
     * @return bool true on success or false on failure
796
     */
797
    protected abstract function loadLocation($location);
798
799
    /**
800
     * Load XML file / IIIF resource from URL
801
     *
802
     * @access protected
803
     *
804
     * @param string $location: The URL of the file to load
805
     *
806
     * @return bool true on success or false on failure
807
     */
808
    protected function load($location)
809
    {
810
        // Load XML / JSON-LD file.
811
        if (GeneralUtility::isValidUrl($location)) {
812
            // Load extension configuration
813
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
814
            // Set user-agent to identify self when fetching XML / JSON-LD data.
815
            if (!empty($extConf['useragent'])) {
816
                @ini_set('user_agent', $extConf['useragent']);
817
            }
818
            // the actual loading is format specific
819
            return $this->loadLocation($location);
820
        } else {
821
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
822
        }
823
        return false;
824
    }
825
826
    /**
827
     * Analyze the document if it contains any fulltext that needs to be indexed.
828
     *
829
     * @access protected
830
     *
831
     * @abstract
832
     */
833
    protected abstract function ensureHasFulltextIsSet();
834
835
    /**
836
     * Register all available data formats
837
     *
838
     * @access protected
839
     *
840
     * @return void
841
     */
842
    protected function loadFormats()
843
    {
844
        if (!$this->formatsLoaded) {
845
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
846
                ->getQueryBuilderForTable('tx_dlf_formats');
847
848
            // Get available data formats from database.
849
            $result = $queryBuilder
850
                ->select(
851
                    'tx_dlf_formats.type AS type',
852
                    'tx_dlf_formats.root AS root',
853
                    'tx_dlf_formats.namespace AS namespace',
854
                    'tx_dlf_formats.class AS class'
855
                )
856
                ->from('tx_dlf_formats')
857
                ->where(
858
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
859
                )
860
                ->execute();
861
862
            while ($resArray = $result->fetch()) {
863
                // Update format registry.
864
                $this->formats[$resArray['type']] = [
865
                    'rootElement' => $resArray['root'],
866
                    'namespaceURI' => $resArray['namespace'],
867
                    'class' => $resArray['class']
868
                ];
869
            }
870
            $this->formatsLoaded = true;
871
        }
872
    }
873
874
    /**
875
     * Register all available namespaces for a \SimpleXMLElement object
876
     *
877
     * @access public
878
     *
879
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
880
     *
881
     * @return void
882
     */
883
    public function registerNamespaces(&$obj)
884
    {
885
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
886
        $this->loadFormats();
887
        // Do we have a \SimpleXMLElement or \DOMXPath object?
888
        if ($obj instanceof \SimpleXMLElement) {
889
            $method = 'registerXPathNamespace';
890
        } elseif ($obj instanceof \DOMXPath) {
891
            $method = 'registerNamespace';
892
        } else {
893
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
894
            return;
895
        }
896
        // Register metadata format's namespaces.
897
        foreach ($this->formats as $enc => $conf) {
898
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
899
        }
900
    }
901
902
    /**
903
     * This returns $this->cPid via __get()
904
     *
905
     * @access protected
906
     *
907
     * @return int The PID of the metadata definitions
908
     */
909
    protected function _getCPid()
910
    {
911
        return $this->cPid;
912
    }
913
914
    /**
915
     * This returns $this->hasFulltext via __get()
916
     *
917
     * @access protected
918
     *
919
     * @return bool Are there any fulltext files available?
920
     */
921
    protected function _getHasFulltext()
922
    {
923
        $this->ensureHasFulltextIsSet();
924
        return $this->hasFulltext;
925
    }
926
927
    /**
928
     * This returns $this->location via __get()
929
     *
930
     * @access protected
931
     *
932
     * @return string The location of the document
933
     */
934
    protected function _getLocation()
935
    {
936
        return $this->location;
937
    }
938
939
    /**
940
     * Format specific part of building the document's metadata array
941
     *
942
     * @access protected
943
     *
944
     * @abstract
945
     *
946
     * @param int $cPid
947
     */
948
    protected abstract function prepareMetadataArray($cPid);
949
950
    /**
951
     * This builds an array of the document's metadata
952
     *
953
     * @access protected
954
     *
955
     * @return array Array of metadata with their corresponding logical structure node ID as key
956
     */
957
    protected function _getMetadataArray()
958
    {
959
        // Set metadata definitions' PID.
960
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
961
        if (!$cPid) {
962
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
963
            return [];
964
        }
965
        if (
966
            !$this->metadataArrayLoaded
967
            || $this->metadataArray[0] != $cPid
968
        ) {
969
            $this->prepareMetadataArray($cPid);
970
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
971
            $this->metadataArrayLoaded = true;
972
        }
973
        return $this->metadataArray;
974
    }
975
976
    /**
977
     * This returns $this->numPages via __get()
978
     *
979
     * @access protected
980
     *
981
     * @return int The total number of pages and/or tracks
982
     */
983
    protected function _getNumPages()
984
    {
985
        $this->_getPhysicalStructure();
986
        return $this->numPages;
987
    }
988
989
    /**
990
     * This returns $this->parentId via __get()
991
     *
992
     * @access protected
993
     *
994
     * @return int The UID of the parent document or zero if not applicable
995
     */
996
    protected function _getParentId()
997
    {
998
        return $this->parentId;
999
    }
1000
1001
    /**
1002
     * This builds an array of the document's physical structure
1003
     *
1004
     * @access protected
1005
     *
1006
     * @abstract
1007
     *
1008
     * @return array Array of physical elements' id, type, label and file representations ordered
1009
     * by @ORDER attribute / IIIF Sequence's Canvases
1010
     */
1011
    protected abstract function _getPhysicalStructure();
1012
1013
    /**
1014
     * This gives an array of the document's physical structure metadata
1015
     *
1016
     * @access protected
1017
     *
1018
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1019
     */
1020
    protected function _getPhysicalStructureInfo()
1021
    {
1022
        // Is there no physical structure array yet?
1023
        if (!$this->physicalStructureLoaded) {
1024
            // Build physical structure array.
1025
            $this->_getPhysicalStructure();
1026
        }
1027
        return $this->physicalStructureInfo;
1028
    }
1029
1030
    /**
1031
     * This returns $this->pid via __get()
1032
     *
1033
     * @access protected
1034
     *
1035
     * @return int The PID of the document or zero if not in database
1036
     */
1037
    protected function _getPid()
1038
    {
1039
        return $this->pid;
1040
    }
1041
1042
    /**
1043
     * This returns $this->ready via __get()
1044
     *
1045
     * @access protected
1046
     *
1047
     * @return bool Is the document instantiated successfully?
1048
     */
1049
    protected function _getReady()
1050
    {
1051
        return $this->ready;
1052
    }
1053
1054
    /**
1055
     * This returns $this->recordId via __get()
1056
     *
1057
     * @access protected
1058
     *
1059
     * @return mixed The METS file's / IIIF manifest's record identifier
1060
     */
1061
    protected function _getRecordId()
1062
    {
1063
        return $this->recordId;
1064
    }
1065
1066
    /**
1067
     * This returns $this->rootId via __get()
1068
     *
1069
     * @access protected
1070
     *
1071
     * @return int The UID of the root document or zero if not applicable
1072
     */
1073
    protected function _getRootId()
1074
    {
1075
        if (!$this->rootIdLoaded) {
1076
            if ($this->parentId) {
1077
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1078
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1079
            }
1080
            $this->rootIdLoaded = true;
1081
        }
1082
        return $this->rootId;
1083
    }
1084
1085
    /**
1086
     * This returns the smLinks between logical and physical structMap (METS) and models the
1087
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1088
     *
1089
     * @access protected
1090
     *
1091
     * @abstract
1092
     *
1093
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1094
     */
1095
    protected abstract function _getSmLinks();
1096
1097
    /**
1098
     * This builds an array of the document's logical structure
1099
     *
1100
     * @access protected
1101
     *
1102
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1103
     */
1104
    protected function _getTableOfContents()
1105
    {
1106
        // Is there no logical structure array yet?
1107
        if (!$this->tableOfContentsLoaded) {
1108
            // Get all logical structures.
1109
            $this->getLogicalStructure('', true);
1110
            $this->tableOfContentsLoaded = true;
1111
        }
1112
        return $this->tableOfContents;
1113
    }
1114
1115
    /**
1116
     * This returns the document's thumbnail location
1117
     *
1118
     * @access protected
1119
     *
1120
     * @abstract
1121
     *
1122
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1123
     *
1124
     * @return string The document's thumbnail location
1125
     */
1126
    protected abstract function _getThumbnail($forceReload = false);
1127
1128
    /**
1129
     * This returns the ID of the toplevel logical structure node
1130
     *
1131
     * @access protected
1132
     *
1133
     * @abstract
1134
     *
1135
     * @return string The logical structure node's ID
1136
     */
1137
    protected abstract function _getToplevelId();
1138
1139
    /**
1140
     * This returns $this->uid via __get()
1141
     *
1142
     * @access protected
1143
     *
1144
     * @return mixed The UID or the URL of the document
1145
     */
1146
    protected function _getUid()
1147
    {
1148
        return $this->uid;
1149
    }
1150
1151
    /**
1152
     * This sets $this->cPid via __set()
1153
     *
1154
     * @access protected
1155
     *
1156
     * @param int $value: The new PID for the metadata definitions
1157
     *
1158
     * @return void
1159
     */
1160
    protected function _setCPid($value)
1161
    {
1162
        $this->cPid = max(intval($value), 0);
1163
    }
1164
1165
    /**
1166
     * This is a singleton class, thus the constructor should be private/protected
1167
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Doc::getInstance())
1168
     *
1169
     * @access protected
1170
     *
1171
     * @param int $location: The location URL of the XML file to parse
1172
     * @param int $pid: If > 0, then only document with this PID gets loaded
1173
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1174
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1175
     *
1176
     * @return void
1177
     */
1178
    protected function __construct($location, $pid, $preloadedDocument)
1179
    {
1180
        $this->setPreloadedDocument($preloadedDocument);
1181
        $this->init();
1182
        $this->establishRecordId($pid);
1183
        $this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger();
1184
1185
        return;
1186
    }
1187
1188
    /**
1189
     * This magic method is called each time an invisible property is referenced from the object
1190
     *
1191
     * @access public
1192
     *
1193
     * @param string $var: Name of variable to get
1194
     *
1195
     * @return mixed Value of $this->$var
1196
     */
1197
    public function __get($var)
1198
    {
1199
        $method = '_get' . ucfirst($var);
1200
        if (
1201
            !property_exists($this, $var)
1202
            || !method_exists($this, $method)
1203
        ) {
1204
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1205
            return;
1206
        } else {
1207
            return $this->$method();
1208
        }
1209
    }
1210
1211
    /**
1212
     * This magic method is called each time an invisible property is checked for isset() or empty()
1213
     *
1214
     * @access public
1215
     *
1216
     * @param string $var: Name of variable to check
1217
     *
1218
     * @return bool true if variable is set and not empty, false otherwise
1219
     */
1220
    public function __isset($var)
1221
    {
1222
        return !empty($this->__get($var));
1223
    }
1224
1225
    /**
1226
     * This magic method is called each time an invisible property is referenced from the object
1227
     *
1228
     * @access public
1229
     *
1230
     * @param string $var: Name of variable to set
1231
     * @param mixed $value: New value of variable
1232
     *
1233
     * @return void
1234
     */
1235
    public function __set($var, $value)
1236
    {
1237
        $method = '_set' . ucfirst($var);
1238
        if (
1239
            !property_exists($this, $var)
1240
            || !method_exists($this, $method)
1241
        ) {
1242
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1243
        } else {
1244
            $this->$method($value);
1245
        }
1246
    }
1247
}
1248