Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — master (#878)
by Beatrycze
03:37
created

Doc::getPageLink()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 6
c 0
b 0
f 0
nc 3
nop 1
dl 0
loc 11
rs 10
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Cache\CacheManager;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
19
use TYPO3\CMS\Core\Log\LogManager;
20
use TYPO3\CMS\Core\Utility\GeneralUtility;
21
use TYPO3\CMS\Core\Utility\MathUtility;
22
use TYPO3\CMS\Extbase\Configuration\ConfigurationManager;
23
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
24
use Ubl\Iiif\Tools\IiifHelper;
25
26
/**
27
 * Document class for the 'dlf' extension
28
 *
29
 * @author Sebastian Meyer <[email protected]>
30
 * @author Henrik Lochmann <[email protected]>
31
 * @package TYPO3
32
 * @subpackage dlf
33
 * @access public
34
 * @property int $cPid This holds the PID for the configuration
35
 * @property-read bool $hasFulltext Are there any fulltext files available?
36
 * @property-read string $location This holds the documents location
37
 * @property-read array $metadataArray This holds the documents' parsed metadata array
38
 * @property-read int $numPages The holds the total number of pages
39
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
40
 * @property-read array $physicalStructure This holds the physical structure
41
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
42
 * @property-read int $pid This holds the PID of the document or zero if not in database
43
 * @property-read bool $ready Is the document instantiated successfully?
44
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
45
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
46
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
47
 * @property-read array $tableOfContents This holds the logical structure
48
 * @property-read string $thumbnail This holds the document's thumbnail location
49
 * @property-read string $toplevelId This holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
50
 * @abstract
51
 */
52
abstract class Doc
53
{
54
    /**
55
     * This holds the logger
56
     *
57
     * @var LogManager
58
     * @access protected
59
     */
60
    protected $logger;
61
62
    /**
63
     * This holds the PID for the configuration
64
     *
65
     * @var int
66
     * @access protected
67
     */
68
    protected $cPid = 0;
69
70
    /**
71
     * The extension key
72
     *
73
     * @var string
74
     * @access public
75
     */
76
    public static $extKey = 'dlf';
77
78
    /**
79
     * MIME types that are excluded from PageViewProxy.
80
     *
81
     * TODO: Consider moving this to extension configuration
82
     *
83
     * @var string[]
84
     * @access public
85
     */
86
    public static $nonProxyMimeTypes = [
87
        'application/vnd.kitodo.iiif',
88
        'application/vnd.netfpx',
89
        'application/vnd.kitodo.zoomify',
90
    ];
91
92
    /**
93
     * This holds the configuration for all supported metadata encodings
94
     * @see loadFormats()
95
     *
96
     * @var array
97
     * @access protected
98
     */
99
    protected $formats = [
100
        'OAI' => [
101
            'rootElement' => 'OAI-PMH',
102
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
103
        ],
104
        'METS' => [
105
            'rootElement' => 'mets',
106
            'namespaceURI' => 'http://www.loc.gov/METS/',
107
        ],
108
        'XLINK' => [
109
            'rootElement' => 'xlink',
110
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
111
        ]
112
    ];
113
114
    /**
115
     * Are the available metadata formats loaded?
116
     * @see $formats
117
     *
118
     * @var bool
119
     * @access protected
120
     */
121
    protected $formatsLoaded = false;
122
123
    /**
124
     * Are there any fulltext files available? This also includes IIIF text annotations
125
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
126
     * annotations as fulltext.
127
     *
128
     * @var bool
129
     * @access protected
130
     */
131
    protected $hasFulltext = false;
132
133
    /**
134
     * Last searched logical and physical page
135
     *
136
     * @var array
137
     * @access protected
138
     */
139
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
140
141
    /**
142
     * This holds the logical units
143
     *
144
     * @var array
145
     * @access protected
146
     */
147
    protected $logicalUnits = [];
148
149
    /**
150
     * This holds the documents' parsed metadata array with their corresponding
151
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
152
     *
153
     * @var array
154
     * @access protected
155
     */
156
    protected $metadataArray = [];
157
158
    /**
159
     * Is the metadata array loaded?
160
     * @see $metadataArray
161
     *
162
     * @var bool
163
     * @access protected
164
     */
165
    protected $metadataArrayLoaded = false;
166
167
    /**
168
     * The holds the total number of pages
169
     *
170
     * @var int
171
     * @access protected
172
     */
173
    protected $numPages = 0;
174
175
    /**
176
     * This holds the UID of the parent document or zero if not multi-volumed
177
     *
178
     * @var int
179
     * @access protected
180
     */
181
    protected $parentId = 0;
182
183
    /**
184
     * This holds the physical structure
185
     *
186
     * @var array
187
     * @access protected
188
     */
189
    protected $physicalStructure = [];
190
191
    /**
192
     * This holds the physical structure metadata
193
     *
194
     * @var array
195
     * @access protected
196
     */
197
    protected $physicalStructureInfo = [];
198
199
    /**
200
     * Is the physical structure loaded?
201
     * @see $physicalStructure
202
     *
203
     * @var bool
204
     * @access protected
205
     */
206
    protected $physicalStructureLoaded = false;
207
208
    /**
209
     * This holds the PID of the document or zero if not in database
210
     *
211
     * @var int
212
     * @access protected
213
     */
214
    protected $pid = 0;
215
216
    /**
217
     * This holds the documents' raw text pages with their corresponding
218
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
219
     *
220
     * @var array
221
     * @access protected
222
     */
223
    protected $rawTextArray = [];
224
225
    /**
226
     * Is the document instantiated successfully?
227
     *
228
     * @var bool
229
     * @access protected
230
     */
231
    protected $ready = false;
232
233
    /**
234
     * The METS file's / IIIF manifest's record identifier
235
     *
236
     * @var string
237
     * @access protected
238
     */
239
    protected $recordId;
240
241
    /**
242
     * This holds the singleton object of the document
243
     *
244
     * @var array (\Kitodo\Dlf\Common\Doc)
245
     * @static
246
     * @access protected
247
     */
248
    protected static $registry = [];
249
250
    /**
251
     * This holds the UID of the root document or zero if not multi-volumed
252
     *
253
     * @var int
254
     * @access protected
255
     */
256
    protected $rootId = 0;
257
258
    /**
259
     * Is the root id loaded?
260
     * @see $rootId
261
     *
262
     * @var bool
263
     * @access protected
264
     */
265
    protected $rootIdLoaded = false;
266
267
    /**
268
     * This holds the smLinks between logical and physical structMap
269
     *
270
     * @var array
271
     * @access protected
272
     */
273
    protected $smLinks = ['l2p' => [], 'p2l' => []];
274
275
    /**
276
     * Are the smLinks loaded?
277
     * @see $smLinks
278
     *
279
     * @var bool
280
     * @access protected
281
     */
282
    protected $smLinksLoaded = false;
283
284
    /**
285
     * This holds the logical structure
286
     *
287
     * @var array
288
     * @access protected
289
     */
290
    protected $tableOfContents = [];
291
292
    /**
293
     * Is the table of contents loaded?
294
     * @see $tableOfContents
295
     *
296
     * @var bool
297
     * @access protected
298
     */
299
    protected $tableOfContentsLoaded = false;
300
301
    /**
302
     * This holds the document's thumbnail location
303
     *
304
     * @var string
305
     * @access protected
306
     */
307
    protected $thumbnail = '';
308
309
    /**
310
     * Is the document's thumbnail location loaded?
311
     * @see $thumbnail
312
     *
313
     * @var bool
314
     * @access protected
315
     */
316
    protected $thumbnailLoaded = false;
317
318
    /**
319
     * This holds the toplevel structure's "@ID" (METS) or the manifest's "@id" (IIIF)
320
     *
321
     * @var string
322
     * @access protected
323
     */
324
    protected $toplevelId = '';
325
326
    /**
327
     * This holds the whole XML file as \SimpleXMLElement object
328
     *
329
     * @var \SimpleXMLElement
330
     * @access protected
331
     */
332
    protected $xml;
333
334
    /**
335
     * This clears the static registry to prevent memory exhaustion
336
     *
337
     * @access public
338
     *
339
     * @static
340
     *
341
     * @return void
342
     */
343
    public static function clearRegistry()
344
    {
345
        // Reset registry array.
346
        self::$registry = [];
347
    }
348
349
    /**
350
     * This ensures that the recordId, if existent, is retrieved from the document
351
     *
352
     * @access protected
353
     *
354
     * @abstract
355
     *
356
     * @param int $pid: ID of the configuration page with the recordId config
357
     *
358
     */
359
    protected abstract function establishRecordId($pid);
360
361
    /**
362
     * Source document PHP object which is represented by a Document instance
363
     *
364
     * @access protected
365
     *
366
     * @abstract
367
     *
368
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
369
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
370
     */
371
    protected abstract function getDocument();
372
373
    /**
374
     * This gets the location of a downloadable file for a physical page or track
375
     *
376
     * @access public
377
     *
378
     * @abstract
379
     *
380
     * @param string $id: The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
381
     *
382
     * @return string    The file's location as URL
383
     */
384
    public abstract function getDownloadLocation($id);
385
386
    /**
387
     * This gets the location of a file representing a physical page or track
388
     *
389
     * @access public
390
     *
391
     * @abstract
392
     *
393
     * @param string $id: The "@ID" attribute of the file node (METS) or the "@id" property of the IIIF resource
394
     *
395
     * @return string The file's location as URL
396
     */
397
    public abstract function getFileLocation($id);
398
399
    /**
400
     * This gets the MIME type of a file representing a physical page or track
401
     *
402
     * @access public
403
     *
404
     * @abstract
405
     *
406
     * @param string $id: The "@ID" attribute of the file node
407
     *
408
     * @return string The file's MIME type
409
     */
410
    public abstract function getFileMimeType($id);
411
412
    /**
413
     * Get information about all files contained in the document, or null if this information is not available.
414
     *
415
     * Returns an associative array of the following form:
416
     *
417
     * ```php
418
     * [
419
     *     '#FILE_ID' => [
420
     *         'url' => '...',
421
     *         'mimetype' => '...',
422
     *     ],
423
     *     // ...
424
     * ]
425
     * ```
426
     *
427
     * @access public
428
     *
429
     * @return array|null
430
     */
431
    public function getAllFiles()
432
    {
433
        // TODO: Implement for IiifManifest
434
        return null;
435
    }
436
437
    /**
438
     * This is a singleton class, thus an instance must be created by this method
439
     *
440
     * @access public
441
     *
442
     * @static
443
     *
444
     * @param string $location: The URL of XML file or the IRI of the IIIF resource
445
     * @param array $settings
446
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
447
     *
448
     * @return \Kitodo\Dlf\Common\Doc|null Instance of this class, either MetsDocument or IiifManifest
449
     */
450
    public static function &getInstance($location, $settings = [], $forceReload = false)
451
    {
452
        // Create new instance depending on format (METS or IIIF) ...
453
        $documentFormat = null;
454
        $xml = null;
455
        $iiif = null;
456
457
        if (isset(self::$registry[$location])) {
458
            return self::$registry[$location];
459
        } elseif ($instance = self::getDocCache($location)) {
460
            self::$registry[$location] = $instance;
461
            return $instance;
462
        } else {
463
            $instance = null;
464
        }
465
466
        // Try to load a file from the url
467
        if (GeneralUtility::isValidUrl($location)) {
468
            // Load extension configuration
469
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
470
471
            $content = Helper::getUrl($location);
472
            if ($content !== false) {
473
                $xml = Helper::getXmlFileAsString($content);
474
                if ($xml !== false) {
475
                    /* @var $xml \SimpleXMLElement */
476
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
477
                    $xpathResult = $xml->xpath('//mets:mets');
478
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
479
                } else {
480
                    // Try to load file as IIIF resource instead.
481
                    $contentAsJsonArray = json_decode($content, true);
482
                    if ($contentAsJsonArray !== null) {
483
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
484
                        IiifHelper::setMaxThumbnailHeight($extConf['iiifThumbnailHeight']);
485
                        IiifHelper::setMaxThumbnailWidth($extConf['iiifThumbnailWidth']);
486
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
487
                        if ($iiif instanceof IiifResourceInterface) {
488
                            $documentFormat = 'IIIF';
489
                        }
490
                    }
491
                }
492
            }
493
        }
494
495
        // Sanitize input.
496
        $pid = max(intval($settings['storagePid']), 0);
497
        if ($documentFormat == 'METS') {
498
            $instance = new MetsDocument($location, $pid, $xml);
499
        } elseif ($documentFormat == 'IIIF') {
500
            $instance = new IiifManifest($location, $pid, $iiif);
501
        }
502
503
        if ($instance) {
504
            self::$registry[$location] = $instance;
505
            self::setDocCache($location, $instance);
506
        }
507
508
        return $instance;
509
    }
510
511
    /**
512
     * This gets details about a logical structure element
513
     *
514
     * @access public
515
     *
516
     * @abstract
517
     *
518
     * @param string $id: The "@ID" attribute of the logical structure node (METS) or
519
     * the "@id" property of the Manifest / Range (IIIF)
520
     * @param bool $recursive: Whether to include the child elements / resources
521
     *
522
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
523
     */
524
    public abstract function getLogicalStructure($id, $recursive = false);
525
526
    /**
527
     * This extracts all the metadata for a logical structure node
528
     *
529
     * @access public
530
     *
531
     * @abstract
532
     *
533
     * @param string $id: The "@ID" attribute of the logical structure node (METS) or the "@id" property
534
     * of the Manifest / Range (IIIF)
535
     * @param int $cPid: The PID for the metadata definitions
536
     *                       (defaults to $this->cPid or $this->pid)
537
     *
538
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
539
     */
540
    public abstract function getMetadata($id, $cPid = 0);
541
542
    /**
543
     * This returns the first corresponding physical page number of a given logical page label
544
     *
545
     * @access public
546
     *
547
     * @param string $logicalPage: The label (or a part of the label) of the logical page
548
     *
549
     * @return int The physical page number
550
     */
551
    public function getPhysicalPage($logicalPage)
552
    {
553
        if (
554
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
555
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
556
        ) {
557
            return $this->lastSearchedPhysicalPage['physicalPage'];
558
        } else {
559
            $physicalPage = 0;
560
            foreach ($this->physicalStructureInfo as $page) {
561
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
562
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
563
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
564
                    return $physicalPage;
565
                }
566
                $physicalPage++;
567
            }
568
        }
569
        return 1;
570
    }
571
572
    /**
573
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
574
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
575
     *
576
     * @access public
577
     *
578
     * @abstract
579
     *
580
     * @param string $id: The "@ID" attribute of the physical structure node (METS) or the "@id" property
581
     * of the Manifest / Range (IIIF)
582
     *
583
     * @return string The OCR full text
584
     */
585
    public abstract function getFullText($id);
586
587
    /**
588
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
589
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
590
     * to be given in the Canvas' / Manifest's "seeAlso" property.
591
     *
592
     * @param string $id: The "@ID" attribute of the physical structure node (METS) or the "@id" property
593
     * of the Manifest / Range (IIIF)
594
     *
595
     * @return string The OCR full text
596
     */
597
    protected function getFullTextFromXml($id)
598
    {
599
        $fullText = '';
600
        // Load available text formats, ...
601
        $this->loadFormats();
602
        // ... physical structure ...
603
        $this->_getPhysicalStructure();
604
        // ... and extension configuration.
605
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
606
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
607
        if (!empty($this->physicalStructureInfo[$id])) {
608
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
609
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
610
                    // Get full text file.
611
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
612
                    if ($fileContent !== false) {
613
                        $textFormat = $this->getTextFormat($fileContent);
614
                    } else {
615
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
616
                        return $fullText;
617
                    }
618
                    break;
619
                }
620
            }
621
        } else {
622
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
623
            return $fullText;
624
        }
625
        // Is this text format supported?
626
        // This part actually differs from previous version of indexed OCR
627
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
628
            $textMiniOcr = '';
629
            if (!empty($this->formats[$textFormat]['class'])) {
630
                $class = $this->formats[$textFormat]['class'];
631
                // Get the raw text from class.
632
                if (
633
                    class_exists($class)
634
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
635
                ) {
636
                    // Load XML from file.
637
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
638
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
0 ignored issues
show
Bug introduced by
It seems like $ocrTextXml can also be of type false; however, parameter $xml of Kitodo\Dlf\Common\Fullte...ace::getTextAsMiniOcr() does only seem to accept SimpleXMLElement, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

638
                    $textMiniOcr = $obj->getTextAsMiniOcr(/** @scrutinizer ignore-type */ $ocrTextXml);
Loading history...
639
                    $this->rawTextArray[$id] = $textMiniOcr;
640
                } else {
641
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
642
                }
643
            }
644
            $fullText = $textMiniOcr;
645
        } else {
646
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
647
        }
648
        return $fullText;
649
    }
650
651
    /**
652
     * Get format of the OCR full text
653
     *
654
     * @access private
655
     *
656
     * @param string $fileContent: content of the XML file
657
     *
658
     * @return string The format of the OCR full text
659
     */
660
    private function getTextFormat($fileContent)
661
    {
662
        $xml = Helper::getXmlFileAsString($fileContent);
663
664
        if ($xml !== false) {
665
            // Get the root element's name as text format.
666
            return strtoupper($xml->getName());
667
        } else {
668
            return '';
669
        }
670
    }
671
672
    /**
673
     * This determines a title for the given document
674
     *
675
     * @access public
676
     *
677
     * @static
678
     *
679
     * @param int $uid: The UID of the document
680
     * @param bool $recursive: Search superior documents for a title, too?
681
     *
682
     * @return string The title of the document itself or a parent document
683
     */
684
    public static function getTitle($uid, $recursive = false)
685
    {
686
        $title = '';
687
        // Sanitize input.
688
        $uid = max(intval($uid), 0);
689
        if ($uid) {
690
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
691
                ->getQueryBuilderForTable('tx_dlf_documents');
692
693
            $result = $queryBuilder
694
                ->select(
695
                    'tx_dlf_documents.title',
696
                    'tx_dlf_documents.partof'
697
                )
698
                ->from('tx_dlf_documents')
699
                ->where(
700
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
701
                    Helper::whereExpression('tx_dlf_documents')
702
                )
703
                ->setMaxResults(1)
704
                ->execute();
705
706
            if ($resArray = $result->fetch()) {
707
                // Get title information.
708
                $title = $resArray['title'];
709
                $partof = $resArray['partof'];
710
                // Search parent documents recursively for a title?
711
                if (
712
                    $recursive
713
                    && empty($title)
714
                    && intval($partof)
715
                    && $partof != $uid
716
                ) {
717
                    $title = self::getTitle($partof, true);
718
                }
719
            } else {
720
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
721
            }
722
        } else {
723
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
724
        }
725
        return $title;
726
    }
727
728
    /**
729
     * This extracts all the metadata for the toplevel logical structure node / resource
730
     *
731
     * @access public
732
     *
733
     * @param int $cPid: The PID for the metadata definitions
734
     *
735
     * @return array The logical structure node's / resource's parsed metadata array
736
     */
737
    public function getTitledata($cPid = 0)
738
    {
739
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
740
        // Add information from METS structural map to titledata array.
741
        if ($this instanceof MetsDocument) {
742
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
743
        }
744
        // Set record identifier for METS file / IIIF manifest if not present.
745
        if (
746
            is_array($titledata)
747
            && array_key_exists('record_id', $titledata)
748
        ) {
749
            if (
750
                !empty($this->recordId)
751
                && !in_array($this->recordId, $titledata['record_id'])
752
            ) {
753
                array_unshift($titledata['record_id'], $this->recordId);
754
            }
755
        }
756
        return $titledata;
757
    }
758
759
    /**
760
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return its depth.
761
     *
762
     * @access protected
763
     *
764
     * @param array $structure: logical structure array
765
     * @param int $depth: current tree depth
766
     * @param string $logId: ID of the logical structure whose depth is requested
767
     *
768
     * @return int|bool: false if structure with $logId is not a child of this substructure,
769
     * or the actual depth.
770
     */
771
    protected function getTreeDepth($structure, $depth, $logId)
772
    {
773
        foreach ($structure as $element) {
774
            if ($element['id'] == $logId) {
775
                return $depth;
776
            } elseif (array_key_exists('children', $element)) {
777
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
778
                if ($foundInChildren !== false) {
779
                    return $foundInChildren;
780
                }
781
            }
782
        }
783
        return false;
784
    }
785
786
    /**
787
     * Get the tree depth of a logical structure element within the table of content
788
     *
789
     * @access public
790
     *
791
     * @param string $logId: The id of the logical structure element whose depth is requested
792
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
793
     */
794
    public function getStructureDepth($logId)
795
    {
796
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
797
    }
798
799
    /**
800
     * This sets some basic class properties
801
     *
802
     * @access protected
803
     *
804
     * @abstract
805
     *
806
     * @param string $location:The location URL of the XML file to parse
807
     *
808
     * @return void
809
     */
810
    protected abstract function init($location);
811
812
    /**
813
     * Reuse any document object that might have been already loaded to determine whether document is METS or IIIF
814
     *
815
     * @access protected
816
     *
817
     * @abstract
818
     *
819
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
820
     *
821
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
822
     */
823
    protected abstract function setPreloadedDocument($preloadedDocument);
824
825
    /**
826
     * METS/IIIF specific part of loading a location
827
     *
828
     * @access protected
829
     *
830
     * @abstract
831
     *
832
     * @param string $location: The URL of the file to load
833
     *
834
     * @return bool true on success or false on failure
835
     */
836
    protected abstract function loadLocation($location);
837
838
    /**
839
     * Load XML file / IIIF resource from URL
840
     *
841
     * @access protected
842
     *
843
     * @param string $location: The URL of the file to load
844
     *
845
     * @return bool true on success or false on failure
846
     */
847
    protected function load($location)
848
    {
849
        // Load XML / JSON-LD file.
850
        if (GeneralUtility::isValidUrl($location)) {
851
            // the actual loading is format specific
852
            return $this->loadLocation($location);
853
        } else {
854
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
855
        }
856
        return false;
857
    }
858
859
    /**
860
     * Analyze the document if it contains any fulltext that needs to be indexed.
861
     *
862
     * @access protected
863
     *
864
     * @abstract
865
     */
866
    protected abstract function ensureHasFulltextIsSet();
867
868
    /**
869
     * Register all available data formats
870
     *
871
     * @access protected
872
     *
873
     * @return void
874
     */
875
    protected function loadFormats()
876
    {
877
        if (!$this->formatsLoaded) {
878
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
879
                ->getQueryBuilderForTable('tx_dlf_formats');
880
881
            // Get available data formats from database.
882
            $result = $queryBuilder
883
                ->select(
884
                    'tx_dlf_formats.type AS type',
885
                    'tx_dlf_formats.root AS root',
886
                    'tx_dlf_formats.namespace AS namespace',
887
                    'tx_dlf_formats.class AS class'
888
                )
889
                ->from('tx_dlf_formats')
890
                ->where(
891
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
892
                )
893
                ->execute();
894
895
            while ($resArray = $result->fetch()) {
896
                // Update format registry.
897
                $this->formats[$resArray['type']] = [
898
                    'rootElement' => $resArray['root'],
899
                    'namespaceURI' => $resArray['namespace'],
900
                    'class' => $resArray['class']
901
                ];
902
            }
903
            $this->formatsLoaded = true;
904
        }
905
    }
906
907
    /**
908
     * Register all available namespaces for a \SimpleXMLElement object
909
     *
910
     * @access public
911
     *
912
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
913
     *
914
     * @return void
915
     */
916
    public function registerNamespaces(&$obj)
917
    {
918
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
919
        $this->loadFormats();
920
        // Do we have a \SimpleXMLElement or \DOMXPath object?
921
        if ($obj instanceof \SimpleXMLElement) {
922
            $method = 'registerXPathNamespace';
923
        } elseif ($obj instanceof \DOMXPath) {
924
            $method = 'registerNamespace';
925
        } else {
926
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
927
            return;
928
        }
929
        // Register metadata format's namespaces.
930
        foreach ($this->formats as $enc => $conf) {
931
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
932
        }
933
    }
934
935
    /**
936
     * This returns $this->cPid via __get()
937
     *
938
     * @access protected
939
     *
940
     * @return int The PID of the metadata definitions
941
     */
942
    protected function _getCPid()
943
    {
944
        return $this->cPid;
945
    }
946
947
    /**
948
     * This returns $this->hasFulltext via __get()
949
     *
950
     * @access protected
951
     *
952
     * @return bool Are there any fulltext files available?
953
     */
954
    protected function _getHasFulltext()
955
    {
956
        $this->ensureHasFulltextIsSet();
957
        return $this->hasFulltext;
958
    }
959
960
    /**
961
     * This returns $this->location via __get()
962
     *
963
     * @access protected
964
     *
965
     * @return string The location of the document
966
     */
967
    protected function _getLocation()
968
    {
969
        return $this->location;
970
    }
971
972
    /**
973
     * Format specific part of building the document's metadata array
974
     *
975
     * @access protected
976
     *
977
     * @abstract
978
     *
979
     * @param int $cPid
980
     */
981
    protected abstract function prepareMetadataArray($cPid);
982
983
    /**
984
     * This builds an array of the document's metadata
985
     *
986
     * @access protected
987
     *
988
     * @return array Array of metadata with their corresponding logical structure node ID as key
989
     */
990
    protected function _getMetadataArray()
991
    {
992
        // Set metadata definitions' PID.
993
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
994
        if (!$cPid) {
995
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
996
            return [];
997
        }
998
        if (
999
            !$this->metadataArrayLoaded
1000
            || $this->metadataArray[0] != $cPid
1001
        ) {
1002
            $this->prepareMetadataArray($cPid);
1003
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1004
            $this->metadataArrayLoaded = true;
1005
        }
1006
        return $this->metadataArray;
1007
    }
1008
1009
    /**
1010
     * This returns $this->numPages via __get()
1011
     *
1012
     * @access protected
1013
     *
1014
     * @return int The total number of pages and/or tracks
1015
     */
1016
    protected function _getNumPages()
1017
    {
1018
        $this->_getPhysicalStructure();
1019
        return $this->numPages;
1020
    }
1021
1022
    /**
1023
     * This returns $this->parentId via __get()
1024
     *
1025
     * @access protected
1026
     *
1027
     * @return int The UID of the parent document or zero if not applicable
1028
     */
1029
    protected function _getParentId()
1030
    {
1031
        return $this->parentId;
1032
    }
1033
1034
    /**
1035
     * This builds an array of the document's physical structure
1036
     *
1037
     * @access protected
1038
     *
1039
     * @abstract
1040
     *
1041
     * @return array Array of physical elements' id, type, label and file representations ordered
1042
     * by "@ORDER" attribute / IIIF Sequence's Canvases
1043
     */
1044
    protected abstract function _getPhysicalStructure();
1045
1046
    /**
1047
     * This gives an array of the document's physical structure metadata
1048
     *
1049
     * @access protected
1050
     *
1051
     * @return array Array of elements' type, label and file representations ordered by "@ID" attribute / Canvas order
1052
     */
1053
    protected function _getPhysicalStructureInfo()
1054
    {
1055
        // Is there no physical structure array yet?
1056
        if (!$this->physicalStructureLoaded) {
1057
            // Build physical structure array.
1058
            $this->_getPhysicalStructure();
1059
        }
1060
        return $this->physicalStructureInfo;
1061
    }
1062
1063
    /**
1064
     * This returns $this->pid via __get()
1065
     *
1066
     * @access protected
1067
     *
1068
     * @return int The PID of the document or zero if not in database
1069
     */
1070
    protected function _getPid()
1071
    {
1072
        return $this->pid;
1073
    }
1074
1075
    /**
1076
     * This returns $this->ready via __get()
1077
     *
1078
     * @access protected
1079
     *
1080
     * @return bool Is the document instantiated successfully?
1081
     */
1082
    protected function _getReady()
1083
    {
1084
        return $this->ready;
1085
    }
1086
1087
    /**
1088
     * This returns $this->recordId via __get()
1089
     *
1090
     * @access protected
1091
     *
1092
     * @return mixed The METS file's / IIIF manifest's record identifier
1093
     */
1094
    protected function _getRecordId()
1095
    {
1096
        return $this->recordId;
1097
    }
1098
1099
    /**
1100
     * This returns $this->rootId via __get()
1101
     *
1102
     * @access protected
1103
     *
1104
     * @return int The UID of the root document or zero if not applicable
1105
     */
1106
    protected function _getRootId()
1107
    {
1108
        if (!$this->rootIdLoaded) {
1109
            if ($this->parentId) {
1110
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1111
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1112
            }
1113
            $this->rootIdLoaded = true;
1114
        }
1115
        return $this->rootId;
1116
    }
1117
1118
    /**
1119
     * This returns the smLinks between logical and physical structMap (METS) and models the
1120
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1121
     *
1122
     * @access protected
1123
     *
1124
     * @abstract
1125
     *
1126
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1127
     */
1128
    protected abstract function _getSmLinks();
1129
1130
    /**
1131
     * This builds an array of the document's logical structure
1132
     *
1133
     * @access protected
1134
     *
1135
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1136
     */
1137
    protected function _getTableOfContents()
1138
    {
1139
        // Is there no logical structure array yet?
1140
        if (!$this->tableOfContentsLoaded) {
1141
            // Get all logical structures.
1142
            $this->getLogicalStructure('', true);
1143
            $this->tableOfContentsLoaded = true;
1144
        }
1145
        return $this->tableOfContents;
1146
    }
1147
1148
    /**
1149
     * This returns the document's thumbnail location
1150
     *
1151
     * @access protected
1152
     *
1153
     * @abstract
1154
     *
1155
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1156
     *
1157
     * @return string The document's thumbnail location
1158
     */
1159
    protected abstract function _getThumbnail($forceReload = false);
1160
1161
    /**
1162
     * This returns the ID of the toplevel logical structure node
1163
     *
1164
     * @access protected
1165
     *
1166
     * @abstract
1167
     *
1168
     * @return string The logical structure node's ID
1169
     */
1170
    protected abstract function _getToplevelId();
1171
1172
    /**
1173
     * This returns $this->uid via __get()
1174
     *
1175
     * @access protected
1176
     *
1177
     * @return mixed The UID or the URL of the document
1178
     */
1179
    protected function _getUid()
1180
    {
1181
        return $this->uid;
0 ignored issues
show
Bug Best Practice introduced by
The property uid does not exist on Kitodo\Dlf\Common\Doc. Since you implemented __get, consider adding a @property annotation.
Loading history...
1182
    }
1183
1184
    /**
1185
     * This sets $this->cPid via __set()
1186
     *
1187
     * @access protected
1188
     *
1189
     * @param int $value: The new PID for the metadata definitions
1190
     *
1191
     * @return void
1192
     */
1193
    protected function _setCPid($value)
1194
    {
1195
        $this->cPid = max(intval($value), 0);
1196
    }
1197
1198
    /**
1199
     * This is a singleton class, thus the constructor should be private/protected
1200
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Doc::getInstance())
1201
     *
1202
     * @access protected
1203
     *
1204
     * @param string $location: The location URL of the XML file to parse
1205
     * @param int $pid: If > 0, then only document with this PID gets loaded
1206
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1207
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1208
     *
1209
     * @return void
1210
     */
1211
    protected function __construct($location, $pid, $preloadedDocument)
1212
    {
1213
        $this->pid = $pid;
0 ignored issues
show
Bug introduced by
The property pid is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1214
        $this->setPreloadedDocument($preloadedDocument);
1215
        $this->init($location);
1216
        $this->establishRecordId($pid);
1217
        return;
1218
    }
1219
1220
    /**
1221
     * This magic method is called each time an invisible property is referenced from the object
1222
     *
1223
     * @access public
1224
     *
1225
     * @param string $var: Name of variable to get
1226
     *
1227
     * @return mixed Value of $this->$var
1228
     */
1229
    public function __get($var)
1230
    {
1231
        $method = '_get' . ucfirst($var);
1232
        if (
1233
            !property_exists($this, $var)
1234
            || !method_exists($this, $method)
1235
        ) {
1236
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1237
            return;
1238
        } else {
1239
            return $this->$method();
1240
        }
1241
    }
1242
1243
    /**
1244
     * This magic method is called each time an invisible property is checked for isset() or empty()
1245
     *
1246
     * @access public
1247
     *
1248
     * @param string $var: Name of variable to check
1249
     *
1250
     * @return bool true if variable is set and not empty, false otherwise
1251
     */
1252
    public function __isset($var)
1253
    {
1254
        return !empty($this->__get($var));
1255
    }
1256
1257
    /**
1258
     * This magic method is called each time an invisible property is referenced from the object
1259
     *
1260
     * @access public
1261
     *
1262
     * @param string $var: Name of variable to set
1263
     * @param mixed $value: New value of variable
1264
     *
1265
     * @return void
1266
     */
1267
    public function __set($var, $value)
1268
    {
1269
        $method = '_set' . ucfirst($var);
1270
        if (
1271
            !property_exists($this, $var)
1272
            || !method_exists($this, $method)
1273
        ) {
1274
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1275
        } else {
1276
            $this->$method($value);
1277
        }
1278
    }
1279
1280
    /**
1281
     * get Cache Hit for $doc
1282
     *
1283
     * @param string $location
1284
     * @return Doc|false
1285
     */
1286
    private static function getDocCache(string $location)
1287
    {
1288
        $cacheIdentifier = md5($location);
1289
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1290
        $cacheHit = $cache->get($cacheIdentifier);
1291
1292
        return $cacheHit;
1293
    }
1294
1295
    /**
1296
     * set Cache for $doc
1297
     *
1298
     * @param string $location
1299
     * @param Doc $doc
1300
     * @return void
1301
     */
1302
    private static function setDocCache(string $location, Doc $doc)
1303
    {
1304
        $cacheIdentifier = md5($location);
1305
        $cache = GeneralUtility::makeInstance(CacheManager::class)->getCache('tx_dlf_doc');
1306
1307
        // Save value in cache
1308
        $cache->set($cacheIdentifier, $doc);
1309
    }
1310
1311
    /**
1312
     * Get IDs of logical structures that a page belongs to, indexed by depth.
1313
     *
1314
     * @param int $pageNo
1315
     * @return array
1316
     */
1317
    public function getLogicalSectionsOnPage($pageNo)
1318
    {
1319
        $this->_getSmLinks();
1320
        $this->_getPhysicalStructure();
1321
1322
        $ids = [];
1323
        if (!empty($this->physicalStructure[$pageNo]) && !empty($this->smLinks['p2l'][$this->physicalStructure[$pageNo]])) {
1324
            foreach ($this->smLinks['p2l'][$this->physicalStructure[$pageNo]] as $logId) {
1325
                $depth = $this->getStructureDepth($logId);
1326
                $ids[$depth][] = $logId;
1327
            }
1328
        }
1329
        ksort($ids);
1330
        reset($ids);
1331
        return $ids;
1332
    }
1333
1334
    /**
1335
     * Get URL of download file of specified page, or the empty string if there is no such link.
1336
     *
1337
     * @param int $pageNumber
1338
     * @return string
1339
     */
1340
    public function getPageLink($pageNumber)
1341
    {
1342
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
1343
        $fileGrpsDownload = GeneralUtility::trimExplode(',', $extConf['fileGrpDownload']);
1344
        // Get image link.
1345
        foreach ($fileGrpsDownload as $fileGrpDownload) {
1346
            if (!empty($this->physicalStructureInfo[$this->physicalStructure[$pageNumber]]['files'][$fileGrpDownload])) {
1347
                return $this->getFileLocation($this->physicalStructureInfo[$this->physicalStructure[$pageNumber]]['files'][$fileGrpDownload]);
1348
            }
1349
        }
1350
        return '';
1351
    }
1352
1353
    public function toArray($uriBuilder, array $config = [])
1354
    {
1355
        $this->_getSmLinks();
1356
        $this->_getPhysicalStructure();
1357
1358
        $proxyFileGroups = $config['proxyFileGroups'] ?? [];
1359
        $forceAbsoluteUrl = $config['forceAbsoluteUrl'] ?? false;
1360
        $minPage = $config['minPage'] ?? 1;
1361
        $maxPage = $config['maxPage'] ?? $this->numPages;
1362
1363
        $result = [
1364
            'pages' => [],
1365
            'query' => [
1366
                'minPage' => $minPage
1367
            ]
1368
        ];
1369
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
1370
        $fileGrpsImages = array_reverse(GeneralUtility::trimExplode(',', $extConf['fileGrpImages']));
0 ignored issues
show
Unused Code introduced by
The assignment to $fileGrpsImages is dead and can be removed.
Loading history...
1371
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
0 ignored issues
show
Unused Code introduced by
The assignment to $fileGrpsFulltext is dead and can be removed.
Loading history...
1372
1373
        $allFiles = $this->getAllFiles();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $allFiles is correct as $this->getAllFiles() targeting Kitodo\Dlf\Common\Doc::getAllFiles() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1374
1375
        for ($page = $minPage; $page <= $maxPage; $page++) {
1376
            $pageEntry = [
1377
                'logSections' => array_merge(...$this->getLogicalSectionsOnPage($page)),
1378
                'files' => [],
1379
            ];
1380
1381
            foreach ($this->physicalStructureInfo[$this->physicalStructure[$page]]['files'] as $fileGrp => $fileId) {
1382
                if ($allFiles === null) {
1383
                    $file = [
1384
                        'url' => $this->getFileLocation($fileId),
1385
                        'mimetype' => $this->getFileMimeType($fileId),
1386
                    ];
1387
                } else {
1388
                    $file = $allFiles[$fileId] ?? null;
1389
                    if ($file === null) {
1390
                        continue;
1391
                    }
1392
                }
1393
1394
                // Only deliver static images via the internal PageViewProxy.
1395
                // (For IIP and IIIF, the viewer needs to build and access a separate metadata URL, see `getMetdadataURL`.)
1396
                if (in_array($fileGrp, $proxyFileGroups) && !in_array($mimetype, self::$nonProxyMimeTypes)) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $mimetype seems to be never defined.
Loading history...
1397
                    // Configure @action URL for form.
1398
                    $file['url'] = $uriBuilder
1399
                        ->reset()
1400
                        ->setTargetPageUid($GLOBALS['TSFE']->id)
1401
                        ->setCreateAbsoluteUri($forceAbsoluteUrl)
1402
                        ->setArguments([
1403
                            'eID' => 'tx_dlf_pageview_proxy',
1404
                            'url' => $file['url'],
1405
                            'uHash' => GeneralUtility::hmac($file['url'], 'PageViewProxy')
1406
                        ])
1407
                        ->build();
1408
                }
1409
1410
                $pageEntry['files'][$fileGrp] = $file;
1411
            }
1412
1413
            $result['pages'][] = $pageEntry;
1414
        }
1415
1416
        return $result;
1417
    }
1418
}
1419