Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — dev-extbase-fluid (#746)
by Alexander
02:49
created

Doc::getInstance()   B

Complexity

Conditions 10
Paths 39

Size

Total Lines 51
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 31
c 1
b 0
f 0
nc 39
nop 3
dl 0
loc 51
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use Kitodo\Dlf\Domain\Repository\DocumentRepository;
16
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
17
use TYPO3\CMS\Core\Database\ConnectionPool;
18
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
19
use TYPO3\CMS\Core\Log\LogManager;
20
use TYPO3\CMS\Core\Utility\GeneralUtility;
21
use TYPO3\CMS\Core\Utility\MathUtility;
22
use TYPO3\CMS\Extbase\Configuration\ConfigurationManager;
23
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
24
use Ubl\Iiif\Tools\IiifHelper;
25
26
/**
27
 * Document class for the 'dlf' extension
28
 *
29
 * @author Sebastian Meyer <[email protected]>
30
 * @author Henrik Lochmann <[email protected]>
31
 * @package TYPO3
32
 * @subpackage dlf
33
 * @access public
34
 * @property int $cPid This holds the PID for the configuration
35
 * @property-read bool $hasFulltext Are there any fulltext files available?
36
 * @property-read string $location This holds the documents location
37
 * @property-read array $metadataArray This holds the documents' parsed metadata array
38
 * @property-read int $numPages The holds the total number of pages
39
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
40
 * @property-read array $physicalStructure This holds the physical structure
41
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
42
 * @property-read int $pid This holds the PID of the document or zero if not in database
43
 * @property-read bool $ready Is the document instantiated successfully?
44
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
45
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
46
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
47
 * @property-read array $tableOfContents This holds the logical structure
48
 * @property-read string $thumbnail This holds the document's thumbnail location
49
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
50
 * @property-read mixed $uid This holds the UID or the URL of the document
51
 * @abstract
52
 */
53
abstract class Doc
54
{
55
    /**
56
     * This holds the logger
57
     *
58
     * @var LogManager
59
     * @access protected
60
     */
61
    protected $logger;
62
63
    /**
64
     * This holds the PID for the configuration
65
     *
66
     * @var int
67
     * @access protected
68
     */
69
    protected $cPid = 0;
70
71
    /**
72
     * The extension key
73
     *
74
     * @var string
75
     * @access public
76
     */
77
    public static $extKey = 'dlf';
78
79
    /**
80
     * This holds the configuration for all supported metadata encodings
81
     * @see loadFormats()
82
     *
83
     * @var array
84
     * @access protected
85
     */
86
    protected $formats = [
87
        'OAI' => [
88
            'rootElement' => 'OAI-PMH',
89
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
90
        ],
91
        'METS' => [
92
            'rootElement' => 'mets',
93
            'namespaceURI' => 'http://www.loc.gov/METS/',
94
        ],
95
        'XLINK' => [
96
            'rootElement' => 'xlink',
97
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
98
        ]
99
    ];
100
101
    /**
102
     * Are the available metadata formats loaded?
103
     * @see $formats
104
     *
105
     * @var bool
106
     * @access protected
107
     */
108
    protected $formatsLoaded = false;
109
110
    /**
111
     * Are there any fulltext files available? This also includes IIIF text annotations
112
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
113
     * annotations as fulltext.
114
     *
115
     * @var bool
116
     * @access protected
117
     */
118
    protected $hasFulltext = false;
119
120
    /**
121
     * Last searched logical and physical page
122
     *
123
     * @var array
124
     * @access protected
125
     */
126
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
127
128
    /**
129
     * This holds the documents location
130
     *
131
     * @var string
132
     * @access protected
133
     */
134
    protected $location = '';
135
136
    /**
137
     * This holds the logical units
138
     *
139
     * @var array
140
     * @access protected
141
     */
142
    protected $logicalUnits = [];
143
144
    /**
145
     * This holds the documents' parsed metadata array with their corresponding
146
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
147
     *
148
     * @var array
149
     * @access protected
150
     */
151
    protected $metadataArray = [];
152
153
    /**
154
     * Is the metadata array loaded?
155
     * @see $metadataArray
156
     *
157
     * @var bool
158
     * @access protected
159
     */
160
    protected $metadataArrayLoaded = false;
161
162
    /**
163
     * The holds the total number of pages
164
     *
165
     * @var int
166
     * @access protected
167
     */
168
    protected $numPages = 0;
169
170
    /**
171
     * This holds the UID of the parent document or zero if not multi-volumed
172
     *
173
     * @var int
174
     * @access protected
175
     */
176
    protected $parentId = 0;
177
178
    /**
179
     * This holds the physical structure
180
     *
181
     * @var array
182
     * @access protected
183
     */
184
    protected $physicalStructure = [];
185
186
    /**
187
     * This holds the physical structure metadata
188
     *
189
     * @var array
190
     * @access protected
191
     */
192
    protected $physicalStructureInfo = [];
193
194
    /**
195
     * Is the physical structure loaded?
196
     * @see $physicalStructure
197
     *
198
     * @var bool
199
     * @access protected
200
     */
201
    protected $physicalStructureLoaded = false;
202
203
    /**
204
     * This holds the PID of the document or zero if not in database
205
     *
206
     * @var int
207
     * @access protected
208
     */
209
    protected $pid = 0;
210
211
    /**
212
     * This holds the documents' raw text pages with their corresponding
213
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
214
     *
215
     * @var array
216
     * @access protected
217
     */
218
    protected $rawTextArray = [];
219
220
    /**
221
     * Is the document instantiated successfully?
222
     *
223
     * @var bool
224
     * @access protected
225
     */
226
    protected $ready = false;
227
228
    /**
229
     * The METS file's / IIIF manifest's record identifier
230
     *
231
     * @var string
232
     * @access protected
233
     */
234
    protected $recordId;
235
236
    /**
237
     * This holds the singleton object of the document
238
     *
239
     * @var array (\Kitodo\Dlf\Common\Doc)
240
     * @static
241
     * @access protected
242
     */
243
    protected static $registry = [];
244
245
    /**
246
     * This holds the UID of the root document or zero if not multi-volumed
247
     *
248
     * @var int
249
     * @access protected
250
     */
251
    protected $rootId = 0;
252
253
    /**
254
     * Is the root id loaded?
255
     * @see $rootId
256
     *
257
     * @var bool
258
     * @access protected
259
     */
260
    protected $rootIdLoaded = false;
261
262
    /**
263
     * This holds the smLinks between logical and physical structMap
264
     *
265
     * @var array
266
     * @access protected
267
     */
268
    protected $smLinks = ['l2p' => [], 'p2l' => []];
269
270
    /**
271
     * Are the smLinks loaded?
272
     * @see $smLinks
273
     *
274
     * @var bool
275
     * @access protected
276
     */
277
    protected $smLinksLoaded = false;
278
279
    /**
280
     * This holds the logical structure
281
     *
282
     * @var array
283
     * @access protected
284
     */
285
    protected $tableOfContents = [];
286
287
    /**
288
     * Is the table of contents loaded?
289
     * @see $tableOfContents
290
     *
291
     * @var bool
292
     * @access protected
293
     */
294
    protected $tableOfContentsLoaded = false;
295
296
    /**
297
     * This holds the document's thumbnail location
298
     *
299
     * @var string
300
     * @access protected
301
     */
302
    protected $thumbnail = '';
303
304
    /**
305
     * Is the document's thumbnail location loaded?
306
     * @see $thumbnail
307
     *
308
     * @var bool
309
     * @access protected
310
     */
311
    protected $thumbnailLoaded = false;
312
313
    /**
314
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
315
     *
316
     * @var string
317
     * @access protected
318
     */
319
    protected $toplevelId = '';
320
321
    /**
322
     * This holds the UID or the URL of the document
323
     *
324
     * @var mixed
325
     * @access protected
326
     */
327
    protected $uid = 0;
328
329
    /**
330
     * This holds the whole XML file as \SimpleXMLElement object
331
     *
332
     * @var \SimpleXMLElement
333
     * @access protected
334
     */
335
    protected $xml;
336
337
    /**
338
     * This clears the static registry to prevent memory exhaustion
339
     *
340
     * @access public
341
     *
342
     * @static
343
     *
344
     * @return void
345
     */
346
    public static function clearRegistry()
347
    {
348
        // Reset registry array.
349
        self::$registry = [];
350
    }
351
352
    /**
353
     * This ensures that the recordId, if existent, is retrieved from the document
354
     *
355
     * @access protected
356
     *
357
     * @abstract
358
     *
359
     * @param int $pid: ID of the configuration page with the recordId config
360
     *
361
     */
362
    protected abstract function establishRecordId($pid);
363
364
    /**
365
     * Source document PHP object which is represented by a Document instance
366
     *
367
     * @access protected
368
     *
369
     * @abstract
370
     *
371
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
372
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
373
     */
374
    protected abstract function getDocument();
375
376
    /**
377
     * This gets the location of a downloadable file for a physical page or track
378
     *
379
     * @access public
380
     *
381
     * @abstract
382
     *
383
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
384
     *
385
     * @return string    The file's location as URL
386
     */
387
    public abstract function getDownloadLocation($id);
388
389
    /**
390
     * This gets the location of a file representing a physical page or track
391
     *
392
     * @access public
393
     *
394
     * @abstract
395
     *
396
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
397
     *
398
     * @return string The file's location as URL
399
     */
400
    public abstract function getFileLocation($id);
401
402
    /**
403
     * This gets the MIME type of a file representing a physical page or track
404
     *
405
     * @access public
406
     *
407
     * @abstract
408
     *
409
     * @param string $id: The @ID attribute of the file node
410
     *
411
     * @return string The file's MIME type
412
     */
413
    public abstract function getFileMimeType($id);
414
415
    /**
416
     * This is a singleton class, thus an instance must be created by this method
417
     *
418
     * @access public
419
     *
420
     * @static
421
     *
422
     * @param string $location: The URL of XML file or the IRI of the IIIF resource
423
     * @param array $settings
424
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
425
     *
426
     * @return \Kitodo\Dlf\Common\Doc Instance of this class, either MetsDocument or IiifManifest
427
     */
428
    public static function &getInstance($location, $settings = [], $forceReload = false)
429
    {
430
        // Create new instance depending on format (METS or IIIF) ...
431
        $instance = null;
432
        $documentFormat = null;
433
        $xml = null;
434
        $iiif = null;
435
436
        // Try to load a file from the url
437
        if (GeneralUtility::isValidUrl($location)) {
438
            // Load extension configuration
439
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
440
            // Set user-agent to identify self when fetching XML data.
441
            if (!empty($extConf['useragent'])) {
442
                @ini_set('user_agent', $extConf['useragent']);
443
            }
444
            $content = GeneralUtility::getUrl($location);
445
            if ($content !== false) {
446
                $xml = Helper::getXmlFileAsString($content);
447
                if ($xml !== false) {
448
                    /* @var $xml \SimpleXMLElement */
449
                    $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
450
                    $xpathResult = $xml->xpath('//mets:mets');
451
                    $documentFormat = !empty($xpathResult) ? 'METS' : null;
452
                } else {
453
                    // Try to load file as IIIF resource instead.
454
                    $contentAsJsonArray = json_decode($content, true);
455
                    if ($contentAsJsonArray !== null) {
456
                        // Load plugin configuration.
457
                        $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
458
                        IiifHelper::setUrlReader(IiifUrlReader::getInstance());
459
                        IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
460
                        IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
461
                        $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
462
                        if ($iiif instanceof IiifResourceInterface) {
463
                            $documentFormat = 'IIIF';
464
                        }
465
                    }
466
                }
467
            }
468
        }
469
470
        // Sanitize input.
471
        $pid = max(intval($settings['storagePid']), 0);
472
        if ($documentFormat == 'METS') {
473
            $instance = new MetsDocument($location, $pid, $xml);
474
        } elseif ($documentFormat == 'IIIF') {
475
            $instance = new IiifManifest($location, $pid, $iiif);
476
        }
477
478
        return $instance;
479
    }
480
481
    /**
482
     * This gets details about a logical structure element
483
     *
484
     * @access public
485
     *
486
     * @abstract
487
     *
488
     * @param string $id: The @ID attribute of the logical structure node (METS) or
489
     * the @id property of the Manifest / Range (IIIF)
490
     * @param bool $recursive: Whether to include the child elements / resources
491
     *
492
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
493
     */
494
    public abstract function getLogicalStructure($id, $recursive = false);
495
496
    /**
497
     * This extracts all the metadata for a logical structure node
498
     *
499
     * @access public
500
     *
501
     * @abstract
502
     *
503
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
504
     * of the Manifest / Range (IIIF)
505
     * @param int $cPid: The PID for the metadata definitions
506
     *                       (defaults to $this->cPid or $this->pid)
507
     *
508
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
509
     */
510
    public abstract function getMetadata($id, $cPid = 0);
511
512
    /**
513
     * This returns the first corresponding physical page number of a given logical page label
514
     *
515
     * @access public
516
     *
517
     * @param string $logicalPage: The label (or a part of the label) of the logical page
518
     *
519
     * @return int The physical page number
520
     */
521
    public function getPhysicalPage($logicalPage)
522
    {
523
        if (
524
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
525
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
526
        ) {
527
            return $this->lastSearchedPhysicalPage['physicalPage'];
528
        } else {
529
            $physicalPage = 0;
530
            foreach ($this->physicalStructureInfo as $page) {
531
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
532
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
533
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
534
                    return $physicalPage;
535
                }
536
                $physicalPage++;
537
            }
538
        }
539
        return 1;
540
    }
541
542
    /**
543
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
544
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
545
     *
546
     * @access public
547
     *
548
     * @abstract
549
     *
550
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
551
     * of the Manifest / Range (IIIF)
552
     *
553
     * @return string The OCR full text
554
     */
555
    public abstract function getFullText($id);
556
557
    /**
558
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
559
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
560
     * to be given in the Canvas' / Manifest's "seeAlso" property.
561
     *
562
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
563
     * of the Manifest / Range (IIIF)
564
     *
565
     * @return string The OCR full text
566
     */
567
    protected function getFullTextFromXml($id)
568
    {
569
        $fullText = '';
570
        // Load available text formats, ...
571
        $this->loadFormats();
572
        // ... physical structure ...
573
        $this->_getPhysicalStructure();
574
        // ... and extension configuration.
575
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
576
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
577
        if (!empty($this->physicalStructureInfo[$id])) {
578
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
579
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
580
                    // Get full text file.
581
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
582
                    if ($fileContent !== false) {
583
                        $textFormat = $this->getTextFormat($fileContent);
584
                    } else {
585
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
586
                        return $fullText;
587
                    }
588
                    break;
589
                }
590
            }
591
        } else {
592
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
593
            return $fullText;
594
        }
595
        // Is this text format supported?
596
        // This part actually differs from previous version of indexed OCR
597
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
598
            $textMiniOcr = '';
599
            if (!empty($this->formats[$textFormat]['class'])) {
600
                $class = $this->formats[$textFormat]['class'];
601
                // Get the raw text from class.
602
                if (
603
                    class_exists($class)
604
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
605
                ) {
606
                    // Load XML from file.
607
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
608
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
609
                    $this->rawTextArray[$id] = $textMiniOcr;
610
                } else {
611
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
612
                }
613
            }
614
            $fullText = $textMiniOcr;
615
        } else {
616
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
617
        }
618
        return $fullText;
619
    }
620
621
    /**
622
     * Get format of the OCR full text
623
     *
624
     * @access private
625
     *
626
     * @param string $fileContent: content of the XML file
627
     *
628
     * @return string The format of the OCR full text
629
     */
630
    private function getTextFormat($fileContent)
631
    {
632
        // Get the root element's name as text format.
633
        return strtoupper(Helper::getXmlFileAsString($fileContent)->getName());
634
    }
635
636
    /**
637
     * This determines a title for the given document
638
     *
639
     * @access public
640
     *
641
     * @static
642
     *
643
     * @param int $uid: The UID of the document
644
     * @param bool $recursive: Search superior documents for a title, too?
645
     *
646
     * @return string The title of the document itself or a parent document
647
     */
648
    public static function getTitle($uid, $recursive = false)
649
    {
650
        $title = '';
651
        // Sanitize input.
652
        $uid = max(intval($uid), 0);
653
        if ($uid) {
654
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
655
                ->getQueryBuilderForTable('tx_dlf_documents');
656
657
            $result = $queryBuilder
658
                ->select(
659
                    'tx_dlf_documents.title',
660
                    'tx_dlf_documents.partof'
661
                )
662
                ->from('tx_dlf_documents')
663
                ->where(
664
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
665
                    Helper::whereExpression('tx_dlf_documents')
666
                )
667
                ->setMaxResults(1)
668
                ->execute();
669
670
            if ($resArray = $result->fetch()) {
671
                // Get title information.
672
                $title = $resArray['title'];
673
                $partof = $resArray['partof'];
674
                // Search parent documents recursively for a title?
675
                if (
676
                    $recursive
677
                    && empty($title)
678
                    && intval($partof)
679
                    && $partof != $uid
680
                ) {
681
                    $title = self::getTitle($partof, true);
682
                }
683
            } else {
684
                Helper::log('No document with UID ' . $uid . ' found or document not accessible', LOG_SEVERITY_WARNING);
685
            }
686
        } else {
687
            Helper::log('Invalid UID ' . $uid . ' for document', LOG_SEVERITY_ERROR);
688
        }
689
        return $title;
690
    }
691
692
    /**
693
     * This extracts all the metadata for the toplevel logical structure node / resource
694
     *
695
     * @access public
696
     *
697
     * @param int $cPid: The PID for the metadata definitions
698
     *
699
     * @return array The logical structure node's / resource's parsed metadata array
700
     */
701
    public function getTitledata($cPid = 0)
702
    {
703
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
704
        // Add information from METS structural map to titledata array.
705
        if ($this instanceof MetsDocument) {
706
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
707
        }
708
        // Set record identifier for METS file / IIIF manifest if not present.
709
        if (
710
            is_array($titledata)
711
            && array_key_exists('record_id', $titledata)
712
        ) {
713
            if (
714
                !empty($this->recordId)
715
                && !in_array($this->recordId, $titledata['record_id'])
716
            ) {
717
                array_unshift($titledata['record_id'], $this->recordId);
718
            }
719
        }
720
        return $titledata;
721
    }
722
723
    /**
724
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
725
     *
726
     * @access protected
727
     *
728
     * @param array $structure: logical structure array
729
     * @param int $depth: current tree depth
730
     * @param string $logId: ID of the logical structure whose depth is requested
731
     *
732
     * @return int|bool: false if structure with $logId is not a child of this substructure,
733
     * or the actual depth.
734
     */
735
    protected function getTreeDepth($structure, $depth, $logId)
736
    {
737
        foreach ($structure as $element) {
738
            if ($element['id'] == $logId) {
739
                return $depth;
740
            } elseif (array_key_exists('children', $element)) {
741
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
742
                if ($foundInChildren !== false) {
743
                    return $foundInChildren;
744
                }
745
            }
746
        }
747
        return false;
748
    }
749
750
    /**
751
     * Get the tree depth of a logical structure element within the table of content
752
     *
753
     * @access public
754
     *
755
     * @param string $logId: The id of the logical structure element whose depth is requested
756
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
757
     */
758
    public function getStructureDepth($logId)
759
    {
760
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
761
    }
762
763
    /**
764
     * This sets some basic class properties
765
     *
766
     * @access protected
767
     *
768
     * @abstract
769
     *
770
     * @return void
771
     */
772
    protected abstract function init();
773
774
    /**
775
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
776
     *
777
     * @access protected
778
     *
779
     * @abstract
780
     *
781
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
782
     *
783
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
784
     */
785
    protected abstract function setPreloadedDocument($preloadedDocument);
786
787
    /**
788
     * METS/IIIF specific part of loading a location
789
     *
790
     * @access protected
791
     *
792
     * @abstract
793
     *
794
     * @param string $location: The URL of the file to load
795
     *
796
     * @return bool true on success or false on failure
797
     */
798
    protected abstract function loadLocation($location);
799
800
    /**
801
     * Load XML file / IIIF resource from URL
802
     *
803
     * @access protected
804
     *
805
     * @param string $location: The URL of the file to load
806
     *
807
     * @return bool true on success or false on failure
808
     */
809
    protected function load($location)
810
    {
811
        // Load XML / JSON-LD file.
812
        if (GeneralUtility::isValidUrl($location)) {
813
            // Load extension configuration
814
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
815
            // Set user-agent to identify self when fetching XML / JSON-LD data.
816
            if (!empty($extConf['useragent'])) {
817
                @ini_set('user_agent', $extConf['useragent']);
818
            }
819
            // the actual loading is format specific
820
            return $this->loadLocation($location);
821
        } else {
822
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
823
        }
824
        return false;
825
    }
826
827
    /**
828
     * Analyze the document if it contains any fulltext that needs to be indexed.
829
     *
830
     * @access protected
831
     *
832
     * @abstract
833
     */
834
    protected abstract function ensureHasFulltextIsSet();
835
836
    /**
837
     * Register all available data formats
838
     *
839
     * @access protected
840
     *
841
     * @return void
842
     */
843
    protected function loadFormats()
844
    {
845
        if (!$this->formatsLoaded) {
846
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
847
                ->getQueryBuilderForTable('tx_dlf_formats');
848
849
            // Get available data formats from database.
850
            $result = $queryBuilder
851
                ->select(
852
                    'tx_dlf_formats.type AS type',
853
                    'tx_dlf_formats.root AS root',
854
                    'tx_dlf_formats.namespace AS namespace',
855
                    'tx_dlf_formats.class AS class'
856
                )
857
                ->from('tx_dlf_formats')
858
                ->where(
859
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
860
                )
861
                ->execute();
862
863
            while ($resArray = $result->fetch()) {
864
                // Update format registry.
865
                $this->formats[$resArray['type']] = [
866
                    'rootElement' => $resArray['root'],
867
                    'namespaceURI' => $resArray['namespace'],
868
                    'class' => $resArray['class']
869
                ];
870
            }
871
            $this->formatsLoaded = true;
872
        }
873
    }
874
875
    /**
876
     * Register all available namespaces for a \SimpleXMLElement object
877
     *
878
     * @access public
879
     *
880
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
881
     *
882
     * @return void
883
     */
884
    public function registerNamespaces(&$obj)
885
    {
886
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
887
        $this->loadFormats();
888
        // Do we have a \SimpleXMLElement or \DOMXPath object?
889
        if ($obj instanceof \SimpleXMLElement) {
890
            $method = 'registerXPathNamespace';
891
        } elseif ($obj instanceof \DOMXPath) {
892
            $method = 'registerNamespace';
893
        } else {
894
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
895
            return;
896
        }
897
        // Register metadata format's namespaces.
898
        foreach ($this->formats as $enc => $conf) {
899
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
900
        }
901
    }
902
903
    /**
904
     * Get the ID of the parent document if the current document has one. Also save a parent document
905
     * to the database and the Solr index if their $pid and the current $pid differ.
906
     * Currently only applies to METS documents.
907
     *
908
     * @access protected
909
     *
910
     * @abstract
911
     *
912
     * @return int The parent document's id.
913
     */
914
    protected abstract function getParentDocumentUidForSaving($pid, $core, $owner);
915
916
    /**
917
     * This returns $this->cPid via __get()
918
     *
919
     * @access protected
920
     *
921
     * @return int The PID of the metadata definitions
922
     */
923
    protected function _getCPid()
924
    {
925
        return $this->cPid;
926
    }
927
928
    /**
929
     * This returns $this->hasFulltext via __get()
930
     *
931
     * @access protected
932
     *
933
     * @return bool Are there any fulltext files available?
934
     */
935
    protected function _getHasFulltext()
936
    {
937
        $this->ensureHasFulltextIsSet();
938
        return $this->hasFulltext;
939
    }
940
941
    /**
942
     * This returns $this->location via __get()
943
     *
944
     * @access protected
945
     *
946
     * @return string The location of the document
947
     */
948
    protected function _getLocation()
949
    {
950
        return $this->location;
951
    }
952
953
    /**
954
     * Format specific part of building the document's metadata array
955
     *
956
     * @access protected
957
     *
958
     * @abstract
959
     *
960
     * @param int $cPid
961
     */
962
    protected abstract function prepareMetadataArray($cPid);
963
964
    /**
965
     * This builds an array of the document's metadata
966
     *
967
     * @access protected
968
     *
969
     * @return array Array of metadata with their corresponding logical structure node ID as key
970
     */
971
    protected function _getMetadataArray()
972
    {
973
        // Set metadata definitions' PID.
974
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
975
        if (!$cPid) {
976
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
977
            return [];
978
        }
979
        if (
980
            !$this->metadataArrayLoaded
981
            || $this->metadataArray[0] != $cPid
982
        ) {
983
            $this->prepareMetadataArray($cPid);
984
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
985
            $this->metadataArrayLoaded = true;
986
        }
987
        return $this->metadataArray;
988
    }
989
990
    /**
991
     * This returns $this->numPages via __get()
992
     *
993
     * @access protected
994
     *
995
     * @return int The total number of pages and/or tracks
996
     */
997
    protected function _getNumPages()
998
    {
999
        $this->_getPhysicalStructure();
1000
        return $this->numPages;
1001
    }
1002
1003
    /**
1004
     * This returns $this->parentId via __get()
1005
     *
1006
     * @access protected
1007
     *
1008
     * @return int The UID of the parent document or zero if not applicable
1009
     */
1010
    protected function _getParentId()
1011
    {
1012
        return $this->parentId;
1013
    }
1014
1015
    /**
1016
     * This builds an array of the document's physical structure
1017
     *
1018
     * @access protected
1019
     *
1020
     * @abstract
1021
     *
1022
     * @return array Array of physical elements' id, type, label and file representations ordered
1023
     * by @ORDER attribute / IIIF Sequence's Canvases
1024
     */
1025
    protected abstract function _getPhysicalStructure();
1026
1027
    /**
1028
     * This gives an array of the document's physical structure metadata
1029
     *
1030
     * @access protected
1031
     *
1032
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1033
     */
1034
    protected function _getPhysicalStructureInfo()
1035
    {
1036
        // Is there no physical structure array yet?
1037
        if (!$this->physicalStructureLoaded) {
1038
            // Build physical structure array.
1039
            $this->_getPhysicalStructure();
1040
        }
1041
        return $this->physicalStructureInfo;
1042
    }
1043
1044
    /**
1045
     * This returns $this->pid via __get()
1046
     *
1047
     * @access protected
1048
     *
1049
     * @return int The PID of the document or zero if not in database
1050
     */
1051
    protected function _getPid()
1052
    {
1053
        return $this->pid;
1054
    }
1055
1056
    /**
1057
     * This returns $this->ready via __get()
1058
     *
1059
     * @access protected
1060
     *
1061
     * @return bool Is the document instantiated successfully?
1062
     */
1063
    protected function _getReady()
1064
    {
1065
        return $this->ready;
1066
    }
1067
1068
    /**
1069
     * This returns $this->recordId via __get()
1070
     *
1071
     * @access protected
1072
     *
1073
     * @return mixed The METS file's / IIIF manifest's record identifier
1074
     */
1075
    protected function _getRecordId()
1076
    {
1077
        return $this->recordId;
1078
    }
1079
1080
    /**
1081
     * This returns $this->rootId via __get()
1082
     *
1083
     * @access protected
1084
     *
1085
     * @return int The UID of the root document or zero if not applicable
1086
     */
1087
    protected function _getRootId()
1088
    {
1089
        if (!$this->rootIdLoaded) {
1090
            if ($this->parentId) {
1091
                $parent = self::getInstance($this->parentId, ['storagePid' => $this->pid]);
1092
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Doc.
Loading history...
1093
            }
1094
            $this->rootIdLoaded = true;
1095
        }
1096
        return $this->rootId;
1097
    }
1098
1099
    /**
1100
     * This returns the smLinks between logical and physical structMap (METS) and models the
1101
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1102
     *
1103
     * @access protected
1104
     *
1105
     * @abstract
1106
     *
1107
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1108
     */
1109
    protected abstract function _getSmLinks();
1110
1111
    /**
1112
     * This builds an array of the document's logical structure
1113
     *
1114
     * @access protected
1115
     *
1116
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1117
     */
1118
    protected function _getTableOfContents()
1119
    {
1120
        // Is there no logical structure array yet?
1121
        if (!$this->tableOfContentsLoaded) {
1122
            // Get all logical structures.
1123
            $this->getLogicalStructure('', true);
1124
            $this->tableOfContentsLoaded = true;
1125
        }
1126
        return $this->tableOfContents;
1127
    }
1128
1129
    /**
1130
     * This returns the document's thumbnail location
1131
     *
1132
     * @access protected
1133
     *
1134
     * @abstract
1135
     *
1136
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1137
     *
1138
     * @return string The document's thumbnail location
1139
     */
1140
    protected abstract function _getThumbnail($forceReload = false);
1141
1142
    /**
1143
     * This returns the ID of the toplevel logical structure node
1144
     *
1145
     * @access protected
1146
     *
1147
     * @abstract
1148
     *
1149
     * @return string The logical structure node's ID
1150
     */
1151
    protected abstract function _getToplevelId();
1152
1153
    /**
1154
     * This returns $this->uid via __get()
1155
     *
1156
     * @access protected
1157
     *
1158
     * @return mixed The UID or the URL of the document
1159
     */
1160
    protected function _getUid()
1161
    {
1162
        return $this->uid;
1163
    }
1164
1165
    /**
1166
     * This sets $this->cPid via __set()
1167
     *
1168
     * @access protected
1169
     *
1170
     * @param int $value: The new PID for the metadata definitions
1171
     *
1172
     * @return void
1173
     */
1174
    protected function _setCPid($value)
1175
    {
1176
        $this->cPid = max(intval($value), 0);
1177
    }
1178
1179
    /**
1180
     * This is a singleton class, thus the constructor should be private/protected
1181
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Doc::getInstance())
1182
     *
1183
     * @access protected
1184
     *
1185
     * @param int $location: The location URL of the XML file to parse
1186
     * @param int $pid: If > 0, then only document with this PID gets loaded
1187
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1188
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1189
     *
1190
     * @return void
1191
     */
1192
    protected function __construct($location, $pid, $preloadedDocument)
1193
    {
1194
        $this->setPreloadedDocument($preloadedDocument);
1195
        $this->init();
1196
        $this->establishRecordId($pid);
1197
        $this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger();
1198
        return;
1199
1200
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
0 ignored issues
show
Unused Code introduced by
$queryBuilder = TYPO3\CM...ble('tx_dlf_documents') is not reachable.

This check looks for unreachable code. It uses sophisticated control flow analysis techniques to find statements which will never be executed.

Unreachable code is most often the result of return, die or exit statements that have been added for debug purposes.

function fx() {
    try {
        doSomething();
        return true;
    }
    catch (\Exception $e) {
        return false;
    }

    return false;
}

In the above example, the last return false will never be executed, because a return statement has already been met in every possible execution path.

Loading history...
1201
            ->getQueryBuilderForTable('tx_dlf_documents');
1202
        $location = '';
1203
        // Try to load METS file / IIIF manifest.
1204
        if ($this->setPreloadedDocument($preloadedDocument) || (GeneralUtility::isValidUrl($location)
1205
            && $this->load($location))) {
1206
            // Initialize core METS object.
1207
            $this->init();
1208
            if ($this->getDocument() !== null) {
1209
                // Cast to string for safety reasons.
1210
                $location = (string) $location;
1211
                $this->establishRecordId($pid);
1212
            } else {
1213
                // No METS / IIIF part found.
1214
                return;
1215
            }
1216
        } else {
1217
            // Loading failed.
1218
            return;
1219
        }
1220
        if (
1221
            !empty($location)
1222
            && !empty($this->recordId)
1223
        ) {
1224
            // Try to match record identifier or location (both should be unique).
1225
            $whereClause = $queryBuilder->expr()->andX(
1226
                $queryBuilder->expr()->orX(
1227
                    $queryBuilder->expr()->eq('tx_dlf_documents.location', $queryBuilder->expr()->literal($location)),
1228
                    $queryBuilder->expr()->eq('tx_dlf_documents.record_id', $queryBuilder->expr()->literal($this->recordId))
1229
                ),
1230
                Helper::whereExpression('tx_dlf_documents')
1231
            );
1232
        } else {
1233
            // Can't persistently identify document, don't try to match at all.
1234
            $whereClause = '1=-1';
1235
        }
1236
        // Check for PID if needed.
1237
        if ($pid) {
1238
            $whereClause = $queryBuilder->expr()->andX(
1239
                $whereClause,
1240
                $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid))
1241
            );
1242
        }
1243
        // Get document PID and location from database.
1244
        $result = $queryBuilder
1245
            ->select(
1246
                'tx_dlf_documents.uid AS uid',
1247
                'tx_dlf_documents.pid AS pid',
1248
                'tx_dlf_documents.record_id AS record_id',
1249
                'tx_dlf_documents.partof AS partof',
1250
                'tx_dlf_documents.thumbnail AS thumbnail',
1251
                'tx_dlf_documents.location AS location'
1252
            )
1253
            ->from('tx_dlf_documents')
1254
            ->where($whereClause)
1255
            ->setMaxResults(1)
1256
            ->execute();
1257
1258
        if ($resArray = $result->fetch()) {
1259
            $this->uid = $resArray['uid'];
1260
            $this->pid = $resArray['pid'];
1261
            $this->recordId = $resArray['record_id'];
1262
            $this->parentId = $resArray['partof'];
1263
            $this->thumbnail = $resArray['thumbnail'];
1264
            $this->location = $resArray['location'];
1265
            $this->thumbnailLoaded = true;
1266
            // Load XML file if necessary...
1267
            if (
1268
                $this->getDocument() === null
1269
                && $this->load($this->location)
1270
            ) {
1271
                // ...and set some basic properties.
1272
                $this->init();
1273
            }
1274
            // Do we have a METS / IIIF object now?
1275
            if ($this->getDocument() !== null) {
1276
                // Set new location if necessary.
1277
                if (!empty($location)) {
1278
                    $this->location = $location;
1279
                }
1280
                // Document ready!
1281
                $this->ready = true;
1282
            }
1283
        } elseif ($this->getDocument() !== null) {
1284
            // Set location as UID for documents not in database.
1285
            $this->uid = $location;
1286
            $this->location = $location;
1287
            // Document ready!
1288
            $this->ready = true;
1289
        } else {
1290
            $this->logger->error('No document with UID ' . $location . ' found or document not accessible');
1291
        }
1292
    }
1293
1294
    /**
1295
     * This magic method is called each time an invisible property is referenced from the object
1296
     *
1297
     * @access public
1298
     *
1299
     * @param string $var: Name of variable to get
1300
     *
1301
     * @return mixed Value of $this->$var
1302
     */
1303
    public function __get($var)
1304
    {
1305
        $method = '_get' . ucfirst($var);
1306
        if (
1307
            !property_exists($this, $var)
1308
            || !method_exists($this, $method)
1309
        ) {
1310
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1311
            return;
1312
        } else {
1313
            return $this->$method();
1314
        }
1315
    }
1316
1317
    /**
1318
     * This magic method is called each time an invisible property is checked for isset() or empty()
1319
     *
1320
     * @access public
1321
     *
1322
     * @param string $var: Name of variable to check
1323
     *
1324
     * @return bool true if variable is set and not empty, false otherwise
1325
     */
1326
    public function __isset($var)
1327
    {
1328
        return !empty($this->__get($var));
1329
    }
1330
1331
    /**
1332
     * This magic method is called each time an invisible property is referenced from the object
1333
     *
1334
     * @access public
1335
     *
1336
     * @param string $var: Name of variable to set
1337
     * @param mixed $value: New value of variable
1338
     *
1339
     * @return void
1340
     */
1341
    public function __set($var, $value)
1342
    {
1343
        $method = '_set' . ucfirst($var);
1344
        if (
1345
            !property_exists($this, $var)
1346
            || !method_exists($this, $method)
1347
        ) {
1348
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1349
        } else {
1350
            $this->$method($value);
1351
        }
1352
    }
1353
}
1354