Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( 85f5d2...3e7f1d )
by Alexander
24s queued 19s
created

Document::getXmlObject()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 6
c 1
b 0
f 0
dl 0
loc 14
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
16
use TYPO3\CMS\Core\Database\ConnectionPool;
17
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
18
use TYPO3\CMS\Core\Log\LogManager;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use TYPO3\CMS\Core\Utility\MathUtility;
21
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
22
use Ubl\Iiif\Tools\IiifHelper;
23
24
/**
25
 * Document class for the 'dlf' extension
26
 *
27
 * @author Sebastian Meyer <[email protected]>
28
 * @author Henrik Lochmann <[email protected]>
29
 * @package TYPO3
30
 * @subpackage dlf
31
 * @access public
32
 * @property int $cPid This holds the PID for the configuration
33
 * @property-read bool $hasFulltext Are there any fulltext files available?
34
 * @property-read string $location This holds the documents location
35
 * @property-read array $metadataArray This holds the documents' parsed metadata array
36
 * @property-read int $numPages The holds the total number of pages
37
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
38
 * @property-read array $physicalStructure This holds the physical structure
39
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
40
 * @property-read int $pid This holds the PID of the document or zero if not in database
41
 * @property-read bool $ready Is the document instantiated successfully?
42
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
43
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
44
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
45
 * @property-read array $tableOfContents This holds the logical structure
46
 * @property-read string $thumbnail This holds the document's thumbnail location
47
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
48
 * @property-read mixed $uid This holds the UID or the URL of the document
49
 * @abstract
50
 */
51
abstract class Document
52
{
53
    /**
54
     * This holds the logger
55
     *
56
     * @var LogManager
57
     * @access protected
58
     */
59
    protected $logger;
60
61
    /**
62
     * This holds the PID for the configuration
63
     *
64
     * @var int
65
     * @access protected
66
     */
67
    protected $cPid = 0;
68
69
    /**
70
     * The extension key
71
     *
72
     * @var string
73
     * @access public
74
     */
75
    public static $extKey = 'dlf';
76
77
    /**
78
     * This holds the configuration for all supported metadata encodings
79
     * @see loadFormats()
80
     *
81
     * @var array
82
     * @access protected
83
     */
84
    protected $formats = [
85
        'OAI' => [
86
            'rootElement' => 'OAI-PMH',
87
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
88
        ],
89
        'METS' => [
90
            'rootElement' => 'mets',
91
            'namespaceURI' => 'http://www.loc.gov/METS/',
92
        ],
93
        'XLINK' => [
94
            'rootElement' => 'xlink',
95
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
96
        ]
97
    ];
98
99
    /**
100
     * Are the available metadata formats loaded?
101
     * @see $formats
102
     *
103
     * @var bool
104
     * @access protected
105
     */
106
    protected $formatsLoaded = false;
107
108
    /**
109
     * Are there any fulltext files available? This also includes IIIF text annotations
110
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
111
     * annotations as fulltext.
112
     *
113
     * @var bool
114
     * @access protected
115
     */
116
    protected $hasFulltext = false;
117
118
    /**
119
     * Last searched logical and physical page
120
     *
121
     * @var array
122
     * @access protected
123
     */
124
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
125
126
    /**
127
     * This holds the documents location
128
     *
129
     * @var string
130
     * @access protected
131
     */
132
    protected $location = '';
133
134
    /**
135
     * This holds the logical units
136
     *
137
     * @var array
138
     * @access protected
139
     */
140
    protected $logicalUnits = [];
141
142
    /**
143
     * This holds the documents' parsed metadata array with their corresponding
144
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
145
     *
146
     * @var array
147
     * @access protected
148
     */
149
    protected $metadataArray = [];
150
151
    /**
152
     * Is the metadata array loaded?
153
     * @see $metadataArray
154
     *
155
     * @var bool
156
     * @access protected
157
     */
158
    protected $metadataArrayLoaded = false;
159
160
    /**
161
     * The holds the total number of pages
162
     *
163
     * @var int
164
     * @access protected
165
     */
166
    protected $numPages = 0;
167
168
    /**
169
     * This holds the UID of the parent document or zero if not multi-volumed
170
     *
171
     * @var int
172
     * @access protected
173
     */
174
    protected $parentId = 0;
175
176
    /**
177
     * This holds the physical structure
178
     *
179
     * @var array
180
     * @access protected
181
     */
182
    protected $physicalStructure = [];
183
184
    /**
185
     * This holds the physical structure metadata
186
     *
187
     * @var array
188
     * @access protected
189
     */
190
    protected $physicalStructureInfo = [];
191
192
    /**
193
     * Is the physical structure loaded?
194
     * @see $physicalStructure
195
     *
196
     * @var bool
197
     * @access protected
198
     */
199
    protected $physicalStructureLoaded = false;
200
201
    /**
202
     * This holds the PID of the document or zero if not in database
203
     *
204
     * @var int
205
     * @access protected
206
     */
207
    protected $pid = 0;
208
209
    /**
210
     * This holds the documents' raw text pages with their corresponding
211
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
212
     *
213
     * @var array
214
     * @access protected
215
     */
216
    protected $rawTextArray = [];
217
218
    /**
219
     * Is the document instantiated successfully?
220
     *
221
     * @var bool
222
     * @access protected
223
     */
224
    protected $ready = false;
225
226
    /**
227
     * The METS file's / IIIF manifest's record identifier
228
     *
229
     * @var string
230
     * @access protected
231
     */
232
    protected $recordId;
233
234
    /**
235
     * This holds the singleton object of the document
236
     *
237
     * @var array (\Kitodo\Dlf\Common\Document)
238
     * @static
239
     * @access protected
240
     */
241
    protected static $registry = [];
242
243
    /**
244
     * This holds the UID of the root document or zero if not multi-volumed
245
     *
246
     * @var int
247
     * @access protected
248
     */
249
    protected $rootId = 0;
250
251
    /**
252
     * Is the root id loaded?
253
     * @see $rootId
254
     *
255
     * @var bool
256
     * @access protected
257
     */
258
    protected $rootIdLoaded = false;
259
260
    /**
261
     * This holds the smLinks between logical and physical structMap
262
     *
263
     * @var array
264
     * @access protected
265
     */
266
    protected $smLinks = ['l2p' => [], 'p2l' => []];
267
268
    /**
269
     * Are the smLinks loaded?
270
     * @see $smLinks
271
     *
272
     * @var bool
273
     * @access protected
274
     */
275
    protected $smLinksLoaded = false;
276
277
    /**
278
     * This holds the logical structure
279
     *
280
     * @var array
281
     * @access protected
282
     */
283
    protected $tableOfContents = [];
284
285
    /**
286
     * Is the table of contents loaded?
287
     * @see $tableOfContents
288
     *
289
     * @var bool
290
     * @access protected
291
     */
292
    protected $tableOfContentsLoaded = false;
293
294
    /**
295
     * This holds the document's thumbnail location
296
     *
297
     * @var string
298
     * @access protected
299
     */
300
    protected $thumbnail = '';
301
302
    /**
303
     * Is the document's thumbnail location loaded?
304
     * @see $thumbnail
305
     *
306
     * @var bool
307
     * @access protected
308
     */
309
    protected $thumbnailLoaded = false;
310
311
    /**
312
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
313
     *
314
     * @var string
315
     * @access protected
316
     */
317
    protected $toplevelId = '';
318
319
    /**
320
     * This holds the UID or the URL of the document
321
     *
322
     * @var mixed
323
     * @access protected
324
     */
325
    protected $uid = 0;
326
327
    /**
328
     * This holds the whole XML file as \SimpleXMLElement object
329
     *
330
     * @var \SimpleXMLElement
331
     * @access protected
332
     */
333
    protected $xml;
334
335
    /**
336
     * This clears the static registry to prevent memory exhaustion
337
     *
338
     * @access public
339
     *
340
     * @static
341
     *
342
     * @return void
343
     */
344
    public static function clearRegistry()
345
    {
346
        // Reset registry array.
347
        self::$registry = [];
348
    }
349
350
    /**
351
     * This ensures that the recordId, if existent, is retrieved from the document
352
     *
353
     * @access protected
354
     *
355
     * @abstract
356
     *
357
     * @param int $pid: ID of the configuration page with the recordId config
358
     *
359
     */
360
    protected abstract function establishRecordId($pid);
361
362
    /**
363
     * Source document PHP object which is represented by a Document instance
364
     *
365
     * @access protected
366
     *
367
     * @abstract
368
     *
369
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
370
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
371
     */
372
    protected abstract function getDocument();
373
374
    /**
375
     * This gets the location of a downloadable file for a physical page or track
376
     *
377
     * @access public
378
     *
379
     * @abstract
380
     *
381
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
382
     *
383
     * @return string    The file's location as URL
384
     */
385
    public abstract function getDownloadLocation($id);
386
387
    /**
388
     * This gets the location of a file representing a physical page or track
389
     *
390
     * @access public
391
     *
392
     * @abstract
393
     *
394
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
395
     *
396
     * @return string The file's location as URL
397
     */
398
    public abstract function getFileLocation($id);
399
400
    /**
401
     * This gets the MIME type of a file representing a physical page or track
402
     *
403
     * @access public
404
     *
405
     * @abstract
406
     *
407
     * @param string $id: The @ID attribute of the file node
408
     *
409
     * @return string The file's MIME type
410
     */
411
    public abstract function getFileMimeType($id);
412
413
    /**
414
     * This is a singleton class, thus an instance must be created by this method
415
     *
416
     * @access public
417
     *
418
     * @static
419
     *
420
     * @param mixed $uid: The unique identifier of the document to parse, the URL of XML file or the IRI of the IIIF resource
421
     * @param int $pid: If > 0, then only document with this PID gets loaded
422
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
423
     *
424
     * @return \Kitodo\Dlf\Common\Document Instance of this class, either MetsDocument or IiifManifest
425
     */
426
    public static function &getInstance($uid, $pid = 0, $forceReload = false)
427
    {
428
        // Sanitize input.
429
        $pid = max(intval($pid), 0);
430
        if (!$forceReload) {
431
            $regObj = Helper::digest($uid);
432
            if (
433
                is_object(self::$registry[$regObj])
434
                && self::$registry[$regObj] instanceof self
435
            ) {
436
                // Check if instance has given PID.
437
                if (
438
                    !$pid
439
                    || !self::$registry[$regObj]->pid
440
                    || $pid == self::$registry[$regObj]->pid
441
                ) {
442
                    // Return singleton instance if available.
443
                    return self::$registry[$regObj];
444
                }
445
            } else {
446
                // Check the user's session...
447
                $sessionData = Helper::loadFromSession(get_called_class());
448
                if (
449
                    is_object($sessionData[$regObj])
450
                    && $sessionData[$regObj] instanceof self
451
                ) {
452
                    // Check if instance has given PID.
453
                    if (
454
                        !$pid
455
                        || !$sessionData[$regObj]->pid
456
                        || $pid == $sessionData[$regObj]->pid
457
                    ) {
458
                        // ...and restore registry.
459
                        self::$registry[$regObj] = $sessionData[$regObj];
460
                        return self::$registry[$regObj];
461
                    }
462
                }
463
            }
464
        }
465
        // Create new instance depending on format (METS or IIIF) ...
466
        $instance = null;
467
        $documentFormat = null;
468
        $xml = null;
469
        $iiif = null;
470
        // Try to get document format from database
471
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
472
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
473
                ->getQueryBuilderForTable('tx_dlf_documents');
474
475
            $queryBuilder
476
                ->select(
477
                    'tx_dlf_documents.location AS location',
478
                    'tx_dlf_documents.document_format AS document_format'
479
                )
480
                ->from('tx_dlf_documents');
481
482
            // Get UID of document with given record identifier.
483
            if ($pid) {
484
                $queryBuilder
485
                    ->where(
486
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
487
                        $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid)),
488
                        Helper::whereExpression('tx_dlf_documents')
489
                    );
490
            } else {
491
                $queryBuilder
492
                    ->where(
493
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
494
                        Helper::whereExpression('tx_dlf_documents')
495
                    );
496
            }
497
498
            $result = $queryBuilder
499
                ->setMaxResults(1)
500
                ->execute();
501
502
            if ($resArray = $result->fetch()) {
503
                $documentFormat = $resArray['document_format'];
504
            }
505
        } else {
506
            // Get document format from content of remote document
507
            // Cast to string for safety reasons.
508
            $location = (string) $uid;
509
            // Try to load a file from the url
510
            if (GeneralUtility::isValidUrl($location)) {
511
                // Load extension configuration
512
                $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
513
                // Set user-agent to identify self when fetching XML data.
514
                if (!empty($extConf['useragent'])) {
515
                    @ini_set('user_agent', $extConf['useragent']);
516
                }
517
                $content = GeneralUtility::getUrl($location);
518
                if ($content !== false) {
519
                    $xml = Helper::getXmlFileAsString($content);
520
                    if ($xml !== false) {
521
                        /* @var $xml \SimpleXMLElement */
522
                        $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
523
                        $xpathResult = $xml->xpath('//mets:mets');
524
                        $documentFormat = !empty($xpathResult) ? 'METS' : null;
525
                    } else {
526
                        // Try to load file as IIIF resource instead.
527
                        $contentAsJsonArray = json_decode($content, true);
528
                        if ($contentAsJsonArray !== null) {
529
                            // Load plugin configuration.
530
                            $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
531
                            IiifHelper::setUrlReader(IiifUrlReader::getInstance());
532
                            IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
533
                            IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
534
                            $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
535
                            if ($iiif instanceof IiifResourceInterface) {
536
                                $documentFormat = 'IIIF';
537
                            }
538
                        }
539
                    }
540
                }
541
            }
542
        }
543
        // Sanitize input.
544
        $pid = max(intval($pid), 0);
545
        if ($documentFormat == 'METS') {
546
            $instance = new MetsDocument($uid, $pid, $xml);
547
        } elseif ($documentFormat == 'IIIF') {
548
            $instance = new IiifManifest($uid, $pid, $iiif);
549
        }
550
        // Save instance to registry.
551
        if (
552
            $instance instanceof self
553
            && $instance->ready) {
554
            self::$registry[Helper::digest($instance->uid)] = $instance;
555
            if ($instance->uid != $instance->location) {
556
                self::$registry[Helper::digest($instance->location)] = $instance;
557
            }
558
            // Load extension configuration
559
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
560
            // Save registry to session if caching is enabled.
561
            if (!empty($extConf['caching'])) {
562
                Helper::saveToSession(self::$registry, get_class($instance));
563
            }
564
            $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance));
565
        }
566
        // Return new instance.
567
        return $instance;
568
    }
569
570
    /**
571
     * This gets details about a logical structure element
572
     *
573
     * @access public
574
     *
575
     * @abstract
576
     *
577
     * @param string $id: The @ID attribute of the logical structure node (METS) or
578
     * the @id property of the Manifest / Range (IIIF)
579
     * @param bool $recursive: Whether to include the child elements / resources
580
     *
581
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
582
     */
583
    public abstract function getLogicalStructure($id, $recursive = false);
584
585
    /**
586
     * This extracts all the metadata for a logical structure node
587
     *
588
     * @access public
589
     *
590
     * @abstract
591
     *
592
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
593
     * of the Manifest / Range (IIIF)
594
     * @param int $cPid: The PID for the metadata definitions
595
     *                       (defaults to $this->cPid or $this->pid)
596
     *
597
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
598
     */
599
    public abstract function getMetadata($id, $cPid = 0);
600
601
    /**
602
     * This returns the first corresponding physical page number of a given logical page label
603
     *
604
     * @access public
605
     *
606
     * @param string $logicalPage: The label (or a part of the label) of the logical page
607
     *
608
     * @return int The physical page number
609
     */
610
    public function getPhysicalPage($logicalPage)
611
    {
612
        if (
613
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
614
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
615
        ) {
616
            return $this->lastSearchedPhysicalPage['physicalPage'];
617
        } else {
618
            $physicalPage = 0;
619
            foreach ($this->physicalStructureInfo as $page) {
620
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
621
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
622
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
623
                    return $physicalPage;
624
                }
625
                $physicalPage++;
626
            }
627
        }
628
        return 1;
629
    }
630
631
    /**
632
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
633
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
634
     *
635
     * @access public
636
     *
637
     * @abstract
638
     *
639
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
640
     * of the Manifest / Range (IIIF)
641
     *
642
     * @return string The OCR full text
643
     */
644
    public abstract function getFullText($id);
645
646
    /**
647
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
648
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
649
     * to be given in the Canvas' / Manifest's "seeAlso" property.
650
     *
651
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
652
     * of the Manifest / Range (IIIF)
653
     *
654
     * @return string The OCR full text
655
     */
656
    protected function getFullTextFromXml($id)
657
    {
658
        $fullText = '';
659
        // Load available text formats, ...
660
        $this->loadFormats();
661
        // ... physical structure ...
662
        $this->_getPhysicalStructure();
663
        // ... and extension configuration.
664
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
665
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
666
        if (!empty($this->physicalStructureInfo[$id])) {
667
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
668
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
669
                    // Get full text file.
670
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
671
                    if ($fileContent !== false) {
672
                        $textFormat = $this->getTextFormat($fileContent);
673
                    } else {
674
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
1 ignored issue
show
Bug introduced by
The method warning() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

674
                        $this->logger->/** @scrutinizer ignore-call */ 
675
                                       warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
675
                        return $fullText;
676
                    }
677
                    break;
678
                }
679
            }
680
        } else {
681
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
682
            return $fullText;
683
        }
684
        // Is this text format supported?
685
        // This part actually differs from previous version of indexed OCR
686
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
687
            $textMiniOcr = '';
688
            if (!empty($this->formats[$textFormat]['class'])) {
689
                $class = $this->formats[$textFormat]['class'];
690
                // Get the raw text from class.
691
                if (
692
                    class_exists($class)
693
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
694
                ) {
695
                    // Load XML from file.
696
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
697
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
698
                    $this->rawTextArray[$id] = $textMiniOcr;
699
                } else {
700
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
701
                }
702
            }
703
            $fullText = $textMiniOcr;
704
        } else {
705
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
706
        }
707
        return $fullText;
708
    }
709
710
    /**
711
     * Get format of the OCR full text
712
     *
713
     * @access private
714
     *
715
     * @param string $fileContent: content of the XML file
716
     *
717
     * @return string The format of the OCR full text
718
     */
719
    private function getTextFormat($fileContent)
720
    {
721
        // Get the root element's name as text format.
722
        return strtoupper(Helper::getXmlFileAsString($fileContent)->getName());
723
    }
724
725
    /**
726
     * This determines a title for the given document
727
     *
728
     * @access public
729
     *
730
     * @static
731
     *
732
     * @param int $uid: The UID of the document
733
     * @param bool $recursive: Search superior documents for a title, too?
734
     *
735
     * @return string The title of the document itself or a parent document
736
     */
737
    public static function getTitle($uid, $recursive = false)
738
    {
739
        $logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__);
740
741
        $title = '';
742
        // Sanitize input.
743
        $uid = max(intval($uid), 0);
744
        if ($uid) {
745
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
746
                ->getQueryBuilderForTable('tx_dlf_documents');
747
748
            $result = $queryBuilder
749
                ->select(
750
                    'tx_dlf_documents.title',
751
                    'tx_dlf_documents.partof'
752
                )
753
                ->from('tx_dlf_documents')
754
                ->where(
755
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
756
                    Helper::whereExpression('tx_dlf_documents')
757
                )
758
                ->setMaxResults(1)
759
                ->execute();
760
761
            if ($resArray = $result->fetch()) {
762
                // Get title information.
763
                $title = $resArray['title'];
764
                $partof = $resArray['partof'];
765
                // Search parent documents recursively for a title?
766
                if (
767
                    $recursive
768
                    && empty($title)
769
                    && intval($partof)
770
                    && $partof != $uid
771
                ) {
772
                    $title = self::getTitle($partof, true);
773
                }
774
            } else {
775
                $logger->warning('No document with UID ' . $uid . ' found or document not accessible');
776
            }
777
        } else {
778
            $logger->error('Invalid UID ' . $uid . ' for document');
779
        }
780
        return $title;
781
    }
782
783
    /**
784
     * This extracts all the metadata for the toplevel logical structure node / resource
785
     *
786
     * @access public
787
     *
788
     * @param int $cPid: The PID for the metadata definitions
789
     *
790
     * @return array The logical structure node's / resource's parsed metadata array
791
     */
792
    public function getTitledata($cPid = 0)
793
    {
794
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
795
        // Add information from METS structural map to titledata array.
796
        if ($this instanceof MetsDocument) {
797
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
798
        }
799
        // Set record identifier for METS file / IIIF manifest if not present.
800
        if (
801
            is_array($titledata)
802
            && array_key_exists('record_id', $titledata)
803
        ) {
804
            if (
805
                !empty($this->recordId)
806
                && !in_array($this->recordId, $titledata['record_id'])
807
            ) {
808
                array_unshift($titledata['record_id'], $this->recordId);
809
            }
810
        }
811
        return $titledata;
812
    }
813
814
    /**
815
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
816
     *
817
     * @access protected
818
     *
819
     * @param array $structure: logical structure array
820
     * @param int $depth: current tree depth
821
     * @param string $logId: ID of the logical structure whose depth is requested
822
     *
823
     * @return int|bool: false if structure with $logId is not a child of this substructure,
824
     * or the actual depth.
825
     */
826
    protected function getTreeDepth($structure, $depth, $logId)
827
    {
828
        foreach ($structure as $element) {
829
            if ($element['id'] == $logId) {
830
                return $depth;
831
            } elseif (array_key_exists('children', $element)) {
832
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
833
                if ($foundInChildren !== false) {
834
                    return $foundInChildren;
835
                }
836
            }
837
        }
838
        return false;
839
    }
840
841
    /**
842
     * Get the tree depth of a logical structure element within the table of content
843
     *
844
     * @access public
845
     *
846
     * @param string $logId: The id of the logical structure element whose depth is requested
847
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
848
     */
849
    public function getStructureDepth($logId)
850
    {
851
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
852
    }
853
854
    /**
855
     * This sets some basic class properties
856
     *
857
     * @access protected
858
     *
859
     * @abstract
860
     *
861
     * @return void
862
     */
863
    protected abstract function init();
864
865
    /**
866
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
867
     *
868
     * @access protected
869
     *
870
     * @abstract
871
     *
872
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
873
     *
874
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
875
     */
876
    protected abstract function setPreloadedDocument($preloadedDocument);
877
878
    /**
879
     * METS/IIIF specific part of loading a location
880
     *
881
     * @access protected
882
     *
883
     * @abstract
884
     *
885
     * @param string $location: The URL of the file to load
886
     *
887
     * @return bool true on success or false on failure
888
     */
889
    protected abstract function loadLocation($location);
890
891
    /**
892
     * Load XML file / IIIF resource from URL
893
     *
894
     * @access protected
895
     *
896
     * @param string $location: The URL of the file to load
897
     *
898
     * @return bool true on success or false on failure
899
     */
900
    protected function load($location)
901
    {
902
        // Load XML / JSON-LD file.
903
        if (GeneralUtility::isValidUrl($location)) {
904
            // Load extension configuration
905
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
906
            // Set user-agent to identify self when fetching XML / JSON-LD data.
907
            if (!empty($extConf['useragent'])) {
908
                @ini_set('user_agent', $extConf['useragent']);
909
            }
910
            // the actual loading is format specific
911
            return $this->loadLocation($location);
912
        } else {
913
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
1 ignored issue
show
Bug introduced by
The method error() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

913
            $this->logger->/** @scrutinizer ignore-call */ 
914
                           error('Invalid file location "' . $location . '" for document loading');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
914
        }
915
        return false;
916
    }
917
918
    /**
919
     * Analyze the document if it contains any fulltext that needs to be indexed.
920
     *
921
     * @access protected
922
     *
923
     * @abstract
924
     */
925
    protected abstract function ensureHasFulltextIsSet();
926
927
    /**
928
     * Register all available data formats
929
     *
930
     * @access protected
931
     *
932
     * @return void
933
     */
934
    protected function loadFormats()
935
    {
936
        if (!$this->formatsLoaded) {
937
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
938
                ->getQueryBuilderForTable('tx_dlf_formats');
939
940
            // Get available data formats from database.
941
            $result = $queryBuilder
942
                ->select(
943
                    'tx_dlf_formats.type AS type',
944
                    'tx_dlf_formats.root AS root',
945
                    'tx_dlf_formats.namespace AS namespace',
946
                    'tx_dlf_formats.class AS class'
947
                )
948
                ->from('tx_dlf_formats')
949
                ->where(
950
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
951
                )
952
                ->execute();
953
954
            while ($resArray = $result->fetch()) {
955
                // Update format registry.
956
                $this->formats[$resArray['type']] = [
957
                    'rootElement' => $resArray['root'],
958
                    'namespaceURI' => $resArray['namespace'],
959
                    'class' => $resArray['class']
960
                ];
961
            }
962
            $this->formatsLoaded = true;
963
        }
964
    }
965
966
    /**
967
     * Register all available namespaces for a \SimpleXMLElement object
968
     *
969
     * @access public
970
     *
971
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
972
     *
973
     * @return void
974
     */
975
    public function registerNamespaces(&$obj)
976
    {
977
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
978
        $this->loadFormats();
979
        // Do we have a \SimpleXMLElement or \DOMXPath object?
980
        if ($obj instanceof \SimpleXMLElement) {
981
            $method = 'registerXPathNamespace';
982
        } elseif ($obj instanceof \DOMXPath) {
983
            $method = 'registerNamespace';
984
        } else {
985
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
986
            return;
987
        }
988
        // Register metadata format's namespaces.
989
        foreach ($this->formats as $enc => $conf) {
990
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
991
        }
992
    }
993
994
    /**
995
     * This saves the document to the database and index
996
     *
997
     * @access public
998
     *
999
     * @param int $pid: The PID of the saved record
1000
     * @param int $core: The UID of the Solr core for indexing
1001
     * @param int|string $owner: UID or index_name of owner to set while indexing
1002
     *
1003
     * @return bool true on success or false on failure
1004
     */
1005
    public function save($pid = 0, $core = 0, $owner = null)
1006
    {
1007
        if (\TYPO3_MODE !== 'BE') {
1008
            $this->logger->error('Saving a document is only allowed in the backend');
1009
            return false;
1010
        }
1011
        // Make sure $pid is a non-negative integer.
1012
        $pid = max(intval($pid), 0);
1013
        // Make sure $core is a non-negative integer.
1014
        $core = max(intval($core), 0);
1015
        // If $pid is not given, try to get it elsewhere.
1016
        if (
1017
            !$pid
1018
            && $this->pid
1019
        ) {
1020
            // Retain current PID.
1021
            $pid = $this->pid;
1022
        } elseif (!$pid) {
1023
            $this->logger->error('Invalid PID ' . $pid . ' for document saving');
1024
            return false;
1025
        }
1026
        // Set PID for metadata definitions.
1027
        $this->cPid = $pid;
1028
        // Set UID placeholder if not updating existing record.
1029
        if ($pid != $this->pid) {
1030
            $this->uid = uniqid('NEW');
1031
        }
1032
        // Get metadata array.
1033
        $metadata = $this->getTitledata($pid);
1034
        // Check for record identifier.
1035
        if (empty($metadata['record_id'][0])) {
1036
            $this->logger->error('No record identifier found to avoid duplication');
1037
            return false;
1038
        }
1039
        // Load plugin configuration.
1040
        $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
1041
1042
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1043
            ->getQueryBuilderForTable('tx_dlf_structures');
1044
1045
        // Get UID for structure type.
1046
        $result = $queryBuilder
1047
            ->select('tx_dlf_structures.uid AS uid')
1048
            ->from('tx_dlf_structures')
1049
            ->where(
1050
                $queryBuilder->expr()->eq('tx_dlf_structures.pid', intval($pid)),
1051
                $queryBuilder->expr()->eq('tx_dlf_structures.index_name', $queryBuilder->expr()->literal($metadata['type'][0])),
1052
                Helper::whereExpression('tx_dlf_structures')
1053
            )
1054
            ->setMaxResults(1)
1055
            ->execute();
1056
1057
        if ($resArray = $result->fetch()) {
1058
            $structure = $resArray['uid'];
1059
        } else {
1060
            $this->logger->error('Could not identify document/structure type "' . $queryBuilder->expr()->literal($metadata['type'][0]) . '"');
1061
            return false;
1062
        }
1063
        $metadata['type'][0] = $structure;
1064
1065
        // Remove appended "valueURI" from authors' names for storing in database.
1066
        foreach ($metadata['author'] as $i => $author) {
1067
            $splitName = explode(chr(31), $author);
1068
            $metadata['author'][$i] = $splitName[0];
1069
        }
1070
1071
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1072
            ->getQueryBuilderForTable('tx_dlf_collections');
1073
        // Get hidden records, too.
1074
        $queryBuilder
1075
            ->getRestrictions()
1076
            ->removeByType(HiddenRestriction::class);
1077
1078
        // Get UIDs for collections.
1079
        $result = $queryBuilder
1080
            ->select(
1081
                'tx_dlf_collections.index_name AS index_name',
1082
                'tx_dlf_collections.uid AS uid'
1083
            )
1084
            ->from('tx_dlf_collections')
1085
            ->where(
1086
                $queryBuilder->expr()->eq('tx_dlf_collections.pid', intval($pid)),
1087
                $queryBuilder->expr()->in('tx_dlf_collections.sys_language_uid', [-1, 0])
1088
            )
1089
            ->execute();
1090
1091
        $collUid = [];
1092
        while ($resArray = $result->fetch()) {
1093
            $collUid[$resArray['index_name']] = $resArray['uid'];
1094
        }
1095
        $collections = [];
1096
        foreach ($metadata['collection'] as $collection) {
1097
            if (!empty($collUid[$collection])) {
1098
                // Add existing collection's UID.
1099
                $collections[] = $collUid[$collection];
1100
            } else {
1101
                // Insert new collection.
1102
                $collNewUid = uniqid('NEW');
1103
                $collData['tx_dlf_collections'][$collNewUid] = [
1104
                    'pid' => $pid,
1105
                    'label' => $collection,
1106
                    'index_name' => $collection,
1107
                    'oai_name' => (!empty($conf['publishNewCollections']) ? Helper::getCleanString($collection) : ''),
1108
                    'description' => '',
1109
                    'documents' => 0,
1110
                    'owner' => 0,
1111
                    'status' => 0,
1112
                ];
1113
                $substUid = Helper::processDBasAdmin($collData);
1114
                // Prevent double insertion.
1115
                unset($collData);
1116
                // Add new collection's UID.
1117
                $collections[] = $substUid[$collNewUid];
1118
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1119
                    Helper::addMessage(
1120
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newCollection'), $collection, $substUid[$collNewUid])),
1121
                        Helper::getMessage('flash.attention', true),
1122
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1123
                        true
1124
                    );
1125
                }
1126
            }
1127
        }
1128
        $metadata['collection'] = $collections;
1129
1130
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1131
            ->getQueryBuilderForTable('tx_dlf_libraries');
1132
1133
        // Get UID for owner.
1134
        if (empty($owner)) {
1135
            $owner = empty($metadata['owner'][0]) ? $metadata['owner'][0] : 'default';
1136
        }
1137
        if (!MathUtility::canBeInterpretedAsInteger($owner)) {
1138
            $result = $queryBuilder
1139
                ->select('tx_dlf_libraries.uid AS uid')
1140
                ->from('tx_dlf_libraries')
1141
                ->where(
1142
                    $queryBuilder->expr()->eq('tx_dlf_libraries.pid', intval($pid)),
1143
                    $queryBuilder->expr()->eq('tx_dlf_libraries.index_name', $queryBuilder->expr()->literal($owner)),
1144
                    Helper::whereExpression('tx_dlf_libraries')
1145
                )
1146
                ->setMaxResults(1)
1147
                ->execute();
1148
1149
            if ($resArray = $result->fetch()) {
1150
                $ownerUid = $resArray['uid'];
1151
            } else {
1152
                // Insert new library.
1153
                $libNewUid = uniqid('NEW');
1154
                $libData['tx_dlf_libraries'][$libNewUid] = [
1155
                    'pid' => $pid,
1156
                    'label' => $owner,
1157
                    'index_name' => $owner,
1158
                    'website' => '',
1159
                    'contact' => '',
1160
                    'image' => '',
1161
                    'oai_label' => '',
1162
                    'oai_base' => '',
1163
                    'opac_label' => '',
1164
                    'opac_base' => '',
1165
                    'union_label' => '',
1166
                    'union_base' => '',
1167
                ];
1168
                $substUid = Helper::processDBasAdmin($libData);
1169
                // Add new library's UID.
1170
                $ownerUid = $substUid[$libNewUid];
1171
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1172
                    Helper::addMessage(
1173
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newLibrary'), $owner, $ownerUid)),
1174
                        Helper::getMessage('flash.attention', true),
1175
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1176
                        true
1177
                    );
1178
                }
1179
            }
1180
            $owner = $ownerUid;
1181
        }
1182
        $metadata['owner'][0] = $owner;
1183
        // Get UID of parent document.
1184
        $partof = $this->getParentDocumentUidForSaving($pid, $core, $owner);
1185
        // Use the date of publication or title as alternative sorting metric for parts of multi-part works.
1186
        if (!empty($partof)) {
1187
            if (
1188
                empty($metadata['volume'][0])
1189
                && !empty($metadata['year'][0])
1190
            ) {
1191
                $metadata['volume'] = $metadata['year'];
1192
            }
1193
            if (empty($metadata['volume_sorting'][0])) {
1194
                // If METS @ORDER is given it is preferred over year_sorting and year.
1195
                if (!empty($metadata['mets_order'][0])) {
1196
                    $metadata['volume_sorting'][0] = $metadata['mets_order'][0];
1197
                } elseif (!empty($metadata['year_sorting'][0])) {
1198
                    $metadata['volume_sorting'][0] = $metadata['year_sorting'][0];
1199
                } elseif (!empty($metadata['year'][0])) {
1200
                    $metadata['volume_sorting'][0] = $metadata['year'][0];
1201
                }
1202
            }
1203
            // If volume_sorting is still empty, try to use title_sorting or METS @ORDERLABEL finally (workaround for newspapers)
1204
            if (empty($metadata['volume_sorting'][0])) {
1205
                if (!empty($metadata['title_sorting'][0])) {
1206
                    $metadata['volume_sorting'][0] = $metadata['title_sorting'][0];
1207
                } elseif (!empty($metadata['mets_orderlabel'][0])) {
1208
                    $metadata['volume_sorting'][0] = $metadata['mets_orderlabel'][0];
1209
                }
1210
            }
1211
        }
1212
1213
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1214
            ->getQueryBuilderForTable('tx_dlf_metadata');
1215
1216
        // Get metadata for lists and sorting.
1217
        $result = $queryBuilder
1218
            ->select(
1219
                'tx_dlf_metadata.index_name AS index_name',
1220
                'tx_dlf_metadata.is_listed AS is_listed',
1221
                'tx_dlf_metadata.is_sortable AS is_sortable'
1222
            )
1223
            ->from('tx_dlf_metadata')
1224
            ->where(
1225
                $queryBuilder->expr()->orX(
1226
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_listed', 1),
1227
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_sortable', 1)
1228
                ),
1229
                $queryBuilder->expr()->eq('tx_dlf_metadata.pid', intval($pid)),
1230
                Helper::whereExpression('tx_dlf_metadata')
1231
            )
1232
            ->execute();
1233
1234
        $listed = [];
1235
        $sortable = [];
1236
1237
        while ($resArray = $result->fetch()) {
1238
            if (!empty($metadata[$resArray['index_name']])) {
1239
                if ($resArray['is_listed']) {
1240
                    $listed[$resArray['index_name']] = $metadata[$resArray['index_name']];
1241
                }
1242
                if ($resArray['is_sortable']) {
1243
                    $sortable[$resArray['index_name']] = $metadata[$resArray['index_name']][0];
1244
                }
1245
            }
1246
        }
1247
        // Fill data array.
1248
        $data['tx_dlf_documents'][$this->uid] = [
1249
            'pid' => $pid,
1250
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['starttime'] => 0,
1251
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['endtime'] => 0,
1252
            'prod_id' => $metadata['prod_id'][0],
1253
            'location' => $this->location,
1254
            'record_id' => $metadata['record_id'][0],
1255
            'opac_id' => $metadata['opac_id'][0],
1256
            'union_id' => $metadata['union_id'][0],
1257
            'urn' => $metadata['urn'][0],
1258
            'purl' => $metadata['purl'][0],
1259
            'title' => $metadata['title'][0],
1260
            'title_sorting' => $metadata['title_sorting'][0],
1261
            'author' => implode('; ', $metadata['author']),
1262
            'year' => implode('; ', $metadata['year']),
1263
            'place' => implode('; ', $metadata['place']),
1264
            'thumbnail' => $this->_getThumbnail(true),
1265
            'metadata' => serialize($listed),
1266
            'metadata_sorting' => serialize($sortable),
1267
            'structure' => $metadata['type'][0],
1268
            'partof' => $partof,
1269
            'volume' => $metadata['volume'][0],
1270
            'volume_sorting' => $metadata['volume_sorting'][0],
1271
            'license' => $metadata['license'][0],
1272
            'terms' => $metadata['terms'][0],
1273
            'restrictions' => $metadata['restrictions'][0],
1274
            'out_of_print' => $metadata['out_of_print'][0],
1275
            'rights_info' => $metadata['rights_info'][0],
1276
            'collections' => $metadata['collection'],
1277
            'mets_label' => $metadata['mets_label'][0],
1278
            'mets_orderlabel' => $metadata['mets_orderlabel'][0],
1279
            'mets_order' => $metadata['mets_order'][0],
1280
            'owner' => $metadata['owner'][0],
1281
            'solrcore' => $core,
1282
            'status' => 0,
1283
            'document_format' => $metadata['document_format'][0],
1284
        ];
1285
        // Unhide hidden documents.
1286
        if (!empty($conf['unhideOnIndex'])) {
1287
            $data['tx_dlf_documents'][$this->uid][$GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['disabled']] = 0;
1288
        }
1289
        // Process data.
1290
        $newIds = Helper::processDBasAdmin($data);
1291
        // Replace placeholder with actual UID.
1292
        if (strpos($this->uid, 'NEW') === 0) {
1293
            $this->uid = $newIds[$this->uid];
1294
            $this->pid = $pid;
1295
            $this->parentId = $partof;
1296
        }
1297
        if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1298
            Helper::addMessage(
1299
                htmlspecialchars(sprintf(Helper::getMessage('flash.documentSaved'), $metadata['title'][0], $this->uid)),
1300
                Helper::getMessage('flash.done', true),
1301
                \TYPO3\CMS\Core\Messaging\FlashMessage::OK,
1302
                true
1303
            );
1304
        }
1305
        // Add document to index.
1306
        if ($core) {
1307
            return Indexer::add($this, $core);
1308
        } else {
1309
            $this->logger->notice('Invalid UID "' . $core . '" for Solr core');
1 ignored issue
show
Bug introduced by
The method notice() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1309
            $this->logger->/** @scrutinizer ignore-call */ 
1310
                           notice('Invalid UID "' . $core . '" for Solr core');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1310
            return false;
1311
        }
1312
    }
1313
1314
    /**
1315
     * Get the ID of the parent document if the current document has one. Also save a parent document
1316
     * to the database and the Solr index if their $pid and the current $pid differ.
1317
     * Currently only applies to METS documents.
1318
     *
1319
     * @access protected
1320
     *
1321
     * @abstract
1322
     *
1323
     * @return int The parent document's id.
1324
     */
1325
    protected abstract function getParentDocumentUidForSaving($pid, $core, $owner);
1326
1327
    /**
1328
     * This returns $this->cPid via __get()
1329
     *
1330
     * @access protected
1331
     *
1332
     * @return int The PID of the metadata definitions
1333
     */
1334
    protected function _getCPid()
1335
    {
1336
        return $this->cPid;
1337
    }
1338
1339
    /**
1340
     * This returns $this->hasFulltext via __get()
1341
     *
1342
     * @access protected
1343
     *
1344
     * @return bool Are there any fulltext files available?
1345
     */
1346
    protected function _getHasFulltext()
1347
    {
1348
        $this->ensureHasFulltextIsSet();
1349
        return $this->hasFulltext;
1350
    }
1351
1352
    /**
1353
     * This returns $this->location via __get()
1354
     *
1355
     * @access protected
1356
     *
1357
     * @return string The location of the document
1358
     */
1359
    protected function _getLocation()
1360
    {
1361
        return $this->location;
1362
    }
1363
1364
    /**
1365
     * Format specific part of building the document's metadata array
1366
     *
1367
     * @access protected
1368
     *
1369
     * @abstract
1370
     *
1371
     * @param int $cPid
1372
     */
1373
    protected abstract function prepareMetadataArray($cPid);
1374
1375
    /**
1376
     * This builds an array of the document's metadata
1377
     *
1378
     * @access protected
1379
     *
1380
     * @return array Array of metadata with their corresponding logical structure node ID as key
1381
     */
1382
    protected function _getMetadataArray()
1383
    {
1384
        // Set metadata definitions' PID.
1385
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
1386
        if (!$cPid) {
1387
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
1388
            return [];
1389
        }
1390
        if (
1391
            !$this->metadataArrayLoaded
1392
            || $this->metadataArray[0] != $cPid
1393
        ) {
1394
            $this->prepareMetadataArray($cPid);
1395
            $this->metadataArray[0] = $cPid;
1396
            $this->metadataArrayLoaded = true;
1397
        }
1398
        return $this->metadataArray;
1399
    }
1400
1401
    /**
1402
     * This returns $this->numPages via __get()
1403
     *
1404
     * @access protected
1405
     *
1406
     * @return int The total number of pages and/or tracks
1407
     */
1408
    protected function _getNumPages()
1409
    {
1410
        $this->_getPhysicalStructure();
1411
        return $this->numPages;
1412
    }
1413
1414
    /**
1415
     * This returns $this->parentId via __get()
1416
     *
1417
     * @access protected
1418
     *
1419
     * @return int The UID of the parent document or zero if not applicable
1420
     */
1421
    protected function _getParentId()
1422
    {
1423
        return $this->parentId;
1424
    }
1425
1426
    /**
1427
     * This builds an array of the document's physical structure
1428
     *
1429
     * @access protected
1430
     *
1431
     * @abstract
1432
     *
1433
     * @return array Array of physical elements' id, type, label and file representations ordered
1434
     * by @ORDER attribute / IIIF Sequence's Canvases
1435
     */
1436
    protected abstract function _getPhysicalStructure();
1437
1438
    /**
1439
     * This gives an array of the document's physical structure metadata
1440
     *
1441
     * @access protected
1442
     *
1443
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1444
     */
1445
    protected function _getPhysicalStructureInfo()
1446
    {
1447
        // Is there no physical structure array yet?
1448
        if (!$this->physicalStructureLoaded) {
1449
            // Build physical structure array.
1450
            $this->_getPhysicalStructure();
1451
        }
1452
        return $this->physicalStructureInfo;
1453
    }
1454
1455
    /**
1456
     * This returns $this->pid via __get()
1457
     *
1458
     * @access protected
1459
     *
1460
     * @return int The PID of the document or zero if not in database
1461
     */
1462
    protected function _getPid()
1463
    {
1464
        return $this->pid;
1465
    }
1466
1467
    /**
1468
     * This returns $this->ready via __get()
1469
     *
1470
     * @access protected
1471
     *
1472
     * @return bool Is the document instantiated successfully?
1473
     */
1474
    protected function _getReady()
1475
    {
1476
        return $this->ready;
1477
    }
1478
1479
    /**
1480
     * This returns $this->recordId via __get()
1481
     *
1482
     * @access protected
1483
     *
1484
     * @return mixed The METS file's / IIIF manifest's record identifier
1485
     */
1486
    protected function _getRecordId()
1487
    {
1488
        return $this->recordId;
1489
    }
1490
1491
    /**
1492
     * This returns $this->rootId via __get()
1493
     *
1494
     * @access protected
1495
     *
1496
     * @return int The UID of the root document or zero if not applicable
1497
     */
1498
    protected function _getRootId()
1499
    {
1500
        if (!$this->rootIdLoaded) {
1501
            if ($this->parentId) {
1502
                $parent = self::getInstance($this->parentId, $this->pid);
1503
                $this->rootId = $parent->rootId;
1504
            }
1505
            $this->rootIdLoaded = true;
1506
        }
1507
        return $this->rootId;
1508
    }
1509
1510
    /**
1511
     * This returns the smLinks between logical and physical structMap (METS) and models the
1512
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1513
     *
1514
     * @access protected
1515
     *
1516
     * @abstract
1517
     *
1518
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1519
     */
1520
    protected abstract function _getSmLinks();
1521
1522
    /**
1523
     * This builds an array of the document's logical structure
1524
     *
1525
     * @access protected
1526
     *
1527
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1528
     */
1529
    protected function _getTableOfContents()
1530
    {
1531
        // Is there no logical structure array yet?
1532
        if (!$this->tableOfContentsLoaded) {
1533
            // Get all logical structures.
1534
            $this->getLogicalStructure('', true);
1535
            $this->tableOfContentsLoaded = true;
1536
        }
1537
        return $this->tableOfContents;
1538
    }
1539
1540
    /**
1541
     * This returns the document's thumbnail location
1542
     *
1543
     * @access protected
1544
     *
1545
     * @abstract
1546
     *
1547
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1548
     *
1549
     * @return string The document's thumbnail location
1550
     */
1551
    protected abstract function _getThumbnail($forceReload = false);
1552
1553
    /**
1554
     * This returns the ID of the toplevel logical structure node
1555
     *
1556
     * @access protected
1557
     *
1558
     * @abstract
1559
     *
1560
     * @return string The logical structure node's ID
1561
     */
1562
    protected abstract function _getToplevelId();
1563
1564
    /**
1565
     * This returns $this->uid via __get()
1566
     *
1567
     * @access protected
1568
     *
1569
     * @return mixed The UID or the URL of the document
1570
     */
1571
    protected function _getUid()
1572
    {
1573
        return $this->uid;
1574
    }
1575
1576
    /**
1577
     * This sets $this->cPid via __set()
1578
     *
1579
     * @access protected
1580
     *
1581
     * @param int $value: The new PID for the metadata definitions
1582
     *
1583
     * @return void
1584
     */
1585
    protected function _setCPid($value)
1586
    {
1587
        $this->cPid = max(intval($value), 0);
1588
    }
1589
1590
    /**
1591
     * This magic method is invoked each time a clone is called on the object variable
1592
     *
1593
     * @access protected
1594
     *
1595
     * @return void
1596
     */
1597
    protected function __clone()
1598
    {
1599
        // This method is defined as protected because singleton objects should not be cloned.
1600
    }
1601
1602
    /**
1603
     * This is a singleton class, thus the constructor should be private/protected
1604
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Document::getInstance())
1605
     *
1606
     * @access protected
1607
     *
1608
     * @param int $uid: The UID of the document to parse or URL to XML file
1609
     * @param int $pid: If > 0, then only document with this PID gets loaded
1610
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1611
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1612
     *
1613
     * @return void
1614
     */
1615
    protected function __construct($uid, $pid, $preloadedDocument)
1616
    {
1617
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1618
            ->getQueryBuilderForTable('tx_dlf_documents');
1619
        $location = '';
1620
        // Prepare to check database for the requested document.
1621
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
1622
            $whereClause = $queryBuilder->expr()->andX(
1623
                $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
1624
                Helper::whereExpression('tx_dlf_documents')
1625
            );
1626
        } else {
1627
            // Try to load METS file / IIIF manifest.
1628
            if ($this->setPreloadedDocument($preloadedDocument) || (GeneralUtility::isValidUrl($uid)
1629
                && $this->load($uid))) {
1630
                // Initialize core METS object.
1631
                $this->init();
1632
                if ($this->getDocument() !== null) {
1633
                    // Cast to string for safety reasons.
1634
                    $location = (string) $uid;
1635
                    $this->establishRecordId($pid);
1636
                } else {
1637
                    // No METS / IIIF part found.
1638
                    return;
1639
                }
1640
            } else {
1641
                // Loading failed.
1642
                return;
1643
            }
1644
            if (
1645
                !empty($location)
1646
                && !empty($this->recordId)
1647
            ) {
1648
                // Try to match record identifier or location (both should be unique).
1649
                $whereClause = $queryBuilder->expr()->andX(
1650
                    $queryBuilder->expr()->orX(
1651
                        $queryBuilder->expr()->eq('tx_dlf_documents.location', $queryBuilder->expr()->literal($location)),
1652
                        $queryBuilder->expr()->eq('tx_dlf_documents.record_id', $queryBuilder->expr()->literal($this->recordId))
1653
                    ),
1654
                    Helper::whereExpression('tx_dlf_documents')
1655
                );
1656
            } else {
1657
                // Can't persistently identify document, don't try to match at all.
1658
                $whereClause = '1=-1';
1659
            }
1660
        }
1661
        // Check for PID if needed.
1662
        if ($pid) {
1663
            $whereClause = $queryBuilder->expr()->andX(
1664
                $whereClause,
1665
                $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid))
1666
            );
1667
        }
1668
        // Get document PID and location from database.
1669
        $result = $queryBuilder
1670
            ->select(
1671
                'tx_dlf_documents.uid AS uid',
1672
                'tx_dlf_documents.pid AS pid',
1673
                'tx_dlf_documents.record_id AS record_id',
1674
                'tx_dlf_documents.partof AS partof',
1675
                'tx_dlf_documents.thumbnail AS thumbnail',
1676
                'tx_dlf_documents.location AS location'
1677
            )
1678
            ->from('tx_dlf_documents')
1679
            ->where($whereClause)
1680
            ->setMaxResults(1)
1681
            ->execute();
1682
1683
        if ($resArray = $result->fetch()) {
1684
            $this->uid = $resArray['uid'];
1685
            $this->pid = $resArray['pid'];
1686
            $this->recordId = $resArray['record_id'];
1687
            $this->parentId = $resArray['partof'];
1688
            $this->thumbnail = $resArray['thumbnail'];
1689
            $this->location = $resArray['location'];
1690
            $this->thumbnailLoaded = true;
1691
            // Load XML file if necessary...
1692
            if (
1693
                $this->getDocument() === null
1694
                && $this->load($this->location)
1695
            ) {
1696
                // ...and set some basic properties.
1697
                $this->init();
1698
            }
1699
            // Do we have a METS / IIIF object now?
1700
            if ($this->getDocument() !== null) {
1701
                // Set new location if necessary.
1702
                if (!empty($location)) {
1703
                    $this->location = $location;
1704
                }
1705
                // Document ready!
1706
                $this->ready = true;
1707
            }
1708
        } elseif ($this->getDocument() !== null) {
1709
            // Set location as UID for documents not in database.
1710
            $this->uid = $location;
1711
            $this->location = $location;
1712
            // Document ready!
1713
            $this->ready = true;
1714
        } else {
1715
            $this->logger->error('No document with UID ' . $uid . ' found or document not accessible');
1716
        }
1717
    }
1718
1719
    /**
1720
     * This magic method is called each time an invisible property is referenced from the object
1721
     *
1722
     * @access public
1723
     *
1724
     * @param string $var: Name of variable to get
1725
     *
1726
     * @return mixed Value of $this->$var
1727
     */
1728
    public function __get($var)
1729
    {
1730
        $method = '_get' . ucfirst($var);
1731
        if (
1732
            !property_exists($this, $var)
1733
            || !method_exists($this, $method)
1734
        ) {
1735
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1736
            return;
1737
        } else {
1738
            return $this->$method();
1739
        }
1740
    }
1741
1742
    /**
1743
     * This magic method is called each time an invisible property is checked for isset() or empty()
1744
     *
1745
     * @access public
1746
     *
1747
     * @param string $var: Name of variable to check
1748
     *
1749
     * @return bool true if variable is set and not empty, false otherwise
1750
     */
1751
    public function __isset($var)
1752
    {
1753
        return !empty($this->__get($var));
1754
    }
1755
1756
    /**
1757
     * This magic method is called each time an invisible property is referenced from the object
1758
     *
1759
     * @access public
1760
     *
1761
     * @param string $var: Name of variable to set
1762
     * @param mixed $value: New value of variable
1763
     *
1764
     * @return void
1765
     */
1766
    public function __set($var, $value)
1767
    {
1768
        $method = '_set' . ucfirst($var);
1769
        if (
1770
            !property_exists($this, $var)
1771
            || !method_exists($this, $method)
1772
        ) {
1773
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1774
        } else {
1775
            $this->$method($value);
1776
        }
1777
    }
1778
}
1779