Passed
Pull Request — master (#87)
by Alexander
02:55
created

Document::getXmlObject()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 6
c 1
b 0
f 0
dl 0
loc 14
rs 10
cc 1
nc 1
nop 1
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
16
use TYPO3\CMS\Core\Database\ConnectionPool;
17
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
18
use TYPO3\CMS\Core\Log\LogManager;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use TYPO3\CMS\Core\Utility\MathUtility;
21
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
22
use Ubl\Iiif\Tools\IiifHelper;
23
24
/**
25
 * Document class for the 'dlf' extension
26
 *
27
 * @author Sebastian Meyer <[email protected]>
28
 * @author Henrik Lochmann <[email protected]>
29
 * @package TYPO3
30
 * @subpackage dlf
31
 * @access public
32
 * @property int $cPid This holds the PID for the configuration
33
 * @property-read bool $hasFulltext Are there any fulltext files available?
34
 * @property-read string $location This holds the documents location
35
 * @property-read array $metadataArray This holds the documents' parsed metadata array
36
 * @property-read int $numPages The holds the total number of pages
37
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
38
 * @property-read array $physicalStructure This holds the physical structure
39
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
40
 * @property-read int $pid This holds the PID of the document or zero if not in database
41
 * @property-read bool $ready Is the document instantiated successfully?
42
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
43
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
44
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
45
 * @property-read array $tableOfContents This holds the logical structure
46
 * @property-read string $thumbnail This holds the document's thumbnail location
47
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
48
 * @property-read mixed $uid This holds the UID or the URL of the document
49
 * @abstract
50
 */
51
abstract class Document
52
{
53
    /**
54
     * This holds the logger
55
     *
56
     * @var LogManager
57
     * @access protected
58
     */
59
    protected $logger;
60
61
    /**
62
     * This holds the PID for the configuration
63
     *
64
     * @var int
65
     * @access protected
66
     */
67
    protected $cPid = 0;
68
69
    /**
70
     * The extension key
71
     *
72
     * @var string
73
     * @access public
74
     */
75
    public static $extKey = 'dlf';
76
77
    /**
78
     * This holds the configuration for all supported metadata encodings
79
     * @see loadFormats()
80
     *
81
     * @var array
82
     * @access protected
83
     */
84
    protected $formats = [
85
        'OAI' => [
86
            'rootElement' => 'OAI-PMH',
87
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
88
        ],
89
        'METS' => [
90
            'rootElement' => 'mets',
91
            'namespaceURI' => 'http://www.loc.gov/METS/',
92
        ],
93
        'XLINK' => [
94
            'rootElement' => 'xlink',
95
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
96
        ]
97
    ];
98
99
    /**
100
     * Are the available metadata formats loaded?
101
     * @see $formats
102
     *
103
     * @var bool
104
     * @access protected
105
     */
106
    protected $formatsLoaded = false;
107
108
    /**
109
     * Are there any fulltext files available? This also includes IIIF text annotations
110
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
111
     * annotations as fulltext.
112
     *
113
     * @var bool
114
     * @access protected
115
     */
116
    protected $hasFulltext = false;
117
118
    /**
119
     * Last searched logical and physical page
120
     *
121
     * @var array
122
     * @access protected
123
     */
124
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
125
126
    /**
127
     * This holds the documents location
128
     *
129
     * @var string
130
     * @access protected
131
     */
132
    protected $location = '';
133
134
    /**
135
     * This holds the logical units
136
     *
137
     * @var array
138
     * @access protected
139
     */
140
    protected $logicalUnits = [];
141
142
    /**
143
     * This holds the documents' parsed metadata array with their corresponding
144
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
145
     *
146
     * @var array
147
     * @access protected
148
     */
149
    protected $metadataArray = [];
150
151
    /**
152
     * Is the metadata array loaded?
153
     * @see $metadataArray
154
     *
155
     * @var bool
156
     * @access protected
157
     */
158
    protected $metadataArrayLoaded = false;
159
160
    /**
161
     * The holds the total number of pages
162
     *
163
     * @var int
164
     * @access protected
165
     */
166
    protected $numPages = 0;
167
168
    /**
169
     * This holds the UID of the parent document or zero if not multi-volumed
170
     *
171
     * @var int
172
     * @access protected
173
     */
174
    protected $parentId = 0;
175
176
    /**
177
     * This holds the physical structure
178
     *
179
     * @var array
180
     * @access protected
181
     */
182
    protected $physicalStructure = [];
183
184
    /**
185
     * This holds the physical structure metadata
186
     *
187
     * @var array
188
     * @access protected
189
     */
190
    protected $physicalStructureInfo = [];
191
192
    /**
193
     * Is the physical structure loaded?
194
     * @see $physicalStructure
195
     *
196
     * @var bool
197
     * @access protected
198
     */
199
    protected $physicalStructureLoaded = false;
200
201
    /**
202
     * This holds the PID of the document or zero if not in database
203
     *
204
     * @var int
205
     * @access protected
206
     */
207
    protected $pid = 0;
208
209
    /**
210
     * This holds the documents' raw text pages with their corresponding
211
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
212
     *
213
     * @var array
214
     * @access protected
215
     */
216
    protected $rawTextArray = [];
217
218
    /**
219
     * Is the document instantiated successfully?
220
     *
221
     * @var bool
222
     * @access protected
223
     */
224
    protected $ready = false;
225
226
    /**
227
     * The METS file's / IIIF manifest's record identifier
228
     *
229
     * @var string
230
     * @access protected
231
     */
232
    protected $recordId;
233
234
    /**
235
     * This holds the singleton object of the document
236
     *
237
     * @var array (\Kitodo\Dlf\Common\Document)
238
     * @static
239
     * @access protected
240
     */
241
    protected static $registry = [];
242
243
    /**
244
     * This holds the UID of the root document or zero if not multi-volumed
245
     *
246
     * @var int
247
     * @access protected
248
     */
249
    protected $rootId = 0;
250
251
    /**
252
     * Is the root id loaded?
253
     * @see $rootId
254
     *
255
     * @var bool
256
     * @access protected
257
     */
258
    protected $rootIdLoaded = false;
259
260
    /**
261
     * This holds the smLinks between logical and physical structMap
262
     *
263
     * @var array
264
     * @access protected
265
     */
266
    protected $smLinks = ['l2p' => [], 'p2l' => []];
267
268
    /**
269
     * Are the smLinks loaded?
270
     * @see $smLinks
271
     *
272
     * @var bool
273
     * @access protected
274
     */
275
    protected $smLinksLoaded = false;
276
277
    /**
278
     * This holds the logical structure
279
     *
280
     * @var array
281
     * @access protected
282
     */
283
    protected $tableOfContents = [];
284
285
    /**
286
     * Is the table of contents loaded?
287
     * @see $tableOfContents
288
     *
289
     * @var bool
290
     * @access protected
291
     */
292
    protected $tableOfContentsLoaded = false;
293
294
    /**
295
     * This holds the document's thumbnail location
296
     *
297
     * @var string
298
     * @access protected
299
     */
300
    protected $thumbnail = '';
301
302
    /**
303
     * Is the document's thumbnail location loaded?
304
     * @see $thumbnail
305
     *
306
     * @var bool
307
     * @access protected
308
     */
309
    protected $thumbnailLoaded = false;
310
311
    /**
312
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
313
     *
314
     * @var string
315
     * @access protected
316
     */
317
    protected $toplevelId = '';
318
319
    /**
320
     * This holds the UID or the URL of the document
321
     *
322
     * @var mixed
323
     * @access protected
324
     */
325
    protected $uid = 0;
326
327
    /**
328
     * This holds the whole XML file as \SimpleXMLElement object
329
     *
330
     * @var \SimpleXMLElement
331
     * @access protected
332
     */
333
    protected $xml;
334
335
    /**
336
     * This clears the static registry to prevent memory exhaustion
337
     *
338
     * @access public
339
     *
340
     * @static
341
     *
342
     * @return void
343
     */
344
    public static function clearRegistry()
345
    {
346
        // Reset registry array.
347
        self::$registry = [];
348
    }
349
350
    /**
351
     * This ensures that the recordId, if existent, is retrieved from the document
352
     *
353
     * @access protected
354
     *
355
     * @abstract
356
     *
357
     * @param int $pid: ID of the configuration page with the recordId config
358
     *
359
     */
360
    protected abstract function establishRecordId($pid);
361
362
    /**
363
     * Source document PHP object which is represented by a Document instance
364
     *
365
     * @access protected
366
     *
367
     * @abstract
368
     *
369
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
370
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
371
     */
372
    protected abstract function getDocument();
373
374
    /**
375
     * This gets the location of a downloadable file for a physical page or track
376
     *
377
     * @access public
378
     *
379
     * @abstract
380
     *
381
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
382
     *
383
     * @return string    The file's location as URL
384
     */
385
    public abstract function getDownloadLocation($id);
386
387
    /**
388
     * This gets the location of a file representing a physical page or track
389
     *
390
     * @access public
391
     *
392
     * @abstract
393
     *
394
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
395
     *
396
     * @return string The file's location as URL
397
     */
398
    public abstract function getFileLocation($id);
399
400
    /**
401
     * This gets the MIME type of a file representing a physical page or track
402
     *
403
     * @access public
404
     *
405
     * @abstract
406
     *
407
     * @param string $id: The @ID attribute of the file node
408
     *
409
     * @return string The file's MIME type
410
     */
411
    public abstract function getFileMimeType($id);
412
413
    /**
414
     * This is a singleton class, thus an instance must be created by this method
415
     *
416
     * @access public
417
     *
418
     * @static
419
     *
420
     * @param mixed $uid: The unique identifier of the document to parse, the URL of XML file or the IRI of the IIIF resource
421
     * @param int $pid: If > 0, then only document with this PID gets loaded
422
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
423
     *
424
     * @return \Kitodo\Dlf\Common\Document Instance of this class, either MetsDocument or IiifManifest
425
     */
426
    public static function &getInstance($uid, $pid = 0, $forceReload = false)
427
    {
428
        // Sanitize input.
429
        $pid = max(intval($pid), 0);
430
        if (!$forceReload) {
431
            $regObj = Helper::digest($uid);
432
            if (
433
                is_object(self::$registry[$regObj])
434
                && self::$registry[$regObj] instanceof self
435
            ) {
436
                // Check if instance has given PID.
437
                if (
438
                    !$pid
439
                    || !self::$registry[$regObj]->pid
440
                    || $pid == self::$registry[$regObj]->pid
441
                ) {
442
                    // Return singleton instance if available.
443
                    return self::$registry[$regObj];
444
                }
445
            } else {
446
                // Check the user's session...
447
                $sessionData = Helper::loadFromSession(get_called_class());
448
                if (
449
                    is_object($sessionData[$regObj])
450
                    && $sessionData[$regObj] instanceof self
451
                ) {
452
                    // Check if instance has given PID.
453
                    if (
454
                        !$pid
455
                        || !$sessionData[$regObj]->pid
456
                        || $pid == $sessionData[$regObj]->pid
457
                    ) {
458
                        // ...and restore registry.
459
                        self::$registry[$regObj] = $sessionData[$regObj];
460
                        return self::$registry[$regObj];
461
                    }
462
                }
463
            }
464
        }
465
        // Create new instance depending on format (METS or IIIF) ...
466
        $instance = null;
467
        $documentFormat = null;
468
        $xml = null;
469
        $iiif = null;
470
        // Try to get document format from database
471
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
472
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
473
                ->getQueryBuilderForTable('tx_dlf_documents');
474
475
            $queryBuilder
476
                ->select(
477
                    'tx_dlf_documents.location AS location',
478
                    'tx_dlf_documents.document_format AS document_format'
479
                )
480
                ->from('tx_dlf_documents');
481
482
            // Get UID of document with given record identifier.
483
            if ($pid) {
484
                $queryBuilder
485
                    ->where(
486
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
487
                        $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid)),
488
                        Helper::whereExpression('tx_dlf_documents')
489
                    );
490
            } else {
491
                $queryBuilder
492
                    ->where(
493
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
494
                        Helper::whereExpression('tx_dlf_documents')
495
                    );
496
            }
497
498
            $result = $queryBuilder
499
                ->setMaxResults(1)
500
                ->execute();
501
502
            if ($resArray = $result->fetch()) {
503
                $documentFormat = $resArray['document_format'];
504
            }
505
        } else {
506
            // Get document format from content of remote document
507
            // Cast to string for safety reasons.
508
            $location = (string) $uid;
509
            // Try to load a file from the url
510
            if (GeneralUtility::isValidUrl($location)) {
511
                // Load extension configuration
512
                $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
513
                // Set user-agent to identify self when fetching XML data.
514
                if (!empty($extConf['useragent'])) {
515
                    @ini_set('user_agent', $extConf['useragent']);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for ini_set(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

515
                    /** @scrutinizer ignore-unhandled */ @ini_set('user_agent', $extConf['useragent']);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
516
                }
517
                $content = GeneralUtility::getUrl($location);
518
                if ($content !== false) {
519
                    $xml = Helper::getXmlFileAsString($content);
520
                    if ($xml !== false) {
521
                        /* @var $xml \SimpleXMLElement */
522
                        $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
523
                        $xpathResult = $xml->xpath('//mets:mets');
524
                        $documentFormat = !empty($xpathResult) ? 'METS' : null;
525
                    } else {
526
                        // Try to load file as IIIF resource instead.
527
                        $contentAsJsonArray = json_decode($content, true);
528
                        if ($contentAsJsonArray !== null) {
529
                            // Load plugin configuration.
530
                            $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
531
                            IiifHelper::setUrlReader(IiifUrlReader::getInstance());
532
                            IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
533
                            IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
534
                            $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
535
                            if ($iiif instanceof IiifResourceInterface) {
536
                                $documentFormat = 'IIIF';
537
                            }
538
                        }
539
                    }
540
                }
541
            }
542
        }
543
        // Sanitize input.
544
        $pid = max(intval($pid), 0);
545
        if ($documentFormat == 'METS') {
546
            $instance = new MetsDocument($uid, $pid, $xml);
547
        } elseif ($documentFormat == 'IIIF') {
548
            $instance = new IiifManifest($uid, $pid, $iiif);
549
        }
550
        // Save instance to registry.
551
        if (
552
            $instance instanceof self
553
            && $instance->ready) {
554
            self::$registry[Helper::digest($instance->uid)] = $instance;
555
            if ($instance->uid != $instance->location) {
556
                self::$registry[Helper::digest($instance->location)] = $instance;
557
            }
558
            // Load extension configuration
559
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
560
            // Save registry to session if caching is enabled.
561
            if (!empty($extConf['caching'])) {
562
                Helper::saveToSession(self::$registry, get_class($instance));
563
            }
564
            $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance));
565
        }
566
        // Return new instance.
567
        return $instance;
568
    }
569
570
    /**
571
     * This gets details about a logical structure element
572
     *
573
     * @access public
574
     *
575
     * @abstract
576
     *
577
     * @param string $id: The @ID attribute of the logical structure node (METS) or
578
     * the @id property of the Manifest / Range (IIIF)
579
     * @param bool $recursive: Whether to include the child elements / resources
580
     *
581
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
582
     */
583
    public abstract function getLogicalStructure($id, $recursive = false);
584
585
    /**
586
     * This extracts all the metadata for a logical structure node
587
     *
588
     * @access public
589
     *
590
     * @abstract
591
     *
592
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
593
     * of the Manifest / Range (IIIF)
594
     * @param int $cPid: The PID for the metadata definitions
595
     *                       (defaults to $this->cPid or $this->pid)
596
     *
597
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
598
     */
599
    public abstract function getMetadata($id, $cPid = 0);
600
601
    /**
602
     * This returns the first corresponding physical page number of a given logical page label
603
     *
604
     * @access public
605
     *
606
     * @param string $logicalPage: The label (or a part of the label) of the logical page
607
     *
608
     * @return int The physical page number
609
     */
610
    public function getPhysicalPage($logicalPage)
611
    {
612
        if (
613
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
614
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
615
        ) {
616
            return $this->lastSearchedPhysicalPage['physicalPage'];
617
        } else {
618
            $physicalPage = 0;
619
            foreach ($this->physicalStructureInfo as $page) {
620
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
621
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
622
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
623
                    return $physicalPage;
624
                }
625
                $physicalPage++;
626
            }
627
        }
628
        return 1;
629
    }
630
631
    /**
632
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
633
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
634
     *
635
     * @access public
636
     *
637
     * @abstract
638
     *
639
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
640
     * of the Manifest / Range (IIIF)
641
     *
642
     * @return string The OCR full text
643
     */
644
    public abstract function getFullText($id);
645
646
    /**
647
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
648
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
649
     * to be given in the Canvas' / Manifest's "seeAlso" property.
650
     *
651
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
652
     * of the Manifest / Range (IIIF)
653
     *
654
     * @return string The OCR full text
655
     */
656
    protected function getFullTextFromXml($id)
657
    {
658
        $fullText = '';
659
        // Load available text formats, ...
660
        $this->loadFormats();
661
        // ... physical structure ...
662
        $this->_getPhysicalStructure();
663
        // ... and extension configuration.
664
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
665
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
666
        if (!empty($this->physicalStructureInfo[$id])) {
667
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
668
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
669
                    // Get full text file.
670
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
671
                    if ($fileContent !== false) {
672
                        $textFormat = $this->getTextFormat($fileContent);
673
                    } else {
674
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
0 ignored issues
show
Bug introduced by
The method warning() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

674
                        $this->logger->/** @scrutinizer ignore-call */ 
675
                                       warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
675
                        return $fullText;
676
                    }
677
                    break;
678
                }
679
            }
680
        } else {
681
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
682
            return $fullText;
683
        }
684
        // Is this text format supported?
685
        // This part actually differs from previous version of indexed OCR
686
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
687
            $textMiniOcr = '';
688
            if (!empty($this->formats[$textFormat]['class'])) {
689
                $class = $this->formats[$textFormat]['class'];
690
                // Get the raw text from class.
691
                if (
692
                    class_exists($class)
693
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
694
                ) {
695
                    // Load XML from file.
696
                    $ocrTextXml = Helper::getXmlFileAsString($fileContent);
697
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
698
                    $this->rawTextArray[$id] = $textMiniOcr;
699
                } else {
700
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
701
                }
702
            }
703
            $fullText = $textMiniOcr;
704
        } else {
705
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
706
        }
707
        return $fullText;
708
    }
709
710
    /**
711
     * Get format of the OCR full text
712
     *
713
     * @access private
714
     *
715
     * @param string $fileContent: content of the XML file
716
     *
717
     * @return string The format of the OCR full text
718
     */
719
    private function getTextFormat($fileContent)
720
    {
721
        // Get the root element's name as text format.
722
        return strtoupper(Helper::getXmlFileAsString($fileContent)->getName());
723
    }
724
725
    /**
726
     * This determines a title for the given document
727
     *
728
     * @access public
729
     *
730
     * @static
731
     *
732
     * @param int $uid: The UID of the document
733
     * @param bool $recursive: Search superior documents for a title, too?
734
     *
735
     * @return string The title of the document itself or a parent document
736
     */
737
    public static function getTitle($uid, $recursive = false)
738
    {
739
        $logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__);
740
741
        $title = '';
742
        // Sanitize input.
743
        $uid = max(intval($uid), 0);
744
        if ($uid) {
745
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
746
                ->getQueryBuilderForTable('tx_dlf_documents');
747
748
            $result = $queryBuilder
749
                ->select(
750
                    'tx_dlf_documents.title',
751
                    'tx_dlf_documents.partof'
752
                )
753
                ->from('tx_dlf_documents')
754
                ->where(
755
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
756
                    Helper::whereExpression('tx_dlf_documents')
757
                )
758
                ->setMaxResults(1)
759
                ->execute();
760
761
            if ($resArray = $result->fetch()) {
762
                // Get title information.
763
                $title = $resArray['title'];
764
                $partof = $resArray['partof'];
765
                // Search parent documents recursively for a title?
766
                if (
767
                    $recursive
768
                    && empty($title)
769
                    && intval($partof)
770
                    && $partof != $uid
771
                ) {
772
                    $title = self::getTitle($partof, true);
773
                }
774
            } else {
775
                $logger->warning('No document with UID ' . $uid . ' found or document not accessible');
776
            }
777
        } else {
778
            $logger->error('Invalid UID ' . $uid . ' for document');
779
        }
780
        return $title;
781
    }
782
783
    /**
784
     * This extracts all the metadata for the toplevel logical structure node / resource
785
     *
786
     * @access public
787
     *
788
     * @param int $cPid: The PID for the metadata definitions
789
     *
790
     * @return array The logical structure node's / resource's parsed metadata array
791
     */
792
    public function getTitledata($cPid = 0)
793
    {
794
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
795
        // Add information from METS structural map to titledata array.
796
        if ($this instanceof MetsDocument) {
797
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
798
        }
799
        // Set record identifier for METS file / IIIF manifest if not present.
800
        if (
801
            is_array($titledata)
802
            && array_key_exists('record_id', $titledata)
803
        ) {
804
            if (
805
                !empty($this->recordId)
806
                && !in_array($this->recordId, $titledata['record_id'])
807
            ) {
808
                array_unshift($titledata['record_id'], $this->recordId);
809
            }
810
        }
811
        return $titledata;
812
    }
813
814
    /**
815
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
816
     *
817
     * @access protected
818
     *
819
     * @param array $structure: logical structure array
820
     * @param int $depth: current tree depth
821
     * @param string $logId: ID of the logical structure whose depth is requested
822
     *
823
     * @return int|bool: false if structure with $logId is not a child of this substructure,
824
     * or the actual depth.
825
     */
826
    protected function getTreeDepth($structure, $depth, $logId)
827
    {
828
        foreach ($structure as $element) {
829
            if ($element['id'] == $logId) {
830
                return $depth;
831
            } elseif (array_key_exists('children', $element)) {
832
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
833
                if ($foundInChildren !== false) {
834
                    return $foundInChildren;
835
                }
836
            }
837
        }
838
        return false;
839
    }
840
841
    /**
842
     * Get the tree depth of a logical structure element within the table of content
843
     *
844
     * @access public
845
     *
846
     * @param string $logId: The id of the logical structure element whose depth is requested
847
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
848
     */
849
    public function getStructureDepth($logId)
850
    {
851
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
852
    }
853
854
    /**
855
     * This sets some basic class properties
856
     *
857
     * @access protected
858
     *
859
     * @abstract
860
     *
861
     * @return void
862
     */
863
    protected abstract function init();
864
865
    /**
866
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
867
     *
868
     * @access protected
869
     *
870
     * @abstract
871
     *
872
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
873
     *
874
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
875
     */
876
    protected abstract function setPreloadedDocument($preloadedDocument);
877
878
    /**
879
     * METS/IIIF specific part of loading a location
880
     *
881
     * @access protected
882
     *
883
     * @abstract
884
     *
885
     * @param string $location: The URL of the file to load
886
     *
887
     * @return bool true on success or false on failure
888
     */
889
    protected abstract function loadLocation($location);
890
891
    /**
892
     * Load XML file / IIIF resource from URL
893
     *
894
     * @access protected
895
     *
896
     * @param string $location: The URL of the file to load
897
     *
898
     * @return bool true on success or false on failure
899
     */
900
    protected function load($location)
901
    {
902
        // Load XML / JSON-LD file.
903
        if (GeneralUtility::isValidUrl($location)) {
904
            // Load extension configuration
905
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
906
            // Set user-agent to identify self when fetching XML / JSON-LD data.
907
            if (!empty($extConf['useragent'])) {
908
                @ini_set('user_agent', $extConf['useragent']);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for ini_set(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

908
                /** @scrutinizer ignore-unhandled */ @ini_set('user_agent', $extConf['useragent']);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
909
            }
910
            // the actual loading is format specific
911
            return $this->loadLocation($location);
912
        } else {
913
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
0 ignored issues
show
Bug introduced by
The method error() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

913
            $this->logger->/** @scrutinizer ignore-call */ 
914
                           error('Invalid file location "' . $location . '" for document loading');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
914
        }
915
        return false;
916
    }
917
918
    /**
919
     * Analyze the document if it contains any fulltext that needs to be indexed.
920
     *
921
     * @access protected
922
     *
923
     * @abstract
924
     */
925
    protected abstract function ensureHasFulltextIsSet();
926
927
    /**
928
     * Register all available data formats
929
     *
930
     * @access protected
931
     *
932
     * @return void
933
     */
934
    protected function loadFormats()
935
    {
936
        if (!$this->formatsLoaded) {
937
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
938
                ->getQueryBuilderForTable('tx_dlf_formats');
939
940
            // Get available data formats from database.
941
            $result = $queryBuilder
942
                ->select(
943
                    'tx_dlf_formats.type AS type',
944
                    'tx_dlf_formats.root AS root',
945
                    'tx_dlf_formats.namespace AS namespace',
946
                    'tx_dlf_formats.class AS class'
947
                )
948
                ->from('tx_dlf_formats')
949
                ->where(
950
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
951
                )
952
                ->execute();
953
954
            while ($resArray = $result->fetch()) {
955
                // Update format registry.
956
                $this->formats[$resArray['type']] = [
957
                    'rootElement' => $resArray['root'],
958
                    'namespaceURI' => $resArray['namespace'],
959
                    'class' => $resArray['class']
960
                ];
961
            }
962
            $this->formatsLoaded = true;
963
        }
964
    }
965
966
    /**
967
     * Register all available namespaces for a \SimpleXMLElement object
968
     *
969
     * @access public
970
     *
971
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
972
     *
973
     * @return void
974
     */
975
    public function registerNamespaces(&$obj)
976
    {
977
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
978
        $this->loadFormats();
979
        // Do we have a \SimpleXMLElement or \DOMXPath object?
980
        if ($obj instanceof \SimpleXMLElement) {
981
            $method = 'registerXPathNamespace';
982
        } elseif ($obj instanceof \DOMXPath) {
983
            $method = 'registerNamespace';
984
        } else {
985
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
986
            return;
987
        }
988
        // Register metadata format's namespaces.
989
        foreach ($this->formats as $enc => $conf) {
990
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
991
        }
992
    }
993
994
    /**
995
     * This saves the document to the database and index
996
     *
997
     * @access public
998
     *
999
     * @param int $pid: The PID of the saved record
1000
     * @param int $core: The UID of the Solr core for indexing
1001
     * @param int|string $owner: UID or index_name of owner to set while indexing
1002
     *
1003
     * @return bool true on success or false on failure
1004
     */
1005
    public function save($pid = 0, $core = 0, $owner = null)
1006
    {
1007
        if (\TYPO3_MODE !== 'BE') {
1008
            $this->logger->error('Saving a document is only allowed in the backend');
1009
            return false;
1010
        }
1011
        // Make sure $pid is a non-negative integer.
1012
        $pid = max(intval($pid), 0);
1013
        // Make sure $core is a non-negative integer.
1014
        $core = max(intval($core), 0);
1015
        // If $pid is not given, try to get it elsewhere.
1016
        if (
1017
            !$pid
1018
            && $this->pid
1019
        ) {
1020
            // Retain current PID.
1021
            $pid = $this->pid;
1022
        } elseif (!$pid) {
1023
            $this->logger->error('Invalid PID ' . $pid . ' for document saving');
1024
            return false;
1025
        }
1026
        // Set PID for metadata definitions.
1027
        $this->cPid = $pid;
1028
        // Set UID placeholder if not updating existing record.
1029
        if ($pid != $this->pid) {
1030
            $this->uid = uniqid('NEW');
0 ignored issues
show
Bug introduced by
The property uid is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1031
        }
1032
        // Get metadata array.
1033
        $metadata = $this->getTitledata($pid);
1034
        // Check for record identifier.
1035
        if (empty($metadata['record_id'][0])) {
1036
            $this->logger->error('No record identifier found to avoid duplication');
1037
            return false;
1038
        }
1039
        // Load plugin configuration.
1040
        $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
1041
1042
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1043
            ->getQueryBuilderForTable('tx_dlf_structures');
1044
1045
        // Get UID for structure type.
1046
        $result = $queryBuilder
1047
            ->select('tx_dlf_structures.uid AS uid')
1048
            ->from('tx_dlf_structures')
1049
            ->where(
1050
                $queryBuilder->expr()->eq('tx_dlf_structures.pid', intval($pid)),
1051
                $queryBuilder->expr()->eq('tx_dlf_structures.index_name', $queryBuilder->expr()->literal($metadata['type'][0])),
1052
                Helper::whereExpression('tx_dlf_structures')
1053
            )
1054
            ->setMaxResults(1)
1055
            ->execute();
1056
1057
        if ($resArray = $result->fetch()) {
1058
            $structure = $resArray['uid'];
1059
        } else {
1060
            $this->logger->error('Could not identify document/structure type "' . $queryBuilder->expr()->literal($metadata['type'][0]) . '"');
1061
            return false;
1062
        }
1063
        $metadata['type'][0] = $structure;
1064
1065
        // Remove appended "valueURI" from authors' names for storing in database.
1066
        foreach ($metadata['author'] as $i => $author) {
1067
            $splitName = explode(chr(31), $author);
1068
            $metadata['author'][$i] = $splitName[0];
1069
        }
1070
1071
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1072
            ->getQueryBuilderForTable('tx_dlf_collections');
1073
        // Get hidden records, too.
1074
        $queryBuilder
1075
            ->getRestrictions()
1076
            ->removeByType(HiddenRestriction::class);
1077
1078
        // Get UIDs for collections.
1079
        $result = $queryBuilder
1080
            ->select(
1081
                'tx_dlf_collections.index_name AS index_name',
1082
                'tx_dlf_collections.uid AS uid'
1083
            )
1084
            ->from('tx_dlf_collections')
1085
            ->where(
1086
                $queryBuilder->expr()->eq('tx_dlf_collections.pid', intval($pid)),
1087
                $queryBuilder->expr()->in('tx_dlf_collections.sys_language_uid', [-1, 0])
1088
            )
1089
            ->execute();
1090
1091
        $collUid = [];
1092
        while ($resArray = $result->fetch()) {
1093
            $collUid[$resArray['index_name']] = $resArray['uid'];
1094
        }
1095
        $collections = [];
1096
        foreach ($metadata['collection'] as $collection) {
1097
            if (!empty($collUid[$collection])) {
1098
                // Add existing collection's UID.
1099
                $collections[] = $collUid[$collection];
1100
            } else {
1101
                // Insert new collection.
1102
                $collNewUid = uniqid('NEW');
1103
                $collData['tx_dlf_collections'][$collNewUid] = [
1104
                    'pid' => $pid,
1105
                    'label' => $collection,
1106
                    'index_name' => $collection,
1107
                    'oai_name' => (!empty($conf['publishNewCollections']) ? Helper::getCleanString($collection) : ''),
1108
                    'description' => '',
1109
                    'documents' => 0,
1110
                    'owner' => 0,
1111
                    'status' => 0,
1112
                ];
1113
                $substUid = Helper::processDBasAdmin($collData);
1114
                // Prevent double insertion.
1115
                unset($collData);
1116
                // Add new collection's UID.
1117
                $collections[] = $substUid[$collNewUid];
1118
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1119
                    Helper::addMessage(
1120
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newCollection'), $collection, $substUid[$collNewUid])),
1121
                        Helper::getMessage('flash.attention', true),
1122
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1123
                        true
1124
                    );
1125
                }
1126
            }
1127
        }
1128
        $metadata['collection'] = $collections;
1129
1130
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1131
            ->getQueryBuilderForTable('tx_dlf_libraries');
1132
1133
        // Get UID for owner.
1134
        if (empty($owner)) {
1135
            $owner = empty($metadata['owner'][0]) ? $metadata['owner'][0] : 'default';
1136
        }
1137
        if (!MathUtility::canBeInterpretedAsInteger($owner)) {
1138
            $result = $queryBuilder
1139
                ->select('tx_dlf_libraries.uid AS uid')
1140
                ->from('tx_dlf_libraries')
1141
                ->where(
1142
                    $queryBuilder->expr()->eq('tx_dlf_libraries.pid', intval($pid)),
1143
                    $queryBuilder->expr()->eq('tx_dlf_libraries.index_name', $queryBuilder->expr()->literal($owner)),
1144
                    Helper::whereExpression('tx_dlf_libraries')
1145
                )
1146
                ->setMaxResults(1)
1147
                ->execute();
1148
1149
            if ($resArray = $result->fetch()) {
1150
                $ownerUid = $resArray['uid'];
1151
            } else {
1152
                // Insert new library.
1153
                $libNewUid = uniqid('NEW');
1154
                $libData['tx_dlf_libraries'][$libNewUid] = [
0 ignored issues
show
Comprehensibility Best Practice introduced by
$libData was never initialized. Although not strictly required by PHP, it is generally a good practice to add $libData = array(); before regardless.
Loading history...
1155
                    'pid' => $pid,
1156
                    'label' => $owner,
1157
                    'index_name' => $owner,
1158
                    'website' => '',
1159
                    'contact' => '',
1160
                    'image' => '',
1161
                    'oai_label' => '',
1162
                    'oai_base' => '',
1163
                    'opac_label' => '',
1164
                    'opac_base' => '',
1165
                    'union_label' => '',
1166
                    'union_base' => '',
1167
                ];
1168
                $substUid = Helper::processDBasAdmin($libData);
1169
                // Add new library's UID.
1170
                $ownerUid = $substUid[$libNewUid];
1171
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1172
                    Helper::addMessage(
1173
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newLibrary'), $owner, $ownerUid)),
1174
                        Helper::getMessage('flash.attention', true),
1175
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1176
                        true
1177
                    );
1178
                }
1179
            }
1180
            $owner = $ownerUid;
1181
        }
1182
        $metadata['owner'][0] = $owner;
1183
        // Get UID of parent document.
1184
        $partof = $this->getParentDocumentUidForSaving($pid, $core, $owner);
1185
        // Use the date of publication or title as alternative sorting metric for parts of multi-part works.
1186
        if (!empty($partof)) {
1187
            if (
1188
                empty($metadata['volume'][0])
1189
                && !empty($metadata['year'][0])
1190
            ) {
1191
                $metadata['volume'] = $metadata['year'];
1192
            }
1193
            if (empty($metadata['volume_sorting'][0])) {
1194
                // If METS @ORDER is given it is preferred over year_sorting and year.
1195
                if (!empty($metadata['mets_order'][0])) {
1196
                    $metadata['volume_sorting'][0] = $metadata['mets_order'][0];
1197
                } elseif (!empty($metadata['year_sorting'][0])) {
1198
                    $metadata['volume_sorting'][0] = $metadata['year_sorting'][0];
1199
                } elseif (!empty($metadata['year'][0])) {
1200
                    $metadata['volume_sorting'][0] = $metadata['year'][0];
1201
                }
1202
            }
1203
            // If volume_sorting is still empty, try to use title_sorting or METS @ORDERLABEL finally (workaround for newspapers)
1204
            if (empty($metadata['volume_sorting'][0])) {
1205
                if (!empty($metadata['title_sorting'][0])) {
1206
                    $metadata['volume_sorting'][0] = $metadata['title_sorting'][0];
1207
                } elseif (!empty($metadata['mets_orderlabel'][0])) {
1208
                    $metadata['volume_sorting'][0] = $metadata['mets_orderlabel'][0];
1209
                }
1210
            }
1211
        }
1212
1213
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1214
            ->getQueryBuilderForTable('tx_dlf_metadata');
1215
1216
        // Get metadata for lists and sorting.
1217
        $result = $queryBuilder
1218
            ->select(
1219
                'tx_dlf_metadata.index_name AS index_name',
1220
                'tx_dlf_metadata.is_listed AS is_listed',
1221
                'tx_dlf_metadata.is_sortable AS is_sortable'
1222
            )
1223
            ->from('tx_dlf_metadata')
1224
            ->where(
1225
                $queryBuilder->expr()->orX(
1226
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_listed', 1),
1227
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_sortable', 1)
1228
                ),
1229
                $queryBuilder->expr()->eq('tx_dlf_metadata.pid', intval($pid)),
1230
                Helper::whereExpression('tx_dlf_metadata')
1231
            )
1232
            ->execute();
1233
1234
        $listed = [];
1235
        $sortable = [];
1236
1237
        while ($resArray = $result->fetch()) {
1238
            if (!empty($metadata[$resArray['index_name']])) {
1239
                if ($resArray['is_listed']) {
1240
                    $listed[$resArray['index_name']] = $metadata[$resArray['index_name']];
1241
                }
1242
                if ($resArray['is_sortable']) {
1243
                    $sortable[$resArray['index_name']] = $metadata[$resArray['index_name']][0];
1244
                }
1245
            }
1246
        }
1247
        // Fill data array.
1248
        $data['tx_dlf_documents'][$this->uid] = [
0 ignored issues
show
Comprehensibility Best Practice introduced by
$data was never initialized. Although not strictly required by PHP, it is generally a good practice to add $data = array(); before regardless.
Loading history...
1249
            'pid' => $pid,
1250
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['starttime'] => 0,
1251
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['endtime'] => 0,
1252
            'prod_id' => $metadata['prod_id'][0],
1253
            'location' => $this->location,
1254
            'record_id' => $metadata['record_id'][0],
1255
            'opac_id' => $metadata['opac_id'][0],
1256
            'union_id' => $metadata['union_id'][0],
1257
            'urn' => $metadata['urn'][0],
1258
            'purl' => $metadata['purl'][0],
1259
            'title' => $metadata['title'][0],
1260
            'title_sorting' => $metadata['title_sorting'][0],
1261
            'author' => implode('; ', $metadata['author']),
1262
            'year' => implode('; ', $metadata['year']),
1263
            'place' => implode('; ', $metadata['place']),
1264
            'thumbnail' => $this->_getThumbnail(true),
1265
            'metadata' => serialize($listed),
1266
            'metadata_sorting' => serialize($sortable),
1267
            'structure' => $metadata['type'][0],
1268
            'partof' => $partof,
1269
            'volume' => $metadata['volume'][0],
1270
            'volume_sorting' => $metadata['volume_sorting'][0],
1271
            'license' => $metadata['license'][0],
1272
            'terms' => $metadata['terms'][0],
1273
            'restrictions' => $metadata['restrictions'][0],
1274
            'out_of_print' => $metadata['out_of_print'][0],
1275
            'rights_info' => $metadata['rights_info'][0],
1276
            'collections' => $metadata['collection'],
1277
            'mets_label' => $metadata['mets_label'][0],
1278
            'mets_orderlabel' => $metadata['mets_orderlabel'][0],
1279
            'mets_order' => $metadata['mets_order'][0],
1280
            'owner' => $metadata['owner'][0],
1281
            'solrcore' => $core,
1282
            'status' => 0,
1283
            'document_format' => $metadata['document_format'][0],
1284
        ];
1285
        // Unhide hidden documents.
1286
        if (!empty($conf['unhideOnIndex'])) {
1287
            $data['tx_dlf_documents'][$this->uid][$GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['disabled']] = 0;
1288
        }
1289
        // Process data.
1290
        $newIds = Helper::processDBasAdmin($data);
1291
        // Replace placeholder with actual UID.
1292
        if (strpos($this->uid, 'NEW') === 0) {
1293
            $this->uid = $newIds[$this->uid];
1294
            $this->pid = $pid;
0 ignored issues
show
Bug introduced by
The property pid is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1295
            $this->parentId = $partof;
0 ignored issues
show
Bug introduced by
The property parentId is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1296
        }
1297
        if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1298
            Helper::addMessage(
1299
                htmlspecialchars(sprintf(Helper::getMessage('flash.documentSaved'), $metadata['title'][0], $this->uid)),
1300
                Helper::getMessage('flash.done', true),
1301
                \TYPO3\CMS\Core\Messaging\FlashMessage::OK,
1302
                true
1303
            );
1304
        }
1305
        // Add document to index.
1306
        if ($core) {
1307
            return Indexer::add($this, $core);
1308
        } else {
1309
            $this->logger->notice('Invalid UID "' . $core . '" for Solr core');
0 ignored issues
show
Bug introduced by
The method notice() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1309
            $this->logger->/** @scrutinizer ignore-call */ 
1310
                           notice('Invalid UID "' . $core . '" for Solr core');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1310
            return false;
1311
        }
1312
    }
1313
1314
    /**
1315
     * Get the ID of the parent document if the current document has one. Also save a parent document
1316
     * to the database and the Solr index if their $pid and the current $pid differ.
1317
     * Currently only applies to METS documents.
1318
     *
1319
     * @access protected
1320
     *
1321
     * @abstract
1322
     *
1323
     * @return int The parent document's id.
1324
     */
1325
    protected abstract function getParentDocumentUidForSaving($pid, $core, $owner);
1326
1327
    /**
1328
     * This returns $this->cPid via __get()
1329
     *
1330
     * @access protected
1331
     *
1332
     * @return int The PID of the metadata definitions
1333
     */
1334
    protected function _getCPid()
1335
    {
1336
        return $this->cPid;
1337
    }
1338
1339
    /**
1340
     * This returns $this->hasFulltext via __get()
1341
     *
1342
     * @access protected
1343
     *
1344
     * @return bool Are there any fulltext files available?
1345
     */
1346
    protected function _getHasFulltext()
1347
    {
1348
        $this->ensureHasFulltextIsSet();
1349
        return $this->hasFulltext;
1350
    }
1351
1352
    /**
1353
     * This returns $this->location via __get()
1354
     *
1355
     * @access protected
1356
     *
1357
     * @return string The location of the document
1358
     */
1359
    protected function _getLocation()
1360
    {
1361
        return $this->location;
1362
    }
1363
1364
    /**
1365
     * Format specific part of building the document's metadata array
1366
     *
1367
     * @access protected
1368
     *
1369
     * @abstract
1370
     *
1371
     * @param int $cPid
1372
     */
1373
    protected abstract function prepareMetadataArray($cPid);
1374
1375
    /**
1376
     * This builds an array of the document's metadata
1377
     *
1378
     * @access protected
1379
     *
1380
     * @return array Array of metadata with their corresponding logical structure node ID as key
1381
     */
1382
    protected function _getMetadataArray()
1383
    {
1384
        // Set metadata definitions' PID.
1385
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
1386
        if (!$cPid) {
1387
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
1388
            return [];
1389
        }
1390
        if (
1391
            !$this->metadataArrayLoaded
1392
            || $this->metadataArray[0] != $cPid
1393
        ) {
1394
            $this->prepareMetadataArray($cPid);
1395
            $this->metadataArray[0] = $cPid;
0 ignored issues
show
Bug introduced by
The property metadataArray is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1396
            $this->metadataArrayLoaded = true;
1397
        }
1398
        return $this->metadataArray;
1399
    }
1400
1401
    /**
1402
     * This returns $this->numPages via __get()
1403
     *
1404
     * @access protected
1405
     *
1406
     * @return int The total number of pages and/or tracks
1407
     */
1408
    protected function _getNumPages()
1409
    {
1410
        $this->_getPhysicalStructure();
1411
        return $this->numPages;
1412
    }
1413
1414
    /**
1415
     * This returns $this->parentId via __get()
1416
     *
1417
     * @access protected
1418
     *
1419
     * @return int The UID of the parent document or zero if not applicable
1420
     */
1421
    protected function _getParentId()
1422
    {
1423
        return $this->parentId;
1424
    }
1425
1426
    /**
1427
     * This builds an array of the document's physical structure
1428
     *
1429
     * @access protected
1430
     *
1431
     * @abstract
1432
     *
1433
     * @return array Array of physical elements' id, type, label and file representations ordered
1434
     * by @ORDER attribute / IIIF Sequence's Canvases
1435
     */
1436
    protected abstract function _getPhysicalStructure();
1437
1438
    /**
1439
     * This gives an array of the document's physical structure metadata
1440
     *
1441
     * @access protected
1442
     *
1443
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1444
     */
1445
    protected function _getPhysicalStructureInfo()
1446
    {
1447
        // Is there no physical structure array yet?
1448
        if (!$this->physicalStructureLoaded) {
1449
            // Build physical structure array.
1450
            $this->_getPhysicalStructure();
1451
        }
1452
        return $this->physicalStructureInfo;
1453
    }
1454
1455
    /**
1456
     * This returns $this->pid via __get()
1457
     *
1458
     * @access protected
1459
     *
1460
     * @return int The PID of the document or zero if not in database
1461
     */
1462
    protected function _getPid()
1463
    {
1464
        return $this->pid;
1465
    }
1466
1467
    /**
1468
     * This returns $this->ready via __get()
1469
     *
1470
     * @access protected
1471
     *
1472
     * @return bool Is the document instantiated successfully?
1473
     */
1474
    protected function _getReady()
1475
    {
1476
        return $this->ready;
1477
    }
1478
1479
    /**
1480
     * This returns $this->recordId via __get()
1481
     *
1482
     * @access protected
1483
     *
1484
     * @return mixed The METS file's / IIIF manifest's record identifier
1485
     */
1486
    protected function _getRecordId()
1487
    {
1488
        return $this->recordId;
1489
    }
1490
1491
    /**
1492
     * This returns $this->rootId via __get()
1493
     *
1494
     * @access protected
1495
     *
1496
     * @return int The UID of the root document or zero if not applicable
1497
     */
1498
    protected function _getRootId()
1499
    {
1500
        if (!$this->rootIdLoaded) {
1501
            if ($this->parentId) {
1502
                $parent = self::getInstance($this->parentId, $this->pid);
1503
                $this->rootId = $parent->rootId;
0 ignored issues
show
Bug introduced by
The property rootId is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1504
            }
1505
            $this->rootIdLoaded = true;
1506
        }
1507
        return $this->rootId;
1508
    }
1509
1510
    /**
1511
     * This returns the smLinks between logical and physical structMap (METS) and models the
1512
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1513
     *
1514
     * @access protected
1515
     *
1516
     * @abstract
1517
     *
1518
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1519
     */
1520
    protected abstract function _getSmLinks();
1521
1522
    /**
1523
     * This builds an array of the document's logical structure
1524
     *
1525
     * @access protected
1526
     *
1527
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1528
     */
1529
    protected function _getTableOfContents()
1530
    {
1531
        // Is there no logical structure array yet?
1532
        if (!$this->tableOfContentsLoaded) {
1533
            // Get all logical structures.
1534
            $this->getLogicalStructure('', true);
1535
            $this->tableOfContentsLoaded = true;
1536
        }
1537
        return $this->tableOfContents;
1538
    }
1539
1540
    /**
1541
     * This returns the document's thumbnail location
1542
     *
1543
     * @access protected
1544
     *
1545
     * @abstract
1546
     *
1547
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1548
     *
1549
     * @return string The document's thumbnail location
1550
     */
1551
    protected abstract function _getThumbnail($forceReload = false);
1552
1553
    /**
1554
     * This returns the ID of the toplevel logical structure node
1555
     *
1556
     * @access protected
1557
     *
1558
     * @abstract
1559
     *
1560
     * @return string The logical structure node's ID
1561
     */
1562
    protected abstract function _getToplevelId();
1563
1564
    /**
1565
     * This returns $this->uid via __get()
1566
     *
1567
     * @access protected
1568
     *
1569
     * @return mixed The UID or the URL of the document
1570
     */
1571
    protected function _getUid()
1572
    {
1573
        return $this->uid;
1574
    }
1575
1576
    /**
1577
     * This sets $this->cPid via __set()
1578
     *
1579
     * @access protected
1580
     *
1581
     * @param int $value: The new PID for the metadata definitions
1582
     *
1583
     * @return void
1584
     */
1585
    protected function _setCPid($value)
1586
    {
1587
        $this->cPid = max(intval($value), 0);
1588
    }
1589
1590
    /**
1591
     * This magic method is invoked each time a clone is called on the object variable
1592
     *
1593
     * @access protected
1594
     *
1595
     * @return void
1596
     */
1597
    protected function __clone()
1598
    {
1599
        // This method is defined as protected because singleton objects should not be cloned.
1600
    }
1601
1602
    /**
1603
     * This is a singleton class, thus the constructor should be private/protected
1604
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Document::getInstance())
1605
     *
1606
     * @access protected
1607
     *
1608
     * @param int $uid: The UID of the document to parse or URL to XML file
1609
     * @param int $pid: If > 0, then only document with this PID gets loaded
1610
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1611
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1612
     *
1613
     * @return void
1614
     */
1615
    protected function __construct($uid, $pid, $preloadedDocument)
1616
    {
1617
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1618
            ->getQueryBuilderForTable('tx_dlf_documents');
1619
        $location = '';
1620
        // Prepare to check database for the requested document.
1621
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
1622
            $whereClause = $queryBuilder->expr()->andX(
1623
                $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
1624
                Helper::whereExpression('tx_dlf_documents')
1625
            );
1626
        } else {
1627
            // Try to load METS file / IIIF manifest.
1628
            if ($this->setPreloadedDocument($preloadedDocument) || (GeneralUtility::isValidUrl($uid)
1629
                && $this->load($uid))) {
1630
                // Initialize core METS object.
1631
                $this->init();
1632
                if ($this->getDocument() !== null) {
1633
                    // Cast to string for safety reasons.
1634
                    $location = (string) $uid;
1635
                    $this->establishRecordId($pid);
1636
                } else {
1637
                    // No METS / IIIF part found.
1638
                    return;
1639
                }
1640
            } else {
1641
                // Loading failed.
1642
                return;
1643
            }
1644
            if (
1645
                !empty($location)
1646
                && !empty($this->recordId)
1647
            ) {
1648
                // Try to match record identifier or location (both should be unique).
1649
                $whereClause = $queryBuilder->expr()->andX(
1650
                    $queryBuilder->expr()->orX(
1651
                        $queryBuilder->expr()->eq('tx_dlf_documents.location', $queryBuilder->expr()->literal($location)),
1652
                        $queryBuilder->expr()->eq('tx_dlf_documents.record_id', $queryBuilder->expr()->literal($this->recordId))
1653
                    ),
1654
                    Helper::whereExpression('tx_dlf_documents')
1655
                );
1656
            } else {
1657
                // Can't persistently identify document, don't try to match at all.
1658
                $whereClause = '1=-1';
1659
            }
1660
        }
1661
        // Check for PID if needed.
1662
        if ($pid) {
1663
            $whereClause = $queryBuilder->expr()->andX(
1664
                $whereClause,
1665
                $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid))
1666
            );
1667
        }
1668
        // Get document PID and location from database.
1669
        $result = $queryBuilder
1670
            ->select(
1671
                'tx_dlf_documents.uid AS uid',
1672
                'tx_dlf_documents.pid AS pid',
1673
                'tx_dlf_documents.record_id AS record_id',
1674
                'tx_dlf_documents.partof AS partof',
1675
                'tx_dlf_documents.thumbnail AS thumbnail',
1676
                'tx_dlf_documents.location AS location'
1677
            )
1678
            ->from('tx_dlf_documents')
1679
            ->where($whereClause)
1680
            ->setMaxResults(1)
1681
            ->execute();
1682
1683
        if ($resArray = $result->fetch()) {
1684
            $this->uid = $resArray['uid'];
0 ignored issues
show
Bug introduced by
The property uid is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1685
            $this->pid = $resArray['pid'];
0 ignored issues
show
Bug introduced by
The property pid is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1686
            $this->recordId = $resArray['record_id'];
0 ignored issues
show
Bug introduced by
The property recordId is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1687
            $this->parentId = $resArray['partof'];
0 ignored issues
show
Bug introduced by
The property parentId is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1688
            $this->thumbnail = $resArray['thumbnail'];
0 ignored issues
show
Bug introduced by
The property thumbnail is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1689
            $this->location = $resArray['location'];
0 ignored issues
show
Bug introduced by
The property location is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1690
            $this->thumbnailLoaded = true;
1691
            // Load XML file if necessary...
1692
            if (
1693
                $this->getDocument() === null
1694
                && $this->load($this->location)
1695
            ) {
1696
                // ...and set some basic properties.
1697
                $this->init();
1698
            }
1699
            // Do we have a METS / IIIF object now?
1700
            if ($this->getDocument() !== null) {
1701
                // Set new location if necessary.
1702
                if (!empty($location)) {
1703
                    $this->location = $location;
1704
                }
1705
                // Document ready!
1706
                $this->ready = true;
0 ignored issues
show
Bug introduced by
The property ready is declared read-only in Kitodo\Dlf\Common\Document.
Loading history...
1707
            }
1708
        } elseif ($this->getDocument() !== null) {
1709
            // Set location as UID for documents not in database.
1710
            $this->uid = $location;
1711
            $this->location = $location;
1712
            // Document ready!
1713
            $this->ready = true;
1714
        } else {
1715
            $this->logger->error('No document with UID ' . $uid . ' found or document not accessible');
1716
        }
1717
    }
1718
1719
    /**
1720
     * This magic method is called each time an invisible property is referenced from the object
1721
     *
1722
     * @access public
1723
     *
1724
     * @param string $var: Name of variable to get
1725
     *
1726
     * @return mixed Value of $this->$var
1727
     */
1728
    public function __get($var)
1729
    {
1730
        $method = '_get' . ucfirst($var);
1731
        if (
1732
            !property_exists($this, $var)
1733
            || !method_exists($this, $method)
1734
        ) {
1735
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1736
            return;
1737
        } else {
1738
            return $this->$method();
1739
        }
1740
    }
1741
1742
    /**
1743
     * This magic method is called each time an invisible property is checked for isset() or empty()
1744
     *
1745
     * @access public
1746
     *
1747
     * @param string $var: Name of variable to check
1748
     *
1749
     * @return bool true if variable is set and not empty, false otherwise
1750
     */
1751
    public function __isset($var)
1752
    {
1753
        return !empty($this->__get($var));
1754
    }
1755
1756
    /**
1757
     * This magic method is called each time an invisible property is referenced from the object
1758
     *
1759
     * @access public
1760
     *
1761
     * @param string $var: Name of variable to set
1762
     * @param mixed $value: New value of variable
1763
     *
1764
     * @return void
1765
     */
1766
    public function __set($var, $value)
1767
    {
1768
        $method = '_set' . ucfirst($var);
1769
        if (
1770
            !property_exists($this, $var)
1771
            || !method_exists($this, $method)
1772
        ) {
1773
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1774
        } else {
1775
            $this->$method($value);
1776
        }
1777
    }
1778
}
1779