Scrutinizer GitHub App not installed

We could not synchronize checks via GitHub's checks API since Scrutinizer's GitHub App is not installed for this repository.

Install GitHub App

GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Pull Request — master (#673)
by Alexander
03:20 queued 12s
created

Document::loadFormats()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 29
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 19
c 0
b 0
f 0
dl 0
loc 29
rs 9.6333
cc 3
nc 3
nop 0
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Common;
14
15
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
16
use TYPO3\CMS\Core\Database\ConnectionPool;
17
use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
18
use TYPO3\CMS\Core\Log\LogManager;
19
use TYPO3\CMS\Core\Utility\GeneralUtility;
20
use TYPO3\CMS\Core\Utility\MathUtility;
21
use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
22
use Ubl\Iiif\Tools\IiifHelper;
23
24
/**
25
 * Document class for the 'dlf' extension
26
 *
27
 * @author Sebastian Meyer <[email protected]>
28
 * @author Henrik Lochmann <[email protected]>
29
 * @package TYPO3
30
 * @subpackage dlf
31
 * @access public
32
 * @property int $cPid This holds the PID for the configuration
33
 * @property-read bool $hasFulltext Are there any fulltext files available?
34
 * @property-read string $location This holds the documents location
35
 * @property-read array $metadataArray This holds the documents' parsed metadata array
36
 * @property-read int $numPages The holds the total number of pages
37
 * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
38
 * @property-read array $physicalStructure This holds the physical structure
39
 * @property-read array $physicalStructureInfo This holds the physical structure metadata
40
 * @property-read int $pid This holds the PID of the document or zero if not in database
41
 * @property-read bool $ready Is the document instantiated successfully?
42
 * @property-read string $recordId The METS file's / IIIF manifest's record identifier
43
 * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
44
 * @property-read array $smLinks This holds the smLinks between logical and physical structMap
45
 * @property-read array $tableOfContents This holds the logical structure
46
 * @property-read string $thumbnail This holds the document's thumbnail location
47
 * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
48
 * @property-read mixed $uid This holds the UID or the URL of the document
49
 * @abstract
50
 */
51
abstract class Document
52
{
53
    /**
54
     * This holds the logger
55
     *
56
     * @var LogManager
57
     * @access protected
58
     */
59
    protected $logger;
60
61
    /**
62
     * This holds the PID for the configuration
63
     *
64
     * @var int
65
     * @access protected
66
     */
67
    protected $cPid = 0;
68
69
    /**
70
     * The extension key
71
     *
72
     * @var string
73
     * @access public
74
     */
75
    public static $extKey = 'dlf';
76
77
    /**
78
     * This holds the configuration for all supported metadata encodings
79
     * @see loadFormats()
80
     *
81
     * @var array
82
     * @access protected
83
     */
84
    protected $formats = [
85
        'OAI' => [
86
            'rootElement' => 'OAI-PMH',
87
            'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
88
        ],
89
        'METS' => [
90
            'rootElement' => 'mets',
91
            'namespaceURI' => 'http://www.loc.gov/METS/',
92
        ],
93
        'XLINK' => [
94
            'rootElement' => 'xlink',
95
            'namespaceURI' => 'http://www.w3.org/1999/xlink',
96
        ]
97
    ];
98
99
    /**
100
     * Are the available metadata formats loaded?
101
     * @see $formats
102
     *
103
     * @var bool
104
     * @access protected
105
     */
106
    protected $formatsLoaded = false;
107
108
    /**
109
     * Are there any fulltext files available? This also includes IIIF text annotations
110
     * with motivation 'painting' if Kitodo.Presentation is configured to store text
111
     * annotations as fulltext.
112
     *
113
     * @var bool
114
     * @access protected
115
     */
116
    protected $hasFulltext = false;
117
118
    /**
119
     * Last searched logical and physical page
120
     *
121
     * @var array
122
     * @access protected
123
     */
124
    protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
125
126
    /**
127
     * This holds the documents location
128
     *
129
     * @var string
130
     * @access protected
131
     */
132
    protected $location = '';
133
134
    /**
135
     * This holds the logical units
136
     *
137
     * @var array
138
     * @access protected
139
     */
140
    protected $logicalUnits = [];
141
142
    /**
143
     * This holds the documents' parsed metadata array with their corresponding
144
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
145
     *
146
     * @var array
147
     * @access protected
148
     */
149
    protected $metadataArray = [];
150
151
    /**
152
     * Is the metadata array loaded?
153
     * @see $metadataArray
154
     *
155
     * @var bool
156
     * @access protected
157
     */
158
    protected $metadataArrayLoaded = false;
159
160
    /**
161
     * The holds the total number of pages
162
     *
163
     * @var int
164
     * @access protected
165
     */
166
    protected $numPages = 0;
167
168
    /**
169
     * This holds the UID of the parent document or zero if not multi-volumed
170
     *
171
     * @var int
172
     * @access protected
173
     */
174
    protected $parentId = 0;
175
176
    /**
177
     * This holds the physical structure
178
     *
179
     * @var array
180
     * @access protected
181
     */
182
    protected $physicalStructure = [];
183
184
    /**
185
     * This holds the physical structure metadata
186
     *
187
     * @var array
188
     * @access protected
189
     */
190
    protected $physicalStructureInfo = [];
191
192
    /**
193
     * Is the physical structure loaded?
194
     * @see $physicalStructure
195
     *
196
     * @var bool
197
     * @access protected
198
     */
199
    protected $physicalStructureLoaded = false;
200
201
    /**
202
     * This holds the PID of the document or zero if not in database
203
     *
204
     * @var int
205
     * @access protected
206
     */
207
    protected $pid = 0;
208
209
    /**
210
     * This holds the documents' raw text pages with their corresponding
211
     * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
212
     *
213
     * @var array
214
     * @access protected
215
     */
216
    protected $rawTextArray = [];
217
218
    /**
219
     * Is the document instantiated successfully?
220
     *
221
     * @var bool
222
     * @access protected
223
     */
224
    protected $ready = false;
225
226
    /**
227
     * The METS file's / IIIF manifest's record identifier
228
     *
229
     * @var string
230
     * @access protected
231
     */
232
    protected $recordId;
233
234
    /**
235
     * This holds the singleton object of the document
236
     *
237
     * @var array (\Kitodo\Dlf\Common\Document)
238
     * @static
239
     * @access protected
240
     */
241
    protected static $registry = [];
242
243
    /**
244
     * This holds the UID of the root document or zero if not multi-volumed
245
     *
246
     * @var int
247
     * @access protected
248
     */
249
    protected $rootId = 0;
250
251
    /**
252
     * Is the root id loaded?
253
     * @see $rootId
254
     *
255
     * @var bool
256
     * @access protected
257
     */
258
    protected $rootIdLoaded = false;
259
260
    /**
261
     * This holds the smLinks between logical and physical structMap
262
     *
263
     * @var array
264
     * @access protected
265
     */
266
    protected $smLinks = ['l2p' => [], 'p2l' => []];
267
268
    /**
269
     * Are the smLinks loaded?
270
     * @see $smLinks
271
     *
272
     * @var bool
273
     * @access protected
274
     */
275
    protected $smLinksLoaded = false;
276
277
    /**
278
     * This holds the logical structure
279
     *
280
     * @var array
281
     * @access protected
282
     */
283
    protected $tableOfContents = [];
284
285
    /**
286
     * Is the table of contents loaded?
287
     * @see $tableOfContents
288
     *
289
     * @var bool
290
     * @access protected
291
     */
292
    protected $tableOfContentsLoaded = false;
293
294
    /**
295
     * This holds the document's thumbnail location
296
     *
297
     * @var string
298
     * @access protected
299
     */
300
    protected $thumbnail = '';
301
302
    /**
303
     * Is the document's thumbnail location loaded?
304
     * @see $thumbnail
305
     *
306
     * @var bool
307
     * @access protected
308
     */
309
    protected $thumbnailLoaded = false;
310
311
    /**
312
     * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
313
     *
314
     * @var string
315
     * @access protected
316
     */
317
    protected $toplevelId = '';
318
319
    /**
320
     * This holds the UID or the URL of the document
321
     *
322
     * @var mixed
323
     * @access protected
324
     */
325
    protected $uid = 0;
326
327
    /**
328
     * This holds the whole XML file as \SimpleXMLElement object
329
     *
330
     * @var \SimpleXMLElement
331
     * @access protected
332
     */
333
    protected $xml;
334
335
    /**
336
     * This clears the static registry to prevent memory exhaustion
337
     *
338
     * @access public
339
     *
340
     * @static
341
     *
342
     * @return void
343
     */
344
    public static function clearRegistry()
345
    {
346
        // Reset registry array.
347
        self::$registry = [];
348
    }
349
350
    /**
351
     * This ensures that the recordId, if existent, is retrieved from the document
352
     *
353
     * @access protected
354
     *
355
     * @abstract
356
     *
357
     * @param int $pid: ID of the configuration page with the recordId config
358
     *
359
     */
360
    protected abstract function establishRecordId($pid);
361
362
    /**
363
     * Source document PHP object which is represented by a Document instance
364
     *
365
     * @access protected
366
     *
367
     * @abstract
368
     *
369
     * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
370
     * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
371
     */
372
    protected abstract function getDocument();
373
374
    /**
375
     * This gets the location of a downloadable file for a physical page or track
376
     *
377
     * @access public
378
     *
379
     * @abstract
380
     *
381
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
382
     *
383
     * @return string    The file's location as URL
384
     */
385
    public abstract function getDownloadLocation($id);
386
387
    /**
388
     * This gets the location of a file representing a physical page or track
389
     *
390
     * @access public
391
     *
392
     * @abstract
393
     *
394
     * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
395
     *
396
     * @return string The file's location as URL
397
     */
398
    public abstract function getFileLocation($id);
399
400
    /**
401
     * This gets the MIME type of a file representing a physical page or track
402
     *
403
     * @access public
404
     *
405
     * @abstract
406
     *
407
     * @param string $id: The @ID attribute of the file node
408
     *
409
     * @return string The file's MIME type
410
     */
411
    public abstract function getFileMimeType($id);
412
413
    /**
414
     * This is a singleton class, thus an instance must be created by this method
415
     *
416
     * @access public
417
     *
418
     * @static
419
     *
420
     * @param mixed $uid: The unique identifier of the document to parse, the URL of XML file or the IRI of the IIIF resource
421
     * @param int $pid: If > 0, then only document with this PID gets loaded
422
     * @param bool $forceReload: Force reloading the document instead of returning the cached instance
423
     *
424
     * @return \Kitodo\Dlf\Common\Document Instance of this class, either MetsDocument or IiifManifest
425
     */
426
    public static function &getInstance($uid, $pid = 0, $forceReload = false)
427
    {
428
        // Sanitize input.
429
        $pid = max(intval($pid), 0);
430
        if (!$forceReload) {
431
            $regObj = Helper::digest($uid);
432
            if (
433
                is_object(self::$registry[$regObj])
434
                && self::$registry[$regObj] instanceof self
435
            ) {
436
                // Check if instance has given PID.
437
                if (
438
                    !$pid
439
                    || !self::$registry[$regObj]->pid
440
                    || $pid == self::$registry[$regObj]->pid
441
                ) {
442
                    // Return singleton instance if available.
443
                    return self::$registry[$regObj];
444
                }
445
            } else {
446
                // Check the user's session...
447
                $sessionData = Helper::loadFromSession(get_called_class());
448
                if (
449
                    is_object($sessionData[$regObj])
450
                    && $sessionData[$regObj] instanceof self
451
                ) {
452
                    // Check if instance has given PID.
453
                    if (
454
                        !$pid
455
                        || !$sessionData[$regObj]->pid
456
                        || $pid == $sessionData[$regObj]->pid
457
                    ) {
458
                        // ...and restore registry.
459
                        self::$registry[$regObj] = $sessionData[$regObj];
460
                        return self::$registry[$regObj];
461
                    }
462
                }
463
            }
464
        }
465
        // Create new instance depending on format (METS or IIIF) ...
466
        $instance = null;
467
        $documentFormat = null;
468
        $xml = null;
469
        $iiif = null;
470
        // Try to get document format from database
471
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
472
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
473
                ->getQueryBuilderForTable('tx_dlf_documents');
474
475
            $queryBuilder
476
                ->select(
477
                    'tx_dlf_documents.location AS location',
478
                    'tx_dlf_documents.document_format AS document_format'
479
                )
480
                ->from('tx_dlf_documents');
481
482
            // Get UID of document with given record identifier.
483
            if ($pid) {
484
                $queryBuilder
485
                    ->where(
486
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
487
                        $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid)),
488
                        Helper::whereExpression('tx_dlf_documents')
489
                    );
490
            } else {
491
                $queryBuilder
492
                    ->where(
493
                        $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
494
                        Helper::whereExpression('tx_dlf_documents')
495
                    );
496
            }
497
498
            $result = $queryBuilder
499
                ->setMaxResults(1)
500
                ->execute();
501
502
            if ($resArray = $result->fetch()) {
503
                $documentFormat = $resArray['document_format'];
504
            }
505
        } else {
506
            // Get document format from content of remote document
507
            // Cast to string for safety reasons.
508
            $location = (string) $uid;
509
            // Try to load a file from the url
510
            if (GeneralUtility::isValidUrl($location)) {
511
                // Load extension configuration
512
                $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
513
                // Set user-agent to identify self when fetching XML data.
514
                if (!empty($extConf['useragent'])) {
515
                    @ini_set('user_agent', $extConf['useragent']);
516
                }
517
                $content = GeneralUtility::getUrl($location);
518
                if ($content !== false) {
519
                    // TODO use single place to load xml
520
                    // Turn off libxml's error logging.
521
                    $libxmlErrors = libxml_use_internal_errors(true);
522
                    // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept
523
                    $previousValueOfEntityLoader = libxml_disable_entity_loader(true);
524
                    // Try to load XML from file.
525
                    $xml = simplexml_load_string($content);
526
                    // reset entity loader setting
527
                    libxml_disable_entity_loader($previousValueOfEntityLoader);
528
                    // Reset libxml's error logging.
529
                    libxml_use_internal_errors($libxmlErrors);
530
                    if ($xml !== false) {
531
                        /* @var $xml \SimpleXMLElement */
532
                        $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
533
                        $xpathResult = $xml->xpath('//mets:mets');
534
                        $documentFormat = !empty($xpathResult) ? 'METS' : null;
535
                    } else {
536
                        // Try to load file as IIIF resource instead.
537
                        $contentAsJsonArray = json_decode($content, true);
538
                        if ($contentAsJsonArray !== null) {
539
                            // Load plugin configuration.
540
                            $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
541
                            IiifHelper::setUrlReader(IiifUrlReader::getInstance());
542
                            IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
543
                            IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
544
                            $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
545
                            if ($iiif instanceof IiifResourceInterface) {
546
                                $documentFormat = 'IIIF';
547
                            }
548
                        }
549
                    }
550
                }
551
            }
552
        }
553
        // Sanitize input.
554
        $pid = max(intval($pid), 0);
555
        if ($documentFormat == 'METS') {
556
            $instance = new MetsDocument($uid, $pid, $xml);
557
        } elseif ($documentFormat == 'IIIF') {
558
            $instance = new IiifManifest($uid, $pid, $iiif);
559
        }
560
        // Save instance to registry.
561
        if (
562
            $instance instanceof self
563
            && $instance->ready) {
564
            self::$registry[Helper::digest($instance->uid)] = $instance;
565
            if ($instance->uid != $instance->location) {
566
                self::$registry[Helper::digest($instance->location)] = $instance;
567
            }
568
            // Load extension configuration
569
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
570
            // Save registry to session if caching is enabled.
571
            if (!empty($extConf['caching'])) {
572
                Helper::saveToSession(self::$registry, get_class($instance));
573
            }
574
            $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance));
575
        }
576
        // Return new instance.
577
        return $instance;
578
    }
579
580
    /**
581
     * This gets details about a logical structure element
582
     *
583
     * @access public
584
     *
585
     * @abstract
586
     *
587
     * @param string $id: The @ID attribute of the logical structure node (METS) or
588
     * the @id property of the Manifest / Range (IIIF)
589
     * @param bool $recursive: Whether to include the child elements / resources
590
     *
591
     * @return array Array of the element's id, label, type and physical page indexes/mptr link
592
     */
593
    public abstract function getLogicalStructure($id, $recursive = false);
594
595
    /**
596
     * This extracts all the metadata for a logical structure node
597
     *
598
     * @access public
599
     *
600
     * @abstract
601
     *
602
     * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
603
     * of the Manifest / Range (IIIF)
604
     * @param int $cPid: The PID for the metadata definitions
605
     *                       (defaults to $this->cPid or $this->pid)
606
     *
607
     * @return array The logical structure node's / the IIIF resource's parsed metadata array
608
     */
609
    public abstract function getMetadata($id, $cPid = 0);
610
611
    /**
612
     * This returns the first corresponding physical page number of a given logical page label
613
     *
614
     * @access public
615
     *
616
     * @param string $logicalPage: The label (or a part of the label) of the logical page
617
     *
618
     * @return int The physical page number
619
     */
620
    public function getPhysicalPage($logicalPage)
621
    {
622
        if (
623
            !empty($this->lastSearchedPhysicalPage['logicalPage'])
624
            && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
625
        ) {
626
            return $this->lastSearchedPhysicalPage['physicalPage'];
627
        } else {
628
            $physicalPage = 0;
629
            foreach ($this->physicalStructureInfo as $page) {
630
                if (strpos($page['orderlabel'], $logicalPage) !== false) {
631
                    $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
632
                    $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
633
                    return $physicalPage;
634
                }
635
                $physicalPage++;
636
            }
637
        }
638
        return 1;
639
    }
640
641
    /**
642
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
643
     * given as ALTO for METS or as annotations or ALTO for IIIF resources.
644
     *
645
     * @access public
646
     *
647
     * @abstract
648
     *
649
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
650
     * of the Manifest / Range (IIIF)
651
     *
652
     * @return string The OCR full text
653
     */
654
    public abstract function getFullText($id);
655
656
    /**
657
     * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
658
     * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
659
     * to be given in the Canvas' / Manifest's "seeAlso" property.
660
     *
661
     * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
662
     * of the Manifest / Range (IIIF)
663
     *
664
     * @return string The OCR full text
665
     */
666
    protected function getFullTextFromXml($id)
667
    {
668
        $fullText = '';
669
        // Load available text formats, ...
670
        $this->loadFormats();
671
        // ... physical structure ...
672
        $this->_getPhysicalStructure();
673
        // ... and extension configuration.
674
        $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
675
        $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
676
        if (!empty($this->physicalStructureInfo[$id])) {
677
            while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
678
                if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
679
                    // Get full text file.
680
                    $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
681
                    if ($fileContent !== false) {
682
                        $textFormat = $this->getTextFormat($fileContent);
683
                    } else {
684
                        $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
1 ignored issue
show
Bug introduced by
The method warning() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

684
                        $this->logger->/** @scrutinizer ignore-call */ 
685
                                       warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
685
                        return $fullText;
686
                    }
687
                    break;
688
                }
689
            }
690
        } else {
691
            $this->logger->warning('Invalid structure node @ID "' . $id . '"');
692
            return $fullText;
693
        }
694
        // Is this text format supported?
695
        // This part actually differs from previous version of indexed OCR
696
        if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $textFormat does not seem to be defined for all execution paths leading up to this point.
Loading history...
697
            $textMiniOcr = '';
698
            if (!empty($this->formats[$textFormat]['class'])) {
699
                $class = $this->formats[$textFormat]['class'];
700
                // Get the raw text from class.
701
                if (
702
                    class_exists($class)
703
                    && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
704
                ) {
705
                    // Load XML from file.
706
                    $ocrTextXml = $this->getXmlObject($fileContent);
707
                    $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
708
                    $this->rawTextArray[$id] = $textMiniOcr;
709
                } else {
710
                    $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
711
                }
712
            }
713
            $fullText = $textMiniOcr;
714
        } else {
715
            $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
716
        }
717
        return $fullText;
718
    }
719
720
    /**
721
     * Get format of the OCR full text
722
     *
723
     * @access private
724
     *
725
     * @param string $fileContent: content of the XML file
726
     *
727
     * @return string The format of the OCR full text
728
     */
729
    private function getTextFormat($fileContent)
730
    {
731
        // Get the root element's name as text format.
732
        return strtoupper($this->getXmlObject($fileContent)->getName());
733
    }
734
735
    /**
736
     * Get the OCR full text as object
737
     *
738
     * @access private
739
     *
740
     * @param string $fileContent: content of the XML file
741
     *
742
     * @return \SimpleXMLElement The OCR full text as object
743
     */
744
    private function getXmlObject($fileContent)
745
    {
746
        // Turn off libxml's error logging.
747
        $libxmlErrors = libxml_use_internal_errors(true);
748
        // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
749
        $previousValueOfEntityLoader = libxml_disable_entity_loader(true);
750
        // Load XML from file.
751
        $ocrTextXml = simplexml_load_string($fileContent);
752
        // Reset entity loader setting.
753
        libxml_disable_entity_loader($previousValueOfEntityLoader);
754
        // Reset libxml's error logging.
755
        libxml_use_internal_errors($libxmlErrors);
756
        // Get the root element.
757
        return $ocrTextXml;
758
    }
759
760
    /**
761
     * This determines a title for the given document
762
     *
763
     * @access public
764
     *
765
     * @static
766
     *
767
     * @param int $uid: The UID of the document
768
     * @param bool $recursive: Search superior documents for a title, too?
769
     *
770
     * @return string The title of the document itself or a parent document
771
     */
772
    public static function getTitle($uid, $recursive = false)
773
    {
774
        $logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__);
775
776
        $title = '';
777
        // Sanitize input.
778
        $uid = max(intval($uid), 0);
779
        if ($uid) {
780
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
781
                ->getQueryBuilderForTable('tx_dlf_documents');
782
783
            $result = $queryBuilder
784
                ->select(
785
                    'tx_dlf_documents.title',
786
                    'tx_dlf_documents.partof'
787
                )
788
                ->from('tx_dlf_documents')
789
                ->where(
790
                    $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
791
                    Helper::whereExpression('tx_dlf_documents')
792
                )
793
                ->setMaxResults(1)
794
                ->execute();
795
796
            if ($resArray = $result->fetch()) {
797
                // Get title information.
798
                $title = $resArray['title'];
799
                $partof = $resArray['partof'];
800
                // Search parent documents recursively for a title?
801
                if (
802
                    $recursive
803
                    && empty($title)
804
                    && intval($partof)
805
                    && $partof != $uid
806
                ) {
807
                    $title = self::getTitle($partof, true);
808
                }
809
            } else {
810
                $logger->warning('No document with UID ' . $uid . ' found or document not accessible');
811
            }
812
        } else {
813
            $logger->error('Invalid UID ' . $uid . ' for document');
814
        }
815
        return $title;
816
    }
817
818
    /**
819
     * This extracts all the metadata for the toplevel logical structure node / resource
820
     *
821
     * @access public
822
     *
823
     * @param int $cPid: The PID for the metadata definitions
824
     *
825
     * @return array The logical structure node's / resource's parsed metadata array
826
     */
827
    public function getTitledata($cPid = 0)
828
    {
829
        $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
830
        // Add information from METS structural map to titledata array.
831
        if ($this instanceof MetsDocument) {
832
            $this->addMetadataFromMets($titledata, $this->_getToplevelId());
833
        }
834
        // Set record identifier for METS file / IIIF manifest if not present.
835
        if (
836
            is_array($titledata)
837
            && array_key_exists('record_id', $titledata)
838
        ) {
839
            if (
840
                !empty($this->recordId)
841
                && !in_array($this->recordId, $titledata['record_id'])
842
            ) {
843
                array_unshift($titledata['record_id'], $this->recordId);
844
            }
845
        }
846
        return $titledata;
847
    }
848
849
    /**
850
     * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
851
     *
852
     * @access protected
853
     *
854
     * @param array $structure: logical structure array
855
     * @param int $depth: current tree depth
856
     * @param string $logId: ID of the logical structure whose depth is requested
857
     *
858
     * @return int|bool: false if structure with $logId is not a child of this substructure,
859
     * or the actual depth.
860
     */
861
    protected function getTreeDepth($structure, $depth, $logId)
862
    {
863
        foreach ($structure as $element) {
864
            if ($element['id'] == $logId) {
865
                return $depth;
866
            } elseif (array_key_exists('children', $element)) {
867
                $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
868
                if ($foundInChildren !== false) {
869
                    return $foundInChildren;
870
                }
871
            }
872
        }
873
        return false;
874
    }
875
876
    /**
877
     * Get the tree depth of a logical structure element within the table of content
878
     *
879
     * @access public
880
     *
881
     * @param string $logId: The id of the logical structure element whose depth is requested
882
     * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
883
     */
884
    public function getStructureDepth($logId)
885
    {
886
        return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
887
    }
888
889
    /**
890
     * This sets some basic class properties
891
     *
892
     * @access protected
893
     *
894
     * @abstract
895
     *
896
     * @return void
897
     */
898
    protected abstract function init();
899
900
    /**
901
     * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
902
     *
903
     * @access protected
904
     *
905
     * @abstract
906
     *
907
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
908
     *
909
     * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
910
     */
911
    protected abstract function setPreloadedDocument($preloadedDocument);
912
913
    /**
914
     * METS/IIIF specific part of loading a location
915
     *
916
     * @access protected
917
     *
918
     * @abstract
919
     *
920
     * @param string $location: The URL of the file to load
921
     *
922
     * @return bool true on success or false on failure
923
     */
924
    protected abstract function loadLocation($location);
925
926
    /**
927
     * Load XML file / IIIF resource from URL
928
     *
929
     * @access protected
930
     *
931
     * @param string $location: The URL of the file to load
932
     *
933
     * @return bool true on success or false on failure
934
     */
935
    protected function load($location)
936
    {
937
        // Load XML / JSON-LD file.
938
        if (GeneralUtility::isValidUrl($location)) {
939
            // Load extension configuration
940
            $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
941
            // Set user-agent to identify self when fetching XML / JSON-LD data.
942
            if (!empty($extConf['useragent'])) {
943
                @ini_set('user_agent', $extConf['useragent']);
944
            }
945
            // the actual loading is format specific
946
            return $this->loadLocation($location);
947
        } else {
948
            $this->logger->error('Invalid file location "' . $location . '" for document loading');
1 ignored issue
show
Bug introduced by
The method error() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

948
            $this->logger->/** @scrutinizer ignore-call */ 
949
                           error('Invalid file location "' . $location . '" for document loading');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
949
        }
950
        return false;
951
    }
952
953
    /**
954
     * Analyze the document if it contains any fulltext that needs to be indexed.
955
     *
956
     * @access protected
957
     *
958
     * @abstract
959
     */
960
    protected abstract function ensureHasFulltextIsSet();
961
962
    /**
963
     * Register all available data formats
964
     *
965
     * @access protected
966
     *
967
     * @return void
968
     */
969
    protected function loadFormats()
970
    {
971
        if (!$this->formatsLoaded) {
972
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
973
                ->getQueryBuilderForTable('tx_dlf_formats');
974
975
            // Get available data formats from database.
976
            $result = $queryBuilder
977
                ->select(
978
                    'tx_dlf_formats.type AS type',
979
                    'tx_dlf_formats.root AS root',
980
                    'tx_dlf_formats.namespace AS namespace',
981
                    'tx_dlf_formats.class AS class'
982
                )
983
                ->from('tx_dlf_formats')
984
                ->where(
985
                    $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
986
                )
987
                ->execute();
988
989
            while ($resArray = $result->fetch()) {
990
                // Update format registry.
991
                $this->formats[$resArray['type']] = [
992
                    'rootElement' => $resArray['root'],
993
                    'namespaceURI' => $resArray['namespace'],
994
                    'class' => $resArray['class']
995
                ];
996
            }
997
            $this->formatsLoaded = true;
998
        }
999
    }
1000
1001
    /**
1002
     * Register all available namespaces for a \SimpleXMLElement object
1003
     *
1004
     * @access public
1005
     *
1006
     * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
1007
     *
1008
     * @return void
1009
     */
1010
    public function registerNamespaces(&$obj)
1011
    {
1012
        // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
1013
        $this->loadFormats();
1014
        // Do we have a \SimpleXMLElement or \DOMXPath object?
1015
        if ($obj instanceof \SimpleXMLElement) {
1016
            $method = 'registerXPathNamespace';
1017
        } elseif ($obj instanceof \DOMXPath) {
1018
            $method = 'registerNamespace';
1019
        } else {
1020
            $this->logger->error('Given object is neither a SimpleXMLElement nor a DOMXPath instance');
1021
            return;
1022
        }
1023
        // Register metadata format's namespaces.
1024
        foreach ($this->formats as $enc => $conf) {
1025
            $obj->$method(strtolower($enc), $conf['namespaceURI']);
1026
        }
1027
    }
1028
1029
    /**
1030
     * This saves the document to the database and index
1031
     *
1032
     * @access public
1033
     *
1034
     * @param int $pid: The PID of the saved record
1035
     * @param int $core: The UID of the Solr core for indexing
1036
     * @param int|string $owner: UID or index_name of owner to set while indexing
1037
     *
1038
     * @return bool true on success or false on failure
1039
     */
1040
    public function save($pid = 0, $core = 0, $owner = null)
1041
    {
1042
        if (\TYPO3_MODE !== 'BE') {
1043
            $this->logger->error('Saving a document is only allowed in the backend');
1044
            return false;
1045
        }
1046
        // Make sure $pid is a non-negative integer.
1047
        $pid = max(intval($pid), 0);
1048
        // Make sure $core is a non-negative integer.
1049
        $core = max(intval($core), 0);
1050
        // If $pid is not given, try to get it elsewhere.
1051
        if (
1052
            !$pid
1053
            && $this->pid
1054
        ) {
1055
            // Retain current PID.
1056
            $pid = $this->pid;
1057
        } elseif (!$pid) {
1058
            $this->logger->error('Invalid PID ' . $pid . ' for document saving');
1059
            return false;
1060
        }
1061
        // Set PID for metadata definitions.
1062
        $this->cPid = $pid;
1063
        // Set UID placeholder if not updating existing record.
1064
        if ($pid != $this->pid) {
1065
            $this->uid = uniqid('NEW');
1066
        }
1067
        // Get metadata array.
1068
        $metadata = $this->getTitledata($pid);
1069
        // Check for record identifier.
1070
        if (empty($metadata['record_id'][0])) {
1071
            $this->logger->error('No record identifier found to avoid duplication');
1072
            return false;
1073
        }
1074
        // Load plugin configuration.
1075
        $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
1076
1077
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1078
            ->getQueryBuilderForTable('tx_dlf_structures');
1079
1080
        // Get UID for structure type.
1081
        $result = $queryBuilder
1082
            ->select('tx_dlf_structures.uid AS uid')
1083
            ->from('tx_dlf_structures')
1084
            ->where(
1085
                $queryBuilder->expr()->eq('tx_dlf_structures.pid', intval($pid)),
1086
                $queryBuilder->expr()->eq('tx_dlf_structures.index_name', $queryBuilder->expr()->literal($metadata['type'][0])),
1087
                Helper::whereExpression('tx_dlf_structures')
1088
            )
1089
            ->setMaxResults(1)
1090
            ->execute();
1091
1092
        if ($resArray = $result->fetch()) {
1093
            $structure = $resArray['uid'];
1094
        } else {
1095
            $this->logger->error('Could not identify document/structure type "' . $queryBuilder->expr()->literal($metadata['type'][0]) . '"');
1096
            return false;
1097
        }
1098
        $metadata['type'][0] = $structure;
1099
1100
        // Remove appended "valueURI" from authors' names for storing in database.
1101
        foreach ($metadata['author'] as $i => $author) {
1102
            $splitName = explode(chr(31), $author);
1103
            $metadata['author'][$i] = $splitName[0];
1104
        }
1105
1106
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1107
            ->getQueryBuilderForTable('tx_dlf_collections');
1108
        // Get hidden records, too.
1109
        $queryBuilder
1110
            ->getRestrictions()
1111
            ->removeByType(HiddenRestriction::class);
1112
1113
        // Get UIDs for collections.
1114
        $result = $queryBuilder
1115
            ->select(
1116
                'tx_dlf_collections.index_name AS index_name',
1117
                'tx_dlf_collections.uid AS uid'
1118
            )
1119
            ->from('tx_dlf_collections')
1120
            ->where(
1121
                $queryBuilder->expr()->eq('tx_dlf_collections.pid', intval($pid)),
1122
                $queryBuilder->expr()->in('tx_dlf_collections.sys_language_uid', [-1, 0])
1123
            )
1124
            ->execute();
1125
1126
        $collUid = [];
1127
        while ($resArray = $result->fetch()) {
1128
            $collUid[$resArray['index_name']] = $resArray['uid'];
1129
        }
1130
        $collections = [];
1131
        foreach ($metadata['collection'] as $collection) {
1132
            if (!empty($collUid[$collection])) {
1133
                // Add existing collection's UID.
1134
                $collections[] = $collUid[$collection];
1135
            } else {
1136
                // Insert new collection.
1137
                $collNewUid = uniqid('NEW');
1138
                $collData['tx_dlf_collections'][$collNewUid] = [
1139
                    'pid' => $pid,
1140
                    'label' => $collection,
1141
                    'index_name' => $collection,
1142
                    'oai_name' => (!empty($conf['publishNewCollections']) ? Helper::getCleanString($collection) : ''),
1143
                    'description' => '',
1144
                    'documents' => 0,
1145
                    'owner' => 0,
1146
                    'status' => 0,
1147
                ];
1148
                $substUid = Helper::processDBasAdmin($collData);
1149
                // Prevent double insertion.
1150
                unset($collData);
1151
                // Add new collection's UID.
1152
                $collections[] = $substUid[$collNewUid];
1153
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1154
                    Helper::addMessage(
1155
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newCollection'), $collection, $substUid[$collNewUid])),
1156
                        Helper::getMessage('flash.attention', true),
1157
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1158
                        true
1159
                    );
1160
                }
1161
            }
1162
        }
1163
        $metadata['collection'] = $collections;
1164
1165
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1166
            ->getQueryBuilderForTable('tx_dlf_libraries');
1167
1168
        // Get UID for owner.
1169
        if (empty($owner)) {
1170
            $owner = empty($metadata['owner'][0]) ? $metadata['owner'][0] : 'default';
1171
        }
1172
        if (!MathUtility::canBeInterpretedAsInteger($owner)) {
1173
            $result = $queryBuilder
1174
                ->select('tx_dlf_libraries.uid AS uid')
1175
                ->from('tx_dlf_libraries')
1176
                ->where(
1177
                    $queryBuilder->expr()->eq('tx_dlf_libraries.pid', intval($pid)),
1178
                    $queryBuilder->expr()->eq('tx_dlf_libraries.index_name', $queryBuilder->expr()->literal($owner)),
1179
                    Helper::whereExpression('tx_dlf_libraries')
1180
                )
1181
                ->setMaxResults(1)
1182
                ->execute();
1183
1184
            if ($resArray = $result->fetch()) {
1185
                $ownerUid = $resArray['uid'];
1186
            } else {
1187
                // Insert new library.
1188
                $libNewUid = uniqid('NEW');
1189
                $libData['tx_dlf_libraries'][$libNewUid] = [
1190
                    'pid' => $pid,
1191
                    'label' => $owner,
1192
                    'index_name' => $owner,
1193
                    'website' => '',
1194
                    'contact' => '',
1195
                    'image' => '',
1196
                    'oai_label' => '',
1197
                    'oai_base' => '',
1198
                    'opac_label' => '',
1199
                    'opac_base' => '',
1200
                    'union_label' => '',
1201
                    'union_base' => '',
1202
                ];
1203
                $substUid = Helper::processDBasAdmin($libData);
1204
                // Add new library's UID.
1205
                $ownerUid = $substUid[$libNewUid];
1206
                if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1207
                    Helper::addMessage(
1208
                        htmlspecialchars(sprintf(Helper::getMessage('flash.newLibrary'), $owner, $ownerUid)),
1209
                        Helper::getMessage('flash.attention', true),
1210
                        \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
1211
                        true
1212
                    );
1213
                }
1214
            }
1215
            $owner = $ownerUid;
1216
        }
1217
        $metadata['owner'][0] = $owner;
1218
        // Get UID of parent document.
1219
        $partof = $this->getParentDocumentUidForSaving($pid, $core, $owner);
1220
        // Use the date of publication or title as alternative sorting metric for parts of multi-part works.
1221
        if (!empty($partof)) {
1222
            if (
1223
                empty($metadata['volume'][0])
1224
                && !empty($metadata['year'][0])
1225
            ) {
1226
                $metadata['volume'] = $metadata['year'];
1227
            }
1228
            if (empty($metadata['volume_sorting'][0])) {
1229
                // If METS @ORDER is given it is preferred over year_sorting and year.
1230
                if (!empty($metadata['mets_order'][0])) {
1231
                    $metadata['volume_sorting'][0] = $metadata['mets_order'][0];
1232
                } elseif (!empty($metadata['year_sorting'][0])) {
1233
                    $metadata['volume_sorting'][0] = $metadata['year_sorting'][0];
1234
                } elseif (!empty($metadata['year'][0])) {
1235
                    $metadata['volume_sorting'][0] = $metadata['year'][0];
1236
                }
1237
            }
1238
            // If volume_sorting is still empty, try to use title_sorting or METS @ORDERLABEL finally (workaround for newspapers)
1239
            if (empty($metadata['volume_sorting'][0])) {
1240
                if (!empty($metadata['title_sorting'][0])) {
1241
                    $metadata['volume_sorting'][0] = $metadata['title_sorting'][0];
1242
                } elseif (!empty($metadata['mets_orderlabel'][0])) {
1243
                    $metadata['volume_sorting'][0] = $metadata['mets_orderlabel'][0];
1244
                }
1245
            }
1246
        }
1247
1248
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1249
            ->getQueryBuilderForTable('tx_dlf_metadata');
1250
1251
        // Get metadata for lists and sorting.
1252
        $result = $queryBuilder
1253
            ->select(
1254
                'tx_dlf_metadata.index_name AS index_name',
1255
                'tx_dlf_metadata.is_listed AS is_listed',
1256
                'tx_dlf_metadata.is_sortable AS is_sortable'
1257
            )
1258
            ->from('tx_dlf_metadata')
1259
            ->where(
1260
                $queryBuilder->expr()->orX(
1261
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_listed', 1),
1262
                    $queryBuilder->expr()->eq('tx_dlf_metadata.is_sortable', 1)
1263
                ),
1264
                $queryBuilder->expr()->eq('tx_dlf_metadata.pid', intval($pid)),
1265
                Helper::whereExpression('tx_dlf_metadata')
1266
            )
1267
            ->execute();
1268
1269
        $listed = [];
1270
        $sortable = [];
1271
1272
        while ($resArray = $result->fetch()) {
1273
            if (!empty($metadata[$resArray['index_name']])) {
1274
                if ($resArray['is_listed']) {
1275
                    $listed[$resArray['index_name']] = $metadata[$resArray['index_name']];
1276
                }
1277
                if ($resArray['is_sortable']) {
1278
                    $sortable[$resArray['index_name']] = $metadata[$resArray['index_name']][0];
1279
                }
1280
            }
1281
        }
1282
        // Fill data array.
1283
        $data['tx_dlf_documents'][$this->uid] = [
1284
            'pid' => $pid,
1285
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['starttime'] => 0,
1286
            $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['endtime'] => 0,
1287
            'prod_id' => $metadata['prod_id'][0],
1288
            'location' => $this->location,
1289
            'record_id' => $metadata['record_id'][0],
1290
            'opac_id' => $metadata['opac_id'][0],
1291
            'union_id' => $metadata['union_id'][0],
1292
            'urn' => $metadata['urn'][0],
1293
            'purl' => $metadata['purl'][0],
1294
            'title' => $metadata['title'][0],
1295
            'title_sorting' => $metadata['title_sorting'][0],
1296
            'author' => implode('; ', $metadata['author']),
1297
            'year' => implode('; ', $metadata['year']),
1298
            'place' => implode('; ', $metadata['place']),
1299
            'thumbnail' => $this->_getThumbnail(true),
1300
            'metadata' => serialize($listed),
1301
            'metadata_sorting' => serialize($sortable),
1302
            'structure' => $metadata['type'][0],
1303
            'partof' => $partof,
1304
            'volume' => $metadata['volume'][0],
1305
            'volume_sorting' => $metadata['volume_sorting'][0],
1306
            'license' => $metadata['license'][0],
1307
            'terms' => $metadata['terms'][0],
1308
            'restrictions' => $metadata['restrictions'][0],
1309
            'out_of_print' => $metadata['out_of_print'][0],
1310
            'rights_info' => $metadata['rights_info'][0],
1311
            'collections' => $metadata['collection'],
1312
            'mets_label' => $metadata['mets_label'][0],
1313
            'mets_orderlabel' => $metadata['mets_orderlabel'][0],
1314
            'mets_order' => $metadata['mets_order'][0],
1315
            'owner' => $metadata['owner'][0],
1316
            'solrcore' => $core,
1317
            'status' => 0,
1318
            'document_format' => $metadata['document_format'][0],
1319
        ];
1320
        // Unhide hidden documents.
1321
        if (!empty($conf['unhideOnIndex'])) {
1322
            $data['tx_dlf_documents'][$this->uid][$GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['disabled']] = 0;
1323
        }
1324
        // Process data.
1325
        $newIds = Helper::processDBasAdmin($data);
1326
        // Replace placeholder with actual UID.
1327
        if (strpos($this->uid, 'NEW') === 0) {
1328
            $this->uid = $newIds[$this->uid];
1329
            $this->pid = $pid;
1330
            $this->parentId = $partof;
1331
        }
1332
        if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
1333
            Helper::addMessage(
1334
                htmlspecialchars(sprintf(Helper::getMessage('flash.documentSaved'), $metadata['title'][0], $this->uid)),
1335
                Helper::getMessage('flash.done', true),
1336
                \TYPO3\CMS\Core\Messaging\FlashMessage::OK,
1337
                true
1338
            );
1339
        }
1340
        // Add document to index.
1341
        if ($core) {
1342
            return Indexer::add($this, $core);
1343
        } else {
1344
            $this->logger->notice('Invalid UID "' . $core . '" for Solr core');
1 ignored issue
show
Bug introduced by
The method notice() does not exist on TYPO3\CMS\Core\Log\LogManager. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1344
            $this->logger->/** @scrutinizer ignore-call */ 
1345
                           notice('Invalid UID "' . $core . '" for Solr core');

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1345
            return false;
1346
        }
1347
    }
1348
1349
    /**
1350
     * Get the ID of the parent document if the current document has one. Also save a parent document
1351
     * to the database and the Solr index if their $pid and the current $pid differ.
1352
     * Currently only applies to METS documents.
1353
     *
1354
     * @access protected
1355
     *
1356
     * @abstract
1357
     *
1358
     * @return int The parent document's id.
1359
     */
1360
    protected abstract function getParentDocumentUidForSaving($pid, $core, $owner);
1361
1362
    /**
1363
     * This returns $this->cPid via __get()
1364
     *
1365
     * @access protected
1366
     *
1367
     * @return int The PID of the metadata definitions
1368
     */
1369
    protected function _getCPid()
1370
    {
1371
        return $this->cPid;
1372
    }
1373
1374
    /**
1375
     * This returns $this->hasFulltext via __get()
1376
     *
1377
     * @access protected
1378
     *
1379
     * @return bool Are there any fulltext files available?
1380
     */
1381
    protected function _getHasFulltext()
1382
    {
1383
        $this->ensureHasFulltextIsSet();
1384
        return $this->hasFulltext;
1385
    }
1386
1387
    /**
1388
     * This returns $this->location via __get()
1389
     *
1390
     * @access protected
1391
     *
1392
     * @return string The location of the document
1393
     */
1394
    protected function _getLocation()
1395
    {
1396
        return $this->location;
1397
    }
1398
1399
    /**
1400
     * Format specific part of building the document's metadata array
1401
     *
1402
     * @access protected
1403
     *
1404
     * @abstract
1405
     *
1406
     * @param int $cPid
1407
     */
1408
    protected abstract function prepareMetadataArray($cPid);
1409
1410
    /**
1411
     * This builds an array of the document's metadata
1412
     *
1413
     * @access protected
1414
     *
1415
     * @return array Array of metadata with their corresponding logical structure node ID as key
1416
     */
1417
    protected function _getMetadataArray()
1418
    {
1419
        // Set metadata definitions' PID.
1420
        $cPid = ($this->cPid ? $this->cPid : $this->pid);
1421
        if (!$cPid) {
1422
            $this->logger->error('Invalid PID ' . $cPid . ' for metadata definitions');
1423
            return [];
1424
        }
1425
        if (
1426
            !$this->metadataArrayLoaded
1427
            || $this->metadataArray[0] != $cPid
1428
        ) {
1429
            $this->prepareMetadataArray($cPid);
1430
            $this->metadataArray[0] = $cPid;
1431
            $this->metadataArrayLoaded = true;
1432
        }
1433
        return $this->metadataArray;
1434
    }
1435
1436
    /**
1437
     * This returns $this->numPages via __get()
1438
     *
1439
     * @access protected
1440
     *
1441
     * @return int The total number of pages and/or tracks
1442
     */
1443
    protected function _getNumPages()
1444
    {
1445
        $this->_getPhysicalStructure();
1446
        return $this->numPages;
1447
    }
1448
1449
    /**
1450
     * This returns $this->parentId via __get()
1451
     *
1452
     * @access protected
1453
     *
1454
     * @return int The UID of the parent document or zero if not applicable
1455
     */
1456
    protected function _getParentId()
1457
    {
1458
        return $this->parentId;
1459
    }
1460
1461
    /**
1462
     * This builds an array of the document's physical structure
1463
     *
1464
     * @access protected
1465
     *
1466
     * @abstract
1467
     *
1468
     * @return array Array of physical elements' id, type, label and file representations ordered
1469
     * by @ORDER attribute / IIIF Sequence's Canvases
1470
     */
1471
    protected abstract function _getPhysicalStructure();
1472
1473
    /**
1474
     * This gives an array of the document's physical structure metadata
1475
     *
1476
     * @access protected
1477
     *
1478
     * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
1479
     */
1480
    protected function _getPhysicalStructureInfo()
1481
    {
1482
        // Is there no physical structure array yet?
1483
        if (!$this->physicalStructureLoaded) {
1484
            // Build physical structure array.
1485
            $this->_getPhysicalStructure();
1486
        }
1487
        return $this->physicalStructureInfo;
1488
    }
1489
1490
    /**
1491
     * This returns $this->pid via __get()
1492
     *
1493
     * @access protected
1494
     *
1495
     * @return int The PID of the document or zero if not in database
1496
     */
1497
    protected function _getPid()
1498
    {
1499
        return $this->pid;
1500
    }
1501
1502
    /**
1503
     * This returns $this->ready via __get()
1504
     *
1505
     * @access protected
1506
     *
1507
     * @return bool Is the document instantiated successfully?
1508
     */
1509
    protected function _getReady()
1510
    {
1511
        return $this->ready;
1512
    }
1513
1514
    /**
1515
     * This returns $this->recordId via __get()
1516
     *
1517
     * @access protected
1518
     *
1519
     * @return mixed The METS file's / IIIF manifest's record identifier
1520
     */
1521
    protected function _getRecordId()
1522
    {
1523
        return $this->recordId;
1524
    }
1525
1526
    /**
1527
     * This returns $this->rootId via __get()
1528
     *
1529
     * @access protected
1530
     *
1531
     * @return int The UID of the root document or zero if not applicable
1532
     */
1533
    protected function _getRootId()
1534
    {
1535
        if (!$this->rootIdLoaded) {
1536
            if ($this->parentId) {
1537
                $parent = self::getInstance($this->parentId, $this->pid);
1538
                $this->rootId = $parent->rootId;
1539
            }
1540
            $this->rootIdLoaded = true;
1541
        }
1542
        return $this->rootId;
1543
    }
1544
1545
    /**
1546
     * This returns the smLinks between logical and physical structMap (METS) and models the
1547
     * relation between IIIF Canvases and Manifests / Ranges in the same way
1548
     *
1549
     * @access protected
1550
     *
1551
     * @abstract
1552
     *
1553
     * @return array The links between logical and physical nodes / Range, Manifest and Canvas
1554
     */
1555
    protected abstract function _getSmLinks();
1556
1557
    /**
1558
     * This builds an array of the document's logical structure
1559
     *
1560
     * @access protected
1561
     *
1562
     * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
1563
     */
1564
    protected function _getTableOfContents()
1565
    {
1566
        // Is there no logical structure array yet?
1567
        if (!$this->tableOfContentsLoaded) {
1568
            // Get all logical structures.
1569
            $this->getLogicalStructure('', true);
1570
            $this->tableOfContentsLoaded = true;
1571
        }
1572
        return $this->tableOfContents;
1573
    }
1574
1575
    /**
1576
     * This returns the document's thumbnail location
1577
     *
1578
     * @access protected
1579
     *
1580
     * @abstract
1581
     *
1582
     * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
1583
     *
1584
     * @return string The document's thumbnail location
1585
     */
1586
    protected abstract function _getThumbnail($forceReload = false);
1587
1588
    /**
1589
     * This returns the ID of the toplevel logical structure node
1590
     *
1591
     * @access protected
1592
     *
1593
     * @abstract
1594
     *
1595
     * @return string The logical structure node's ID
1596
     */
1597
    protected abstract function _getToplevelId();
1598
1599
    /**
1600
     * This returns $this->uid via __get()
1601
     *
1602
     * @access protected
1603
     *
1604
     * @return mixed The UID or the URL of the document
1605
     */
1606
    protected function _getUid()
1607
    {
1608
        return $this->uid;
1609
    }
1610
1611
    /**
1612
     * This sets $this->cPid via __set()
1613
     *
1614
     * @access protected
1615
     *
1616
     * @param int $value: The new PID for the metadata definitions
1617
     *
1618
     * @return void
1619
     */
1620
    protected function _setCPid($value)
1621
    {
1622
        $this->cPid = max(intval($value), 0);
1623
    }
1624
1625
    /**
1626
     * This magic method is invoked each time a clone is called on the object variable
1627
     *
1628
     * @access protected
1629
     *
1630
     * @return void
1631
     */
1632
    protected function __clone()
1633
    {
1634
        // This method is defined as protected because singleton objects should not be cloned.
1635
    }
1636
1637
    /**
1638
     * This is a singleton class, thus the constructor should be private/protected
1639
     * (Get an instance of this class by calling \Kitodo\Dlf\Common\Document::getInstance())
1640
     *
1641
     * @access protected
1642
     *
1643
     * @param int $uid: The UID of the document to parse or URL to XML file
1644
     * @param int $pid: If > 0, then only document with this PID gets loaded
1645
     * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
1646
     * or IiifResourceInterface that has been loaded to determine the basic document format.
1647
     *
1648
     * @return void
1649
     */
1650
    protected function __construct($uid, $pid, $preloadedDocument)
1651
    {
1652
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1653
            ->getQueryBuilderForTable('tx_dlf_documents');
1654
        $location = '';
1655
        // Prepare to check database for the requested document.
1656
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
1657
            $whereClause = $queryBuilder->expr()->andX(
1658
                $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
1659
                Helper::whereExpression('tx_dlf_documents')
1660
            );
1661
        } else {
1662
            // Try to load METS file / IIIF manifest.
1663
            if ($this->setPreloadedDocument($preloadedDocument) || (GeneralUtility::isValidUrl($uid)
1664
                && $this->load($uid))) {
1665
                // Initialize core METS object.
1666
                $this->init();
1667
                if ($this->getDocument() !== null) {
1668
                    // Cast to string for safety reasons.
1669
                    $location = (string) $uid;
1670
                    $this->establishRecordId($pid);
1671
                } else {
1672
                    // No METS / IIIF part found.
1673
                    return;
1674
                }
1675
            } else {
1676
                // Loading failed.
1677
                return;
1678
            }
1679
            if (
1680
                !empty($location)
1681
                && !empty($this->recordId)
1682
            ) {
1683
                // Try to match record identifier or location (both should be unique).
1684
                $whereClause = $queryBuilder->expr()->andX(
1685
                    $queryBuilder->expr()->orX(
1686
                        $queryBuilder->expr()->eq('tx_dlf_documents.location', $queryBuilder->expr()->literal($location)),
1687
                        $queryBuilder->expr()->eq('tx_dlf_documents.record_id', $queryBuilder->expr()->literal($this->recordId))
1688
                    ),
1689
                    Helper::whereExpression('tx_dlf_documents')
1690
                );
1691
            } else {
1692
                // Can't persistently identify document, don't try to match at all.
1693
                $whereClause = '1=-1';
1694
            }
1695
        }
1696
        // Check for PID if needed.
1697
        if ($pid) {
1698
            $whereClause = $queryBuilder->expr()->andX(
1699
                $whereClause,
1700
                $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid))
1701
            );
1702
        }
1703
        // Get document PID and location from database.
1704
        $result = $queryBuilder
1705
            ->select(
1706
                'tx_dlf_documents.uid AS uid',
1707
                'tx_dlf_documents.pid AS pid',
1708
                'tx_dlf_documents.record_id AS record_id',
1709
                'tx_dlf_documents.partof AS partof',
1710
                'tx_dlf_documents.thumbnail AS thumbnail',
1711
                'tx_dlf_documents.location AS location'
1712
            )
1713
            ->from('tx_dlf_documents')
1714
            ->where($whereClause)
1715
            ->setMaxResults(1)
1716
            ->execute();
1717
1718
        if ($resArray = $result->fetch()) {
1719
            $this->uid = $resArray['uid'];
1720
            $this->pid = $resArray['pid'];
1721
            $this->recordId = $resArray['record_id'];
1722
            $this->parentId = $resArray['partof'];
1723
            $this->thumbnail = $resArray['thumbnail'];
1724
            $this->location = $resArray['location'];
1725
            $this->thumbnailLoaded = true;
1726
            // Load XML file if necessary...
1727
            if (
1728
                $this->getDocument() === null
1729
                && $this->load($this->location)
1730
            ) {
1731
                // ...and set some basic properties.
1732
                $this->init();
1733
            }
1734
            // Do we have a METS / IIIF object now?
1735
            if ($this->getDocument() !== null) {
1736
                // Set new location if necessary.
1737
                if (!empty($location)) {
1738
                    $this->location = $location;
1739
                }
1740
                // Document ready!
1741
                $this->ready = true;
1742
            }
1743
        } elseif ($this->getDocument() !== null) {
1744
            // Set location as UID for documents not in database.
1745
            $this->uid = $location;
1746
            $this->location = $location;
1747
            // Document ready!
1748
            $this->ready = true;
1749
        } else {
1750
            $this->logger->error('No document with UID ' . $uid . ' found or document not accessible');
1751
        }
1752
    }
1753
1754
    /**
1755
     * This magic method is called each time an invisible property is referenced from the object
1756
     *
1757
     * @access public
1758
     *
1759
     * @param string $var: Name of variable to get
1760
     *
1761
     * @return mixed Value of $this->$var
1762
     */
1763
    public function __get($var)
1764
    {
1765
        $method = '_get' . ucfirst($var);
1766
        if (
1767
            !property_exists($this, $var)
1768
            || !method_exists($this, $method)
1769
        ) {
1770
            $this->logger->warning('There is no getter function for property "' . $var . '"');
1771
            return;
1772
        } else {
1773
            return $this->$method();
1774
        }
1775
    }
1776
1777
    /**
1778
     * This magic method is called each time an invisible property is checked for isset() or empty()
1779
     *
1780
     * @access public
1781
     *
1782
     * @param string $var: Name of variable to check
1783
     *
1784
     * @return bool true if variable is set and not empty, false otherwise
1785
     */
1786
    public function __isset($var)
1787
    {
1788
        return !empty($this->__get($var));
1789
    }
1790
1791
    /**
1792
     * This magic method is called each time an invisible property is referenced from the object
1793
     *
1794
     * @access public
1795
     *
1796
     * @param string $var: Name of variable to set
1797
     * @param mixed $value: New value of variable
1798
     *
1799
     * @return void
1800
     */
1801
    public function __set($var, $value)
1802
    {
1803
        $method = '_set' . ucfirst($var);
1804
        if (
1805
            !property_exists($this, $var)
1806
            || !method_exists($this, $method)
1807
        ) {
1808
            $this->logger->warning('There is no setter function for property "' . $var . '"');
1809
        } else {
1810
            $this->$method($value);
1811
        }
1812
    }
1813
}
1814