Completed
Push — master ( 4904bb...48f77b )
by Timo
11s
created

Typo3PageIndexer::getPageSolrDocument()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1.125

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 4
cp 0.5
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 0
crap 1.125
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2009-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
27
use Apache_Solr_Document;
28
use ApacheSolrForTypo3\Solr\Access\Rootline;
29
use ApacheSolrForTypo3\Solr\Domain\Variants\IdBuilder;
30
use ApacheSolrForTypo3\Solr\FieldProcessor\Service;
31
use ApacheSolrForTypo3\Solr\IndexQueue\FrontendHelper\PageFieldMappingIndexer;
32
use ApacheSolrForTypo3\Solr\IndexQueue\Item;
33
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration;
34
use TYPO3\CMS\Core\Utility\GeneralUtility;
35
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
36
37
/**
38
 * Page Indexer to index TYPO3 pages used by the Index Queue.
39
 *
40
 * @author Ingo Renner <[email protected]>
41
 * @author Daniel Poetzinger <[email protected]>
42
 * @author Timo Schmidt <[email protected]>
43
 */
44
class Typo3PageIndexer
45
{
46
47
    /**
48
     * ID of the current page's Solr document.
49
     *
50
     * @var string
51
     */
52
    protected static $pageSolrDocumentId = '';
53
    /**
54
     * The Solr document generated for the current page.
55
     *
56
     * @var \Apache_Solr_Document
57
     */
58
    protected static $pageSolrDocument = null;
59
    /**
60
     * The mount point parameter used in the Frontend controller.
61
     *
62
     * @var string
63
     */
64
    protected $mountPointParameter;
65
    /**
66
     * Solr server connection.
67
     *
68
     * @var SolrService
69
     */
70
    protected $solrConnection = null;
71
    /**
72
     * Frontend page object (TSFE).
73
     *
74
     * @var TypoScriptFrontendController
75
     */
76
    protected $page = null;
77
    /**
78
     * Content extractor to extract content from TYPO3 pages
79
     *
80
     * @var Typo3PageContentExtractor
81
     */
82
    protected $contentExtractor = null;
83
    /**
84
     * URL to be indexed as the page's URL
85
     *
86
     * @var string
87
     */
88
    protected $pageUrl = '';
89
    /**
90
     * The page's access rootline
91
     *
92
     * @var Rootline
93
     */
94
    protected $pageAccessRootline = null;
95
    /**
96
     * Documents that have been sent to Solr
97
     *
98
     * @var array
99
     */
100
    protected $documentsSentToSolr = [];
101
102
    /**
103
     * @var TypoScriptConfiguration
104
     */
105
    protected $configuration;
106
107
    /**
108
     * @var Item
109
     */
110
    protected $indexQueueItem;
111
112
    /**
113
     * @var IdBuilder
114
     */
115
    protected $variantIdBuilder;
116
117
    /**
118
     * Constructor
119
     *
120
     * @param TypoScriptFrontendController $page The page to index
121
     * @param IdBuilder $variantIdBuilder
122
     */
123 42
    public function __construct(TypoScriptFrontendController $page, IdBuilder $variantIdBuilder = null)
124
    {
125 42
        $this->page = $page;
126 42
        $this->pageUrl = GeneralUtility::getIndpEnv('TYPO3_REQUEST_URL');
127 42
        $this->configuration = Util::getSolrConfiguration();
128
129
        try {
130 42
            $this->initializeSolrConnection();
131
        } catch (\Exception $e) {
132
            $this->log($e->getMessage() . ' Error code: ' . $e->getCode(), 3);
133
134
            // TODO extract to a class "ExceptionLogger"
135
            if ($this->configuration->getLoggingExceptions()) {
136
                GeneralUtility::devLog('Exception while trying to index a page', 'solr', 3, [$e->__toString()]);
137
            }
138
        }
139
140 42
        $this->contentExtractor = GeneralUtility::makeInstance(Typo3PageContentExtractor::class, $this->page->content);
141 42
        $this->pageAccessRootline = GeneralUtility::makeInstance(Rootline::class, '');
142 42
        $this->variantIdBuilder = is_null($variantIdBuilder) ? GeneralUtility::makeInstance(IdBuilder::class) : $variantIdBuilder;
143 42
    }
144
145
    /**
146
     * @param Item $indexQueueItem
147
     */
148 6
    public function setIndexQueueItem($indexQueueItem)
149
    {
150 6
        $this->indexQueueItem = $indexQueueItem;
151 6
    }
152
153
154
    /**
155
     * Initializes the Solr server connection.
156
     *
157
     * @throws    \Exception when no Solr connection can be established.
158
     */
159 42
    protected function initializeSolrConnection()
160
    {
161 42
        $solr = GeneralUtility::makeInstance(ConnectionManager::class)->getConnectionByPageId($this->page->id, $this->page->sys_language_uid);
162
163
        // do not continue if no server is available
164 42
        if (!$solr->ping()) {
165
            throw new \Exception(
166
                'No Solr instance available while trying to index a page.',
167
                1234790825
168
            );
169
        }
170
171 42
        $this->solrConnection = $solr;
172 42
    }
173
174
    /**
175
     * Logs messages to devlog and TS log (admin panel)
176
     *
177
     * @param string $message Message to set
178
     * @param int $errorNum Error number
179
     * @param array $data Additional data to log
180
     * @return void
181
     */
182 42
    protected function log($message, $errorNum = 0, array $data = [])
183
    {
184 42
        if (is_object($GLOBALS['TT'])) {
185 42
            $GLOBALS['TT']->setTSlogMessage('tx_solr: ' . $message, $errorNum);
186
        }
187
188 42
        if ($this->configuration->getLoggingIndexing()) {
189
            $logData = [];
190
            if (!empty($data)) {
191
                foreach ($data as $value) {
192
                    $logData[] = (array)$value;
193
                }
194
            }
195
196
            GeneralUtility::devLog($message, 'solr', $errorNum, $logData);
197
        }
198 42
    }
199
200
    /**
201
     * Gets the current page's Solr document ID.
202
     *
203
     * @return string|NULL The page's Solr document ID or NULL in case no document was generated yet.
204
     */
205
    public static function getPageSolrDocumentId()
206
    {
207
        return self::$pageSolrDocumentId;
208
    }
209
210
    /**
211
     * Gets the Solr document generated for the current page.
212
     *
213
     * @return \Apache_Solr_Document|NULL The page's Solr document or NULL if it has not been generated yet.
214
     */
215 6
    public static function getPageSolrDocument()
216
    {
217 6
        return self::$pageSolrDocument;
218
    }
219
220
    /**
221
     * Allows to provide a Solr server connection other than the one
222
     * initialized by the constructor.
223
     *
224
     * @param SolrService $solrConnection Solr connection
225
     * @throws \Exception if the Solr server cannot be reached
226
     */
227 6
    public function setSolrConnection(SolrService $solrConnection)
228
    {
229 6
        if (!$solrConnection->ping()) {
230
            throw new \Exception(
231
                'Could not connect to Solr server.',
232
                1323946472
233
            );
234
        }
235
236 6
        $this->solrConnection = $solrConnection;
237 6
    }
238
239
    /**
240
     * Indexes a page.
241
     *
242
     * @return bool TRUE after successfully indexing the page, FALSE on error
243
     * @throws \UnexpectedValueException if a page document post processor fails to implement interface ApacheSolrForTypo3\Solr\PageDocumentPostProcessor
244
     */
245 42
    public function indexPage()
246
    {
247 42
        $pageIndexed = false;
248 42
        $documents = []; // this will become useful as soon as when starting to index individual records instead of whole pages
249
250 42
        if (is_null($this->solrConnection)) {
251
            // intended early return as it doesn't make sense to continue
252
            // and waste processing time if the solr server isn't available
253
            // anyways
254
            // FIXME use an exception
255
            return $pageIndexed;
256
        }
257
258 42
        $pageDocument = $this->getPageDocument();
259 42
        $pageDocument = $this->substitutePageDocument($pageDocument);
260
261 42
        $this->applyIndexPagePostProcessors($pageDocument);
262
263 42
        self::$pageSolrDocument = $pageDocument;
264 42
        $documents[] = $pageDocument;
265 42
        $documents = $this->getAdditionalDocuments($pageDocument, $documents);
266 42
        $this->processDocuments($documents);
267
268 42
        $pageIndexed = $this->addDocumentsToSolrIndex($documents);
269 42
        $this->documentsSentToSolr = $documents;
270
271 42
        return $pageIndexed;
272
    }
273
274
    /**
275
     * Applies the configured post processors (indexPagePostProcessPageDocument)
276
     *
277
     * @param \Apache_Solr_Document $pageDocument
278
     */
279 42
    protected function applyIndexPagePostProcessors($pageDocument)
280
    {
281 42
        if (!is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPagePostProcessPageDocument'])) {
282 41
            return;
283
        }
284
285 1
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPagePostProcessPageDocument'] as $classReference) {
286 1
            $postProcessor = GeneralUtility::getUserObj($classReference);
287 1
            if (!$postProcessor instanceof PageDocumentPostProcessor) {
288
                throw new \UnexpectedValueException(get_class($pageDocument) . ' must implement interface ' . PageDocumentPostProcessor::class, 1397739154);
289
            }
290
291 1
            $postProcessor->postProcessPageDocument($pageDocument, $this->page);
292
        }
293 1
    }
294
295
    /**
296
     * Builds the Solr document for the current page.
297
     *
298
     * @return \Apache_Solr_Document A document representing the page
299
     */
300 42
    protected function getPageDocument()
301
    {
302 42
        $document = GeneralUtility::makeInstance(Apache_Solr_Document::class);
303
        /* @var $document \Apache_Solr_Document */
304 42
        $site = Site::getSiteByPageId($this->page->id);
305 42
        $pageRecord = $this->page->page;
306
307 42
        self::$pageSolrDocumentId = $documentId = Util::getPageDocumentId(
308 42
            $this->page->id,
309 42
            $this->page->type,
310 42
            $this->page->sys_language_uid,
311 42
            $this->getDocumentIdGroups(),
312 42
            $this->getMountPointParameter()
313
        );
314 42
        $document->setField('id', $documentId);
315 42
        $document->setField('site', $site->getDomain());
316 42
        $document->setField('siteHash', $site->getSiteHash());
317 42
        $document->setField('appKey', 'EXT:solr');
318 42
        $document->setField('type', 'pages');
319
320
        // system fields
321 42
        $document->setField('uid', $this->page->id);
322 42
        $document->setField('pid', $pageRecord['pid']);
323
324
        // variantId
325 42
        $variantId = $this->variantIdBuilder->buildFromTypeAndUid('pages', $this->page->id);
326 42
        $document->setField('variantId', $variantId);
327
328 42
        $document->setField('typeNum', $this->page->type);
329 42
        $document->setField('created', $pageRecord['crdate']);
330 42
        $document->setField('changed', $pageRecord['SYS_LASTCHANGED']);
331
332 42
        $rootline = $this->getRootLineFieldValue();
333 42
        $document->setField('rootline', $rootline);
334
335
        // access
336 42
        $this->addAccessField($document);
337 42
        $this->addEndtimeField($document, $pageRecord);
338
339
        // content
340 42
        $document->setField('title', $this->contentExtractor->getPageTitle());
341 42
        $document->setField('subTitle', $pageRecord['subtitle']);
342 42
        $document->setField('navTitle', $pageRecord['nav_title']);
343 42
        $document->setField('author', $pageRecord['author']);
344 42
        $document->setField('description', $pageRecord['description']);
345 42
        $document->setField('abstract', $pageRecord['abstract']);
346 42
        $document->setField('content', $this->contentExtractor->getIndexableContent());
347 42
        $document->setField('url', $this->pageUrl);
348
349 42
        $this->addKeywordsField($document, $pageRecord);
350 42
        $this->addTagContentFields($document);
351
352 42
        return $document;
353
    }
354
355
    /**
356
     * Adds the access field to the document if needed.
357
     *
358
     * @param \Apache_Solr_Document $document
359
     */
360 42
    protected function addAccessField(\Apache_Solr_Document $document)
361
    {
362 42
        $access = (string)$this->pageAccessRootline;
363 42
        if (trim($access) !== '') {
364 9
            $document->setField('access', $access);
365
        }
366 42
    }
367
368
    /**
369
     * @param $document
370
     * @param $pageRecord
371
     */
372 42
    protected function addEndtimeField(\Apache_Solr_Document  $document, $pageRecord)
373
    {
374 42
        if ($this->page->page['endtime']) {
375
            $document->setField('endtime', $pageRecord['endtime']);
376
        }
377 42
    }
378
379
    /**
380
     * Adds keywords, multi valued.
381
     *
382
     * @param \Apache_Solr_Document $document
383
     * @param array $pageRecord
384
     */
385 42
    protected function addKeywordsField(\Apache_Solr_Document $document, $pageRecord)
386
    {
387 42
        $keywords = array_unique(GeneralUtility::trimExplode(',', $pageRecord['keywords'], true));
388 42
        foreach ($keywords as $keyword) {
389
            $document->addField('keywords', $keyword);
390
        }
391 42
    }
392
393
    /**
394
     * Add content from several tags like headers, anchors, ...
395
     *
396
     * @param \Apache_Solr_Document $document
397
     */
398 42
    protected function addTagContentFields(\Apache_Solr_Document  $document)
399
    {
400 42
        $tagContent = $this->contentExtractor->getTagContent();
401 42
        foreach ($tagContent as $fieldName => $fieldValue) {
402
            $document->setField($fieldName, $fieldValue);
403
        }
404 42
    }
405
406
    /**
407
     * Builds the content for the rootline field.
408
     *
409
     * @return string
410
     */
411 42
    protected function getRootLineFieldValue()
412
    {
413 42
        $rootline = $this->page->id;
414 42
        $mountPointParameter = $this->getMountPointParameter();
415 42
        if ($mountPointParameter !== '') {
416 36
            $rootline .= ',' . $mountPointParameter;
417
        }
418 42
        return $rootline;
419
    }
420
421
    /**
422
     * Gets a comma separated list of frontend user groups to use for the
423
     * document ID.
424
     *
425
     * @return string A comma separated list of frontend user groups.
426
     */
427 42
    protected function getDocumentIdGroups()
428
    {
429 42
        $groups = $this->pageAccessRootline->getGroups();
430 42
        $groups = Rootline::cleanGroupArray($groups);
431
432 42
        if (empty($groups)) {
433 35
            $groups[] = 0;
434
        }
435
436 42
        $groups = implode(',', $groups);
437
438 42
        return $groups;
439
    }
440
441
    // Logging
442
    // TODO replace by a central logger
443
444
    /**
445
     * Gets the mount point parameter that is used in the Frontend controller.
446
     *
447
     * @return string
448
     */
449 42
    public function getMountPointParameter()
450
    {
451 42
        return $this->mountPointParameter;
452
    }
453
454
    // Misc
455
456
    /**
457
     * Sets the mount point parameter that is used in the Frontend controller.
458
     *
459
     * @param string $mountPointParameter
460
     */
461 6
    public function setMountPointParameter($mountPointParameter)
462
    {
463 6
        $this->mountPointParameter = (string)$mountPointParameter;
464 6
    }
465
466
    /**
467
     * Allows third party extensions to replace or modify the page document
468
     * created by this indexer.
469
     *
470
     * @param \Apache_Solr_Document $pageDocument The page document created by this indexer.
471
     * @return \Apache_Solr_Document An Apache Solr document representing the currently indexed page
472
     */
473 42
    protected function substitutePageDocument(\Apache_Solr_Document $pageDocument)
474
    {
475 42
        if (!is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPageSubstitutePageDocument'])) {
476 36
            return $pageDocument;
477
        }
478
479 6
        $indexConfigurationName = $this->getIndexConfigurationNameForCurrentPage();
480 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPageSubstitutePageDocument'] as $classReference) {
481 6
            $substituteIndexer = GeneralUtility::getUserObj($classReference);
482
483 6
            if (!$substituteIndexer instanceof SubstitutePageIndexer) {
484
                $message = get_class($substituteIndexer) . ' must implement interface ' . SubstitutePageIndexer::class;
485
                throw new \UnexpectedValueException($message, 1310491001);
486
            }
487
488 6
            if ($substituteIndexer instanceof PageFieldMappingIndexer) {
489 6
                $substituteIndexer->setPageIndexingConfigurationName($indexConfigurationName);
490
            }
491
492 6
            $substituteDocument = $substituteIndexer->getPageDocument($pageDocument);
493 6
            if (!$substituteDocument instanceof Apache_Solr_Document) {
0 ignored issues
show
Bug introduced by
The class Apache_Solr_Document does not exist. Is this class maybe located in a folder that is not analyzed, or in a newer version of your dependencies than listed in your composer.lock/composer.json?
Loading history...
494
                $message = 'The document returned by ' . get_class($substituteIndexer) . ' is not a valid Apache_Solr_Document document.';
495
                throw new \UnexpectedValueException($message, 1310490952);
496
            }
497 6
            $pageDocument = $substituteDocument;
498
        }
499
500 6
        return $pageDocument;
501
    }
502
503
    /**
504
     * Retrieves the indexConfigurationName from the related queueItem, or falls back to pages when no queue item set.
505
     *
506
     * @return string
507
     */
508 6
    protected function getIndexConfigurationNameForCurrentPage()
509
    {
510 6
        return isset($this->indexQueueItem) ? $this->indexQueueItem->getIndexingConfigurationName() : 'pages';
511
    }
512
513
    /**
514
     * Allows third party extensions to provide additional documents which
515
     * should be indexed for the current page.
516
     *
517
     * @param \Apache_Solr_Document $pageDocument The main document representing this page.
518
     * @param \Apache_Solr_Document[] $existingDocuments An array of documents already created for this page.
519
     * @return array An array of additional \Apache_Solr_Document objects to index
520
     */
521 42
    protected function getAdditionalDocuments(\Apache_Solr_Document $pageDocument, array $existingDocuments)
522
    {
523 42
        $documents = $existingDocuments;
524
525 42
        if (!is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPageAddDocuments'])) {
526 41
            return $documents;
527
        }
528
529 1
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['solr']['Indexer']['indexPageAddDocuments'] as $classReference) {
530 1
            $additionalIndexer = GeneralUtility::getUserObj($classReference);
531
532 1
            if (!$additionalIndexer instanceof AdditionalPageIndexer) {
533
                $message = get_class($additionalIndexer) . ' must implement interface ' . AdditionalPageIndexer::class;
534
                throw new \UnexpectedValueException($message, 1310491024);
535
            }
536
537 1
            $additionalDocuments = $additionalIndexer->getAdditionalPageDocuments($pageDocument, $documents);
538 1
            if (is_array($additionalDocuments)) {
539 1
                $documents = array_merge($documents, $additionalDocuments);
540
            }
541
        }
542
543 1
        return $documents;
544
    }
545
546
    /**
547
     * Sends the given documents to the field processing service which takes
548
     * care of manipulating fields as defined in the field's configuration.
549
     *
550
     * @param array $documents An array of documents to manipulate
551
     */
552 42
    protected function processDocuments(array $documents)
553
    {
554 42
        $processingInstructions = $this->configuration->getIndexFieldProcessingInstructionsConfiguration();
555 42
        if (count($processingInstructions) > 0) {
556 42
            $service = GeneralUtility::makeInstance(Service::class);
557 42
            $service->processDocuments($documents, $processingInstructions);
558
        }
559 42
    }
560
561
    /**
562
     * Adds the collected documents to the Solr index.
563
     *
564
     * @param array $documents An array of \Apache_Solr_Document objects.
565
     * @return bool TRUE if documents were added successfully, FALSE otherwise
566
     */
567 42
    protected function addDocumentsToSolrIndex(array $documents)
568
    {
569 42
        $documentsAdded = false;
570
571 42
        if (!count($documents)) {
572
            return $documentsAdded;
573
        }
574
575
        try {
576 42
            $this->log('Adding ' . count($documents) . ' documents.', 0, $documents);
577
578
            // chunk adds by 20
579 42
            $documentChunks = array_chunk($documents, 20);
580 42
            foreach ($documentChunks as $documentChunk) {
581 42
                $response = $this->solrConnection->addDocuments($documentChunk);
582
583 42
                if ($response->getHttpStatus() != 200) {
584
                    $transportException = new \Apache_Solr_HttpTransportException($response);
585 42
                    throw new \RuntimeException('Solr Request failed.', 1331834983, $transportException);
586
                }
587
            }
588
589 42
            $documentsAdded = true;
590
        } catch (\Exception $e) {
591
            $this->log($e->getMessage() . ' Error code: ' . $e->getCode(), 2);
592
593
            if ($this->configuration->getLoggingExceptions()) {
594
                GeneralUtility::devLog('Exception while adding documents', 'solr', 3, [$e->__toString()]);
595
            }
596
        }
597
598 42
        return $documentsAdded;
599
    }
600
601
    /**
602
     * Gets the current page's URL.
603
     *
604
     * @return string URL of the current page.
605
     */
606
    public function getPageUrl()
607
    {
608
        return $this->pageUrl;
609
    }
610
611
    /**
612
     * Sets the URL to use for the page document.
613
     *
614
     * @param string $url The page's URL.
615
     */
616 6
    public function setPageUrl($url)
617
    {
618 6
        $this->pageUrl = $url;
619 6
    }
620
621
    /**
622
     * Gets the page's access rootline.
623
     *
624
     * @return Rootline The page's access rootline
625
     */
626
    public function getPageAccessRootline()
627
    {
628
        return $this->pageAccessRootline;
629
    }
630
631
    /**
632
     * Sets the page's access rootline.
633
     *
634
     * @param Rootline $accessRootline The page's access rootline
635
     */
636 41
    public function setPageAccessRootline(Rootline $accessRootline)
637
    {
638 41
        $this->pageAccessRootline = $accessRootline;
639 41
    }
640
641
    /**
642
     * Gets the documents that have been sent to Solr
643
     *
644
     * @return array An array of \Apache_Solr_Document objects
645
     */
646 6
    public function getDocumentsSentToSolr()
647
    {
648 6
        return $this->documentsSentToSolr;
649
    }
650
}
651