| 1 | <?php |
||
| 2 | namespace EWW\Dpf\Services\ElasticSearch; |
||
| 3 | |||
| 4 | /* |
||
| 5 | * This file is part of the TYPO3 CMS project. |
||
| 6 | * |
||
| 7 | * It is free software; you can redistribute it and/or modify it under |
||
| 8 | * the terms of the GNU General Public License, either version 2 |
||
| 9 | * of the License, or any later version. |
||
| 10 | * |
||
| 11 | * For the full copyright and license information, please read the |
||
| 12 | * LICENSE.txt file that was distributed with this source code. |
||
| 13 | * |
||
| 14 | * The TYPO3 project - inspiring people to share! |
||
| 15 | */ |
||
| 16 | |||
| 17 | use Elasticsearch\ClientBuilder; |
||
| 18 | use Elasticsearch\Common\Exceptions\Curl\CouldNotConnectToHost; |
||
| 19 | use Elasticsearch\Common\Exceptions\Curl\CouldNotResolveHostException; |
||
| 20 | use EWW\Dpf\Domain\Repository\FrontendUserRepository; |
||
| 21 | use EWW\Dpf\Domain\Workflow\DocumentWorkflow; |
||
| 22 | use EWW\Dpf\Exceptions\ElasticSearchConnectionErrorException; |
||
| 23 | use EWW\Dpf\Exceptions\ElasticSearchMissingIndexNameException; |
||
| 24 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||
| 25 | use EWW\Dpf\Configuration\ClientConfigurationManager; |
||
| 26 | use EWW\Dpf\Domain\Model\Document; |
||
| 27 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
| 28 | use TYPO3\CMS\Core\Log\LogManager; |
||
| 29 | use TYPO3\CMS\Extbase\Utility\LocalizationUtility; |
||
| 30 | |||
| 31 | class ElasticSearch |
||
| 32 | { |
||
| 33 | /** |
||
| 34 | * @var \EWW\Dpf\Configuration\ClientConfigurationManager |
||
| 35 | */ |
||
| 36 | protected $clientConfigurationManager; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * @var \Elasticsearch\Client |
||
| 40 | */ |
||
| 41 | protected $client; |
||
| 42 | |||
| 43 | protected $server = 'host.docker.internal'; //127.0.0.1'; |
||
| 44 | |||
| 45 | protected $port = '9200'; |
||
| 46 | |||
| 47 | protected $indexName = 'kitodo_publication'; |
||
| 48 | |||
| 49 | protected $results; |
||
| 50 | |||
| 51 | |||
| 52 | protected $elasticsearchMapper; |
||
| 53 | |||
| 54 | /** |
||
| 55 | * @var int |
||
| 56 | */ |
||
| 57 | protected $clientPid = 0; |
||
| 58 | |||
| 59 | /** |
||
| 60 | * elasticsearch client constructor |
||
| 61 | * @param int|null $clientPid |
||
| 62 | * @throws ElasticSearchMissingIndexNameException |
||
| 63 | */ |
||
| 64 | public function __construct($clientPid = null) |
||
| 65 | { |
||
| 66 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
||
| 67 | |||
| 68 | $this->elasticsearchMapper = $objectManager->get(ElasticsearchMapper::class); |
||
| 69 | |||
| 70 | $this->clientConfigurationManager = $objectManager->get(ClientConfigurationManager::class); |
||
| 71 | |||
| 72 | if ($clientPid) { |
||
|
0 ignored issues
–
show
|
|||
| 73 | $this->clientConfigurationManager->setConfigurationPid($clientPid); |
||
| 74 | $this->clientPid = $clientPid; |
||
| 75 | } |
||
| 76 | |||
| 77 | $this->server = $this->clientConfigurationManager->getElasticSearchHost(); |
||
| 78 | $this->port = $this->clientConfigurationManager->getElasticSearchPort(); |
||
| 79 | $this->indexName = $this->clientConfigurationManager->getElasticSearchIndexName(); |
||
| 80 | |||
| 81 | if (empty($this->indexName)) { |
||
| 82 | throw new ElasticSearchMissingIndexNameException('Missing search index name.'); |
||
| 83 | } |
||
| 84 | |||
| 85 | $hosts = array( |
||
| 86 | $this->server . ':' . $this->port, |
||
| 87 | ); |
||
| 88 | |||
| 89 | $clientBuilder = ClientBuilder::create(); |
||
| 90 | $clientBuilder->setHosts($hosts); |
||
| 91 | $this->client = $clientBuilder->build(); |
||
| 92 | |||
| 93 | try { |
||
| 94 | $this->initializeIndex($this->indexName); |
||
| 95 | } catch (\Throwable $e) { |
||
| 96 | $message = LocalizationUtility::translate( |
||
| 97 | 'elasticsearch.notRunning', 'dpf' |
||
| 98 | ); |
||
| 99 | die($message); |
||
|
0 ignored issues
–
show
|
|||
| 100 | } |
||
| 101 | } |
||
| 102 | |||
| 103 | /** |
||
| 104 | * @return string|null |
||
| 105 | */ |
||
| 106 | protected function getIndexName() |
||
| 107 | { |
||
| 108 | return $this->indexName; |
||
| 109 | } |
||
| 110 | |||
| 111 | /** |
||
| 112 | * Creates an index named by $indexName if it doesn't exist. |
||
| 113 | * |
||
| 114 | * @param $indexName |
||
| 115 | */ |
||
| 116 | protected function initializeIndex($indexName) |
||
| 117 | { |
||
| 118 | $paramsIndex = [ |
||
| 119 | 'index' => $indexName, |
||
| 120 | 'body' => [ |
||
| 121 | 'settings' => [ |
||
| 122 | //'index.requests.cache.enable' => false, |
||
| 123 | 'analysis' => [ |
||
| 124 | 'filter' => [ |
||
| 125 | 'ngram' => [ |
||
| 126 | 'type' => 'ngram', |
||
| 127 | 'min_gram' => 3, |
||
| 128 | 'max_gram' => 3, |
||
| 129 | 'token_chars' => [ |
||
| 130 | 'letter', |
||
| 131 | 'digit' |
||
| 132 | ], |
||
| 133 | ] |
||
| 134 | ], |
||
| 135 | 'analyzer' => [ |
||
| 136 | 'keyword_lowercase' => [ |
||
| 137 | 'tokenizer' => 'keyword', |
||
| 138 | 'filter' => ['lowercase'] |
||
| 139 | ] |
||
| 140 | ], |
||
| 141 | 'normalizer' => [ |
||
| 142 | 'lowercase_normalizer' => [ |
||
| 143 | 'type' => 'custom', |
||
| 144 | 'char_filter' => [], |
||
| 145 | 'filter' => [ |
||
| 146 | 'lowercase', |
||
| 147 | 'asciifolding' |
||
| 148 | ] |
||
| 149 | ] |
||
| 150 | ] |
||
| 151 | ] |
||
| 152 | ], |
||
| 153 | 'mappings' => [ |
||
| 154 | '_source' => [ |
||
| 155 | 'enabled' => true |
||
| 156 | ], |
||
| 157 | //'dynamic' => 'strict', |
||
| 158 | 'properties' => [ |
||
| 159 | 'title' => [ |
||
| 160 | 'type' => 'text', |
||
| 161 | 'fields' => [ |
||
| 162 | 'keyword' => [ |
||
| 163 | 'type' => 'keyword', |
||
| 164 | 'normalizer' => 'lowercase_normalizer' |
||
| 165 | ] |
||
| 166 | ] |
||
| 167 | ], |
||
| 168 | 'state' => [ |
||
| 169 | 'type' => 'keyword' |
||
| 170 | ], |
||
| 171 | 'aliasState' => [ |
||
| 172 | 'type' => 'keyword' |
||
| 173 | ], |
||
| 174 | 'year' => [ |
||
| 175 | 'type' => 'integer' |
||
| 176 | ], |
||
| 177 | 'persons' => [ |
||
| 178 | 'type' => 'keyword' |
||
| 179 | ], |
||
| 180 | 'personsSort' => [ |
||
| 181 | 'type' => 'text', |
||
| 182 | 'fields' => [ |
||
| 183 | 'keyword' => [ |
||
| 184 | 'type' => 'keyword', |
||
| 185 | 'normalizer' => 'lowercase_normalizer' |
||
| 186 | ] |
||
| 187 | ] |
||
| 188 | ], |
||
| 189 | 'doctype' => [ |
||
| 190 | 'type' => 'keyword' |
||
| 191 | ], |
||
| 192 | 'collections' => [ |
||
| 193 | 'type' => 'keyword' |
||
| 194 | ], |
||
| 195 | 'hasFiles' => [ |
||
| 196 | 'type' => 'keyword' |
||
| 197 | ], |
||
| 198 | 'creator' => [ |
||
| 199 | 'type' => 'keyword' |
||
| 200 | ], |
||
| 201 | 'creatorRole' => [ |
||
| 202 | 'type' => 'keyword' |
||
| 203 | ], |
||
| 204 | 'source' => [ |
||
| 205 | 'type' => 'text' |
||
| 206 | ], |
||
| 207 | 'fobIdentifiers' => [ |
||
| 208 | 'type' => 'keyword' |
||
| 209 | ], |
||
| 210 | 'personData' => [ |
||
| 211 | //'enabled' => false, |
||
| 212 | 'properties' => [ |
||
| 213 | 'name' => [ |
||
| 214 | 'type' => 'keyword' |
||
| 215 | ], |
||
| 216 | 'fobId' => [ |
||
| 217 | //'type' => 'keyword' |
||
| 218 | 'enabled' => false |
||
| 219 | ], |
||
| 220 | 'index' => [ |
||
| 221 | //'type' => 'integer' |
||
| 222 | 'enabled' => false |
||
| 223 | ] |
||
| 224 | ] |
||
| 225 | ], |
||
| 226 | 'affiliation' => [ |
||
| 227 | 'type' => 'keyword' |
||
| 228 | ], |
||
| 229 | 'process_number' => [ |
||
| 230 | 'type' => 'keyword' |
||
| 231 | ], |
||
| 232 | 'creationDate' => [ |
||
| 233 | 'type' => 'date', |
||
| 234 | 'format'=> "yyyy-MM-dd" |
||
| 235 | ], |
||
| 236 | 'embargoDate' => [ |
||
| 237 | 'type' => 'date', |
||
| 238 | 'format'=> "yyyy-MM-dd" |
||
| 239 | ] |
||
| 240 | ] |
||
| 241 | ] |
||
| 242 | ] |
||
| 243 | ]; |
||
| 244 | |||
| 245 | if (!$this->client->indices()->exists(['index' => $indexName])) { |
||
| 246 | $this->client->indices()->create($paramsIndex); |
||
| 247 | } |
||
| 248 | } |
||
| 249 | |||
| 250 | /** |
||
| 251 | * Adds an document to the index. |
||
| 252 | * |
||
| 253 | * @param Document $document |
||
| 254 | */ |
||
| 255 | public function index($document) |
||
| 256 | { |
||
| 257 | try { |
||
| 258 | $data = json_decode($this->elasticsearchMapper->getElasticsearchJson($document)); |
||
| 259 | } catch (\Throwable $throwable) { |
||
| 260 | // Fixme: The solution via json_decode and the XSLT file needs to be replaced. |
||
| 261 | } |
||
| 262 | |||
| 263 | if (!$data) { |
||
| 264 | $data->title[] = $document->getTitle(); |
||
| 265 | $data->doctype = $document->getDocumentType()->getName(); |
||
| 266 | } |
||
| 267 | |||
| 268 | if ($data) { |
||
| 269 | |||
| 270 | $data->state = $document->getState(); |
||
| 271 | $data->aliasState = DocumentWorkflow::STATE_TO_ALIASSTATE_MAPPING[$document->getState()]; |
||
| 272 | |||
| 273 | $data->objectIdentifier = $document->getObjectIdentifier(); |
||
| 274 | |||
| 275 | if (!$data->identifier || !is_array($data->identifier)) { |
||
| 276 | $data->identifier = []; |
||
| 277 | } |
||
| 278 | $data->identifier[] = $document->getObjectIdentifier(); |
||
| 279 | $data->identifier[] = $document->getProcessNumber(); |
||
| 280 | |||
| 281 | if ($document->getCreator()) { |
||
| 282 | $data->creator = $document->getCreator(); |
||
| 283 | } else { |
||
| 284 | $data->creator = null; |
||
| 285 | } |
||
| 286 | |||
| 287 | if ($document->getCreator()) { |
||
| 288 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
||
| 289 | $frontendUserRepository = $objectManager->get(FrontendUserRepository::class); |
||
| 290 | |||
| 291 | /** @var \EWW\Dpf\Domain\Model\FrontendUser $creatorFeUser */ |
||
| 292 | $creatorFeUser = $frontendUserRepository->findByUid($document->getCreator()); |
||
| 293 | if ($creatorFeUser) { |
||
|
0 ignored issues
–
show
|
|||
| 294 | $data->creatorRole = $creatorFeUser->getUserRole(); |
||
| 295 | } else { |
||
| 296 | $data->creatorRole = ''; |
||
| 297 | } |
||
| 298 | } else { |
||
| 299 | $data->creatorRole = ''; |
||
| 300 | } |
||
| 301 | |||
| 302 | $creationDate = new \DateTime($document->getCreationDate()); |
||
| 303 | |||
| 304 | $data->creationDate = $creationDate->format('Y-m-d'); |
||
| 305 | |||
| 306 | $data->year = $document->getPublicationYear(); |
||
| 307 | |||
| 308 | $notes = $document->getNotes(); |
||
| 309 | |||
| 310 | if ($notes && is_array($notes)) { |
||
| 311 | $data->notes = $notes; |
||
| 312 | } else { |
||
| 313 | $data->notes = array(); |
||
| 314 | } |
||
| 315 | |||
| 316 | |||
| 317 | if ($document->hasFiles()) { |
||
| 318 | $data->hasFiles = true; |
||
| 319 | } else { |
||
| 320 | $data->hasFiles = false; |
||
| 321 | } |
||
| 322 | |||
| 323 | $internalFormat = new \EWW\Dpf\Helper\InternalFormat($document->getXmlData(), $this->clientPid); |
||
| 324 | |||
| 325 | //$persons = array_merge($internalFormat->getAuthors(), $internalFormat->getPublishers()); |
||
| 326 | $persons = $internalFormat->getPersons(); |
||
| 327 | |||
| 328 | $fobIdentifiers = []; |
||
| 329 | $personData = []; |
||
| 330 | foreach ($persons as $person) { |
||
| 331 | $fobIdentifiers[] = $person['fobId']; |
||
| 332 | $personData[] = $person; |
||
| 333 | //$data->persons[] = $person['name']; |
||
| 334 | $data->persons[] = $person['fobId']; |
||
| 335 | |||
| 336 | foreach ($person['affiliations'] as $affiliation) { |
||
| 337 | $data->affiliation[] = $affiliation; |
||
| 338 | } |
||
| 339 | |||
| 340 | foreach ($person['affiliationIdentifiers'] as $affiliationIdentifier) { |
||
| 341 | $data->affiliation[] = $affiliationIdentifier; |
||
| 342 | } |
||
| 343 | } |
||
| 344 | |||
| 345 | $data->fobIdentifiers = $fobIdentifiers; |
||
| 346 | $data->personData = $personData; |
||
| 347 | |||
| 348 | if (sizeof($persons) > 0) { |
||
| 349 | if (array_key_exists('family', $persons[0])) { |
||
| 350 | $data->personsSort = $persons[0]['family']; |
||
| 351 | } |
||
| 352 | } |
||
| 353 | |||
| 354 | $data->source = $document->getSourceDetails(); |
||
| 355 | |||
| 356 | $data->universityCollection = false; |
||
| 357 | if ($data->collections && is_array($data->collections)) { |
||
| 358 | foreach ($data->collections as $collection) { |
||
| 359 | if ($collection == $this->clientConfigurationManager->getUniversityCollection()) { |
||
| 360 | $data->universityCollection = true; |
||
| 361 | break; |
||
| 362 | } |
||
| 363 | } |
||
| 364 | } |
||
| 365 | |||
| 366 | $embargoDate = $document->getEmbargoDate(); |
||
| 367 | if ($embargoDate instanceof \DateTime) { |
||
|
0 ignored issues
–
show
|
|||
| 368 | $data->embargoDate = $embargoDate->format("Y-m-d"); |
||
| 369 | } else { |
||
| 370 | $data->embargoDate = null; |
||
| 371 | } |
||
| 372 | |||
| 373 | $data->originalSourceTitle = $internalFormat->getOriginalSourceTitle(); |
||
| 374 | |||
| 375 | $data->fobIdentifiers = $internalFormat->getPersonFisIdentifiers(); |
||
| 376 | |||
| 377 | $this->client->index([ |
||
| 378 | 'refresh' => 'wait_for', |
||
| 379 | 'index' => $this->getIndexName(), |
||
| 380 | 'id' => $document->getDocumentIdentifier(), |
||
| 381 | 'body' => $data |
||
| 382 | ]); |
||
| 383 | |||
| 384 | } |
||
| 385 | |||
| 386 | } |
||
| 387 | |||
| 388 | |||
| 389 | /** |
||
| 390 | * Deletes a document from the index |
||
| 391 | * |
||
| 392 | * @param string $identifier |
||
| 393 | */ |
||
| 394 | public function delete($identifier) |
||
| 395 | { |
||
| 396 | try { |
||
| 397 | |||
| 398 | $params = [ |
||
| 399 | 'refresh' => 'wait_for', |
||
| 400 | 'index' => $this->getIndexName(), |
||
| 401 | 'id' => $identifier |
||
| 402 | ]; |
||
| 403 | |||
| 404 | $this->client->delete($params); |
||
| 405 | |||
| 406 | } catch (\Exception $e) { |
||
| 407 | /** @var $logger \TYPO3\CMS\Core\Log\Logger */ |
||
| 408 | $logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__); |
||
| 409 | $logger->warning('Document could not be deleted from the index.', |
||
| 410 | [ |
||
| 411 | 'Document identifier' => $identifier |
||
| 412 | ] |
||
| 413 | ); |
||
| 414 | } |
||
| 415 | } |
||
| 416 | |||
| 417 | |||
| 418 | /** |
||
| 419 | * @param $identifier |
||
| 420 | */ |
||
| 421 | public function getDocument($identifier) |
||
| 422 | { |
||
| 423 | $params = [ |
||
| 424 | 'index' => $this->getIndexName(), |
||
| 425 | 'id' => $identifier |
||
| 426 | ]; |
||
| 427 | |||
| 428 | return $this->client->get($params); |
||
| 429 | } |
||
| 430 | |||
| 431 | |||
| 432 | /** |
||
| 433 | * performs the |
||
| 434 | * @param array $query search query |
||
| 435 | * @return array result list |
||
| 436 | */ |
||
| 437 | public function search($query, $type = null) |
||
| 438 | { |
||
| 439 | try { |
||
| 440 | // define type and index |
||
| 441 | if (empty($query['index'])) { |
||
| 442 | $query['index'] = $this->getIndexName(); |
||
| 443 | } |
||
| 444 | if (!empty($type)) { |
||
| 445 | //$query['type'] = $type; |
||
| 446 | // $query['type'] = $this->type; |
||
| 447 | } |
||
| 448 | |||
| 449 | // Search request |
||
| 450 | $results = $this->client->search($query); |
||
| 451 | |||
| 452 | //$this->hits = $results['hits']['total']; |
||
| 453 | |||
| 454 | //$this->resultList = $results['hits']; |
||
| 455 | |||
| 456 | $this->results = $results; |
||
| 457 | |||
| 458 | return $this->results; |
||
| 459 | } catch (CouldNotConnectToHost $exception) { |
||
| 460 | throw new ElasticSearchConnectionErrorException("Could not connect to repository server."); |
||
| 461 | } catch (CouldNotResolveHostException $exception) { |
||
| 462 | throw new ElasticSearchConnectionErrorException("Could not connect to repository server."); |
||
| 463 | } |
||
| 464 | } |
||
| 465 | |||
| 466 | /** |
||
| 467 | * Get the results |
||
| 468 | * @return mixed |
||
| 469 | */ |
||
| 470 | public function getResults() |
||
| 471 | { |
||
| 472 | // return results from the last search request |
||
| 473 | return $this->results; |
||
| 474 | } |
||
| 475 | } |
||
| 476 |
In PHP, under loose comparison (like
==, or!=, orswitchconditions), values of different types might be equal.For
integervalues, zero is a special case, in particular the following results might be unexpected: