|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* (c) Kitodo. Key to digital objects e.V. <[email protected]> |
|
5
|
|
|
* |
|
6
|
|
|
* This file is part of the Kitodo and TYPO3 projects. |
|
7
|
|
|
* |
|
8
|
|
|
* @license GNU General Public License version 3 or later. |
|
9
|
|
|
* For the full copyright and license information, please read the |
|
10
|
|
|
* LICENSE.txt file that was distributed with this source code. |
|
11
|
|
|
*/ |
|
12
|
|
|
|
|
13
|
|
|
namespace Kitodo\Dlf\Command; |
|
14
|
|
|
|
|
15
|
|
|
use Symfony\Component\Console\Input\InputInterface; |
|
16
|
|
|
use Symfony\Component\Console\Input\InputOption; |
|
17
|
|
|
use Symfony\Component\Console\Output\OutputInterface; |
|
18
|
|
|
use Symfony\Component\Console\Style\SymfonyStyle; |
|
19
|
|
|
use TYPO3\CMS\Core\Core\Bootstrap; |
|
20
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
|
21
|
|
|
use TYPO3\CMS\Core\Utility\MathUtility; |
|
22
|
|
|
use TYPO3\CMS\Core\Database\ConnectionPool; |
|
23
|
|
|
use TYPO3\CMS\Core\Database\Connection; |
|
24
|
|
|
use Kitodo\Dlf\Command\BaseCommand; |
|
25
|
|
|
use Kitodo\Dlf\Common\Document; |
|
26
|
|
|
use Phpoaipmh\Endpoint; |
|
27
|
|
|
use Phpoaipmh\Exception\BaseoaipmhException; |
|
28
|
|
|
|
|
29
|
|
|
/** |
|
30
|
|
|
* CLI Command for harvesting OAI-PMH interfaces into database and Solr. |
|
31
|
|
|
* |
|
32
|
|
|
* @author Sebastian Meyer <[email protected]> |
|
33
|
|
|
* @package TYPO3 |
|
34
|
|
|
* @subpackage dlf |
|
35
|
|
|
* @access public |
|
36
|
|
|
*/ |
|
37
|
|
|
class HarvestCommand extends BaseCommand |
|
38
|
|
|
{ |
|
39
|
|
|
/** |
|
40
|
|
|
* Configure the command by defining the name, options and arguments |
|
41
|
|
|
* |
|
42
|
|
|
* @return void |
|
43
|
|
|
*/ |
|
44
|
|
|
public function configure() |
|
45
|
|
|
{ |
|
46
|
|
|
$this |
|
47
|
|
|
->setDescription('Harvest OAI-PMH contents into database and Solr.') |
|
48
|
|
|
->setHelp('') |
|
49
|
|
|
->addOption( |
|
50
|
|
|
'dry-run', |
|
51
|
|
|
null, |
|
52
|
|
|
InputOption::VALUE_NONE, |
|
53
|
|
|
'If this option is set, the files will not actually be processed but the location URIs are shown.' |
|
54
|
|
|
) |
|
55
|
|
|
->addOption( |
|
56
|
|
|
'lib', |
|
57
|
|
|
'l', |
|
58
|
|
|
InputOption::VALUE_REQUIRED, |
|
59
|
|
|
'UID of the library to harvest.' |
|
60
|
|
|
) |
|
61
|
|
|
->addOption( |
|
62
|
|
|
'pid', |
|
63
|
|
|
'p', |
|
64
|
|
|
InputOption::VALUE_REQUIRED, |
|
65
|
|
|
'UID of the page the documents should be added to.' |
|
66
|
|
|
) |
|
67
|
|
|
->addOption( |
|
68
|
|
|
'solr', |
|
69
|
|
|
's', |
|
70
|
|
|
InputOption::VALUE_REQUIRED, |
|
71
|
|
|
'[UID|index_name] of the Solr core the document should be added to.' |
|
72
|
|
|
) |
|
73
|
|
|
->addOption( |
|
74
|
|
|
'from', |
|
75
|
|
|
null, |
|
76
|
|
|
InputOption::VALUE_OPTIONAL, |
|
77
|
|
|
'Datestamp (YYYY-MM-DD) to begin harvesting from.' |
|
78
|
|
|
) |
|
79
|
|
|
->addOption( |
|
80
|
|
|
'until', |
|
81
|
|
|
null, |
|
82
|
|
|
InputOption::VALUE_OPTIONAL, |
|
83
|
|
|
'Datestamp (YYYY-MM-DD) to end harvesting on.' |
|
84
|
|
|
) |
|
85
|
|
|
->addOption( |
|
86
|
|
|
'set', |
|
87
|
|
|
null, |
|
88
|
|
|
InputOption::VALUE_OPTIONAL, |
|
89
|
|
|
'Name of the set to limit harvesting to.' |
|
90
|
|
|
); |
|
91
|
|
|
} |
|
92
|
|
|
|
|
93
|
|
|
/** |
|
94
|
|
|
* Executes the command to index the given document to db and solr. |
|
95
|
|
|
* |
|
96
|
|
|
* @param InputInterface $input The input parameters |
|
97
|
|
|
* @param OutputInterface $output The Symfony interface for outputs on console |
|
98
|
|
|
* |
|
99
|
|
|
* @return void |
|
100
|
|
|
*/ |
|
101
|
|
|
protected function execute(InputInterface $input, OutputInterface $output) |
|
102
|
|
|
{ |
|
103
|
|
|
// Make sure the _cli_ user is loaded |
|
104
|
|
|
Bootstrap::getInstance()->initializeBackendAuthentication(); |
|
|
|
|
|
|
105
|
|
|
|
|
106
|
|
|
$dryRun = $input->getOption('dry-run') != false ? true : false; |
|
107
|
|
|
|
|
108
|
|
|
$io = new SymfonyStyle($input, $output); |
|
109
|
|
|
$io->title($this->getDescription()); |
|
110
|
|
|
|
|
111
|
|
|
$startingPoint = 0; |
|
112
|
|
|
if (MathUtility::canBeInterpretedAsInteger($input->getOption('pid'))) { |
|
113
|
|
|
$startingPoint = MathUtility::forceIntegerInRange((int) $input->getOption('pid'), 0); |
|
114
|
|
|
} |
|
115
|
|
|
if ($startingPoint == 0) { |
|
116
|
|
|
$io->error('ERROR: No valid PID (' . $startingPoint . ') given.'); |
|
117
|
|
|
exit(1); |
|
118
|
|
|
} |
|
119
|
|
|
|
|
120
|
|
|
if ( |
|
121
|
|
|
!empty($input->getOption('solr')) |
|
122
|
|
|
&& !is_array($input->getOption('solr')) |
|
123
|
|
|
) { |
|
124
|
|
|
$allSolrCores = $this->getSolrCores($startingPoint); |
|
125
|
|
|
if (MathUtility::canBeInterpretedAsInteger($input->getOption('solr'))) { |
|
126
|
|
|
$solrCoreUid = MathUtility::forceIntegerInRange((int) $input->getOption('solr'), 0); |
|
127
|
|
|
} else { |
|
128
|
|
|
$solrCoreUid = $allSolrCores[$input->getOption('solr')]; |
|
129
|
|
|
} |
|
130
|
|
|
// Abort if solrCoreUid is empty or not in the array of allowed solr cores. |
|
131
|
|
|
if (empty($solrCoreUid) || !in_array($solrCoreUid, $allSolrCores)) { |
|
132
|
|
|
$output_solrCores = []; |
|
133
|
|
|
foreach ($allSolrCores as $index_name => $uid) { |
|
134
|
|
|
$output_solrCores[] = $uid . ' : ' . $index_name; |
|
135
|
|
|
} |
|
136
|
|
|
if (empty($output_solrCores)) { |
|
137
|
|
|
$io->error('ERROR: No valid Solr core ("' . $input->getOption('solr') . '") given. No valid cores found on PID ' . $startingPoint . ".\n"); |
|
138
|
|
|
exit(1); |
|
139
|
|
|
} else { |
|
140
|
|
|
$io->error('ERROR: No valid Solr core ("' . $input->getOption('solr') . '") given. ' . "Valid cores are (<uid>:<index_name>):\n" . implode("\n", $output_solrCores) . "\n"); |
|
141
|
|
|
exit(1); |
|
142
|
|
|
} |
|
143
|
|
|
} |
|
144
|
|
|
} else { |
|
145
|
|
|
$io->error('ERROR: Required parameter --solr|-s is missing or array.'); |
|
146
|
|
|
exit(1); |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
|
|
if (MathUtility::canBeInterpretedAsInteger($input->getOption('lib'))) { |
|
150
|
|
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
|
151
|
|
|
->getQueryBuilderForTable('tx_dlf_libraries'); |
|
152
|
|
|
|
|
153
|
|
|
$result = $queryBuilder |
|
154
|
|
|
->select('oai_base') |
|
155
|
|
|
->from('tx_dlf_libraries') |
|
156
|
|
|
->where( |
|
157
|
|
|
$queryBuilder->expr()->eq( |
|
158
|
|
|
'uid', |
|
159
|
|
|
$queryBuilder->createNamedParameter((int) $input->getOption('lib'), Connection::PARAM_INT) |
|
160
|
|
|
), |
|
161
|
|
|
$queryBuilder->expr()->eq( |
|
162
|
|
|
'pid', |
|
163
|
|
|
$queryBuilder->createNamedParameter((int) $startingPoint, Connection::PARAM_INT) |
|
164
|
|
|
) |
|
165
|
|
|
) |
|
166
|
|
|
->setMaxResults(1) |
|
167
|
|
|
->execute(); |
|
168
|
|
|
|
|
169
|
|
|
$record = $result->fetch(); |
|
170
|
|
|
$baseUrl = $record['oai_base']; |
|
171
|
|
|
} else { |
|
172
|
|
|
$io->error('ERROR: Required parameter --lib|-l is not a valid UID.'); |
|
173
|
|
|
exit(1); |
|
174
|
|
|
} |
|
175
|
|
|
if (!GeneralUtility::isValidUrl($baseUrl)) { |
|
176
|
|
|
$io->error('ERROR: No valid OAI Base URL set for library with given UID ("' . $input->getOption('lib') . '").'); |
|
177
|
|
|
exit(1); |
|
178
|
|
|
} else { |
|
179
|
|
|
try { |
|
180
|
|
|
$oai = Endpoint::build($baseUrl); |
|
181
|
|
|
} catch (BaseoaipmhException $e) { |
|
182
|
|
|
$this->handleOaiError($e, $io); |
|
183
|
|
|
} |
|
184
|
|
|
} |
|
185
|
|
|
|
|
186
|
|
|
if ( |
|
187
|
|
|
!is_array($input->getOption('from')) |
|
188
|
|
|
&& preg_match('/^[0-9]{4}-[0-9]{2}-[0-9]{2}$/', $input->getOption('from')) |
|
189
|
|
|
) { |
|
190
|
|
|
$from = new \DateTime($input->getOption('from')); |
|
191
|
|
|
} else { |
|
192
|
|
|
$from = null; |
|
193
|
|
|
} |
|
194
|
|
|
|
|
195
|
|
|
if ( |
|
196
|
|
|
!is_array($input->getOption('until')) |
|
197
|
|
|
&& preg_match('/^[0-9]{4}-[0-9]{2}-[0-9]{2}$/', $input->getOption('until')) |
|
198
|
|
|
) { |
|
199
|
|
|
$until = new \DateTime($input->getOption('until')); |
|
200
|
|
|
} else { |
|
201
|
|
|
$until = null; |
|
202
|
|
|
} |
|
203
|
|
|
|
|
204
|
|
|
$set = null; |
|
205
|
|
|
if ( |
|
206
|
|
|
!is_array($input->getOption('set')) |
|
207
|
|
|
&& !empty($input->getOption('set')) |
|
208
|
|
|
) { |
|
209
|
|
|
$setsAvailable = $oai->listSets(); |
|
210
|
|
|
foreach ($setsAvailable as $setAvailable) { |
|
211
|
|
|
if ((string) $setAvailable->setSpec === $input->getOption('set')) { |
|
212
|
|
|
$set = $input->getOption('set'); |
|
213
|
|
|
break; |
|
214
|
|
|
} |
|
215
|
|
|
} |
|
216
|
|
|
if (empty($set)) { |
|
217
|
|
|
$io->error('ERROR: OAI interface does not provide a set with given setSpec ("' . $input->getOption('set') . '").'); |
|
218
|
|
|
exit(1); |
|
219
|
|
|
} |
|
220
|
|
|
} |
|
221
|
|
|
|
|
222
|
|
|
// Get OAI record identifiers to process. |
|
223
|
|
|
try { |
|
224
|
|
|
$identifiers = $oai->listIdentifiers('mets', $from, $until, $set); |
|
225
|
|
|
} catch (BaseoaipmhException $exception) { |
|
226
|
|
|
$this->handleOaiError($exception, $io); |
|
227
|
|
|
} |
|
228
|
|
|
|
|
229
|
|
|
// Process all identifiers. |
|
230
|
|
|
$baseLocation = $baseUrl . (parse_url($baseUrl, PHP_URL_QUERY) ? '&' : '?'); |
|
231
|
|
|
foreach ($identifiers as $identifier) { |
|
232
|
|
|
// Build OAI GetRecord URL... |
|
233
|
|
|
$params = [ |
|
234
|
|
|
'verb' => 'GetRecord', |
|
235
|
|
|
'metadataPrefix' => 'mets', |
|
236
|
|
|
'identifier' => (string) $identifier->identifier |
|
237
|
|
|
]; |
|
238
|
|
|
$docLocation = $baseLocation . http_build_query($params); |
|
239
|
|
|
// ...index the document... |
|
240
|
|
|
$doc = Document::getInstance($docLocation, $startingPoint, true); |
|
241
|
|
|
if ($doc->ready) { |
|
242
|
|
|
if ($dryRun) { |
|
243
|
|
|
$io->writeln('DRY RUN: Would index ' . $doc->uid . ' ("' . $doc->location . '") on PID ' . $startingPoint . ' and Solr core ' . $solrCoreUid . '.'); |
|
244
|
|
|
} else { |
|
245
|
|
|
if ($io->isVerbose()) { |
|
246
|
|
|
$io->writeln(date('Y-m-d H:i:s') . ' Indexing ' . $doc->uid . ' ("' . $doc->location . '") on PID ' . $startingPoint . ' and Solr core ' . $solrCoreUid . '.'); |
|
247
|
|
|
} |
|
248
|
|
|
// ...and save it to the database... |
|
249
|
|
|
if (!$doc->save($startingPoint, $solrCoreUid, (int) $input->getOption('lib'))) { |
|
250
|
|
|
$io->error('ERROR: Document "' . $doc->location . '" not saved and indexed.'); |
|
251
|
|
|
} |
|
252
|
|
|
} |
|
253
|
|
|
} else { |
|
254
|
|
|
$io->error('ERROR: Document "' . $docLocation . '" could not be loaded.'); |
|
255
|
|
|
} |
|
256
|
|
|
// Clear document registry to prevent memory exhaustion. |
|
257
|
|
|
Document::clearRegistry(); |
|
258
|
|
|
} |
|
259
|
|
|
|
|
260
|
|
|
$io->success('All done!'); |
|
261
|
|
|
} |
|
262
|
|
|
|
|
263
|
|
|
/** |
|
264
|
|
|
* Handles OAI errors |
|
265
|
|
|
* |
|
266
|
|
|
* @param BaseoaipmhException $exception Instance of exception thrown |
|
267
|
|
|
* @param SymfonyStyle $io |
|
268
|
|
|
* |
|
269
|
|
|
* @return void |
|
270
|
|
|
*/ |
|
271
|
|
|
protected function handleOaiError(BaseoaipmhException $exception, SymfonyStyle $io) |
|
272
|
|
|
{ |
|
273
|
|
|
$io->error('ERROR: Trying to retrieve data from OAI interface resulted in error:' . "\n " . $exception->getMessage()); |
|
274
|
|
|
} |
|
275
|
|
|
} |
|
276
|
|
|
|
This function has been deprecated. The supplier of the function has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.