DumpEntities::makeIdStream()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 11
rs 9.9
c 0
b 0
f 0
cc 2
nc 2
nop 2
1
<?php
2
3
namespace Wikibase\Repo\Maintenance;
4
5
use ExtensionRegistry;
6
use Maintenance;
7
use MWException;
8
use Onoi\MessageReporter\ObservableMessageReporter;
9
use Wikibase\DataModel\Services\EntityId\EntityIdPager;
10
use Wikibase\DataModel\Services\Lookup\EntityLookupException;
11
use Wikibase\Lib\Reporting\ExceptionHandler;
12
use Wikibase\Lib\Reporting\ReportingExceptionHandler;
13
use Wikibase\Lib\WikibaseSettings;
14
use Wikibase\Repo\Dumpers\DumpGenerator;
15
use Wikibase\Repo\IO\EntityIdReader;
16
use Wikibase\Repo\IO\LineReader;
17
use Wikibase\Repo\Store\Sql\SqlEntityIdPager;
18
use Wikibase\Repo\Store\Sql\SqlEntityIdPagerFactory;
19
use Wikibase\Repo\Store\Store;
20
use Wikibase\Repo\WikibaseRepo;
21
use Wikimedia\AtEase\AtEase;
22
23
$basePath = getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..';
24
25
require_once $basePath . '/maintenance/Maintenance.php';
26
27
/**
28
 * Maintenance script for generating a dump of entities in the repository.
29
 *
30
 * @license GPL-2.0-or-later
31
 * @author Daniel Kinzler
32
 */
33
abstract class DumpEntities extends Maintenance {
34
35
	/**
36
	 * @var SqlEntityIdPagerFactory
37
	 */
38
	private $sqlEntityIdPagerFactory;
39
40
	/**
41
	 * @var bool|resource
42
	 */
43
	private $logFileHandle = false;
44
45
	private $existingEntityTypes = [];
46
47
	private $entityTypesToExcludeFromOutput = [];
48
49
	public function __construct() {
50
		parent::__construct();
51
52
		$this->addDescription( 'Generate a JSON dump from entities in the repository.' );
53
54
		$this->addOption( 'list-file', "A file containing one entity ID per line.", false, true );
55
		$this->addOption(
56
			'entity-type',
57
			"Only dump this kind of entity, e.g. `item` or `property`. Can be given multiple times.",
58
			false,
59
			true,
60
			false,
61
			/* $multiOccurrence */ true
62
		);
63
		$this->addOption( 'sharding-factor', "The number of shards (must be >= 1)", false, true );
64
		$this->addOption( 'shard', "The shard to output (must be less than the sharding-factor)", false, true );
65
		$this->addOption( 'batch-size', "The number of entities per processing batch", false, true );
66
		$this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, true );
67
		$this->addOption( 'log', "Log file (default is stderr). Will be appended.", false, true );
68
		$this->addOption( 'quiet', "Disable progress reporting", false, false );
69
		$this->addOption( 'limit', "Limit how many entities are dumped.", false, true );
70
		$this->addOption( 'no-cache', "If this is set, don't try to read from an EntityRevisionCache.", false, false );
71
		$this->addOption(
72
			'first-page-id',
73
			'First page id to dump, use 1 to start with the first page. Use the reported last SqlEntityIdPager position + 1 ' .
74
				'to continue a previous run. Not compatible with --list-file.',
75
			false,
76
			true
77
		);
78
		$this->addOption(
79
			'last-page-id',
80
			'Page id of the last page to possibly include in the dump. Not compatible with --list-file.',
81
			false,
82
			true
83
		);
84
		$this->addOption(
85
			'ignore-missing',
86
			'Ignore missing IDs, do not report errors on them',
87
			false,
88
			false
89
		);
90
	}
91
92
	public function setDumpEntitiesServices(
93
		SqlEntityIdPagerFactory $sqlEntityIdPagerFactory,
94
		array $existingEntityTypes,
95
		array $entityTypesToExcludeFromOutput
96
	) {
97
		$this->sqlEntityIdPagerFactory = $sqlEntityIdPagerFactory;
98
		$this->existingEntityTypes = $existingEntityTypes;
99
		$this->entityTypesToExcludeFromOutput = $entityTypesToExcludeFromOutput;
100
	}
101
102
	/**
103
	 * Create concrete dumper instance
104
	 * @param resource $output
105
	 * @return DumpGenerator
106
	 */
107
	abstract protected function createDumper( $output );
108
109
	/**
110
	 * Outputs a message vis the output() method.
111
	 *
112
	 * @see MessageReporter::logMessage()
113
	 *
114
	 * @param string $message
115
	 */
116
	public function logMessage( $message ) {
117
		if ( $this->logFileHandle ) {
118
			fwrite( $this->logFileHandle, "$message\n" );
119
			fflush( $this->logFileHandle );
120
		} else {
121
			$this->output( "$message\n" );
122
		}
123
	}
124
125
	/**
126
	 * Opens the given file for use by logMessage().
127
	 *
128
	 * @param string $file use "-" as a shortcut for "php://stdout"
129
	 *
130
	 * @throws MWException
131
	 */
132
	private function openLogFile( $file ) {
133
		$this->closeLogFile();
134
135
		if ( $file === '-' ) {
136
			$file = 'php://stdout';
137
		}
138
139
		// wouldn't streams be nice...
140
		$this->logFileHandle = fopen( $file, 'a' );
141
142
		if ( !$this->logFileHandle ) {
143
			throw new MWException( 'Failed to open log file: ' . $file );
144
		}
145
	}
146
147
	/**
148
	 * Closes any currently open file opened with openLogFile().
149
	 */
150
	private function closeLogFile() {
151
		if ( $this->logFileHandle
152
			&& $this->logFileHandle !== STDERR
153
			&& $this->logFileHandle !== STDOUT
154
		) {
155
			fclose( $this->logFileHandle );
156
		}
157
158
		$this->logFileHandle = false;
159
	}
160
161
	/**
162
	 * Do the actual work. All child classes will need to implement this
163
	 */
164
	public function execute() {
165
		//TODO: more validation for options
166
		$shardingFactor = (int)$this->getOption( 'sharding-factor', 1 );
167
		$shard = (int)$this->getOption( 'shard', 0 );
168
		$batchSize = (int)$this->getOption( 'batch-size', 100 );
169
		$limit = (int)$this->getOption( 'limit', 0 );
170
171
		//TODO: Allow injection of an OutputStream for logging
172
		$this->openLogFile( $this->getOption( 'log', 'php://stderr' ) );
173
174
		$outFile = $this->getOption( 'output', 'php://stdout' );
175
176
		if ( $outFile === '-' ) {
177
			$outFile = 'php://stdout';
178
		}
179
180
		$output = fopen( $outFile, 'w' ); //TODO: Allow injection of an OutputStream
181
182
		if ( !$output ) {
183
			throw new MWException( 'Failed to open ' . $outFile . '!' );
184
		}
185
186
		if ( $this->hasOption( 'list-file' ) ) {
187
			$this->logMessage( "Dumping entities listed in " . $this->getOption( 'list-file' ) );
188
		}
189
190
		$entityTypes = $this->getEntityTypes();
191
		if ( empty( $entityTypes ) ) {
192
			$this->logMessage( "No entity types to dump" );
193
			$this->closeLogFile();
194
			return;
195
		}
196
197
		$this->logMessage( 'Dumping entities of type ' . implode( ', ', $entityTypes ) );
198
199
		if ( $shardingFactor ) {
200
			$this->logMessage( "Dumping shard $shard/$shardingFactor" );
201
		}
202
203
		$dumper = $this->createDumper( $output );
204
		$dumper->setLimit( $limit );
205
206
		$progressReporter = new ObservableMessageReporter();
207
		$progressReporter->registerReporterCallback( [ $this, 'logMessage' ] );
208
		$dumper->setProgressReporter( $progressReporter );
209
210
		$ignored = $this->hasOption( 'ignore-missing' ) ?
211
			[ EntityLookupException::class ] :
212
			[];
213
		$exceptionReporter = new ReportingExceptionHandler( $progressReporter, $ignored );
214
		$dumper->setExceptionHandler( $exceptionReporter );
215
216
		//NOTE: we filter for $entityType twice: filtering in the DB is efficient,
217
		//      but filtering in the dumper is needed when working from a list file.
218
		$dumper->setShardingFilter( $shardingFactor, $shard );
219
		$dumper->setEntityTypesFilter( $entityTypes );
220
		$dumper->setBatchSize( $batchSize );
221
222
		$idStream = $this->makeIdStream( $entityTypes, $exceptionReporter );
223
		AtEase::suppressWarnings();
224
		$dumper->generateDump( $idStream );
225
		AtEase::restoreWarnings();
226
227
		if ( $idStream instanceof EntityIdReader ) {
228
			// close stream / free resources
229
			$idStream->dispose();
230
		}
231
232
		$this->closeLogFile();
233
	}
234
235
	/**
236
	 * @inheritDoc
237
	 */
238
	public function finalSetup() {
239
		global $wgHooks;
240
241
		parent::finalSetup();
242
243
		if ( $this->hasOption( 'dbgroupdefault' ) ) {
244
			// A group was set via cli, so no need to set the default here
245
			return;
246
		}
247
248
		$wgHooks['MediaWikiServices'][] = function() {
249
			global $wgDBDefaultGroup;
250
			if ( !ExtensionRegistry::getInstance()->isLoaded( 'WikibaseRepository' ) ) {
251
				// Something instantiates the MediaWikiServices before Wikibase
252
				// is loaded, nothing we can do here.
253
				wfWarn( self::class . ': Can not change default DB group.' );
254
				return;
255
			}
256
257
			// Don't use WikibaseRepo here as this is run very early on, thus
258
			// the bootstrapping code is not ready yet (T202452).
259
			$settings = WikibaseSettings::getRepoSettings();
260
			$dumpDBDefaultGroup = $settings->getSetting( 'dumpDBDefaultGroup' );
261
262
			if ( $dumpDBDefaultGroup !== null ) {
263
				$wgDBDefaultGroup = $dumpDBDefaultGroup;
264
			}
265
		};
266
	}
267
268
	private function getEntityTypes() {
269
		return array_diff(
270
			$this->getOption( 'entity-type', $this->existingEntityTypes ),
271
			$this->entityTypesToExcludeFromOutput
272
		);
273
	}
274
275
	/**
276
	 * @param string[] $entityTypes
277
	 * @param ExceptionHandler|null $exceptionReporter
278
	 *
279
	 * @return EntityIdReader|SqlEntityIdPager a stream of EntityId objects
280
	 */
281
	private function makeIdStream( array $entityTypes, ExceptionHandler $exceptionReporter = null ) {
282
		$listFile = $this->getOption( 'list-file' );
283
284
		if ( $listFile !== null ) {
285
			$stream = $this->makeIdFileStream( $listFile, $exceptionReporter );
286
		} else {
287
			$stream = $this->makeIdQueryStream( $entityTypes );
288
		}
289
290
		return $stream;
291
	}
292
293
	/**
294
	 * Returns EntityIdPager::NO_REDIRECTS.
295
	 *
296
	 * @return mixed a EntityIdPager::XXX_REDIRECTS constant
297
	 */
298
	protected function getRedirectMode() {
299
		return EntityIdPager::NO_REDIRECTS;
300
	}
301
302
	/**
303
	 * Cache flag for use in Store::getEntityRevisionLookup.
304
	 *
305
	 * @return string One of Store::LOOKUP_CACHING_RETRIEVE_ONLY and Store::LOOKUP_CACHING_DISABLED
306
	 */
307
	protected function getEntityRevisionLookupCacheMode() {
308
		if ( $this->getOption( 'no-cache', false ) ) {
309
			return Store::LOOKUP_CACHING_DISABLED;
310
		} else {
311
			return Store::LOOKUP_CACHING_RETRIEVE_ONLY;
312
		}
313
	}
314
315
	/**
316
	 * @param string[] $entityTypes
317
	 *
318
	 * @return SqlEntityIdPager
319
	 */
320
	private function makeIdQueryStream( array $entityTypes ) {
321
		$sqlEntityIdPager = $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityTypes, $this->getRedirectMode() );
322
323
		$firstPageId = $this->getOption( 'first-page-id', null );
324
		if ( $firstPageId ) {
325
			$sqlEntityIdPager->setPosition( intval( $firstPageId ) - 1 );
326
		}
327
		$lastPageId = $this->getOption( 'last-page-id', null );
328
		if ( $lastPageId ) {
329
			$sqlEntityIdPager->setCutoffPosition( intval( $lastPageId ) );
330
		}
331
332
		return $sqlEntityIdPager;
333
	}
334
335
	/**
336
	 * @param string $listFile
337
	 * @param ExceptionHandler|null $exceptionReporter
338
	 *
339
	 * @throws MWException
340
	 * @return EntityIdReader
341
	 */
342
	private function makeIdFileStream( $listFile, ExceptionHandler $exceptionReporter = null ) {
343
		$input = fopen( $listFile, 'r' );
344
345
		if ( !$input ) {
346
			throw new MWException( "Failed to open ID file: $listFile" );
347
		}
348
349
		$stream = new EntityIdReader( new LineReader( $input ), WikibaseRepo::getDefaultInstance()->getEntityIdParser() );
350
		$stream->setExceptionHandler( $exceptionReporter );
0 ignored issues
show
Bug introduced by
It seems like $exceptionReporter defined by parameter $exceptionReporter on line 342 can be null; however, Wikibase\Repo\IO\EntityI...::setExceptionHandler() does not accept null, maybe add an additional type check?

It seems like you allow that null is being passed for a parameter, however the function which is called does not seem to accept null.

We recommend to add an additional type check (or disallow null for the parameter):

function notNullable(stdClass $x) { }

// Unsafe
function withoutCheck(stdClass $x = null) {
    notNullable($x);
}

// Safe - Alternative 1: Adding Additional Type-Check
function withCheck(stdClass $x = null) {
    if ($x instanceof stdClass) {
        notNullable($x);
    }
}

// Safe - Alternative 2: Changing Parameter
function withNonNullableParam(stdClass $x) {
    notNullable($x);
}
Loading history...
351
352
		return $stream;
353
	}
354
355
}
356