AddUnitConversions   A
last analyzed

Complexity

Total Complexity 30

Size/Duplication

Total Lines 346
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 12

Importance

Changes 0
Metric Value
wmc 30
lcom 1
cbo 12
dl 0
loc 346
rs 10
c 0
b 0
f 0

11 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 12 1
B execute() 0 51 8
A initializeWriter() 0 14 2
A initializeBuilder() 0 5 1
B processUnit() 0 41 5
A getNormalized() 0 24 2
A processStatements() 0 39 4
A startDocument() 0 7 2
A writeOut() 0 8 3
A createRdfVocabulary() 0 15 1
A createRdfWriter() 0 4 1
1
<?php
2
3
namespace Wikibase\Repo\Maintenance;
4
5
use DataValues\DecimalValue;
6
use DataValues\QuantityValue;
7
use Maintenance;
8
use MediaWiki\MediaWikiServices;
9
use MediaWiki\Sparql\SparqlClient;
10
use Title;
11
use Wikibase\DataAccess\EntitySourceDefinitions;
12
use Wikibase\Lib\EntityTypeDefinitions;
13
use Wikibase\Lib\Units\JsonUnitStorage;
14
use Wikibase\Lib\Units\UnitConverter;
15
use Wikibase\Repo\Rdf\RdfVocabulary;
16
use Wikibase\Repo\Rdf\Values\ComplexValueRdfHelper;
17
use Wikibase\Repo\Rdf\Values\QuantityRdfBuilder;
18
use Wikibase\Repo\WikibaseRepo;
19
use Wikimedia\Purtle\RdfWriter;
20
use Wikimedia\Purtle\RdfWriterFactory;
21
22
$basePath =
23
	getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..';
24
require_once $basePath . '/maintenance/Maintenance.php';
25
26
/**
27
 * Generate dump-like RDF for newly added units without running full dump.
28
 *
29
 * @license GPL-2.0-or-later
30
 * @author Stas Malyshev
31
 */
32
class AddUnitConversions extends Maintenance {
33
34
	/**
35
	 * Max chunk of values processed by one query
36
	 */
37
	const MAX_QUERY_CHUNK = 100;
38
39
	/**
40
	 * @var RdfVocabulary
41
	 */
42
	private $rdfVocabulary;
43
44
	/**
45
	 * @var RdfWriter
46
	 */
47
	protected $rdfWriter;
48
49
	/**
50
	 * @var UnitConverter
51
	 */
52
	protected $unitConverter;
53
54
	/**
55
	 * @var SparqlClient
56
	 */
57
	protected $client;
58
59
	/**
60
	 * @var resource
61
	 */
62
	private $out;
63
64
	/**
65
	 * map of normalization predicates by full name
66
	 * @var string[]
67
	 */
68
	private $normMap;
69
70
	/**
71
	 * Value URI prefix
72
	 * @var string
73
	 */
74
	private $valueURI;
75
76
	/**
77
	 * Set of normalized namespace names.
78
	 * @var bool[]
79
	 */
80
	private $normalizedNames;
81
82
	/**
83
	 * @var QuantityRdfBuilder
84
	 */
85
	protected $builder;
86
87
	/**
88
	 * @var boolean
89
	 */
90
	private $dryRun;
91
92
	public function __construct() {
93
		parent::__construct();
94
		$this->addDescription( "Produce RDF for new units." );
95
96
		$this->addOption( 'config', 'Current units config.', true, true );
97
		$this->addOption( 'old-config', 'Previous units config.', false, true );
98
		$this->addOption( 'output', 'File to output the data to.', true, true );
99
		$this->addOption( 'format', "Set the dump format.", false, true );
100
		$this->addOption( 'base-uri', 'Base URI for the data.', false, true );
101
		$this->addOption( 'sparql', 'SPARQL endpoint URL.', false, true );
102
		$this->addOption( 'dry-run', 'Do not generate output, only count values.', false, false );
103
	}
104
105
	/**
106
	 * Do the actual work. All child classes will need to implement this
107
	 */
108
	public function execute() {
109
		$newJsonName = $this->getOption( 'config' );
110
		$newJson = json_decode( file_get_contents( $newJsonName ), true );
111
		if ( !$newJson ) {
112
			$this->fatalError( "Cannot load new config" );
113
		}
114
115
		$oldJsonName = $this->getOption( 'old-config' );
116
		if ( !$oldJsonName ) {
117
			$oldJson = [];
118
		} else {
119
			$oldJson = json_decode( file_get_contents( $oldJsonName ), true );
120
			if ( !$oldJson ) {
121
				$this->fatalError( "Cannot load old config" );
122
			}
123
		}
124
125
		$diffUnits = array_diff( array_keys( $newJson ), array_keys( $oldJson ) );
126
		if ( empty( $diffUnits ) ) {
127
			$this->error( "No new units." );
128
			return;
129
		}
130
		$this->output( 'Detected ' . count( $diffUnits ) . " new units\n" );
131
		$this->dryRun = $this->getOption( 'dry-run' );
132
133
		if ( !$this->dryRun ) {
134
			$this->out = fopen( $this->getOption( 'output' ), 'w' );
135
		}
136
137
		$wikibaseRepo = WikibaseRepo::getDefaultInstance();
138
		$endPoint = $this->getOption( 'sparql',
139
				$wikibaseRepo->getSettings()->getSetting( 'sparqlEndpoint' ) );
140
		if ( !$endPoint ) {
141
			$this->fatalError( 'SPARQL endpoint should be supplied in config or parameters' );
142
		}
143
144
		$baseUri = $this->getOption( 'base-uri',
145
				$wikibaseRepo->getSettings()->getSetting( 'conceptBaseUri' ) );
146
147
		$this->client = new SparqlClient( $endPoint, MediaWikiServices::getInstance()->getHttpRequestFactory() );
148
		$this->client->appendUserAgent( __CLASS__ );
149
		$format = $this->getOption( 'format', 'ttl' );
150
		$this->initializeWriter( $baseUri, $format );
151
		$this->unitConverter = new UnitConverter( new JsonUnitStorage( $newJsonName ), $baseUri );
152
		$this->initializeBuilder();
153
154
		foreach ( $diffUnits as $unit ) {
155
			$this->processUnit( $unit );
156
			$this->writeOut();
157
		}
158
	}
159
160
	/**
161
	 * Initialize RDF writer
162
	 *
163
	 * @param string $baseUri
164
	 * @param string $format File extension or MIME type of the output format.
165
	 */
166
	public function initializeWriter( $baseUri, $format ) {
167
		$wikibaseRepo = WikibaseRepo::getDefaultInstance();
168
		$this->rdfVocabulary = $this->createRdfVocabulary( $baseUri,
169
				$wikibaseRepo->getDataTypeDefinitions()->getRdfTypeUris() );
170
		$this->rdfWriter = $this->createRdfWriter( $format );
171
172
		$ns = $this->rdfVocabulary->getNamespaces();
173
		$this->valueURI = $ns[RdfVocabulary::NS_VALUE];
174
		foreach ( $this->rdfVocabulary->claimToValueNormalized as $value => $norm ) {
175
			$this->normMap[$ns[$this->rdfVocabulary->claimToValue[$value]]] = $norm;
176
			$this->normalizedNames[$ns[$norm]] = true;
177
		}
178
		$this->startDocument();
179
	}
180
181
	/**
182
	 * Initialize quantity builder.
183
	 */
184
	public function initializeBuilder() {
185
		$this->builder =
186
			new QuantityRdfBuilder( new ComplexValueRdfHelper( $this->rdfVocabulary,
187
				$this->rdfWriter ), $this->unitConverter );
188
	}
189
190
	/**
191
	 * Generate all statements for a specific unit.
192
	 *
193
	 * @param string $unit Unit Q-id
194
	 */
195
	public function processUnit( $unit ) {
196
		$this->output( "Processing $unit...\n" );
197
		$query = <<<QUERY
198
SELECT * WHERE {
199
{
200
    SELECT DISTINCT ?v  WHERE {
201
        ?v wikibase:quantityUnit wd:$unit .
202
        FILTER EXISTS { ?s ?p ?v }
203
    }
204
}
205
  ?v wikibase:quantityAmount ?amount .
206
  ?v wikibase:quantityUpperBound ?upper .
207
  ?v wikibase:quantityLowerBound ?lower .
208
}
209
QUERY;
210
		$values = $this->client->query( $query );
211
		'@phan-var array[] $values';
212
		$this->output( "Got " . count( $values ) . " ids\n" );
213
		if ( $this->dryRun ) {
214
			return;
215
		}
216
		$map = [];
217
		foreach ( $values as $value ) {
218
			if ( substr_compare( $value['v'], $this->valueURI, 0, strlen( $this->valueURI ) ) !== 0 ) {
219
				$this->error( "Invalid value: {$value['v']}!" );
220
				continue;
221
			}
222
			$id = str_replace( $this->valueURI, '', $value['v'] );
223
			$map[$id] = $this->getNormalized( $id, $unit, $value );
224
			$this->rdfWriter->about( RdfVocabulary::NS_VALUE, $id )
225
				->say( RdfVocabulary::NS_ONTOLOGY, 'quantityNormalized' )
226
				->is( RdfVocabulary::NS_VALUE, $map[$id] );
227
228
		}
229
		$this->writeOut();
230
		foreach ( array_chunk( array_keys( $map ), self::MAX_QUERY_CHUNK ) as $idChunk ) {
231
			$this->processStatements( $idChunk, $map );
232
			$this->writeOut();
233
		}
234
		$this->output( "Done.\n" );
235
	}
236
237
	/**
238
	 * Normalize unit and return the hash of the normalized node.
239
	 *
240
	 * @param string   $id Original value ID (hash)
241
	 * @param string   $unit Short ID of the unit
242
	 * @param string[] $value Value data array
243
	 *
244
	 * @return string Hash of the normalized node
245
	 */
246
	private function getNormalized( $id, $unit, array $value ) {
247
		$q =
248
			new QuantityValue( new DecimalValue( $value['amount'] ), $unit,
249
				new DecimalValue( $value['upper'] ),
250
				new DecimalValue( $value['lower'] ) );
251
		$qNorm = $this->unitConverter->toStandardUnits( $q );
252
		if ( $q === $qNorm ) {
253
			// didn't actually convert, so return original one
254
			return $id;
255
		} else {
256
			$normLName = $qNorm->getHash();
257
258
			$this->rdfWriter->about( RdfVocabulary::NS_VALUE, $normLName )
259
				->a( RdfVocabulary::NS_ONTOLOGY, $this->rdfVocabulary->getValueTypeName( $qNorm ) );
0 ignored issues
show
Bug introduced by
It seems like $qNorm defined by $this->unitConverter->toStandardUnits($q) on line 251 can be null; however, Wikibase\Repo\Rdf\RdfVoc...ary::getValueTypeName() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
260
261
			$this->builder->writeQuantityValue( $qNorm );
0 ignored issues
show
Bug introduced by
It seems like $qNorm defined by $this->unitConverter->toStandardUnits($q) on line 251 can be null; however, Wikibase\Repo\Rdf\Values...r::writeQuantityValue() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
262
263
			$this->rdfWriter->about( RdfVocabulary::NS_VALUE, $normLName )
264
				->say( RdfVocabulary::NS_ONTOLOGY, 'quantityNormalized' )
265
				->is( RdfVocabulary::NS_VALUE, $normLName );
266
267
			return $normLName;
268
		}
269
	}
270
271
	/**
272
	 * Process statements for particular set of values.
273
	 * Will scan through the triples which use each of the values and
274
	 * add appropriate normalized triple referring to the normalized value.
275
	 * E.g. <s123> psv:P345 wdv:xys -> <s123> psn:P345 wdv:xyznorm
276
	 *
277
	 * @param string[] $values Value hashes
278
	 * @param string[] $map Map old id -> normalized id
279
	 */
280
	private function processStatements( $values, $map ) {
281
		$shortValues = array_map( function ( $str ) {
282
			return 'wdv:' . $str;
283
		}, $values );
284
		$valuesStr = implode( ' ', $shortValues );
285
		$query = <<<QUERY
286
SELECT ?s ?p ?v WHERE {
287
	VALUES ?v { $valuesStr }
288
	?s ?p ?v
289
	FILTER (?p != wikibase:quantityNormalized)
290
} ORDER BY ?s
291
QUERY;
292
		$data = $this->client->query( $query );
293
		'@phan-var array[] $data';
294
		foreach ( $data as $statement ) {
295
			// Split predicate name into $prefix and $name (actual P123 part)
296
			$last = strrpos( $statement['p'], '/' );
297
			$prefix = substr( $statement['p'], 0, $last + 1 );
298
			$name = substr( $statement['p'], $last + 1 );
299
			if ( isset( $this->normalizedNames[$prefix] ) ) {
300
				// This is already normalized predicate
301
				// This can happen when we deployed new config and
302
				// somebody edits the data with that unit - the update will already have
303
				// the normalized value. We can just ignore it.
304
				continue;
305
			}
306
			if ( !isset( $this->normMap[$prefix] ) ) {
307
				// This shouldn't happen - it means value used in predicate
308
				// that is not in RdfVocabulary.
309
				$this->error( "Unknown predicate {$statement['p']}" );
310
				continue;
311
			}
312
			$v = str_replace( $this->valueURI, '', $statement['v'] );
313
			$this->rdfWriter->about( $statement['s'] )
314
				->say( $this->normMap[$prefix], $name )
315
				->is( RdfVocabulary::NS_VALUE, $map[$v] );
316
		}
317
		$this->output( '.' );
318
	}
319
320
	/**
321
	 * Kick off the document
322
	 */
323
	public function startDocument() {
324
		foreach ( $this->rdfVocabulary->getNamespaces() as $gname => $uri ) {
325
			$this->rdfWriter->prefix( $gname, $uri );
326
		}
327
328
		$this->writeOut();
329
	}
330
331
	/**
332
	 * Write data to the output
333
	 */
334
	protected function writeOut() {
335
		$data = $this->rdfWriter->drain();
336
		if ( $this->out ) {
337
			if ( fwrite( $this->out, $data ) === false ) {
338
				$this->fatalError( "Failed to write to the output, exiting." );
339
			}
340
		}
341
	}
342
343
	/**
344
	 * Get vocabulary instance
345
	 *
346
	 * @param string   $baseUri
347
	 * @param string[] $typeUris
348
	 *
349
	 * @return RdfVocabulary
350
	 */
351
	private function createRdfVocabulary( $baseUri, $typeUris ) {
352
		$entityDataTitle = Title::makeTitle( NS_SPECIAL, 'EntityData' );
353
354
		return new RdfVocabulary(
355
			[ '' => $baseUri ],
356
			[ '' => $entityDataTitle->getCanonicalURL() . '/' ],
357
			new EntitySourceDefinitions( [], new EntityTypeDefinitions( [] ) ),
358
			'',
359
			[ '' => 'wd' ],
360
			[ '' => '' ],
361
			[],
362
			$typeUris,
363
			[]
364
		);
365
	}
366
367
	/**
368
	 * @param string $format File extension or MIME type of the output format.
369
	 *
370
	 * @return RdfWriter
371
	 */
372
	private function createRdfWriter( $format ) {
373
		$factory = new RdfWriterFactory();
374
		return $factory->getWriter( $factory->getFormatName( $format ) );
0 ignored issues
show
Security Bug introduced by
It seems like $factory->getFormatName($format) targeting Wikimedia\Purtle\RdfWriterFactory::getFormatName() can also be of type false; however, Wikimedia\Purtle\RdfWriterFactory::getWriter() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
375
	}
376
377
}
378
379
$maintClass = AddUnitConversions::class;
380
require_once RUN_MAINTENANCE_IF_MAIN;
381