1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace WikibaseQuality\ConstraintReport\ConstraintCheck\Helper; |
4
|
|
|
|
5
|
|
|
use Config; |
6
|
|
|
use DataValues\DataValue; |
7
|
|
|
use DataValues\MonolingualTextValue; |
8
|
|
|
use IBufferingStatsdDataFactory; |
9
|
|
|
use InvalidArgumentException; |
10
|
|
|
use MapCacheLRU; |
11
|
|
|
use MWException; |
12
|
|
|
use MWHttpRequest; |
13
|
|
|
use WANObjectCache; |
14
|
|
|
use Wikibase\DataModel\Entity\EntityId; |
15
|
|
|
use Wikibase\DataModel\Entity\EntityIdParser; |
16
|
|
|
use Wikibase\DataModel\Entity\EntityIdParsingException; |
17
|
|
|
use Wikibase\DataModel\Entity\EntityIdValue; |
18
|
|
|
use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
19
|
|
|
use Wikibase\DataModel\Snak\PropertyValueSnak; |
20
|
|
|
use Wikibase\DataModel\Statement\Statement; |
21
|
|
|
use Wikibase\Rdf\RdfVocabulary; |
22
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedBool; |
23
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedEntityIds; |
24
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedQueryResults; |
25
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachingMetadata; |
26
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\Metadata; |
27
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Context\Context; |
28
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessage; |
29
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageDeserializer; |
30
|
|
|
use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageSerializer; |
31
|
|
|
use WikibaseQuality\ConstraintReport\Role; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Class for running a SPARQL query on some endpoint and getting the results. |
35
|
|
|
* |
36
|
|
|
* @author Lucas Werkmeister |
37
|
|
|
* @license GPL-2.0-or-later |
38
|
|
|
*/ |
39
|
|
|
class SparqlHelper { |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* @var Config |
43
|
|
|
*/ |
44
|
|
|
private $config; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @var RdfVocabulary |
48
|
|
|
*/ |
49
|
|
|
private $rdfVocabulary; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* @var string |
53
|
|
|
*/ |
54
|
|
|
private $entityPrefix; |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* @var string |
58
|
|
|
*/ |
59
|
|
|
private $prefixes; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* @var EntityIdParser |
63
|
|
|
*/ |
64
|
|
|
private $entityIdParser; |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* @var PropertyDataTypeLookup |
68
|
|
|
*/ |
69
|
|
|
private $propertyDataTypeLookup; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @var WANObjectCache |
73
|
|
|
*/ |
74
|
|
|
private $cache; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* @var ViolationMessageSerializer |
78
|
|
|
*/ |
79
|
|
|
private $violationMessageSerializer; |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* @var ViolationMessageDeserializer |
83
|
|
|
*/ |
84
|
|
|
private $violationMessageDeserializer; |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* @var IBufferingStatsdDataFactory |
88
|
|
|
*/ |
89
|
|
|
private $dataFactory; |
90
|
|
|
|
91
|
|
|
public function __construct( |
92
|
|
|
Config $config, |
93
|
|
|
RdfVocabulary $rdfVocabulary, |
94
|
|
|
EntityIdParser $entityIdParser, |
95
|
|
|
PropertyDataTypeLookup $propertyDataTypeLookup, |
96
|
|
|
WANObjectCache $cache, |
97
|
|
|
ViolationMessageSerializer $violationMessageSerializer, |
98
|
|
|
ViolationMessageDeserializer $violationMessageDeserializer, |
99
|
|
|
IBufferingStatsdDataFactory $dataFactory |
100
|
|
|
) { |
101
|
|
|
$this->config = $config; |
102
|
|
|
$this->rdfVocabulary = $rdfVocabulary; |
103
|
|
|
$this->entityIdParser = $entityIdParser; |
104
|
|
|
$this->propertyDataTypeLookup = $propertyDataTypeLookup; |
105
|
|
|
$this->cache = $cache; |
106
|
|
|
$this->violationMessageSerializer = $violationMessageSerializer; |
107
|
|
|
$this->violationMessageDeserializer = $violationMessageDeserializer; |
108
|
|
|
$this->dataFactory = $dataFactory; |
109
|
|
|
|
110
|
|
|
$this->entityPrefix = $rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_ENTITY ); |
111
|
|
|
$this->prefixes = <<<EOT |
112
|
|
|
PREFIX wd: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_ENTITY )}> |
113
|
|
|
PREFIX wds: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_STATEMENT )}> |
114
|
|
|
PREFIX wdt: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_DIRECT_CLAIM )}> |
115
|
|
|
PREFIX wdv: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_VALUE )}> |
116
|
|
|
PREFIX p: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_CLAIM )}> |
117
|
|
|
PREFIX ps: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_CLAIM_STATEMENT )}> |
118
|
|
|
PREFIX pq: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_QUALIFIER )}> |
119
|
|
|
PREFIX pqv: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_QUALIFIER_VALUE )}> |
120
|
|
|
PREFIX pr: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_REFERENCE )}> |
121
|
|
|
PREFIX prv: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NSP_REFERENCE_VALUE )}> |
122
|
|
|
PREFIX wikibase: <http://wikiba.se/ontology#> |
123
|
|
|
PREFIX wikibase-beta: <http://wikiba.se/ontology-beta#> |
124
|
|
|
EOT; |
125
|
|
|
// TODO get wikibase: prefix from vocabulary once -beta is dropped (T112127) |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
/** |
129
|
|
|
* @param string $id entity ID serialization of the entity to check |
130
|
|
|
* @param string[] $classes entity ID serializations of the expected types |
131
|
|
|
* @param boolean $withInstance true for “instance” relation, false for “subclass” relation |
132
|
|
|
* |
133
|
|
|
* @return CachedBool |
134
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
135
|
|
|
*/ |
136
|
|
|
public function hasType( $id, array $classes, $withInstance ) { |
137
|
|
|
$instanceOfId = $this->config->get( 'WBQualityConstraintsInstanceOfId' ); |
138
|
|
|
$subclassOfId = $this->config->get( 'WBQualityConstraintsSubclassOfId' ); |
139
|
|
|
|
140
|
|
|
$path = ( $withInstance ? "wdt:$instanceOfId/" : "" ) . "wdt:$subclassOfId*"; |
141
|
|
|
|
142
|
|
|
$metadatas = []; |
143
|
|
|
|
144
|
|
|
foreach ( array_chunk( $classes, 20 ) as $classesChunk ) { |
145
|
|
|
$classesValues = implode( ' ', array_map( |
146
|
|
|
function( $class ) { |
147
|
|
|
return 'wd:' . $class; |
148
|
|
|
}, |
149
|
|
|
$classesChunk |
150
|
|
|
) ); |
151
|
|
|
|
152
|
|
|
$query = <<<EOF |
153
|
|
|
ASK { |
154
|
|
|
BIND(wd:$id AS ?item) |
155
|
|
|
VALUES ?class { $classesValues } |
156
|
|
|
?item $path ?class. hint:Prior hint:gearing "forward". |
157
|
|
|
} |
158
|
|
|
EOF; |
159
|
|
|
// TODO hint:gearing is a workaround for T168973 and can hopefully be removed eventually |
160
|
|
|
|
161
|
|
|
$result = $this->runQuery( $query ); |
162
|
|
|
$metadatas[] = $result->getMetadata(); |
163
|
|
|
if ( $result->getArray()['boolean'] ) { |
164
|
|
|
return new CachedBool( |
165
|
|
|
true, |
166
|
|
|
Metadata::merge( $metadatas ) |
167
|
|
|
); |
168
|
|
|
} |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
return new CachedBool( |
172
|
|
|
false, |
173
|
|
|
Metadata::merge( $metadatas ) |
174
|
|
|
); |
175
|
|
|
} |
176
|
|
|
|
177
|
|
|
/** |
178
|
|
|
* @param Statement $statement |
179
|
|
|
* @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
180
|
|
|
* |
181
|
|
|
* @return CachedEntityIds |
182
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
183
|
|
|
*/ |
184
|
|
|
public function findEntitiesWithSameStatement( |
185
|
|
|
Statement $statement, |
186
|
|
|
$ignoreDeprecatedStatements |
187
|
|
|
) { |
188
|
|
|
$pid = $statement->getPropertyId()->serialize(); |
189
|
|
|
$guid = str_replace( '$', '-', $statement->getGuid() ); |
190
|
|
|
|
191
|
|
|
$deprecatedFilter = ''; |
192
|
|
|
if ( $ignoreDeprecatedStatements ) { |
193
|
|
|
$deprecatedFilter .= 'MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. }'; |
194
|
|
|
$deprecatedFilter .= 'MINUS { ?otherStatement wikibase-beta:rank wikibase-beta:DeprecatedRank. }'; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
$query = <<<EOF |
198
|
|
|
SELECT DISTINCT ?otherEntity WHERE { |
199
|
|
|
BIND(wds:$guid AS ?statement) |
200
|
|
|
BIND(p:$pid AS ?p) |
201
|
|
|
BIND(ps:$pid AS ?ps) |
202
|
|
|
?entity ?p ?statement. |
203
|
|
|
?statement ?ps ?value. |
204
|
|
|
?otherStatement ?ps ?value. |
205
|
|
|
?otherEntity ?p ?otherStatement. |
206
|
|
|
FILTER(?otherEntity != ?entity) |
207
|
|
|
$deprecatedFilter |
208
|
|
|
} |
209
|
|
|
LIMIT 10 |
210
|
|
|
EOF; |
211
|
|
|
|
212
|
|
|
$result = $this->runQuery( $query ); |
213
|
|
|
|
214
|
|
|
return $this->getOtherEntities( $result ); |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
/** |
218
|
|
|
* @param EntityId $entityId The entity ID on the containing entity |
219
|
|
|
* @param PropertyValueSnak $snak |
220
|
|
|
* @param string $type Context::TYPE_QUALIFIER or Context::TYPE_REFERENCE |
221
|
|
|
* @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
222
|
|
|
* |
223
|
|
|
* @return CachedEntityIds |
224
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
225
|
|
|
*/ |
226
|
|
|
public function findEntitiesWithSameQualifierOrReference( |
227
|
|
|
EntityId $entityId, |
228
|
|
|
PropertyValueSnak $snak, |
229
|
|
|
$type, |
230
|
|
|
$ignoreDeprecatedStatements |
231
|
|
|
) { |
232
|
|
|
$eid = $entityId->getSerialization(); |
233
|
|
|
$pid = $snak->getPropertyId()->getSerialization(); |
234
|
|
|
$prefix = $type === Context::TYPE_QUALIFIER ? 'pq' : 'pr'; |
235
|
|
|
$dataValue = $snak->getDataValue(); |
236
|
|
|
$dataType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( |
237
|
|
|
$snak->getPropertyId() |
238
|
|
|
); |
239
|
|
|
list( $value, $isFullValue ) = $this->getRdfLiteral( $dataType, $dataValue ); |
240
|
|
|
if ( $isFullValue ) { |
241
|
|
|
$prefix .= 'v'; |
242
|
|
|
} |
243
|
|
|
$path = $type === Context::TYPE_QUALIFIER ? |
244
|
|
|
"$prefix:$pid" : |
245
|
|
|
"prov:wasDerivedFrom/$prefix:$pid"; |
246
|
|
|
|
247
|
|
|
$deprecatedFilter = ''; |
248
|
|
|
if ( $ignoreDeprecatedStatements ) { |
249
|
|
|
$deprecatedFilter = <<< EOF |
250
|
|
|
MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. } |
251
|
|
|
MINUS { ?otherStatement wikibase-beta:rank wikibase-beta:DeprecatedRank. } |
252
|
|
|
EOF; |
253
|
|
|
} |
254
|
|
|
|
255
|
|
|
$query = <<<EOF |
256
|
|
|
SELECT DISTINCT ?otherEntity WHERE { |
257
|
|
|
BIND(wd:$eid AS ?entity) |
258
|
|
|
BIND($value AS ?value) |
259
|
|
|
?entity ?p ?statement. |
260
|
|
|
?statement $path ?value. |
261
|
|
|
?otherStatement $path ?value. |
262
|
|
|
?otherEntity ?otherP ?otherStatement. |
263
|
|
|
FILTER(?otherEntity != ?entity) |
264
|
|
|
$deprecatedFilter |
265
|
|
|
} |
266
|
|
|
LIMIT 10 |
267
|
|
|
EOF; |
268
|
|
|
|
269
|
|
|
$result = $this->runQuery( $query ); |
270
|
|
|
|
271
|
|
|
return $this->getOtherEntities( $result ); |
272
|
|
|
} |
273
|
|
|
|
274
|
|
|
/** |
275
|
|
|
* Return SPARQL code for a string literal with $text as content. |
276
|
|
|
* |
277
|
|
|
* @param string $text |
278
|
|
|
* |
279
|
|
|
* @return string |
280
|
|
|
*/ |
281
|
|
|
private function stringLiteral( $text ) { |
282
|
|
|
return '"' . strtr( $text, [ '"' => '\\"', '\\' => '\\\\' ] ) . '"'; |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* Extract and parse entity IDs from the ?otherEntity column of a SPARQL query result. |
287
|
|
|
* |
288
|
|
|
* @param CachedQueryResults $results |
289
|
|
|
* |
290
|
|
|
* @return CachedEntityIds |
291
|
|
|
*/ |
292
|
|
|
private function getOtherEntities( CachedQueryResults $results ) { |
293
|
|
|
return new CachedEntityIds( array_map( |
294
|
|
|
function ( $resultBindings ) { |
295
|
|
|
$entityIRI = $resultBindings['otherEntity']['value']; |
296
|
|
|
$entityPrefixLength = strlen( $this->entityPrefix ); |
297
|
|
|
if ( substr( $entityIRI, 0, $entityPrefixLength ) === $this->entityPrefix ) { |
298
|
|
|
try { |
299
|
|
|
return $this->entityIdParser->parse( |
300
|
|
|
substr( $entityIRI, $entityPrefixLength ) |
301
|
|
|
); |
302
|
|
|
} catch ( EntityIdParsingException $e ) { |
303
|
|
|
// fall through |
304
|
|
|
} |
305
|
|
|
} |
306
|
|
|
|
307
|
|
|
return null; |
308
|
|
|
}, |
309
|
|
|
$results->getArray()['results']['bindings'] |
310
|
|
|
), $results->getMetadata() ); |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
// @codingStandardsIgnoreStart cyclomatic complexity of this function is too high |
314
|
|
|
/** |
315
|
|
|
* Get an RDF literal or IRI with which the given data value can be matched in a query. |
316
|
|
|
* |
317
|
|
|
* @param string $dataType |
318
|
|
|
* @param DataValue $dataValue |
319
|
|
|
* |
320
|
|
|
* @return array the literal or IRI as a string in SPARQL syntax, |
321
|
|
|
* and a boolean indicating whether it refers to a full value node or not |
322
|
|
|
*/ |
323
|
|
|
private function getRdfLiteral( $dataType, DataValue $dataValue ) { |
324
|
|
|
switch ( $dataType ) { |
325
|
|
|
case 'string': |
326
|
|
|
case 'external-id': |
327
|
|
|
return [ $this->stringLiteral( $dataValue->getValue() ), false ]; |
328
|
|
|
case 'commonsMedia': |
329
|
|
|
$url = $this->rdfVocabulary->getMediaFileURI( $dataValue->getValue() ); |
330
|
|
|
return [ '<' . $url . '>', false ]; |
331
|
|
|
case 'geo-shape': |
332
|
|
|
$url = $this->rdfVocabulary->getGeoShapeURI( $dataValue->getValue() ); |
333
|
|
|
return [ '<' . $url . '>', false ]; |
334
|
|
|
case 'tabular-data': |
335
|
|
|
$url = $this->rdfVocabulary->getTabularDataURI( $dataValue->getValue() ); |
336
|
|
|
return [ '<' . $url . '>', false ]; |
337
|
|
|
case 'url': |
338
|
|
|
$url = $dataValue->getValue(); |
339
|
|
|
if ( !preg_match( '/^[^<>"{}\\\\|^`\\x00-\\x20]*$/D', $url ) ) { |
340
|
|
|
// not a valid URL for SPARQL (see SPARQL spec, production 139 IRIREF) |
341
|
|
|
// such an URL should never reach us, so just throw |
342
|
|
|
throw new InvalidArgumentException( 'invalid URL: ' . $url ); |
343
|
|
|
} |
344
|
|
|
return [ '<' . $url . '>', false ]; |
345
|
|
|
case 'wikibase-item': |
346
|
|
|
case 'wikibase-property': |
347
|
|
|
/** @var EntityIdValue $dataValue */ |
348
|
|
|
return [ 'wd:' . $dataValue->getEntityId()->getSerialization(), false ]; |
349
|
|
|
case 'monolingualtext': |
350
|
|
|
/** @var MonolingualTextValue $dataValue */ |
351
|
|
|
$lang = $dataValue->getLanguageCode(); |
352
|
|
|
if ( !preg_match( '/^[a-zA-Z]+(-[a-zA-Z0-9]+)*$/D', $lang ) ) { |
353
|
|
|
// not a valid language tag for SPARQL (see SPARQL spec, production 145 LANGTAG) |
354
|
|
|
// such a language tag should never reach us, so just throw |
355
|
|
|
throw new InvalidArgumentException( 'invalid language tag: ' . $lang ); |
356
|
|
|
} |
357
|
|
|
return [ $this->stringLiteral( $dataValue->getText() ) . '@' . $lang, false ]; |
358
|
|
|
case 'globe-coordinate': |
359
|
|
|
case 'quantity': |
360
|
|
|
case 'time': |
361
|
|
|
return [ 'wdv:' . $dataValue->getHash(), true ]; |
362
|
|
|
default: |
363
|
|
|
throw new InvalidArgumentException( 'unknown data type: ' . $dataType ); |
364
|
|
|
} |
365
|
|
|
} |
366
|
|
|
// @codingStandardsIgnoreEnd |
367
|
|
|
|
368
|
|
|
/** |
369
|
|
|
* @param string $text |
370
|
|
|
* @param string $regex |
371
|
|
|
* |
372
|
|
|
* @return boolean |
373
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
374
|
|
|
* @throws ConstraintParameterException if the $regex is invalid |
375
|
|
|
*/ |
376
|
|
|
public function matchesRegularExpression( $text, $regex ) { |
377
|
|
|
// caching wrapper around matchesRegularExpressionWithSparql |
378
|
|
|
|
379
|
|
|
$textHash = hash( 'sha256', $text ); |
380
|
|
|
$cacheKey = $this->cache->makeKey( |
381
|
|
|
'WikibaseQualityConstraints', // extension |
382
|
|
|
'regex', // action |
383
|
|
|
'WDQS-Java', // regex flavor |
384
|
|
|
hash( 'sha256', $regex ) |
385
|
|
|
); |
386
|
|
|
$cacheMapSize = $this->config->get( 'WBQualityConstraintsFormatCacheMapSize' ); |
387
|
|
|
|
388
|
|
|
$cacheMapArray = $this->cache->getWithSetCallback( |
389
|
|
|
$cacheKey, |
390
|
|
|
WANObjectCache::TTL_DAY, |
391
|
|
|
function( $cacheMapArray ) use ( $text, $regex, $textHash, $cacheMapSize ) { |
392
|
|
|
// Initialize the cache map if not set |
393
|
|
|
if ( $cacheMapArray === false ) { |
394
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.refresh.init'; |
395
|
|
|
$this->dataFactory->increment( $key ); |
396
|
|
|
return []; |
397
|
|
|
} |
398
|
|
|
|
399
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.refresh'; |
400
|
|
|
$this->dataFactory->increment( $key ); |
401
|
|
|
$cacheMap = MapCacheLRU::newFromArray( $cacheMapArray, $cacheMapSize ); |
402
|
|
|
if ( $cacheMap->has( $textHash ) ) { |
403
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.refresh.hit'; |
404
|
|
|
$this->dataFactory->increment( $key ); |
405
|
|
|
$cacheMap->get( $textHash ); // ping cache |
406
|
|
|
} else { |
407
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.refresh.miss'; |
408
|
|
|
$this->dataFactory->increment( $key ); |
409
|
|
|
try { |
410
|
|
|
$matches = $this->matchesRegularExpressionWithSparql( $text, $regex ); |
411
|
|
|
} catch ( ConstraintParameterException $e ) { |
412
|
|
|
$matches = $this->serializeConstraintParameterException( $e ); |
413
|
|
|
} catch ( SparqlHelperException $e ) { |
414
|
|
|
// don’t cache this |
415
|
|
|
return $cacheMap->toArray(); |
416
|
|
|
} |
417
|
|
|
$cacheMap->set( |
418
|
|
|
$textHash, |
419
|
|
|
$matches, |
420
|
|
|
3 / 8 |
421
|
|
|
); |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
return $cacheMap->toArray(); |
425
|
|
|
}, |
426
|
|
|
[ |
427
|
|
|
// Once map is > 1 sec old, consider refreshing |
428
|
|
|
'ageNew' => 1, |
429
|
|
|
// Update 5 seconds after "ageNew" given a 1 query/sec cache check rate |
430
|
|
|
'hotTTR' => 5, |
431
|
|
|
// avoid querying cache servers multiple times in a request |
432
|
|
|
// (e. g. when checking format of a reference URL used multiple times on an entity) |
433
|
|
|
'pcTTL' => WANObjectCache::TTL_PROC_LONG, |
434
|
|
|
] |
435
|
|
|
); |
436
|
|
|
|
437
|
|
|
if ( isset( $cacheMapArray[$textHash] ) ) { |
438
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.hit'; |
439
|
|
|
$this->dataFactory->increment( $key ); |
440
|
|
|
$matches = $cacheMapArray[$textHash]; |
441
|
|
|
if ( is_bool( $matches ) ) { |
442
|
|
|
return $matches; |
443
|
|
|
} elseif ( is_array( $matches ) && |
444
|
|
|
$matches['type'] == ConstraintParameterException::class ) { |
445
|
|
|
throw $this->deserializeConstraintParameterException( $matches ); |
446
|
|
|
} else { |
447
|
|
|
throw new MWException( |
448
|
|
|
'Value of unknown type in object cache (' . |
449
|
|
|
'cache key: ' . $cacheKey . ', ' . |
450
|
|
|
'cache map key: ' . $textHash . ', ' . |
451
|
|
|
'value type: ' . gettype( $matches ) . ')' |
452
|
|
|
); |
453
|
|
|
} |
454
|
|
|
} else { |
455
|
|
|
$key = 'wikibase.quality.constraints.regex.cache.miss'; |
456
|
|
|
$this->dataFactory->increment( $key ); |
457
|
|
|
return $this->matchesRegularExpressionWithSparql( $text, $regex ); |
458
|
|
|
} |
459
|
|
|
} |
460
|
|
|
|
461
|
|
|
private function serializeConstraintParameterException( ConstraintParameterException $cpe ) { |
462
|
|
|
return [ |
463
|
|
|
'type' => ConstraintParameterException::class, |
464
|
|
|
'violationMessage' => $this->violationMessageSerializer->serialize( $cpe->getViolationMessage() ), |
|
|
|
|
465
|
|
|
]; |
466
|
|
|
} |
467
|
|
|
|
468
|
|
|
private function deserializeConstraintParameterException( array $serialization ) { |
469
|
|
|
$message = $this->violationMessageDeserializer->deserialize( |
470
|
|
|
$serialization['violationMessage'] |
471
|
|
|
); |
472
|
|
|
return new ConstraintParameterException( $message ); |
473
|
|
|
} |
474
|
|
|
|
475
|
|
|
/** |
476
|
|
|
* This function is only public for testing purposes; |
477
|
|
|
* use matchesRegularExpression, which is equivalent but caches results. |
478
|
|
|
* |
479
|
|
|
* @param string $text |
480
|
|
|
* @param string $regex |
481
|
|
|
* |
482
|
|
|
* @return boolean |
483
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
484
|
|
|
* @throws ConstraintParameterException if the $regex is invalid |
485
|
|
|
*/ |
486
|
|
|
public function matchesRegularExpressionWithSparql( $text, $regex ) { |
487
|
|
|
$textStringLiteral = $this->stringLiteral( $text ); |
488
|
|
|
$regexStringLiteral = $this->stringLiteral( '^' . $regex . '$' ); |
489
|
|
|
|
490
|
|
|
$query = <<<EOF |
491
|
|
|
SELECT (REGEX($textStringLiteral, $regexStringLiteral) AS ?matches) {} |
492
|
|
|
EOF; |
493
|
|
|
|
494
|
|
|
$result = $this->runQuery( $query ); |
495
|
|
|
|
496
|
|
|
$vars = $result->getArray()['results']['bindings'][0]; |
497
|
|
|
if ( array_key_exists( 'matches', $vars ) ) { |
498
|
|
|
// true or false ⇒ regex okay, text matches or not |
499
|
|
|
return $vars['matches']['value'] === 'true'; |
500
|
|
|
} else { |
501
|
|
|
// empty result: regex broken |
502
|
|
|
throw new ConstraintParameterException( |
503
|
|
|
( new ViolationMessage( 'wbqc-violation-message-parameter-regex' ) ) |
504
|
|
|
->withInlineCode( $regex, Role::CONSTRAINT_PARAMETER_VALUE ) |
505
|
|
|
); |
506
|
|
|
} |
507
|
|
|
} |
508
|
|
|
|
509
|
|
|
/** |
510
|
|
|
* Check whether the text content of an error response indicates a query timeout. |
511
|
|
|
* |
512
|
|
|
* @param string $responseContent |
513
|
|
|
* |
514
|
|
|
* @return boolean |
515
|
|
|
*/ |
516
|
|
|
public function isTimeout( $responseContent ) { |
517
|
|
|
$timeoutRegex = implode( '|', array_map( |
518
|
|
|
function ( $fqn ) { |
519
|
|
|
return preg_quote( $fqn, '/' ); |
520
|
|
|
}, |
521
|
|
|
$this->config->get( 'WBQualityConstraintsSparqlTimeoutExceptionClasses' ) |
522
|
|
|
) ); |
523
|
|
|
return (bool)preg_match( '/' . $timeoutRegex . '/', $responseContent ); |
524
|
|
|
} |
525
|
|
|
|
526
|
|
|
/** |
527
|
|
|
* Return the max-age of a cached response, |
528
|
|
|
* or a boolean indicating whether the response was cached or not. |
529
|
|
|
* |
530
|
|
|
* @param array $responseHeaders see MWHttpRequest::getResponseHeaders() |
531
|
|
|
* |
532
|
|
|
* @return integer|boolean the max-age (in seconds) |
533
|
|
|
* or a plain boolean if no max-age can be determined |
534
|
|
|
*/ |
535
|
|
|
public function getCacheMaxAge( $responseHeaders ) { |
536
|
|
|
if ( |
537
|
|
|
array_key_exists( 'x-cache-status', $responseHeaders ) && |
538
|
|
|
preg_match( '/^hit(?:-.*)?$/', $responseHeaders['x-cache-status'][0] ) |
539
|
|
|
) { |
540
|
|
|
$maxage = []; |
541
|
|
|
if ( |
542
|
|
|
array_key_exists( 'cache-control', $responseHeaders ) && |
543
|
|
|
preg_match( '/\bmax-age=(\d+)\b/', $responseHeaders['cache-control'][0], $maxage ) |
544
|
|
|
) { |
545
|
|
|
return intval( $maxage[1] ); |
546
|
|
|
} else { |
547
|
|
|
return true; |
548
|
|
|
} |
549
|
|
|
} else { |
550
|
|
|
return false; |
551
|
|
|
} |
552
|
|
|
} |
553
|
|
|
|
554
|
|
|
/** |
555
|
|
|
* Runs a query against the configured endpoint and returns the results. |
556
|
|
|
* |
557
|
|
|
* @param string $query The query, unencoded (plain string). |
558
|
|
|
* |
559
|
|
|
* @return CachedQueryResults |
560
|
|
|
* |
561
|
|
|
* @throws SparqlHelperException if the query times out or some other error occurs |
562
|
|
|
*/ |
563
|
|
|
public function runQuery( $query ) { |
564
|
|
|
|
565
|
|
|
$endpoint = $this->config->get( 'WBQualityConstraintsSparqlEndpoint' ); |
566
|
|
|
$maxQueryTimeMillis = $this->config->get( 'WBQualityConstraintsSparqlMaxMillis' ); |
567
|
|
|
$url = $endpoint . '?' . http_build_query( |
568
|
|
|
[ |
569
|
|
|
'query' => "#wbqc\n" . $this->prefixes . $query, |
570
|
|
|
'format' => 'json', |
571
|
|
|
'maxQueryTimeMillis' => $maxQueryTimeMillis, |
572
|
|
|
], |
573
|
|
|
null, ini_get( 'arg_separator.output' ), |
574
|
|
|
// encode spaces with %20, not + |
575
|
|
|
PHP_QUERY_RFC3986 |
576
|
|
|
); |
577
|
|
|
|
578
|
|
|
$options = [ |
579
|
|
|
'method' => 'GET', |
580
|
|
|
'timeout' => (int)round( ( $maxQueryTimeMillis + 1000 ) / 1000 ), |
581
|
|
|
'connectTimeout' => 'default', |
582
|
|
|
]; |
583
|
|
|
$request = MWHttpRequest::factory( $url, $options ); |
584
|
|
|
$startTime = microtime( true ); |
585
|
|
|
$status = $request->execute(); |
586
|
|
|
$endTime = microtime( true ); |
587
|
|
|
$this->dataFactory->timing( |
588
|
|
|
'wikibase.quality.constraints.sparql.timing', |
589
|
|
|
( $endTime - $startTime ) * 1000 |
590
|
|
|
); |
591
|
|
|
|
592
|
|
|
$maxAge = $this->getCacheMaxAge( $request->getResponseHeaders() ); |
593
|
|
|
if ( $maxAge ) { |
594
|
|
|
$this->dataFactory->increment( 'wikibase.quality.constraints.sparql.cached' ); |
595
|
|
|
} |
596
|
|
|
|
597
|
|
|
if ( $status->isOK() ) { |
598
|
|
|
$json = $request->getContent(); |
599
|
|
|
$arr = json_decode( $json, true ); |
600
|
|
|
return new CachedQueryResults( |
601
|
|
|
$arr, |
602
|
|
|
Metadata::ofCachingMetadata( |
603
|
|
|
$maxAge ? |
604
|
|
|
CachingMetadata::ofMaximumAgeInSeconds( $maxAge ) : |
|
|
|
|
605
|
|
|
CachingMetadata::fresh() |
606
|
|
|
) |
607
|
|
|
); |
608
|
|
|
} else { |
609
|
|
|
$this->dataFactory->increment( 'wikibase.quality.constraints.sparql.error' ); |
610
|
|
|
|
611
|
|
|
$this->dataFactory->increment( |
612
|
|
|
"wikibase.quality.constraints.sparql.error.http.{$request->getStatus()}" |
613
|
|
|
); |
614
|
|
|
|
615
|
|
|
if ( $this->isTimeout( $request->getContent() ) ) { |
616
|
|
|
$this->dataFactory->increment( |
617
|
|
|
'wikibase.quality.constraints.sparql.error.timeout' |
618
|
|
|
); |
619
|
|
|
} |
620
|
|
|
|
621
|
|
|
throw new SparqlHelperException(); |
622
|
|
|
} |
623
|
|
|
} |
624
|
|
|
|
625
|
|
|
} |
626
|
|
|
|
This check looks at variables that are passed out again to other methods.
If the outgoing method call has stricter type requirements than the method itself, an issue is raised.
An additional type check may prevent trouble.