Complex classes like SparqlHelper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use SparqlHelper, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
44 | class SparqlHelper { |
||
45 | |||
46 | /** |
||
47 | * @var Config |
||
48 | */ |
||
49 | private $config; |
||
50 | |||
51 | /** |
||
52 | * @var RdfVocabulary |
||
53 | */ |
||
54 | private $rdfVocabulary; |
||
55 | |||
56 | /** |
||
57 | * @var string[] |
||
58 | */ |
||
59 | private $entityPrefixes; |
||
60 | |||
61 | /** |
||
62 | * @var string |
||
63 | */ |
||
64 | private $prefixes; |
||
65 | |||
66 | /** |
||
67 | * @var EntityIdParser |
||
68 | */ |
||
69 | private $entityIdParser; |
||
70 | |||
71 | /** |
||
72 | * @var PropertyDataTypeLookup |
||
73 | */ |
||
74 | private $propertyDataTypeLookup; |
||
75 | |||
76 | /** |
||
77 | * @var WANObjectCache |
||
78 | */ |
||
79 | private $cache; |
||
80 | |||
81 | /** |
||
82 | * @var ViolationMessageSerializer |
||
83 | */ |
||
84 | private $violationMessageSerializer; |
||
85 | |||
86 | /** |
||
87 | * @var ViolationMessageDeserializer |
||
88 | */ |
||
89 | private $violationMessageDeserializer; |
||
90 | |||
91 | /** |
||
92 | * @var IBufferingStatsdDataFactory |
||
93 | */ |
||
94 | private $dataFactory; |
||
95 | |||
96 | /** |
||
97 | * @var LoggingHelper |
||
98 | */ |
||
99 | private $loggingHelper; |
||
100 | |||
101 | /** |
||
102 | * @var string |
||
103 | */ |
||
104 | private $defaultUserAgent; |
||
105 | |||
106 | /** |
||
107 | * @var ExpiryLock |
||
108 | */ |
||
109 | private $throttlingLock; |
||
110 | |||
111 | /** |
||
112 | * @var int stands for: No Retry-After header-field was sent back |
||
113 | */ |
||
114 | const NO_RETRY_AFTER = -1; |
||
115 | /** |
||
116 | * @var int stands for: Empty Retry-After header-field was sent back |
||
117 | */ |
||
118 | const EMPTY_RETRY_AFTER = -2; |
||
119 | /** |
||
120 | * @var int stands for: Invalid Retry-After header-field was sent back |
||
121 | * link a string |
||
122 | */ |
||
123 | const INVALID_RETRY_AFTER = -3; |
||
124 | /** |
||
125 | * @var string ID on which the lock is applied on |
||
126 | */ |
||
127 | const EXPIRY_LOCK_ID = 'SparqlHelper.runQuery'; |
||
128 | |||
129 | /** |
||
130 | * @var int HTTP response code for too many requests |
||
131 | */ |
||
132 | const HTTP_TOO_MANY_REQUESTS = 429; |
||
133 | |||
134 | /** |
||
135 | * @var HttpRequestFactory |
||
136 | */ |
||
137 | private $requestFactory; |
||
138 | |||
139 | public function __construct( |
||
172 | |||
173 | private function getQueryPrefixes( RdfVocabulary $rdfVocabulary ) { |
||
221 | |||
222 | /** |
||
223 | * @param string $id entity ID serialization of the entity to check |
||
224 | * @param string[] $classes entity ID serializations of the expected types |
||
225 | * |
||
226 | * @return CachedBool |
||
227 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
228 | */ |
||
229 | public function hasType( $id, array $classes ) { |
||
269 | |||
270 | /** |
||
271 | * @param Statement $statement |
||
272 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
||
273 | * |
||
274 | * @return CachedEntityIds |
||
275 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
276 | */ |
||
277 | public function findEntitiesWithSameStatement( |
||
308 | |||
309 | /** |
||
310 | * @param EntityId $entityId The entity ID on the containing entity |
||
311 | * @param PropertyValueSnak $snak |
||
312 | * @param string $type Context::TYPE_QUALIFIER or Context::TYPE_REFERENCE |
||
313 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
||
314 | * |
||
315 | * @return CachedEntityIds |
||
316 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
317 | */ |
||
318 | public function findEntitiesWithSameQualifierOrReference( |
||
364 | |||
365 | /** |
||
366 | * Return SPARQL code for a string literal with $text as content. |
||
367 | * |
||
368 | * @param string $text |
||
369 | * |
||
370 | * @return string |
||
371 | */ |
||
372 | private function stringLiteral( $text ) { |
||
375 | |||
376 | /** |
||
377 | * Extract and parse entity IDs from the ?otherEntity column of a SPARQL query result. |
||
378 | * |
||
379 | * @param CachedQueryResults $results |
||
380 | * |
||
381 | * @return CachedEntityIds |
||
382 | */ |
||
383 | private function getOtherEntities( CachedQueryResults $results ) { |
||
407 | |||
408 | // @codingStandardsIgnoreStart cyclomatic complexity of this function is too high |
||
409 | /** |
||
410 | * Get an RDF literal or IRI with which the given data value can be matched in a query. |
||
411 | * |
||
412 | * @param string $dataType |
||
413 | * @param DataValue $dataValue |
||
414 | * |
||
415 | * @return array the literal or IRI as a string in SPARQL syntax, |
||
416 | * and a boolean indicating whether it refers to a full value node or not |
||
417 | */ |
||
418 | private function getRdfLiteral( $dataType, DataValue $dataValue ) { |
||
419 | switch ( $dataType ) { |
||
420 | case 'string': |
||
421 | case 'external-id': |
||
422 | return [ $this->stringLiteral( $dataValue->getValue() ), false ]; |
||
423 | case 'commonsMedia': |
||
424 | $url = $this->rdfVocabulary->getMediaFileURI( $dataValue->getValue() ); |
||
425 | return [ '<' . $url . '>', false ]; |
||
426 | case 'geo-shape': |
||
427 | $url = $this->rdfVocabulary->getGeoShapeURI( $dataValue->getValue() ); |
||
428 | return [ '<' . $url . '>', false ]; |
||
429 | case 'tabular-data': |
||
430 | $url = $this->rdfVocabulary->getTabularDataURI( $dataValue->getValue() ); |
||
431 | return [ '<' . $url . '>', false ]; |
||
432 | case 'url': |
||
433 | $url = $dataValue->getValue(); |
||
434 | if ( !preg_match( '/^[^<>"{}\\\\|^`\\x00-\\x20]*$/D', $url ) ) { |
||
435 | // not a valid URL for SPARQL (see SPARQL spec, production 139 IRIREF) |
||
436 | // such an URL should never reach us, so just throw |
||
437 | throw new InvalidArgumentException( 'invalid URL: ' . $url ); |
||
438 | } |
||
439 | return [ '<' . $url . '>', false ]; |
||
440 | case 'wikibase-item': |
||
441 | case 'wikibase-property': |
||
442 | /** @var EntityIdValue $dataValue */ |
||
443 | '@phan-var EntityIdValue $dataValue'; |
||
444 | return [ 'wd:' . $dataValue->getEntityId()->getSerialization(), false ]; |
||
445 | case 'monolingualtext': |
||
446 | /** @var MonolingualTextValue $dataValue */ |
||
447 | '@phan-var MonolingualTextValue $dataValue'; |
||
448 | $lang = $dataValue->getLanguageCode(); |
||
449 | if ( !preg_match( '/^[a-zA-Z]+(-[a-zA-Z0-9]+)*$/D', $lang ) ) { |
||
450 | // not a valid language tag for SPARQL (see SPARQL spec, production 145 LANGTAG) |
||
451 | // such a language tag should never reach us, so just throw |
||
452 | throw new InvalidArgumentException( 'invalid language tag: ' . $lang ); |
||
453 | } |
||
454 | return [ $this->stringLiteral( $dataValue->getText() ) . '@' . $lang, false ]; |
||
455 | case 'globe-coordinate': |
||
456 | case 'quantity': |
||
457 | case 'time': |
||
458 | return [ 'wdv:' . $dataValue->getHash(), true ]; |
||
459 | default: |
||
460 | throw new InvalidArgumentException( 'unknown data type: ' . $dataType ); |
||
461 | } |
||
462 | } |
||
463 | // @codingStandardsIgnoreEnd |
||
464 | |||
465 | /** |
||
466 | * @param string $text |
||
467 | * @param string $regex |
||
468 | * |
||
469 | * @return boolean |
||
470 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
471 | * @throws ConstraintParameterException if the $regex is invalid |
||
472 | */ |
||
473 | public function matchesRegularExpression( $text, $regex ) { |
||
474 | // caching wrapper around matchesRegularExpressionWithSparql |
||
475 | |||
476 | $textHash = hash( 'sha256', $text ); |
||
477 | $cacheKey = $this->cache->makeKey( |
||
478 | 'WikibaseQualityConstraints', // extension |
||
479 | 'regex', // action |
||
480 | 'WDQS-Java', // regex flavor |
||
481 | hash( 'sha256', $regex ) |
||
482 | ); |
||
483 | $cacheMapSize = $this->config->get( 'WBQualityConstraintsFormatCacheMapSize' ); |
||
484 | |||
485 | $cacheMapArray = $this->cache->getWithSetCallback( |
||
486 | $cacheKey, |
||
487 | WANObjectCache::TTL_DAY, |
||
488 | function( $cacheMapArray ) use ( $text, $regex, $textHash, $cacheMapSize ) { |
||
489 | // Initialize the cache map if not set |
||
490 | if ( $cacheMapArray === false ) { |
||
491 | $key = 'wikibase.quality.constraints.regex.cache.refresh.init'; |
||
492 | $this->dataFactory->increment( $key ); |
||
493 | return []; |
||
494 | } |
||
495 | |||
496 | $key = 'wikibase.quality.constraints.regex.cache.refresh'; |
||
497 | $this->dataFactory->increment( $key ); |
||
498 | $cacheMap = MapCacheLRU::newFromArray( $cacheMapArray, $cacheMapSize ); |
||
499 | if ( $cacheMap->has( $textHash ) ) { |
||
500 | $key = 'wikibase.quality.constraints.regex.cache.refresh.hit'; |
||
501 | $this->dataFactory->increment( $key ); |
||
502 | $cacheMap->get( $textHash ); // ping cache |
||
503 | } else { |
||
504 | $key = 'wikibase.quality.constraints.regex.cache.refresh.miss'; |
||
505 | $this->dataFactory->increment( $key ); |
||
506 | try { |
||
507 | $matches = $this->matchesRegularExpressionWithSparql( $text, $regex ); |
||
508 | } catch ( ConstraintParameterException $e ) { |
||
509 | $matches = $this->serializeConstraintParameterException( $e ); |
||
510 | } catch ( SparqlHelperException $e ) { |
||
511 | // don’t cache this |
||
512 | return $cacheMap->toArray(); |
||
513 | } |
||
514 | $cacheMap->set( |
||
515 | $textHash, |
||
516 | $matches, |
||
517 | 3 / 8 |
||
518 | ); |
||
519 | } |
||
520 | |||
521 | return $cacheMap->toArray(); |
||
522 | }, |
||
523 | [ |
||
524 | // Once map is > 1 sec old, consider refreshing |
||
525 | 'ageNew' => 1, |
||
526 | // Update 5 seconds after "ageNew" given a 1 query/sec cache check rate |
||
527 | 'hotTTR' => 5, |
||
528 | // avoid querying cache servers multiple times in a request |
||
529 | // (e. g. when checking format of a reference URL used multiple times on an entity) |
||
530 | 'pcTTL' => WANObjectCache::TTL_PROC_LONG, |
||
531 | ] |
||
532 | ); |
||
533 | |||
534 | if ( isset( $cacheMapArray[$textHash] ) ) { |
||
535 | $key = 'wikibase.quality.constraints.regex.cache.hit'; |
||
536 | $this->dataFactory->increment( $key ); |
||
537 | $matches = $cacheMapArray[$textHash]; |
||
538 | if ( is_bool( $matches ) ) { |
||
539 | return $matches; |
||
540 | } elseif ( is_array( $matches ) && |
||
541 | $matches['type'] == ConstraintParameterException::class ) { |
||
542 | throw $this->deserializeConstraintParameterException( $matches ); |
||
543 | } else { |
||
544 | throw new MWException( |
||
545 | 'Value of unknown type in object cache (' . |
||
546 | 'cache key: ' . $cacheKey . ', ' . |
||
547 | 'cache map key: ' . $textHash . ', ' . |
||
548 | 'value type: ' . gettype( $matches ) . ')' |
||
549 | ); |
||
550 | } |
||
551 | } else { |
||
552 | $key = 'wikibase.quality.constraints.regex.cache.miss'; |
||
553 | $this->dataFactory->increment( $key ); |
||
554 | return $this->matchesRegularExpressionWithSparql( $text, $regex ); |
||
555 | } |
||
556 | } |
||
557 | |||
558 | private function serializeConstraintParameterException( ConstraintParameterException $cpe ) { |
||
559 | return [ |
||
560 | 'type' => ConstraintParameterException::class, |
||
561 | 'violationMessage' => $this->violationMessageSerializer->serialize( $cpe->getViolationMessage() ), |
||
562 | ]; |
||
563 | } |
||
564 | |||
565 | private function deserializeConstraintParameterException( array $serialization ) { |
||
566 | $message = $this->violationMessageDeserializer->deserialize( |
||
567 | $serialization['violationMessage'] |
||
568 | ); |
||
569 | return new ConstraintParameterException( $message ); |
||
570 | } |
||
571 | |||
572 | /** |
||
573 | * This function is only public for testing purposes; |
||
574 | * use matchesRegularExpression, which is equivalent but caches results. |
||
575 | * |
||
576 | * @param string $text |
||
577 | * @param string $regex |
||
578 | * |
||
579 | * @return boolean |
||
580 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
581 | * @throws ConstraintParameterException if the $regex is invalid |
||
582 | */ |
||
583 | public function matchesRegularExpressionWithSparql( $text, $regex ) { |
||
584 | $textStringLiteral = $this->stringLiteral( $text ); |
||
585 | $regexStringLiteral = $this->stringLiteral( '^(?:' . $regex . ')$' ); |
||
586 | |||
587 | $query = <<<EOF |
||
588 | SELECT (REGEX($textStringLiteral, $regexStringLiteral) AS ?matches) {} |
||
589 | EOF; |
||
590 | |||
591 | $result = $this->runQuery( $query, false ); |
||
592 | |||
593 | $vars = $result->getArray()['results']['bindings'][0]; |
||
594 | if ( array_key_exists( 'matches', $vars ) ) { |
||
595 | // true or false ⇒ regex okay, text matches or not |
||
596 | return $vars['matches']['value'] === 'true'; |
||
597 | } else { |
||
598 | // empty result: regex broken |
||
599 | throw new ConstraintParameterException( |
||
600 | ( new ViolationMessage( 'wbqc-violation-message-parameter-regex' ) ) |
||
601 | ->withInlineCode( $regex, Role::CONSTRAINT_PARAMETER_VALUE ) |
||
602 | ); |
||
603 | } |
||
604 | } |
||
605 | |||
606 | /** |
||
607 | * Check whether the text content of an error response indicates a query timeout. |
||
608 | * |
||
609 | * @param string $responseContent |
||
610 | * |
||
611 | * @return boolean |
||
612 | */ |
||
613 | public function isTimeout( $responseContent ) { |
||
614 | $timeoutRegex = implode( '|', array_map( |
||
615 | function ( $fqn ) { |
||
616 | return preg_quote( $fqn, '/' ); |
||
617 | }, |
||
618 | $this->config->get( 'WBQualityConstraintsSparqlTimeoutExceptionClasses' ) |
||
619 | ) ); |
||
620 | return (bool)preg_match( '/' . $timeoutRegex . '/', $responseContent ); |
||
621 | } |
||
622 | |||
623 | /** |
||
624 | * Return the max-age of a cached response, |
||
625 | * or a boolean indicating whether the response was cached or not. |
||
626 | * |
||
627 | * @param array $responseHeaders see MWHttpRequest::getResponseHeaders() |
||
628 | * |
||
629 | * @return int|boolean the max-age (in seconds) |
||
630 | * or a plain boolean if no max-age can be determined |
||
631 | */ |
||
632 | public function getCacheMaxAge( $responseHeaders ) { |
||
633 | if ( |
||
634 | array_key_exists( 'x-cache-status', $responseHeaders ) && |
||
635 | preg_match( '/^hit(?:-.*)?$/', $responseHeaders['x-cache-status'][0] ) |
||
636 | ) { |
||
637 | $maxage = []; |
||
638 | if ( |
||
639 | array_key_exists( 'cache-control', $responseHeaders ) && |
||
640 | preg_match( '/\bmax-age=(\d+)\b/', $responseHeaders['cache-control'][0], $maxage ) |
||
641 | ) { |
||
642 | return intval( $maxage[1] ); |
||
643 | } else { |
||
644 | return true; |
||
645 | } |
||
646 | } else { |
||
647 | return false; |
||
648 | } |
||
649 | } |
||
650 | |||
651 | /** |
||
652 | * Get the delay date of a 429 headered response, which is caused by |
||
653 | * throttling of to many SPARQL-Requests. The header-format is defined |
||
654 | * in RFC7231 see: https://tools.ietf.org/html/rfc7231#section-7.1.3 |
||
655 | * |
||
656 | * @param MWHttpRequest $request |
||
657 | * |
||
658 | * @return int|ConvertibleTimestamp |
||
659 | * or SparlHelper::NO_RETRY_AFTER if there is no Retry-After header |
||
660 | * or SparlHelper::EMPTY_RETRY_AFTER if there is an empty Retry-After |
||
661 | * or SparlHelper::INVALID_RETRY_AFTER if there is something wrong with the format |
||
662 | */ |
||
663 | public function getThrottling( MWHttpRequest $request ) { |
||
687 | |||
688 | private function getTimestampInFuture( DateInterval $delta ) { |
||
692 | |||
693 | /** |
||
694 | * Runs a query against the configured endpoint and returns the results. |
||
695 | * TODO: See if Sparql Client in core can be used instead of rolling our own |
||
696 | * |
||
697 | * @param string $query The query, unencoded (plain string). |
||
698 | * @param bool $needsPrefixes Whether the query requires prefixes or they can be omitted. |
||
699 | * |
||
700 | * @return CachedQueryResults |
||
701 | * |
||
702 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
703 | */ |
||
704 | public function runQuery( $query, $needsPrefixes = true ) { |
||
792 | |||
793 | /** |
||
794 | * Handle a potential “too many requests” error. |
||
795 | * |
||
796 | * @param MWHttpRequest $request |
||
797 | * @throws TooManySparqlRequestsException |
||
798 | */ |
||
799 | private function guardAgainstTooManyRequestsError( MWHttpRequest $request ): void { |
||
825 | |||
826 | } |
||
827 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.