Complex classes like SparqlHelper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use SparqlHelper, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 44 | class SparqlHelper { |
||
| 45 | |||
| 46 | /** |
||
| 47 | * @var Config |
||
| 48 | */ |
||
| 49 | private $config; |
||
| 50 | |||
| 51 | /** |
||
| 52 | * @var RdfVocabulary |
||
| 53 | */ |
||
| 54 | private $rdfVocabulary; |
||
| 55 | |||
| 56 | /** |
||
| 57 | * @var string[] |
||
| 58 | */ |
||
| 59 | private $entityPrefixes; |
||
| 60 | |||
| 61 | /** |
||
| 62 | * @var string |
||
| 63 | */ |
||
| 64 | private $prefixes; |
||
| 65 | |||
| 66 | /** |
||
| 67 | * @var EntityIdParser |
||
| 68 | */ |
||
| 69 | private $entityIdParser; |
||
| 70 | |||
| 71 | /** |
||
| 72 | * @var PropertyDataTypeLookup |
||
| 73 | */ |
||
| 74 | private $propertyDataTypeLookup; |
||
| 75 | |||
| 76 | /** |
||
| 77 | * @var WANObjectCache |
||
| 78 | */ |
||
| 79 | private $cache; |
||
| 80 | |||
| 81 | /** |
||
| 82 | * @var ViolationMessageSerializer |
||
| 83 | */ |
||
| 84 | private $violationMessageSerializer; |
||
| 85 | |||
| 86 | /** |
||
| 87 | * @var ViolationMessageDeserializer |
||
| 88 | */ |
||
| 89 | private $violationMessageDeserializer; |
||
| 90 | |||
| 91 | /** |
||
| 92 | * @var IBufferingStatsdDataFactory |
||
| 93 | */ |
||
| 94 | private $dataFactory; |
||
| 95 | |||
| 96 | /** |
||
| 97 | * @var LoggingHelper |
||
| 98 | */ |
||
| 99 | private $loggingHelper; |
||
| 100 | |||
| 101 | /** |
||
| 102 | * @var string |
||
| 103 | */ |
||
| 104 | private $defaultUserAgent; |
||
| 105 | |||
| 106 | /** |
||
| 107 | * @var ExpiryLock |
||
| 108 | */ |
||
| 109 | private $throttlingLock; |
||
| 110 | |||
| 111 | /** |
||
| 112 | * @var int stands for: No Retry-After header-field was sent back |
||
| 113 | */ |
||
| 114 | const NO_RETRY_AFTER = -1; |
||
| 115 | /** |
||
| 116 | * @var int stands for: Empty Retry-After header-field was sent back |
||
| 117 | */ |
||
| 118 | const EMPTY_RETRY_AFTER = -2; |
||
| 119 | /** |
||
| 120 | * @var int stands for: Invalid Retry-After header-field was sent back |
||
| 121 | * link a string |
||
| 122 | */ |
||
| 123 | const INVALID_RETRY_AFTER = -3; |
||
| 124 | /** |
||
| 125 | * @var string ID on which the lock is applied on |
||
| 126 | */ |
||
| 127 | const EXPIRY_LOCK_ID = 'SparqlHelper.runQuery'; |
||
| 128 | |||
| 129 | /** |
||
| 130 | * @var int HTTP response code for too many requests |
||
| 131 | */ |
||
| 132 | const HTTP_TOO_MANY_REQUESTS = 429; |
||
| 133 | |||
| 134 | /** |
||
| 135 | * @var HttpRequestFactory |
||
| 136 | */ |
||
| 137 | private $requestFactory; |
||
| 138 | |||
| 139 | public function __construct( |
||
| 172 | |||
| 173 | private function getQueryPrefixes( RdfVocabulary $rdfVocabulary ) { |
||
| 221 | |||
| 222 | /** |
||
| 223 | * @param string $id entity ID serialization of the entity to check |
||
| 224 | * @param string[] $classes entity ID serializations of the expected types |
||
| 225 | * |
||
| 226 | * @return CachedBool |
||
| 227 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 228 | */ |
||
| 229 | public function hasType( $id, array $classes ) { |
||
| 269 | |||
| 270 | /** |
||
| 271 | * @param Statement $statement |
||
| 272 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
||
| 273 | * |
||
| 274 | * @return CachedEntityIds |
||
| 275 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 276 | */ |
||
| 277 | public function findEntitiesWithSameStatement( |
||
| 308 | |||
| 309 | /** |
||
| 310 | * @param EntityId $entityId The entity ID on the containing entity |
||
| 311 | * @param PropertyValueSnak $snak |
||
| 312 | * @param string $type Context::TYPE_QUALIFIER or Context::TYPE_REFERENCE |
||
| 313 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
||
| 314 | * |
||
| 315 | * @return CachedEntityIds |
||
| 316 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 317 | */ |
||
| 318 | public function findEntitiesWithSameQualifierOrReference( |
||
| 364 | |||
| 365 | /** |
||
| 366 | * Return SPARQL code for a string literal with $text as content. |
||
| 367 | * |
||
| 368 | * @param string $text |
||
| 369 | * |
||
| 370 | * @return string |
||
| 371 | */ |
||
| 372 | private function stringLiteral( $text ) { |
||
| 375 | |||
| 376 | /** |
||
| 377 | * Extract and parse entity IDs from the ?otherEntity column of a SPARQL query result. |
||
| 378 | * |
||
| 379 | * @param CachedQueryResults $results |
||
| 380 | * |
||
| 381 | * @return CachedEntityIds |
||
| 382 | */ |
||
| 383 | private function getOtherEntities( CachedQueryResults $results ) { |
||
| 407 | |||
| 408 | // @codingStandardsIgnoreStart cyclomatic complexity of this function is too high |
||
| 409 | /** |
||
| 410 | * Get an RDF literal or IRI with which the given data value can be matched in a query. |
||
| 411 | * |
||
| 412 | * @param string $dataType |
||
| 413 | * @param DataValue $dataValue |
||
| 414 | * |
||
| 415 | * @return array the literal or IRI as a string in SPARQL syntax, |
||
| 416 | * and a boolean indicating whether it refers to a full value node or not |
||
| 417 | */ |
||
| 418 | private function getRdfLiteral( $dataType, DataValue $dataValue ) { |
||
| 419 | switch ( $dataType ) { |
||
| 420 | case 'string': |
||
| 421 | case 'external-id': |
||
| 422 | return [ $this->stringLiteral( $dataValue->getValue() ), false ]; |
||
| 423 | case 'commonsMedia': |
||
| 424 | $url = $this->rdfVocabulary->getMediaFileURI( $dataValue->getValue() ); |
||
| 425 | return [ '<' . $url . '>', false ]; |
||
| 426 | case 'geo-shape': |
||
| 427 | $url = $this->rdfVocabulary->getGeoShapeURI( $dataValue->getValue() ); |
||
| 428 | return [ '<' . $url . '>', false ]; |
||
| 429 | case 'tabular-data': |
||
| 430 | $url = $this->rdfVocabulary->getTabularDataURI( $dataValue->getValue() ); |
||
| 431 | return [ '<' . $url . '>', false ]; |
||
| 432 | case 'url': |
||
| 433 | $url = $dataValue->getValue(); |
||
| 434 | if ( !preg_match( '/^[^<>"{}\\\\|^`\\x00-\\x20]*$/D', $url ) ) { |
||
| 435 | // not a valid URL for SPARQL (see SPARQL spec, production 139 IRIREF) |
||
| 436 | // such an URL should never reach us, so just throw |
||
| 437 | throw new InvalidArgumentException( 'invalid URL: ' . $url ); |
||
| 438 | } |
||
| 439 | return [ '<' . $url . '>', false ]; |
||
| 440 | case 'wikibase-item': |
||
| 441 | case 'wikibase-property': |
||
| 442 | /** @var EntityIdValue $dataValue */ |
||
| 443 | '@phan-var EntityIdValue $dataValue'; |
||
| 444 | return [ 'wd:' . $dataValue->getEntityId()->getSerialization(), false ]; |
||
| 445 | case 'monolingualtext': |
||
| 446 | /** @var MonolingualTextValue $dataValue */ |
||
| 447 | '@phan-var MonolingualTextValue $dataValue'; |
||
| 448 | $lang = $dataValue->getLanguageCode(); |
||
| 449 | if ( !preg_match( '/^[a-zA-Z]+(-[a-zA-Z0-9]+)*$/D', $lang ) ) { |
||
| 450 | // not a valid language tag for SPARQL (see SPARQL spec, production 145 LANGTAG) |
||
| 451 | // such a language tag should never reach us, so just throw |
||
| 452 | throw new InvalidArgumentException( 'invalid language tag: ' . $lang ); |
||
| 453 | } |
||
| 454 | return [ $this->stringLiteral( $dataValue->getText() ) . '@' . $lang, false ]; |
||
| 455 | case 'globe-coordinate': |
||
| 456 | case 'quantity': |
||
| 457 | case 'time': |
||
| 458 | return [ 'wdv:' . $dataValue->getHash(), true ]; |
||
| 459 | default: |
||
| 460 | throw new InvalidArgumentException( 'unknown data type: ' . $dataType ); |
||
| 461 | } |
||
| 462 | } |
||
| 463 | // @codingStandardsIgnoreEnd |
||
| 464 | |||
| 465 | /** |
||
| 466 | * @param string $text |
||
| 467 | * @param string $regex |
||
| 468 | * |
||
| 469 | * @return boolean |
||
| 470 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 471 | * @throws ConstraintParameterException if the $regex is invalid |
||
| 472 | */ |
||
| 473 | public function matchesRegularExpression( $text, $regex ) { |
||
| 474 | // caching wrapper around matchesRegularExpressionWithSparql |
||
| 475 | |||
| 476 | $textHash = hash( 'sha256', $text ); |
||
| 477 | $cacheKey = $this->cache->makeKey( |
||
| 478 | 'WikibaseQualityConstraints', // extension |
||
| 479 | 'regex', // action |
||
| 480 | 'WDQS-Java', // regex flavor |
||
| 481 | hash( 'sha256', $regex ) |
||
| 482 | ); |
||
| 483 | $cacheMapSize = $this->config->get( 'WBQualityConstraintsFormatCacheMapSize' ); |
||
| 484 | |||
| 485 | $cacheMapArray = $this->cache->getWithSetCallback( |
||
| 486 | $cacheKey, |
||
| 487 | WANObjectCache::TTL_DAY, |
||
| 488 | function( $cacheMapArray ) use ( $text, $regex, $textHash, $cacheMapSize ) { |
||
| 489 | // Initialize the cache map if not set |
||
| 490 | if ( $cacheMapArray === false ) { |
||
| 491 | $key = 'wikibase.quality.constraints.regex.cache.refresh.init'; |
||
| 492 | $this->dataFactory->increment( $key ); |
||
| 493 | return []; |
||
| 494 | } |
||
| 495 | |||
| 496 | $key = 'wikibase.quality.constraints.regex.cache.refresh'; |
||
| 497 | $this->dataFactory->increment( $key ); |
||
| 498 | $cacheMap = MapCacheLRU::newFromArray( $cacheMapArray, $cacheMapSize ); |
||
| 499 | if ( $cacheMap->has( $textHash ) ) { |
||
| 500 | $key = 'wikibase.quality.constraints.regex.cache.refresh.hit'; |
||
| 501 | $this->dataFactory->increment( $key ); |
||
| 502 | $cacheMap->get( $textHash ); // ping cache |
||
| 503 | } else { |
||
| 504 | $key = 'wikibase.quality.constraints.regex.cache.refresh.miss'; |
||
| 505 | $this->dataFactory->increment( $key ); |
||
| 506 | try { |
||
| 507 | $matches = $this->matchesRegularExpressionWithSparql( $text, $regex ); |
||
| 508 | } catch ( ConstraintParameterException $e ) { |
||
| 509 | $matches = $this->serializeConstraintParameterException( $e ); |
||
| 510 | } catch ( SparqlHelperException $e ) { |
||
| 511 | // don’t cache this |
||
| 512 | return $cacheMap->toArray(); |
||
| 513 | } |
||
| 514 | $cacheMap->set( |
||
| 515 | $textHash, |
||
| 516 | $matches, |
||
| 517 | 3 / 8 |
||
| 518 | ); |
||
| 519 | } |
||
| 520 | |||
| 521 | return $cacheMap->toArray(); |
||
| 522 | }, |
||
| 523 | [ |
||
| 524 | // Once map is > 1 sec old, consider refreshing |
||
| 525 | 'ageNew' => 1, |
||
| 526 | // Update 5 seconds after "ageNew" given a 1 query/sec cache check rate |
||
| 527 | 'hotTTR' => 5, |
||
| 528 | // avoid querying cache servers multiple times in a request |
||
| 529 | // (e. g. when checking format of a reference URL used multiple times on an entity) |
||
| 530 | 'pcTTL' => WANObjectCache::TTL_PROC_LONG, |
||
| 531 | ] |
||
| 532 | ); |
||
| 533 | |||
| 534 | if ( isset( $cacheMapArray[$textHash] ) ) { |
||
| 535 | $key = 'wikibase.quality.constraints.regex.cache.hit'; |
||
| 536 | $this->dataFactory->increment( $key ); |
||
| 537 | $matches = $cacheMapArray[$textHash]; |
||
| 538 | if ( is_bool( $matches ) ) { |
||
| 539 | return $matches; |
||
| 540 | } elseif ( is_array( $matches ) && |
||
| 541 | $matches['type'] == ConstraintParameterException::class ) { |
||
| 542 | throw $this->deserializeConstraintParameterException( $matches ); |
||
| 543 | } else { |
||
| 544 | throw new MWException( |
||
| 545 | 'Value of unknown type in object cache (' . |
||
| 546 | 'cache key: ' . $cacheKey . ', ' . |
||
| 547 | 'cache map key: ' . $textHash . ', ' . |
||
| 548 | 'value type: ' . gettype( $matches ) . ')' |
||
| 549 | ); |
||
| 550 | } |
||
| 551 | } else { |
||
| 552 | $key = 'wikibase.quality.constraints.regex.cache.miss'; |
||
| 553 | $this->dataFactory->increment( $key ); |
||
| 554 | return $this->matchesRegularExpressionWithSparql( $text, $regex ); |
||
| 555 | } |
||
| 556 | } |
||
| 557 | |||
| 558 | private function serializeConstraintParameterException( ConstraintParameterException $cpe ) { |
||
| 559 | return [ |
||
| 560 | 'type' => ConstraintParameterException::class, |
||
| 561 | 'violationMessage' => $this->violationMessageSerializer->serialize( $cpe->getViolationMessage() ), |
||
| 562 | ]; |
||
| 563 | } |
||
| 564 | |||
| 565 | private function deserializeConstraintParameterException( array $serialization ) { |
||
| 566 | $message = $this->violationMessageDeserializer->deserialize( |
||
| 567 | $serialization['violationMessage'] |
||
| 568 | ); |
||
| 569 | return new ConstraintParameterException( $message ); |
||
| 570 | } |
||
| 571 | |||
| 572 | /** |
||
| 573 | * This function is only public for testing purposes; |
||
| 574 | * use matchesRegularExpression, which is equivalent but caches results. |
||
| 575 | * |
||
| 576 | * @param string $text |
||
| 577 | * @param string $regex |
||
| 578 | * |
||
| 579 | * @return boolean |
||
| 580 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 581 | * @throws ConstraintParameterException if the $regex is invalid |
||
| 582 | */ |
||
| 583 | public function matchesRegularExpressionWithSparql( $text, $regex ) { |
||
| 584 | $textStringLiteral = $this->stringLiteral( $text ); |
||
| 585 | $regexStringLiteral = $this->stringLiteral( '^(?:' . $regex . ')$' ); |
||
| 586 | |||
| 587 | $query = <<<EOF |
||
| 588 | SELECT (REGEX($textStringLiteral, $regexStringLiteral) AS ?matches) {} |
||
| 589 | EOF; |
||
| 590 | |||
| 591 | $result = $this->runQuery( $query, false ); |
||
| 592 | |||
| 593 | $vars = $result->getArray()['results']['bindings'][0]; |
||
| 594 | if ( array_key_exists( 'matches', $vars ) ) { |
||
| 595 | // true or false ⇒ regex okay, text matches or not |
||
| 596 | return $vars['matches']['value'] === 'true'; |
||
| 597 | } else { |
||
| 598 | // empty result: regex broken |
||
| 599 | throw new ConstraintParameterException( |
||
| 600 | ( new ViolationMessage( 'wbqc-violation-message-parameter-regex' ) ) |
||
| 601 | ->withInlineCode( $regex, Role::CONSTRAINT_PARAMETER_VALUE ) |
||
| 602 | ); |
||
| 603 | } |
||
| 604 | } |
||
| 605 | |||
| 606 | /** |
||
| 607 | * Check whether the text content of an error response indicates a query timeout. |
||
| 608 | * |
||
| 609 | * @param string $responseContent |
||
| 610 | * |
||
| 611 | * @return boolean |
||
| 612 | */ |
||
| 613 | public function isTimeout( $responseContent ) { |
||
| 614 | $timeoutRegex = implode( '|', array_map( |
||
| 615 | function ( $fqn ) { |
||
| 616 | return preg_quote( $fqn, '/' ); |
||
| 617 | }, |
||
| 618 | $this->config->get( 'WBQualityConstraintsSparqlTimeoutExceptionClasses' ) |
||
| 619 | ) ); |
||
| 620 | return (bool)preg_match( '/' . $timeoutRegex . '/', $responseContent ); |
||
| 621 | } |
||
| 622 | |||
| 623 | /** |
||
| 624 | * Return the max-age of a cached response, |
||
| 625 | * or a boolean indicating whether the response was cached or not. |
||
| 626 | * |
||
| 627 | * @param array $responseHeaders see MWHttpRequest::getResponseHeaders() |
||
| 628 | * |
||
| 629 | * @return int|boolean the max-age (in seconds) |
||
| 630 | * or a plain boolean if no max-age can be determined |
||
| 631 | */ |
||
| 632 | public function getCacheMaxAge( $responseHeaders ) { |
||
| 633 | if ( |
||
| 634 | array_key_exists( 'x-cache-status', $responseHeaders ) && |
||
| 635 | preg_match( '/^hit(?:-.*)?$/', $responseHeaders['x-cache-status'][0] ) |
||
| 636 | ) { |
||
| 637 | $maxage = []; |
||
| 638 | if ( |
||
| 639 | array_key_exists( 'cache-control', $responseHeaders ) && |
||
| 640 | preg_match( '/\bmax-age=(\d+)\b/', $responseHeaders['cache-control'][0], $maxage ) |
||
| 641 | ) { |
||
| 642 | return intval( $maxage[1] ); |
||
| 643 | } else { |
||
| 644 | return true; |
||
| 645 | } |
||
| 646 | } else { |
||
| 647 | return false; |
||
| 648 | } |
||
| 649 | } |
||
| 650 | |||
| 651 | /** |
||
| 652 | * Get the delay date of a 429 headered response, which is caused by |
||
| 653 | * throttling of to many SPARQL-Requests. The header-format is defined |
||
| 654 | * in RFC7231 see: https://tools.ietf.org/html/rfc7231#section-7.1.3 |
||
| 655 | * |
||
| 656 | * @param MWHttpRequest $request |
||
| 657 | * |
||
| 658 | * @return int|ConvertibleTimestamp |
||
| 659 | * or SparlHelper::NO_RETRY_AFTER if there is no Retry-After header |
||
| 660 | * or SparlHelper::EMPTY_RETRY_AFTER if there is an empty Retry-After |
||
| 661 | * or SparlHelper::INVALID_RETRY_AFTER if there is something wrong with the format |
||
| 662 | */ |
||
| 663 | public function getThrottling( MWHttpRequest $request ) { |
||
| 687 | |||
| 688 | private function getTimestampInFuture( DateInterval $delta ) { |
||
| 692 | |||
| 693 | /** |
||
| 694 | * Runs a query against the configured endpoint and returns the results. |
||
| 695 | * TODO: See if Sparql Client in core can be used instead of rolling our own |
||
| 696 | * |
||
| 697 | * @param string $query The query, unencoded (plain string). |
||
| 698 | * @param bool $needsPrefixes Whether the query requires prefixes or they can be omitted. |
||
| 699 | * |
||
| 700 | * @return CachedQueryResults |
||
| 701 | * |
||
| 702 | * @throws SparqlHelperException if the query times out or some other error occurs |
||
| 703 | */ |
||
| 704 | public function runQuery( $query, $needsPrefixes = true ) { |
||
| 792 | |||
| 793 | /** |
||
| 794 | * Handle a potential “too many requests” error. |
||
| 795 | * |
||
| 796 | * @param MWHttpRequest $request |
||
| 797 | * @throws TooManySparqlRequestsException |
||
| 798 | */ |
||
| 799 | private function guardAgainstTooManyRequestsError( MWHttpRequest $request ): void { |
||
| 825 | |||
| 826 | } |
||
| 827 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.