Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | <?php |
||
| 15 | /** |
||
| 16 | * Consumer that extracts Open Graph data from either a URL or a HTML string. |
||
| 17 | */ |
||
| 18 | class Consumer |
||
| 19 | { |
||
| 20 | private ?ClientInterface $client; |
||
|
|
|||
| 21 | private ?RequestFactoryInterface $requestFactory; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * When enabled, crawler will read content of title and meta description if no |
||
| 25 | * Open Graph data is provided by target page. |
||
| 26 | */ |
||
| 27 | public bool $useFallbackMode = false; |
||
| 28 | |||
| 29 | /** |
||
| 30 | * When enabled, crawler will throw exceptions for some crawling errors like unexpected |
||
| 31 | * Open Graph elements. |
||
| 32 | */ |
||
| 33 | public bool $debug = false; |
||
| 34 | |||
| 35 | /** |
||
| 36 | * @param ClientInterface|null $client A PSR-18 ClientInterface implementation. |
||
| 37 | * @param RequestFactoryInterface|null $requestFactory A PSR-17 RequestFactoryInterface implementation. |
||
| 38 | */ |
||
| 39 | 16 | public function __construct(?ClientInterface $client = null, ?RequestFactoryInterface $requestFactory = null) |
|
| 40 | { |
||
| 41 | 16 | $this->client = $client; |
|
| 42 | 16 | $this->requestFactory = $requestFactory; |
|
| 43 | 16 | } |
|
| 44 | |||
| 45 | /** |
||
| 46 | * Fetches HTML content from the given URL and then crawls it for Open Graph data. |
||
| 47 | * |
||
| 48 | * @param string $url URL to be crawled. |
||
| 49 | * |
||
| 50 | * @return ObjectBase |
||
| 51 | * |
||
| 52 | * @throws ClientExceptionInterface |
||
| 53 | */ |
||
| 54 | public function loadUrl(string $url): ObjectBase |
||
| 55 | { |
||
| 56 | if ($this->client === null) { |
||
| 57 | throw new LogicException( |
||
| 58 | "To use loadUrl() you must provide \$client and \$requestFactory when instantiating the consumer." |
||
| 59 | ); |
||
| 60 | } |
||
| 61 | |||
| 62 | $request = $this->requestFactory->createRequest("GET", $url); |
||
| 63 | $response = $this->client->sendRequest($request); |
||
| 64 | |||
| 65 | return $this->loadHtml($response->getBody()->getContents(), $url); |
||
| 66 | } |
||
| 67 | |||
| 68 | /** |
||
| 69 | * Crawls the given HTML string for OpenGraph data. |
||
| 70 | * |
||
| 71 | * @param string $html HTML string, usually whole content of crawled web resource. |
||
| 72 | * @param string $fallbackUrl URL to use when fallback mode is enabled. |
||
| 73 | * |
||
| 74 | * @return ObjectBase |
||
| 75 | */ |
||
| 76 | 16 | public function loadHtml(string $html, string $fallbackUrl = null): ObjectBase |
|
| 77 | { |
||
| 78 | // Extract all data that can be found |
||
| 79 | 16 | $page = $this->extractOpenGraphData($html); |
|
| 80 | |||
| 81 | // Use the user's URL as fallback |
||
| 82 | 13 | if ($this->useFallbackMode && $page->url === null) { |
|
| 83 | 1 | $page->url = $fallbackUrl; |
|
| 84 | } |
||
| 85 | |||
| 86 | // Return result |
||
| 87 | 13 | return $page; |
|
| 88 | } |
||
| 89 | |||
| 90 | 16 | private function extractOpenGraphData(string $content): ObjectBase |
|
| 91 | { |
||
| 92 | 16 | $crawler = new Crawler; |
|
| 93 | 16 | $crawler->addHTMLContent($content, 'UTF-8'); |
|
| 94 | |||
| 95 | 16 | $properties = []; |
|
| 96 | 16 | foreach(['name', 'property'] as $t) |
|
| 97 | { |
||
| 98 | // Get all meta-tags starting with "og:" |
||
| 99 | 16 | $ogMetaTags = $crawler->filter("meta[{$t}^='og:']"); |
|
| 100 | // Create clean property array |
||
| 101 | 16 | $props = Linq::from($ogMetaTags) |
|
| 102 | 16 | ->select( |
|
| 103 | function (DOMElement $tag) use ($t) { |
||
| 104 | 13 | $name = strtolower(trim($tag->getAttribute($t))); |
|
| 105 | 13 | $value = trim($tag->getAttribute("content")); |
|
| 106 | 13 | return new Property($name, $value); |
|
| 107 | 16 | } |
|
| 108 | ) |
||
| 109 | 16 | ->toArray(); |
|
| 110 | 16 | $properties = array_merge($properties, $props); |
|
| 111 | |||
| 112 | } |
||
| 113 | |||
| 114 | // Create new object of the correct type |
||
| 115 | 16 | $typeProperty = Linq::from($properties) |
|
| 116 | 16 | ->firstOrNull( |
|
| 117 | function (Property $property) { |
||
| 118 | 13 | return $property->key === Property::TYPE; |
|
| 119 | 16 | } |
|
| 120 | ); |
||
| 121 | 16 | switch ($typeProperty !== null ? $typeProperty->value : null) { |
|
| 122 | default: |
||
| 123 | 16 | $object = new Website(); |
|
| 124 | 16 | break; |
|
| 125 | } |
||
| 126 | |||
| 127 | // Assign all properties to the object |
||
| 128 | 16 | $object->assignProperties($properties, $this->debug); |
|
| 129 | |||
| 130 | // Fallback for url |
||
| 131 | 13 | if ($this->useFallbackMode && !$object->url) { |
|
| 132 | 2 | $urlElement = $crawler->filter("link[rel='canonical']")->first(); |
|
| 133 | 2 | if ($urlElement->count() > 0) { |
|
| 134 | 1 | $object->url = trim($urlElement->attr("href")); |
|
| 135 | } |
||
| 136 | } |
||
| 137 | |||
| 138 | // Fallback for title |
||
| 139 | 13 | if ($this->useFallbackMode && !$object->title) { |
|
| 140 | 2 | $titleElement = $crawler->filter("title")->first(); |
|
| 141 | 2 | if ($titleElement->count() > 0) { |
|
| 157 |