DefaultFinder::find()   C
last analyzed

Complexity

Conditions 12
Paths 11

Size

Total Lines 32
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 19
dl 0
loc 32
rs 6.9666
c 0
b 0
f 0
cc 12
nc 11
nop 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace WebThumbnailer\Finder;
6
7
use WebThumbnailer\Application\ConfigManager;
8
use WebThumbnailer\Application\WebAccess\WebAccess;
9
use WebThumbnailer\Application\WebAccess\WebAccessCUrl;
10
use WebThumbnailer\Application\WebAccess\WebAccessFactory;
11
use WebThumbnailer\Utils\ImageUtils;
12
use WebThumbnailer\Utils\UrlUtils;
13
14
/**
15
 * This finder isn't linked to any domain.
16
 * It will return the resource if it is an image (by extension, or by content).
17
 * Otherwise, it'll try to retrieve an OpenGraph resource.
18
 */
19
class DefaultFinder extends FinderCommon
20
{
21
    /** @var WebAccess instance. */
22
    protected $webAccess;
23
24
    /**
25
     * @inheritdoc
26
     * @param mixed[]|null $rules   All existing rules loaded from JSON files.
27
     * @param mixed[]|null $options Options provided by the user to retrieve a thumbnail.
28
     */
29
    public function __construct(string $domain, string $url, ?array $rules, ?array $options)
30
    {
31
        $this->webAccess = WebAccessFactory::getWebAccess($url);
32
        $this->url = $url;
33
        $this->domain = $domain;
34
    }
35
36
    /**
37
     * Generic finder.
38
     *
39
     * @inheritdoc
40
     */
41
    public function find()
42
    {
43
        if (ImageUtils::isImageExtension(UrlUtils::getUrlFileExtension($this->url))) {
44
            return $this->url;
45
        }
46
47
        $content = $thumbnail = null;
48
        $callback = $this->webAccess instanceof WebAccessCUrl
49
            ? $this->getCurlCallback($content, $thumbnail)
50
            : null;
51
        list($headers, $content) = $this->webAccess->getContent(
52
            $this->url,
53
            (int) ConfigManager::get('settings.default.timeout', 30),
54
            (int) ConfigManager::get('settings.default.max_img_dl', 16777216),
55
            $callback,
56
            $content
57
        );
58
59
        if (empty($thumbnail) && !empty($content) && ImageUtils::isImageString($content)) {
60
            return $this->url;
61
        }
62
63
        if (empty($thumbnail) && ! empty($headers) && strpos($headers[0], '200') === false) {
64
            return false;
65
        }
66
67
        // With curl, the thumb is extracted during the download
68
        if ($this->webAccess instanceof WebAccessCUrl && ! empty($thumbnail)) {
69
            return $thumbnail;
70
        }
71
72
        return ! empty($content) ? static::extractMetaTag($content) : false;
73
    }
74
75
    /**
76
     * Get a callback for curl write function.
77
     *
78
     * @param string|null $content   A variable reference in which the downloaded content should be stored.
79
     * @param string|null $thumbnail A variable reference in which extracted thumb URL should be stored.
80
     *
81
     * @return callable CURLOPT_WRITEFUNCTION callback
82
     */
83
    protected function getCurlCallback(?string &$content, ?string &$thumbnail): callable
84
    {
85
        $url = $this->url;
86
        $isRedirected = false;
87
88
        /**
89
         * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
90
         *
91
         * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
92
         * Then we extract the title and the charset and stop the download when it's done.
93
         *
94
         * Note that when using CURLOPT_WRITEFUNCTION, we have to manually handle the content retrieved,
95
         * hence the $content reference variable.
96
         *
97
         * @param resource $ch   cURL resource
98
         * @param string   $data chunk of data being downloaded
99
         *
100
         * @return int|false length of $data or false if we need to stop the download
101
         */
102
        return function ($ch, $data) use ($url, &$content, &$thumbnail, &$isRedirected) {
103
            $content .= $data;
104
            $responseCode = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
105
            if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
106
                $isRedirected = true;
107
                return strlen($data);
108
            }
109
            if (!empty($responseCode) && $responseCode !== 200) {
110
                return false;
111
            }
112
            // After a redirection, the content type will keep the previous request value
113
            // until it finds the next content-type header.
114
            if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
115
                $contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
116
            }
117
            // we look for image, and ignore application/octet-stream,
118
            // which is a the default content type for any binary
119
            // @see https://developer.mozilla.org/fr/docs/Web/HTTP/Basics_of_HTTP/MIME_types
120
            if (
121
                !empty($contentType)
122
                && strpos($contentType, 'image/') !== false
123
                && strpos($contentType, 'application/octet-stream') === false
124
            ) {
125
                $thumbnail = $url;
126
                return false;
127
            } elseif (
128
                !empty($contentType)
129
                && strpos($contentType, 'text/html') === false
130
                && strpos($contentType, 'application/octet-stream') === false
131
            ) {
132
                return false;
133
            }
134
            if (empty($thumbnail)) {
135
                $thumbnail = DefaultFinder::extractMetaTag($data);
136
            }
137
            // We got everything we want, stop the download.
138
            if (!empty($responseCode) && !empty($contentType) && !empty($thumbnail)) {
139
                return false;
140
            }
141
142
            return strlen($data);
143
        };
144
    }
145
146
    /**
147
     * Applies the regexp on the HTML $content to extract the thumb URL.
148
     *
149
     * @param string $content Downloaded HTML content
150
     *
151
     * @return string|false Extracted thumb URL or false if not found.
152
     */
153
    public static function extractMetaTag(string $content)
154
    {
155
        $propertiesKey = ['property', 'name', 'itemprop'];
156
        $properties = implode('|', $propertiesKey);
157
        // Try to retrieve OpenGraph image.
158
        $ogRegex = '#<meta[^>]+(?:' . $properties . ')=["\']?og:image["\'\s][^>]*content=["\']?(.*?)["\'\s>]#';
159
        // If the attributes are not in the order property => content (e.g. Github)
160
        // New regex to keep this readable... more or less.
161
        $ogRegexReverse = '#<meta[^>]+content=["\']?([^"\'\s]+)[^>]+(?:' . $properties . ')=["\']?og:image["\'\s/>]#';
162
163
        if (
164
            preg_match($ogRegex, $content, $matches) > 0
165
            || preg_match($ogRegexReverse, $content, $matches) > 0
166
        ) {
167
            return $matches[1];
168
        }
169
170
        return false;
171
    }
172
173
    /** @inheritdoc */
174
    public function isHotlinkAllowed(): bool
175
    {
176
        return true;
177
    }
178
179
    /** @inheritdoc */
180
    public function checkRules(?array $rules): bool
181
    {
182
        return true;
183
    }
184
185
    /** @inheritdoc */
186
    public function loadRules(?array $rules): void
187
    {
188
    }
189
190
    /** @inheritdoc */
191
    public function getName(): string
192
    {
193
        return 'default';
194
    }
195
}
196