Completed
Push — master ( e15538...7938c9 )
by Arthur
02:07
created

DefaultFinder::find()   C

Complexity

Conditions 11
Paths 11

Size

Total Lines 32
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 11
eloc 19
nc 11
nop 0
dl 0
loc 32
rs 5.2653
c 0
b 0
f 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace WebThumbnailer\Finder;
4
5
use WebThumbnailer\Application\ConfigManager;
6
use WebThumbnailer\Application\WebAccess\WebAccess;
7
use WebThumbnailer\Application\WebAccess\WebAccessCUrl;
8
use WebThumbnailer\Application\WebAccess\WebAccessFactory;
9
use WebThumbnailer\Utils\ImageUtils;
10
use WebThumbnailer\Utils\UrlUtils;
11
12
/**
13
 * Class DefaultFinder
14
 *
15
 * This finder isn't linked to any domain.
16
 * It will return the resource if it is an image (by extension, or by content).
17
 * Otherwise, it'll try to retrieve an OpenGraph resource.
18
 *
19
 * @package WebThumbnailer\Finder
20
 */
21
class DefaultFinder extends FinderCommon
22
{
23
    /**
24
     * @var WebAccess instance.
25
     */
26
    protected $webAccess;
27
28
    /**
29
     * @inheritdoc
30
     */
31
    public function __construct($domain, $url, $rules, $options)
32
    {
33
        $this->webAccess = WebAccessFactory::getWebAccess($url);
34
        $this->url = $url;
35
        $this->domain = $domain;
36
    }
37
38
    /**
39
     * Generic finder.
40
     *
41
     * @inheritdoc
42
     */
43
    public function find()
44
    {
45
        if (ImageUtils::isImageExtension(UrlUtils::getUrlFileExtension($this->url))) {
46
            return $this->url;
47
        }
48
49
        $content = $thumbnail = null;
50
        $callback = $this->webAccess instanceof WebAccessCUrl
51
            ? $this->getCurlCallback($content, $thumbnail)
52
            : null;
53
        list($headers, $content) = $this->webAccess->getContent(
54
            $this->url,
55
            (int) ConfigManager::get('settings.default.timeout', 30),
56
            (int) ConfigManager::get('settings.default.max_img_dl', 16777216),
57
            $callback,
58
            $content
59
        );
60
61
        if (empty($thumbnail) && ImageUtils::isImageString($content)) {
62
            return $this->url;
63
        }
64
65
        if (empty($thumbnail) && ! empty($headers) && strpos($headers[0], '200') === false) {
66
            return false;
67
        }
68
69
        // With curl, the thumb is extracted during the download
70
        if ($this->webAccess instanceof WebAccessCUrl && ! empty($thumbnail)) {
71
            return $thumbnail;
72
        }
73
74
        return ! empty($content) ? self::extractMetaTag($content) : false;
75
    }
76
77
    /**
78
     * Get a callback for curl write function.
79
     *
80
     * @param string $content   A variable reference in which the downloaded content should be stored.
81
     * @param string $thumbnail A variable reference in which extracted thumb URL should be stored.
82
     *
83
     * @return \Closure CURLOPT_WRITEFUNCTION callback
84
     */
85
    protected function getCurlCallback(&$content, &$thumbnail)
86
    {
87
        $url = $this->url;
88
        $isRedirected = false;
89
90
        /**
91
         * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
92
         *
93
         * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
94
         * Then we extract the title and the charset and stop the download when it's done.
95
         *
96
         * Note that when using CURLOPT_WRITEFUNCTION, we have to manually handle the content retrieved,
97
         * hence the $content reference variable.
98
         *
99
         * @param resource $ch   cURL resource
100
         * @param string   $data chunk of data being downloaded
101
         *
102
         * @return int|bool length of $data or false if we need to stop the download
103
         */
104
        return function (&$ch, $data) use ($url, &$content, &$thumbnail, &$isRedirected) {
105
            $content .= $data;
106
            $responseCode = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
107
            if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
108
                $isRedirected = true;
109
                return strlen($data);
110
            }
111
            if (!empty($responseCode) && $responseCode !== 200) {
112
                return false;
113
            }
114
            // After a redirection, the content type will keep the previous request value
115
            // until it finds the next content-type header.
116
            if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
117
                $contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
118
            }
119
            // we look for image, and ignore application/octet-stream,
120
            // which is a the default content type for any binary
121
            // @see https://developer.mozilla.org/fr/docs/Web/HTTP/Basics_of_HTTP/MIME_types
122
            if (!empty($contentType)
123
                && strpos($contentType, 'image/') !== false
124
                && strpos($contentType, 'application/octet-stream') === false
125
            ) {
126
                $thumbnail = $url;
127
                return false;
128
            } elseif (!empty($contentType)
129
                && strpos($contentType, 'text/html') === false
130
                && strpos($contentType, 'application/octet-stream') === false
131
            ) {
132
                return false;
133
            }
134
            if (empty($thumbnail)) {
135
                $thumbnail = DefaultFinder::extractMetaTag($data);
136
            }
137
            // We got everything we want, stop the download.
138
            if (!empty($responseCode) && !empty($contentType) && !empty($thumbnail)) {
139
                return false;
140
            }
141
142
            return strlen($data);
143
        };
144
    }
145
146
    /**
147
     * Applies the regexp on the HTML $content to extract the thumb URL.
148
     *
149
     * @param string $content Downloaded HTML content
150
     *
151
     * @return string|bool Extracted thumb URL or false if not found.
152
     */
153
    public static function extractMetaTag($content)
154
    {
155
        $propertiesKey = ['property', 'name', 'itemprop'];
156
        $properties = implode('|', $propertiesKey);
157
        // Try to retrieve OpenGraph image.
158
        $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?og:image["\'\s][^>]*content=["\']?(.*?)["\'\s>]#';
159
        // If the attributes are not in the order property => content (e.g. Github)
160
        // New regex to keep this readable... more or less.
161
        $ogRegexReverse = '#<meta[^>]+content=["\']?([^"\'\s]+)[^>]+(?:'. $properties .')=["\']?og:image["\'\s/>]#';
162
163
        if (preg_match($ogRegex, $content, $matches) > 0
164
            || preg_match($ogRegexReverse, $content, $matches) > 0
165
        ) {
166
            return $matches[1];
167
        }
168
169
        return false;
170
    }
171
172
    /**
173
     * @inheritdoc
174
     */
175
    public function isHotlinkAllowed()
176
    {
177
        return true;
178
    }
179
180
    /**
181
     * @inheritdoc
182
     */
183
    public function checkRules($rules)
184
    {
185
    }
186
187
    /**
188
     * @inheritdoc
189
     */
190
    public function loadRules($rules)
191
    {
192
    }
193
194
    /**
195
     * @inheritdoc
196
     */
197
    public function getName()
198
    {
199
        return 'default';
200
    }
201
}
202