Passed
Push — master ( 09d1e9...66abdb )
by Darko
09:57
created

ImdbScraper::fetchById()   F

Complexity

Conditions 29
Paths > 20000

Size

Total Lines 142
Code Lines 91

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 91
c 1
b 0
f 0
dl 0
loc 142
rs 0
cc 29
nc 351131
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace App\Services;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\GuzzleException;
7
use Illuminate\Support\Facades\Cache;
8
use Illuminate\Support\Facades\Log;
9
use voku\helper\HtmlDomParser;
0 ignored issues
show
Bug introduced by
The type voku\helper\HtmlDomParser was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
10
11
class ImdbScraper
12
{
13
    protected Client $client;
14
15
    public function __construct()
16
    {
17
        $this->client = new Client([
18
            'timeout' => 10,
19
            'headers' => [
20
                'User-Agent' => 'Mozilla/5.0 (compatible; NNTmuxBot/1.0; +https://github.com/NNTmux)',
21
            ],
22
        ]);
23
    }
24
25
    /**
26
     * Fetch a movie by IMDB numeric ID.
27
     *
28
     * @param  string  $id  Numeric part without 'tt'.
29
     */
30
    public function fetchById(string $id): array|false
31
    {
32
        $id = preg_replace('/[^0-9]/', '', $id);
33
        if ($id === '') {
34
            return false;
35
        }
36
        $cacheKey = 'imdb_scrape_id_'.$id;
37
        if (Cache::has($cacheKey)) {
38
            return Cache::get($cacheKey);
39
        }
40
        $url = 'https://www.imdb.com/title/tt'.$id.'/';
41
        try {
42
            $res = $this->client->get($url);
43
            $html = (string) $res->getBody();
44
            $dom = HtmlDomParser::str_get_html($html);
45
            if (! $dom) {
46
                Cache::put($cacheKey, false, now()->addHours(6));
47
48
                return false;
49
            }
50
51
            $titleNode = $dom->findOne('h1');
52
            $title = trim($titleNode?->text() ?? '');
53
            if ($title === '') {
54
                Cache::put($cacheKey, false, now()->addHours(6));
55
56
                return false;
57
            }
58
59
            // Year
60
            $year = '';
61
            $yearNode = $dom->findOne("span[data-testid='title-details-releasedate'] a");
62
            if ($yearNode) {
63
                if (preg_match('/(19|20)\d{2}/', $yearNode->text(), $m)) {
64
                    $year = $m[0];
65
                }
66
            }
67
            if ($year === '') {
68
                $altYearNode = $dom->findOne("span[data-testid='title-details-releasedate']");
69
                if ($altYearNode && preg_match('/(19|20)\d{2}/', $altYearNode->text(), $m)) {
70
                    $year = $m[0];
71
                }
72
            }
73
74
            // Plot
75
            $plot = '';
76
            $plotNode = $dom->findOne("span[data-testid='plot-l']");
77
            if (! $plotNode) {
78
                $plotNode = $dom->findOne("span[data-testid='plot-xl']");
79
            }
80
            if ($plotNode) {
81
                $plot = trim($plotNode->text());
82
            }
83
84
            // Rating
85
            $rating = '';
86
            $ratingNode = $dom->findOne("div[data-testid='hero-rating-bar__aggregate-rating__score'] span");
87
            if ($ratingNode) {
88
                $rating = trim($ratingNode->text());
89
            }
90
            $rating = preg_replace('/[^0-9\.]/', '', $rating);
91
92
            // Poster
93
            $cover = '';
94
            $posterNode = $dom->findOne("div[data-testid='hero-media__poster'] img");
95
            if ($posterNode) {
96
                $cover = $posterNode->getAttribute('src');
97
            }
98
            if ($cover !== '' && str_contains($cover, '._V1_')) {
99
                // Attempt higher res by removing size suffix
100
                $cover = preg_replace('/\._V1_.*\./', '.', $cover);
101
            }
102
103
            // Genres
104
            $genres = [];
105
            foreach ($dom->find("div[data-testid='genres'] a") as $g) {
106
                $gt = trim($g->text());
107
                if ($gt !== '') {
108
                    $genres[] = $gt;
109
                }
110
            }
111
112
            // Directors
113
            $directors = [];
114
            foreach ($dom->find("li[data-testid='title-pc-principal-credit']:contains(Director) a") as $d) {
115
                $dt = trim($d->text());
116
                if ($dt !== '') {
117
                    $directors[] = $dt;
118
                }
119
            }
120
            if (empty($directors)) {
121
                foreach ($dom->find("li[data-testid='title-pc-principal-credit']:contains(Creator) a") as $d) {
122
                    $dt = trim($d->text());
123
                    if ($dt !== '') {
124
                        $directors[] = $dt;
125
                    }
126
                }
127
            }
128
129
            // Actors (top billed)
130
            $actors = [];
131
            foreach ($dom->find("div[data-testid='title-cast-item'] a[data-testid='title-cast-item__actor']") as $a) {
132
                $at = trim($a->text());
133
                if ($at !== '') {
134
                    $actors[] = $at;
135
                }
136
                if (count($actors) >= 10) {
137
                    break;
138
                }
139
            }
140
141
            // Languages (from details section)
142
            $language = '';
143
            foreach ($dom->find("li[data-testid='title-details-languages'] a") as $ln) {
144
                $lt = trim($ln->text());
145
                if ($lt !== '') {
146
                    $language .= $lt.', ';
147
                }
148
            }
149
            $language = rtrim($language, ', ');
150
151
            $data = [
152
                'imdbid' => $id,
153
                'title' => $title,
154
                'year' => $year,
155
                'plot' => $plot,
156
                'rating' => $rating,
157
                'cover' => $cover,
158
                'genre' => $genres,
159
                'director' => $directors,
160
                'actors' => $actors,
161
                'language' => $language,
162
                'type' => 'movie',
163
            ];
164
            Cache::put($cacheKey, $data, now()->addDays(7));
165
166
            return $data;
167
        } catch (GuzzleException|\Throwable $e) {
168
            Log::debug('IMDb fetch error tt'.$id.': '.$e->getMessage());
169
            Cache::put($cacheKey, false, now()->addHours(6));
170
171
            return false;
172
        }
173
    }
174
175
    /**
176
     * Search IMDb suggestion API for a title.
177
     */
178
    public function search(string $query): array
179
    {
180
        $query = trim($query);
181
        if ($query === '') {
182
            return [];
183
        }
184
        $norm = strtolower(preg_replace('/[^a-z0-9 ]/i', '', $query));
185
        $slug = str_replace(' ', '_', $norm);
186
        $prefix = substr($slug, 0, 1);
187
        $cacheKey = 'imdb_search_'.md5($slug);
188
        if (Cache::has($cacheKey)) {
189
            return Cache::get($cacheKey);
190
        }
191
        $url = 'https://v2.sg.media-imdb.com/suggestion/'.urlencode($prefix).'/'.urlencode($slug).'.json';
192
        try {
193
            $res = $this->client->get($url);
194
            $json = json_decode((string) $res->getBody(), true);
195
            $results = [];
196
            foreach (($json['d'] ?? []) as $row) {
197
                if (! isset($row['id']) || ! str_starts_with($row['id'], 'tt')) {
198
                    continue;
199
                }
200
                $id = substr($row['id'], 2);
201
                $title = $row['l'] ?? '';
202
                $year = $row['y'] ?? '';
203
                if ($title === '') {
204
                    continue;
205
                }
206
                $results[] = [
207
                    'imdbid' => $id,
208
                    'title' => $title,
209
                    'year' => (string) $year,
210
                ];
211
                if (count($results) >= 25) {
212
                    break;
213
                }
214
            }
215
            Cache::put($cacheKey, $results, now()->addHours(12));
216
217
            return $results;
218
        } catch (GuzzleException|\Throwable $e) {
219
            Log::debug('IMDb search error '.$query.': '.$e->getMessage());
220
            Cache::put($cacheKey, [], now()->addHours(6));
221
222
            return [];
223
        }
224
    }
225
}
226