Data18Pipe::getBaseUrl()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 1
c 1
b 0
f 0
dl 0
loc 3
rs 10
cc 1
nc 1
nop 0
1
<?php
2
3
namespace App\Services\AdultProcessing\Pipes;
4
5
use App\Services\AdultProcessing\AdultProcessingPassable;
6
use App\Services\AdultProcessing\AdultProcessingResult;
7
8
/**
9
 * Data18 provider pipe.
10
 *
11
 * Handles movie information extraction from data18.com
12
 * Data18 is a comprehensive adult movie database with good structured data.
13
 */
14
class Data18Pipe extends AbstractAdultProviderPipe
15
{
16
    protected int $priority = 15; // Second priority after AEBN
17
18
    private const BASE_URL = 'https://www.data18.com';
19
    private const SEARCH_URL = '/search/?k=';
20
21
    protected string $directUrl = '';
22
    protected string $title = '';
23
    protected string $response = '';
24
    protected ?array $jsonLdData = null;
25
26
    public function getName(): string
27
    {
28
        return 'data18';
29
    }
30
31
    public function getDisplayName(): string
32
    {
33
        return 'Data18';
34
    }
35
36
    protected function getBaseUrl(): string
37
    {
38
        return self::BASE_URL;
39
    }
40
41
    protected function process(AdultProcessingPassable $passable): AdultProcessingResult
42
    {
43
        $movie = $passable->getCleanTitle();
44
45
        // Check cache first
46
        $cached = $this->getCachedSearch($movie);
47
        if ($cached !== null) {
48
            if ($cached === false) {
49
                return AdultProcessingResult::notFound($this->getName());
50
            }
51
            return AdultProcessingResult::matched(
52
                $cached['title'] ?? $movie,
53
                $this->getName(),
54
                $cached
55
            );
56
        }
57
58
        $searchResult = $this->search($movie);
59
60
        if ($searchResult === false) {
0 ignored issues
show
introduced by
The condition $searchResult === false is always true.
Loading history...
61
            $this->cacheSearchResult($movie, false);
62
            $this->outputNotFound();
63
            return AdultProcessingResult::notFound($this->getName());
64
        }
65
66
        $this->title = $searchResult['title'];
67
        $this->directUrl = $searchResult['url'];
68
69
        // Fetch the movie details page
70
        $this->response = $this->fetchHtml($this->directUrl, $this->cookie);
71
72
        if ($this->response === false) {
73
            return AdultProcessingResult::failed('Failed to fetch movie details page', $this->getName());
74
        }
75
76
        // Try to extract JSON-LD data first
77
        $this->jsonLdData = $this->extractJsonLd($this->response);
78
79
        $this->getHtmlParser()->loadHtml($this->response);
80
81
        $movieInfo = $this->getMovieInfo();
82
83
        if ($movieInfo === false) {
84
            $this->cacheSearchResult($movie, false);
85
            return AdultProcessingResult::notFound($this->getName());
86
        }
87
88
        // Cache the successful result
89
        $this->cacheSearchResult($movie, $movieInfo);
90
91
        $this->outputMatch($this->title);
92
93
        return AdultProcessingResult::matched(
94
            $this->title,
95
            $this->getName(),
96
            $movieInfo
97
        );
98
    }
99
100
    protected function search(string $movie): array|false
101
    {
102
        if (empty($movie)) {
103
            return false;
104
        }
105
106
        $searchUrl = self::BASE_URL . self::SEARCH_URL . urlencode($movie);
107
        $response = $this->fetchHtml($searchUrl, $this->cookie);
108
109
        if ($response === false) {
110
            return false;
111
        }
112
113
        $this->getHtmlParser()->loadHtml($response);
114
115
        $bestMatch = null;
116
        $highestSimilarity = 0;
117
118
        // Try multiple container selectors for search results
119
        $containerSelectors = [
120
            'div.gen12',
121
            'div.movie-card',
122
            'div[class*="result"]',
123
            'a[href*="/movies/"]',
124
        ];
125
126
        foreach ($containerSelectors as $containerSelector) {
127
            $results = $this->getHtmlParser()->find($containerSelector);
128
129
            if (!empty($results)) {
130
                foreach ($results as $result) {
131
                    $link = null;
132
                    $title = '';
133
134
                    // If the container is a link itself
135
                    if (isset($result->href) && str_contains($result->href, '/movies/')) {
136
                        $link = $result;
137
                        $title = $result->title ?? trim($result->plaintext ?? '');
138
                    } else {
139
                        // Find link within container
140
                        $link = $result->findOne('a[href*="/movies/"]');
141
                        if ($link) {
142
                            $title = $link->title ?? trim($link->plaintext ?? '');
143
                        }
144
                    }
145
146
                    if ($link && isset($link->href) && !empty($title)) {
147
                        $similarity = $this->calculateSimilarity($movie, $title);
148
149
                        if ($similarity > $highestSimilarity) {
150
                            $highestSimilarity = $similarity;
151
                            $url = $link->href;
152
                            if (!str_starts_with($url, 'http')) {
153
                                $url = self::BASE_URL . $url;
154
                            }
155
                            $bestMatch = [
156
                                'title' => trim($title),
157
                                'url' => $url,
158
                                'similarity' => $similarity,
159
                            ];
160
                        }
161
                    }
162
                }
163
164
                if ($bestMatch !== null) {
165
                    break;
166
                }
167
            }
168
        }
169
170
        if ($bestMatch !== null && $highestSimilarity >= $this->minimumSimilarity) {
0 ignored issues
show
introduced by
The condition $bestMatch !== null is always false.
Loading history...
171
            return $bestMatch;
172
        }
173
174
        return false;
175
    }
176
177
    protected function getMovieInfo(): array|false
178
    {
179
        $results = [];
180
181
        if (!empty($this->directUrl)) {
182
            if (!empty($this->title)) {
183
                $results['title'] = $this->title;
184
            }
185
            $results['directurl'] = $this->directUrl;
186
        }
187
188
        // Try to get data from JSON-LD first (most reliable)
189
        if ($this->jsonLdData !== null) {
190
            $results = array_merge($results, $this->extractFromJsonLd());
191
        }
192
193
        // Get all the movie data (HTML fallback)
194
        $synopsis = $this->extractSynopsis();
195
        if (is_array($synopsis) && !empty($synopsis)) {
196
            $results = array_merge($results, $synopsis);
197
        }
198
199
        $productInfo = $this->extractProductInfo(true);
200
        if (is_array($productInfo) && !empty($productInfo)) {
201
            $results = array_merge($results, $productInfo);
202
        }
203
204
        $cast = $this->extractCast();
205
        if (is_array($cast) && !empty($cast)) {
206
            $results = array_merge($results, $cast);
207
        }
208
209
        $genres = $this->extractGenres();
210
        if (is_array($genres) && !empty($genres)) {
211
            $results = array_merge($results, $genres);
212
        }
213
214
        $covers = $this->extractCovers();
215
        if (is_array($covers) && !empty($covers)) {
216
            $results = array_merge($results, $covers);
217
        }
218
219
        if (empty($results) || (empty($results['title'] ?? '') && empty($results['boxcover'] ?? ''))) {
220
            return false;
221
        }
222
223
        return $results;
224
    }
225
226
    /**
227
     * Extract data from JSON-LD structured data.
228
     */
229
    protected function extractFromJsonLd(): array
230
    {
231
        $results = [];
232
233
        if ($this->jsonLdData === null) {
234
            return $results;
235
        }
236
237
        // Title
238
        if (!empty($this->jsonLdData['name'])) {
239
            $results['title'] = $this->jsonLdData['name'];
240
        }
241
242
        // Synopsis/Description
243
        if (!empty($this->jsonLdData['description'])) {
244
            $results['synopsis'] = $this->jsonLdData['description'];
245
        }
246
247
        // Image/Cover
248
        if (!empty($this->jsonLdData['image'])) {
249
            $image = is_array($this->jsonLdData['image']) ? ($this->jsonLdData['image'][0] ?? '') : $this->jsonLdData['image'];
250
            if (!empty($image)) {
251
                $results['boxcover'] = $image;
252
            }
253
        }
254
255
        // Duration
256
        if (!empty($this->jsonLdData['duration'])) {
257
            $results['duration'] = $this->jsonLdData['duration'];
258
        }
259
260
        // Director
261
        if (!empty($this->jsonLdData['director'])) {
262
            $director = $this->jsonLdData['director'];
263
            if (is_array($director)) {
264
                $results['director'] = $director['name'] ?? ($director[0]['name'] ?? '');
265
            } else {
266
                $results['director'] = $director;
267
            }
268
        }
269
270
        // Actors
271
        if (!empty($this->jsonLdData['actor'])) {
272
            $actors = $this->jsonLdData['actor'];
273
            $cast = [];
274
            if (is_array($actors)) {
275
                foreach ($actors as $actor) {
276
                    if (is_array($actor) && !empty($actor['name'])) {
277
                        $cast[] = $actor['name'];
278
                    } elseif (is_string($actor)) {
279
                        $cast[] = $actor;
280
                    }
281
                }
282
            }
283
            if (!empty($cast)) {
284
                $results['cast'] = $cast;
285
            }
286
        }
287
288
        // Genre
289
        if (!empty($this->jsonLdData['genre'])) {
290
            $genres = $this->jsonLdData['genre'];
291
            if (is_array($genres)) {
292
                $results['genres'] = $genres;
293
            } else {
294
                $results['genres'] = [$genres];
295
            }
296
        }
297
298
        return $results;
299
    }
300
301
    protected function extractCovers(): array
302
    {
303
        $res = [];
304
305
        // Try multiple selectors
306
        $selectors = [
307
            'img[itemprop=image]',
308
            'img.cover',
309
            'img[src*="cover"]',
310
            'div.cover img',
311
            'a.cover img',
312
            'meta[property="og:image"]',
313
        ];
314
315
        foreach ($selectors as $selector) {
316
            $ret = $this->getHtmlParser()->findOne($selector);
317
            if ($ret) {
318
                $coverUrl = $ret->src ?? $ret->content ?? null;
319
320
                if (!empty($coverUrl)) {
321
                    if (str_starts_with($coverUrl, '//')) {
322
                        $coverUrl = 'https:' . $coverUrl;
323
                    } elseif (!str_starts_with($coverUrl, 'http')) {
324
                        $coverUrl = self::BASE_URL . '/' . ltrim($coverUrl, '/');
325
                    }
326
327
                    $res['boxcover'] = $coverUrl;
328
329
                    // Try to find back cover
330
                    $backCover = str_replace(['front', '_f.', '_1.'], ['back', '_b.', '_2.'], $coverUrl);
331
                    if ($backCover !== $coverUrl) {
332
                        $res['backcover'] = $backCover;
333
                    }
334
335
                    return $res;
336
                }
337
            }
338
        }
339
340
        return $res;
341
    }
342
343
    protected function extractSynopsis(): array
344
    {
345
        $res = [];
346
347
        $selectors = [
348
            'div[itemprop=description]',
349
            'span[itemprop=description]',
350
            'div.synopsis',
351
            'div.description',
352
            'p.synopsis',
353
            'meta[name="description"]',
354
            'meta[property="og:description"]',
355
        ];
356
357
        foreach ($selectors as $selector) {
358
            $ret = $this->getHtmlParser()->findOne($selector);
359
            if ($ret) {
360
                $text = $ret->plaintext ?? $ret->content ?? '';
361
                if (!empty(trim($text))) {
362
                    $res['synopsis'] = trim($text);
363
                    return $res;
364
                }
365
            }
366
        }
367
368
        return $res;
369
    }
370
371
    protected function extractCast(): array
372
    {
373
        $res = [];
374
        $cast = [];
375
376
        $selectors = [
377
            'a[href*="/name/"]',
378
            'a[href*="/pornstars/"]',
379
            'span[itemprop=actor] a',
380
            'div.cast a',
381
            'div.performers a',
382
        ];
383
384
        foreach ($selectors as $selector) {
385
            $elements = $this->getHtmlParser()->find($selector);
386
            if (!empty($elements)) {
387
                foreach ($elements as $element) {
388
                    $name = trim($element->plaintext ?? '');
389
                    if (!empty($name) && strlen($name) > 2) {
390
                        $cast[] = $name;
391
                    }
392
                }
393
                if (!empty($cast)) {
394
                    break;
395
                }
396
            }
397
        }
398
399
        if (!empty($cast)) {
400
            $res['cast'] = array_unique($cast);
401
        }
402
403
        return $res;
404
    }
405
406
    protected function extractGenres(): array
407
    {
408
        $res = [];
409
        $genres = [];
410
411
        $selectors = [
412
            'a[href*="/category/"]',
413
            'a[href*="/genre/"]',
414
            'span[itemprop=genre]',
415
            'div.categories a',
416
            'div.tags a',
417
        ];
418
419
        foreach ($selectors as $selector) {
420
            $elements = $this->getHtmlParser()->find($selector);
421
            if (!empty($elements)) {
422
                foreach ($elements as $element) {
423
                    $text = trim($element->plaintext ?? '');
424
                    if (!empty($text) && strlen($text) > 1) {
425
                        $genres[] = $text;
426
                    }
427
                }
428
                if (!empty($genres)) {
429
                    break;
430
                }
431
            }
432
        }
433
434
        if (!empty($genres)) {
435
            $res['genres'] = array_unique($genres);
436
        }
437
438
        return $res;
439
    }
440
441
    protected function extractProductInfo(bool $extras = false): array
442
    {
443
        $res = [];
444
445
        // Look for studio
446
        $studioSelectors = [
447
            'a[href*="/studio/"]',
448
            'a[href*="/studios/"]',
449
            'span.studio a',
450
        ];
451
452
        foreach ($studioSelectors as $selector) {
453
            $studio = $this->getHtmlParser()->findOne($selector);
454
            if ($studio) {
455
                $res['studio'] = trim($studio->plaintext ?? '');
456
                break;
457
            }
458
        }
459
460
        // Look for release date
461
        $dateSelectors = [
462
            'span[itemprop=datePublished]',
463
            'time[datetime]',
464
            'span.date',
465
        ];
466
467
        foreach ($dateSelectors as $selector) {
468
            $date = $this->getHtmlParser()->findOne($selector);
469
            if ($date) {
470
                $res['releasedate'] = $date->datetime ?? trim($date->plaintext ?? '');
471
                break;
472
            }
473
        }
474
475
        // Look for director
476
        $directorSelectors = [
477
            'a[href*="/director/"]',
478
            'span.director a',
479
        ];
480
481
        foreach ($directorSelectors as $selector) {
482
            $director = $this->getHtmlParser()->findOne($selector);
483
            if ($director) {
484
                $res['director'] = trim($director->plaintext ?? '');
485
                break;
486
            }
487
        }
488
489
        return $res;
490
    }
491
}
492
493