AnimeProcessor::fixEncoding()   A
last analyzed

Complexity

Conditions 5
Paths 6

Size

Total Lines 36
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 13
dl 0
loc 36
rs 9.5222
c 0
b 0
f 0
cc 5
nc 6
nop 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace App\Services;
6
7
use App\Models\AnidbInfo;
8
use App\Models\AnidbTitle;
9
use App\Models\Category;
10
use App\Models\Release;
11
use App\Models\Settings;
12
use Blacklight\ColorCLI;
13
use Blacklight\PopulateAniList as PaList;
14
15
class AnimeProcessor
16
{
17
    private const PROC_EXTFAIL = -1; // Release Anime title/episode # could not be extracted from searchname
18
19
    private const PROC_NOMATCH = -2; // AniList ID was not found in anidb table using extracted title
20
21
    /** @var bool Whether to echo messages to CLI */
22
    public bool $echooutput;
23
24
    public PaList $palist;
25
26
    /** @var int number of AniDB releases to process */
27
    private int $aniqty;
28
29
    /** @var int|null The status of the release being processed */
30
    private ?int $status;
31
32
    protected ColorCLI $colorCli;
33
34
    /**
35
     * Simple cache of looked up titles -> anidbid to reduce repeat queries within one run.
36
     *
37
     * @var array<string,int>
38
     */
39
    private array $titleCache = [];
40
41
    /**
42
     * Simple cache of looked up titles -> anilist_id to reduce repeat queries within one run.
43
     *
44
     * @var array<string,int>
45
     */
46
    private array $anilistIdCache = [];
0 ignored issues
show
introduced by
The private property $anilistIdCache is not used, and could be removed.
Loading history...
47
48
    /**
49
     * @throws \Exception
50
     */
51
    public function __construct(bool $echooutput = true)
52
    {
53
        $this->echooutput = $echooutput && (bool) config('nntmux.echocli');
54
        $this->palist = new PaList;
55
        $this->colorCli = new ColorCLI;
56
57
        $quantity = (int) Settings::settingValue('maxanidbprocessed');
58
        $this->aniqty = $quantity > 0 ? $quantity : 100;
59
        $this->status = null;
60
    }
61
62
    /**
63
     * Main entry point for processing anime releases.
64
     *
65
     * @param  string  $groupID  (Optional) ID of a group to work on.
66
     * @param  string  $guidChar  (Optional) First letter of a release GUID to use to get work.
67
     *
68
     * @throws \Exception
69
     */
70
    public function process(string $groupID = '', string $guidChar = ''): void
71
    {
72
        if ((int) Settings::settingValue('lookupanidb') === 0) {
73
            return;
74
        }
75
76
        $this->processAnimeReleases($groupID, $guidChar);
77
    }
78
79
    /**
80
     * Queues anime releases for processing.
81
     *
82
     * @param  string  $groupID  (Optional) ID of a group to work on.
83
     * @param  string  $guidChar  (Optional) First letter of a release GUID to use to get work.
84
     *
85
     * @throws \Exception
86
     */
87
    public function processAnimeReleases(string $groupID = '', string $guidChar = ''): void
88
    {
89
        $query = Release::query()
90
            ->whereNull('anidbid')
91
            ->where('categories_id', Category::TV_ANIME);
92
93
        if ($guidChar !== '') {
94
            $query->where('leftguid', 'like', $guidChar.'%');
95
        }
96
97
        if ($groupID !== '') {
98
            $query->where('groups_id', $groupID);
99
        }
100
101
        $results = $query->orderByDesc('postdate')
0 ignored issues
show
Bug introduced by
'postdate' of type string is incompatible with the type Closure|Illuminate\Datab...\Database\Query\Builder expected by parameter $column of Illuminate\Database\Query\Builder::orderByDesc(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

101
        $results = $query->orderByDesc(/** @scrutinizer ignore-type */ 'postdate')
Loading history...
102
            ->limit($this->aniqty)
103
            ->get();
104
105
        if ($results->count() > 0) {
106
            // AniList rate limiting is handled internally in PopulateAniList
107
108
            foreach ($results as $release) {
109
                $matched = $this->matchAnimeRelease($release);
110
                if ($matched === false) {
111
                    // Persist status so we do not keep retrying hopeless releases immediately.
112
                    Release::query()->where('id', $release->id)->update(['anidbid' => $this->status]);
113
                }
114
            }
115
        } else {
116
            $this->colorCli->info('No anidb releases to process.');
117
        }
118
    }
119
120
    /**
121
     * Extracts anime title from release searchname.
122
     * Returns ['title' => string] on success else empty array.
123
     * Note: AniList doesn't support episode lookups, so we only extract the title.
124
     */
125
    private function extractTitleEpisode(string $cleanName = ''): array
126
    {
127
        // Fix UTF-8 encoding issues (double-encoding, corrupted sequences)
128
        $s = $this->fixEncoding($cleanName);
129
130
        // Normalize common separators
131
        $s = str_replace(['_', '.'], ' ', $s);
132
        $s = preg_replace('/\s+/', ' ', (string) $s);
133
        $s = trim((string) $s);
134
135
        // Strip leading group tags like [Group]
136
        $s = preg_replace('/^(?:\[[^\]]+\]\s*)+/', '', $s);
137
        $s = trim((string) $s);
138
139
        // Remove language codes and tags (before extracting title)
140
        // Common language tags: [ENG], [JAP], [SUB], [DUB], [MULTI], etc.
141
        $s = preg_replace('/\[(?:ENG|JAP|JPN|SUB|DUB|MULTI|RAW|HARDSUB|SOFTSUB|HARDDUB|SOFTDUB|ITA|SPA|FRE|GER|RUS|CHI|KOR)\]/i', ' ', $s);
142
        $s = preg_replace('/\((?:ENG|JAP|JPN|SUB|DUB|MULTI|RAW|HARDSUB|SOFTSUB|HARDDUB|SOFTDUB|ITA|SPA|FRE|GER|RUS|CHI|KOR)\)/i', ' ', $s);
143
144
        // Remove episode patterns and extract title
145
        $title = '';
146
147
        // Try to extract title by removing episode patterns
148
        // 1) Look for " S01E01" or " S1E1" pattern
149
        if (preg_match('/\sS\d+E\d+/i', $s, $m, PREG_OFFSET_CAPTURE)) {
150
            $title = substr($s, 0, (int) $m[0][1]);
151
        }
152
        // 2) Look for " 1x18" or " 2x05" pattern (season x episode)
153
        elseif (preg_match('/\s\d+x\d+/i', $s, $m, PREG_OFFSET_CAPTURE)) {
154
            $title = substr($s, 0, (int) $m[0][1]);
155
        }
156
        // 3) Look for " - NNN" and extract title before it
157
        elseif (preg_match('/\s-\s*(\d{1,3})\b/', $s, $m, PREG_OFFSET_CAPTURE)) {
158
            $title = substr($s, 0, (int) $m[0][1]);
159
        }
160
        // 4) If not found, look for " E0*NNN" or " Ep NNN"
161
        elseif (preg_match('/\sE(?:p(?:isode)?)?\s*0*(\d{1,3})\b/i', $s, $m, PREG_OFFSET_CAPTURE)) {
162
            $title = substr($s, 0, (int) $m[0][1]);
163
        }
164
        // 4) Keywords Movie/OVA/Complete Series
165
        elseif (preg_match('/\b(Movie|OVA|Complete Series|Complete|Full Series)\b/i', $s, $m, PREG_OFFSET_CAPTURE)) {
166
            $title = substr($s, 0, (int) $m[0][1]);
167
        }
168
        // 5) BD/resolution releases: pick title before next bracket token
169
        elseif (preg_match('/\[(?:BD|BDRip|BluRay|Blu-Ray|\d{3,4}[ipx]|HEVC|x264|x265|H264|H265)\]/i', $s, $m, PREG_OFFSET_CAPTURE)) {
170
            $title = substr($s, 0, (int) $m[0][1]);
171
        } else {
172
            // No episode pattern found, use the whole string as title
173
            $title = $s;
174
        }
175
176
        $title = $this->cleanTitle((string) $title);
177
178
        if ($title === '') {
179
            $this->status = self::PROC_EXTFAIL;
180
181
            return [];
182
        }
183
184
        return ['title' => $title];
185
    }
186
187
    /**
188
     * Fix UTF-8 encoding issues in strings (double-encoding, corrupted sequences).
189
     */
190
    private function fixEncoding(string $text): string
191
    {
192
        // Remove common corrupted character sequences (encoding artifacts)
193
        // Pattern: âÂ_Â, â Â, âÂ, etc.
194
        $text = preg_replace('/âÂ[_\sÂ]*/u', '', $text);
195
        $text = preg_replace('/Ã[¢Â©€£]/u', '', $text);
196
197
        // Remove standalone  characters (common encoding artifact)
198
        $text = preg_replace('/Â+/u', '', $text);
199
200
        // Remove any remaining à sequences (encoding artifacts)
201
        $text = preg_replace('/Ã[^\s]*/u', '', $text);
202
203
        // Try to detect and fix double-encoding issues
204
        // Common patterns: é, Ã, etc. (UTF-8 interpreted as ISO-8859-1)
205
        if (preg_match('/Ã[^\s]/u', $text)) {
206
            // Try ISO-8859-1 -> UTF-8 conversion (common double-encoding fix)
207
            $converted = @mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
208
            if ($converted !== false && ! preg_match('/Ã[^\s]/u', $converted)) {
0 ignored issues
show
Bug introduced by
It seems like $converted can also be of type array; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

208
            if ($converted !== false && ! preg_match('/Ã[^\s]/u', /** @scrutinizer ignore-type */ $converted)) {
Loading history...
209
                $text = $converted;
210
            }
211
        }
212
213
        // Remove any remaining non-printable or control characters except spaces
214
        $text = preg_replace('/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]/u', '', $text);
215
216
        // Normalize Unicode (NFD -> NFC) if available
217
        if (function_exists('normalizer_normalize')) {
218
            $text = normalizer_normalize($text, \Normalizer::FORM_C);
219
        }
220
221
        // Final cleanup: remove any remaining isolated non-ASCII control-like characters
222
        // This catches any remaining encoding artifacts
223
        $text = preg_replace('/[\xC0-\xC1\xC2-\xC5]/u', '', $text);
224
225
        return $text;
226
    }
227
228
    /**
229
     * Strip stray separators, language codes, episode numbers, and other release tags from title.
230
     */
231
    private function cleanTitle(string $title): string
232
    {
233
        // Fix encoding issues first
234
        $title = $this->fixEncoding($title);
235
236
        // Remove all bracketed tags (language, quality, etc.)
237
        $title = preg_replace('/\[[^\]]+\]/', ' ', $title);
238
239
        // Remove all parenthesized tags
240
        $title = preg_replace('/\([^)]+\)/', ' ', $title);
241
242
        // Remove language codes (standalone or with separators)
243
        $title = preg_replace('/\b(ENG|JAP|JPN|SUB|DUB|MULTI|RAW|HARDSUB|SOFTSUB|HARDDUB|SOFTDUB|ITA|SPA|FRE|GER|RUS|CHI|KOR)\b/i', ' ', $title);
244
245
        // Remove metadata words (JAV, Uncensored, Censored, etc.)
246
        $title = preg_replace('/\b(JAV|Uncensored|Censored|Mosaic|Mosaic-less|HD|SD|FHD|UHD)\b/i', ' ', $title);
247
248
        // Remove date patterns (6-digit dates like 091919, 200101, etc.)
249
        $title = preg_replace('/\b\d{6}\b/', ' ', $title);
250
251
        // Remove trailing numbers/underscores (like _01, 01, _001, etc.)
252
        $title = preg_replace('/[-_]\s*\d{1,4}\s*$/i', '', $title);
253
        $title = preg_replace('/\s+\d{1,4}\s*$/i', '', $title);
254
255
        // Remove episode patterns (including episode titles that follow)
256
        // Remove " - 1x18 - Episode Title" or " - 1x18" patterns
257
        $title = preg_replace('/\s*-\s*\d+x\d+.*$/i', '', $title);
258
        // Remove " S01E01" or " S1E1" pattern
259
        $title = preg_replace('/\s+S\d+E\d+.*$/i', '', $title);
260
        // Remove " - NNN" or " - NNN - Episode Title" patterns
261
        $title = preg_replace('/\s*-\s*\d{1,4}(?:\s*-\s*.*)?\s*$/i', '', $title);
262
        $title = preg_replace('/\s*-\s*$/i', '', $title);
263
        // Remove " E0*NNN" or " Ep NNN" patterns
264
        $title = preg_replace('/\s+E(?:p(?:isode)?)?\s*0*\d{1,4}\s*$/i', '', $title);
265
266
        // Remove quality/resolution tags
267
        $title = preg_replace('/\b(480p|720p|1080p|2160p|4K|BD|BDRip|BluRay|Blu-Ray|HEVC|x264|x265|H264|H265|WEB|WEBRip|DVDRip|TVRip)\b/i', ' ', $title);
268
269
        // Remove common release tags
270
        $title = preg_replace('/\b(PROPER|REPACK|RIP|ISO|CRACK|BETA|ALPHA|FINAL|COMPLETE|FULL)\b/i', ' ', $title);
271
272
        // Remove volume/chapter markers
273
        $title = preg_replace('/\s+Vol\.?\s*\d*\s*$/i', '', $title);
274
        $title = preg_replace('/\s+Ch\.?\s*\d*\s*$/i', '', $title);
275
276
        // Remove trailing dashes and separators
277
        $title = preg_replace('/\s*[-_]\s*$/', '', $title);
278
279
        // Normalize whitespace
280
        $title = preg_replace('/\s+/', ' ', $title);
281
282
        return trim((string) $title);
283
    }
284
285
    /**
286
     * Retrieve AniList anime by searching for title.
287
     * First checks local database, then searches AniList API if not found.
288
     */
289
    private function getAnidbByName(string $searchName = ''): ?AnidbTitle
290
    {
291
        if ($searchName === '') {
292
            return null;
293
        }
294
295
        $key = strtolower($searchName);
296
297
        // Check cache first
298
        if (isset($this->titleCache[$key])) {
299
            return AnidbTitle::query()->select(['anidbid', 'title'])->where('anidbid', $this->titleCache[$key])->first();
300
        }
301
302
        // Try exact match in local database first
303
        $exact = AnidbTitle::query()->whereRaw('LOWER(title) = ?', [$key])->select(['anidbid', 'title'])->first();
304
        if ($exact) {
305
            $this->titleCache[$key] = (int) $exact->anidbid;
306
307
            return $exact;
308
        }
309
310
        // Try partial match in local database
311
        $partial = AnidbTitle::query()->where('title', 'like', '%'.$searchName.'%')->select(['anidbid', 'title'])->first();
312
        if ($partial) {
313
            $this->titleCache[$key] = (int) $partial->anidbid;
314
315
            return $partial;
316
        }
317
318
        // Not found locally, search AniList API
319
        try {
320
            $searchResults = $this->palist->searchAnime($searchName, 1);
321
            if ($searchResults && ! empty($searchResults)) {
322
                $anilistData = $searchResults[0];
323
                $anilistId = $anilistData['id'] ?? null;
324
325
                if ($anilistId) {
326
                    // Use anilist_id as anidbid for new entries
327
                    $anidbid = AnidbInfo::query()->where('anilist_id', $anilistId)->value('anidbid');
328
329
                    if (! $anidbid) {
0 ignored issues
show
introduced by
$anidbid is of type App\Models\AnidbInfo, thus it always evaluated to true.
Loading history...
330
                        // Create new entry using anilist_id as anidbid
331
                        $anidbid = (int) $anilistId;
332
                        $this->palist->populateTable('info', $anilistId);
333
                    }
334
335
                    // Get the title from database after insertion
336
                    $title = AnidbTitle::query()
337
                        ->where('anidbid', $anidbid)
338
                        ->where('lang', 'en')
339
                        ->value('title');
340
341
                    if ($title) {
0 ignored issues
show
introduced by
$title is of type App\Models\AnidbTitle, thus it always evaluated to true.
Loading history...
342
                        $this->titleCache[$key] = $anidbid;
343
344
                        return AnidbTitle::query()
345
                            ->where('anidbid', $anidbid)
346
                            ->where('title', $title)
347
                            ->first();
348
                    }
349
                }
350
            }
351
        } catch (\Exception $e) {
352
            if ($this->echooutput) {
353
                $this->colorCli->error('AniList search failed: '.$e->getMessage());
354
            }
355
        }
356
357
        return null;
358
    }
359
360
    /**
361
     * Matches the anime release to AniList Info; fetches remotely if needed.
362
     * Note: AniList doesn't support episode lookups, so we only match by title.
363
     *
364
     * @throws \Exception
365
     */
366
    private function matchAnimeRelease($release): bool
367
    {
368
        $matched = false;
369
        $type = 'Local';
370
371
        $cleanArr = $this->extractTitleEpisode((string) $release->searchname);
372
        if (empty($cleanArr)) {
373
            return false;
374
        }
375
376
        $title = $cleanArr['title'];
377
378
        if ($this->echooutput) {
379
            $this->colorCli->info('Looking Up: Title: '.$title);
380
        }
381
382
        $anidbTitle = $this->getAnidbByName($title);
383
        if (! $anidbTitle) {
0 ignored issues
show
introduced by
$anidbTitle is of type App\Models\AnidbTitle, thus it always evaluated to true.
Loading history...
384
            // Try with spaces replaced by % for broader matching
385
            $tmpName = preg_replace('/\s+/', '%', $title);
386
            $anidbTitle = $this->getAnidbByName($tmpName);
387
        }
388
389
        if ($anidbTitle && is_numeric($anidbTitle->anidbid) && (int) $anidbTitle->anidbid > 0) {
0 ignored issues
show
introduced by
The condition is_numeric($anidbTitle->anidbid) is always true.
Loading history...
390
            $anidbId = (int) $anidbTitle->anidbid;
391
392
            // Check if we need to update info from AniList
393
            $info = AnidbInfo::query()->where('anidbid', $anidbId)->first();
394
            if (! $info || $this->shouldUpdateInfo($anidbId)) {
395
                // Try to get anilist_id if we have it
396
                $anilistId = $info->anilist_id ?? null;
397
398
                if (! $anilistId) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $anilistId of type integer|null is loosely compared to false; this is ambiguous if the integer can be 0. You might want to explicitly use === null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
399
                    // Search AniList for this title
400
                    try {
401
                        $searchResults = $this->palist->searchAnime($title, 1);
402
                        if ($searchResults && ! empty($searchResults)) {
403
                            $anilistId = $searchResults[0]['id'] ?? null;
404
                        }
405
                    } catch (\Exception $e) {
406
                        if ($this->echooutput) {
407
                            $this->colorCli->warning('AniList search failed: '.$e->getMessage());
408
                        }
409
                    }
410
                }
411
412
                if ($anilistId) {
413
                    // Fetch remote info from AniList
414
                    $this->palist->populateTable('info', $anilistId);
415
                    $type = 'Remote';
416
                }
417
            }
418
419
            $this->updateRelease($anidbId, (int) $release->id);
420
421
            if ($this->echooutput) {
422
                $this->colorCli->headerOver('Matched '.$type.' AniList ID: ');
423
                $this->colorCli->primary((string) $anidbId);
424
                $this->colorCli->alternateOver('   Title: ');
425
                $this->colorCli->primary($anidbTitle->title);
426
            }
427
            $matched = true;
428
        } else {
429
            $this->status = self::PROC_NOMATCH;
430
        }
431
432
        return $matched;
433
    }
434
435
    private function updateRelease(int $anidbId, int $relId): void
436
    {
437
        Release::query()->where('id', $relId)->update(['anidbid' => $anidbId]);
438
    }
439
440
    /**
441
     * Determine if we should attempt a remote AniDB info fetch (missing or stale > 1 week).
442
     */
443
    private function shouldUpdateInfo(int $anidbId): bool
444
    {
445
        $info = AnidbInfo::query()->where('anidbid', $anidbId)->first(['updated']);
446
        if ($info === null) {
447
            return true; // no info yet
448
        }
449
450
        return $info->updated < now()->subWeek();
451
    }
452
}
453