Passed
Push — master ( 7f6d76...ae3f18 )
by Darko
13:15
created

MiscCategorizer   A

Complexity

Total Complexity 26

Size/Duplication

Total Lines 110
Duplicated Lines 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
wmc 26
eloc 43
c 2
b 0
f 0
dl 0
loc 110
rs 10

6 Methods

Rating   Name   Duplication   Size   Complexity  
A checkHash() 0 23 5
A categorize() 0 25 5
A checkDataset() 0 16 6
A getName() 0 3 1
A checkArchive() 0 7 2
B checkObfuscated() 0 22 7
1
<?php
2
3
namespace App\Services\Categorization\Categorizers;
4
5
use App\Models\Category;
6
use App\Services\Categorization\CategorizationResult;
7
use App\Services\Categorization\ReleaseContext;
8
9
/**
10
 * Categorizer for miscellaneous content and hash detection.
11
 * This runs last as a fallback.
12
 */
13
class MiscCategorizer extends AbstractCategorizer
14
{
15
    protected int $priority = 100; // Lowest priority - run last
16
17
    public function getName(): string
18
    {
19
        return 'Misc';
20
    }
21
22
    public function categorize(ReleaseContext $context): CategorizationResult
23
    {
24
        $name = $context->releaseName;
25
26
        // Check for hash patterns first
27
        if ($result = $this->checkHash($name)) {
28
            return $result;
29
        }
30
31
        // Check for archive formats
32
        if ($result = $this->checkArchive($name)) {
33
            return $result;
34
        }
35
36
        // Check for dataset/dump patterns
37
        if ($result = $this->checkDataset($name)) {
38
            return $result;
39
        }
40
41
        // Check for obfuscated/encoded patterns
42
        if ($result = $this->checkObfuscated($name)) {
43
            return $result;
44
        }
45
46
        return $this->noMatch();
47
    }
48
49
    protected function checkHash(string $name): ?CategorizationResult
50
    {
51
        // MD5 hash (32 hex characters)
52
        if (preg_match('/\b[a-f0-9]{32}\b/i', $name)) {
53
            return $this->matched(Category::OTHER_HASHED, 0.8, 'hash_md5');
54
        }
55
56
        // SHA-1 hash (40 hex characters)
57
        if (preg_match('/\b[a-f0-9]{40}\b/i', $name)) {
58
            return $this->matched(Category::OTHER_HASHED, 0.85, 'hash_sha1');
59
        }
60
61
        // SHA-256 hash (64 hex characters)
62
        if (preg_match('/\b[a-f0-9]{64}\b/i', $name)) {
63
            return $this->matched(Category::OTHER_HASHED, 0.9, 'hash_sha256');
64
        }
65
66
        // Generic long hex hash
67
        if (preg_match('/\b[a-f0-9]{32,128}\b/i', $name)) {
68
            return $this->matched(Category::OTHER_HASHED, 0.75, 'hash_generic');
69
        }
70
71
        return null;
72
    }
73
74
    protected function checkArchive(string $name): ?CategorizationResult
75
    {
76
        if (preg_match('/\.(zip|rar|7z|tar|gz|bz2|xz|tgz|tbz2|cab|iso|img|dmg|pkg|archive)$/i', $name)) {
77
            return $this->matched(Category::OTHER_MISC, 0.5, 'archive');
78
        }
79
80
        return null;
81
    }
82
83
    protected function checkDataset(string $name): ?CategorizationResult
84
    {
85
        // Dataset/dump patterns that aren't media
86
        if (preg_match('/\b(sql|csv|dump|backup|dataset|collection)\b/i', $name) &&
87
            !preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) {
88
            return $this->matched(Category::OTHER_MISC, 0.6, 'dataset');
89
        }
90
91
        // Data leaks/dumps (be careful with these)
92
        if (preg_match('/\b(leak|breach|data|database)\b/i', $name) &&
93
            preg_match('/\b(dump|export|backup)\b/i', $name) &&
94
            !preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) {
95
            return $this->matched(Category::OTHER_MISC, 0.6, 'data_dump');
96
        }
97
98
        return null;
99
    }
100
101
    protected function checkObfuscated(string $name): ?CategorizationResult
102
    {
103
        // Release names consisting only of uppercase letters and numbers
104
        if (preg_match('/^[A-Z0-9]{15,}$/', $name)) {
105
            return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_uppercase');
106
        }
107
108
        // Mixed-case alphanumeric strings without separators (common obfuscation pattern)
109
        // These look like random strings: e.g., "AA7Jl2toE8Q53yNZmQ5R6G"
110
        if (preg_match('/^[a-zA-Z0-9]{15,}$/', $name) &&
111
            !preg_match('/\b(19|20)\d{2}\b/', $name) &&
112
            !preg_match('/^[A-Z][a-z]+([A-Z][a-z]+)+$/', $name)) { // Exclude CamelCase words
113
            return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_mixed_alphanumeric');
114
        }
115
116
        // Only punctuation and numbers with no clear structure
117
        if (preg_match('/^[^a-zA-Z]*[A-Z0-9\._\-]{5,}[^a-zA-Z]*$/', $name) &&
118
            !preg_match('/\.(mkv|avi|mp4|mp3|flac|pdf|epub|exe|iso)$/i', $name)) {
119
            return $this->matched(Category::OTHER_MISC, 0.5, 'obfuscated_pattern');
120
        }
121
122
        return null;
123
    }
124
}
125
126