Passed
Push — master ( 402528...b9a071 )
by Darko
11:56
created

MiscCategorizer::looksLikeRandomString()   C

Complexity

Conditions 14
Paths 13

Size

Total Lines 50
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 21
c 1
b 0
f 0
dl 0
loc 50
rs 6.2666
cc 14
nc 13
nop 1

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace App\Services\Categorization\Categorizers;
4
5
use App\Models\Category;
6
use App\Services\Categorization\CategorizationResult;
7
use App\Services\Categorization\ReleaseContext;
8
9
/**
10
 * Categorizer for miscellaneous content and hash detection.
11
 * This runs FIRST with high priority to detect hashes early and prevent
12
 * them from being incorrectly categorized by group-based or content-based rules.
13
 */
14
class MiscCategorizer extends AbstractCategorizer
15
{
16
    protected int $priority = 1; // Highest priority - run first to catch hashes
17
18
    public function getName(): string
19
    {
20
        return 'Misc';
21
    }
22
23
    public function categorize(ReleaseContext $context): CategorizationResult
24
    {
25
        $name = $context->releaseName;
26
27
        // Check for hash patterns first
28
        if ($result = $this->checkHash($name)) {
29
            return $result;
30
        }
31
32
        // Check for archive formats
33
        if ($result = $this->checkArchive($name)) {
34
            return $result;
35
        }
36
37
        // Check for dataset/dump patterns
38
        if ($result = $this->checkDataset($name)) {
39
            return $result;
40
        }
41
42
        // Check for obfuscated/encoded patterns
43
        if ($result = $this->checkObfuscated($name)) {
44
            return $result;
45
        }
46
47
        return $this->noMatch();
48
    }
49
50
    protected function checkHash(string $name): ?CategorizationResult
51
    {
52
        // MD5 hash (32 hex characters) - match with word boundaries or quotes/punctuation
53
        if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{32})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) {
54
            return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_md5');
55
        }
56
57
        // SHA-1 hash (40 hex characters) - match with word boundaries or quotes/punctuation
58
        if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{40})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) {
59
            return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_sha1');
60
        }
61
62
        // SHA-256 hash (64 hex characters) - match with word boundaries or quotes/punctuation
63
        if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{64})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) {
64
            return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_sha256');
65
        }
66
67
        // Generic long hex hash (32-128 chars) - match with word boundaries or quotes/punctuation
68
        if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{32,128})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) {
69
            return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_generic');
70
        }
71
72
        return null;
73
    }
74
75
    protected function checkArchive(string $name): ?CategorizationResult
76
    {
77
        if (preg_match('/\.(zip|rar|7z|tar|gz|bz2|xz|tgz|tbz2|cab|iso|img|dmg|pkg|archive)$/i', $name)) {
78
            return $this->matched(Category::OTHER_MISC, 0.5, 'archive');
79
        }
80
81
        return null;
82
    }
83
84
    protected function checkDataset(string $name): ?CategorizationResult
85
    {
86
        // Dataset/dump patterns that aren't media
87
        if (preg_match('/\b(sql|csv|dump|backup|dataset|collection)\b/i', $name) &&
88
            !preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) {
89
            return $this->matched(Category::OTHER_MISC, 0.6, 'dataset');
90
        }
91
92
        // Data leaks/dumps (be careful with these)
93
        if (preg_match('/\b(leak|breach|data|database)\b/i', $name) &&
94
            preg_match('/\b(dump|export|backup)\b/i', $name) &&
95
            !preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) {
96
            return $this->matched(Category::OTHER_MISC, 0.6, 'data_dump');
97
        }
98
99
        return null;
100
    }
101
102
    protected function checkObfuscated(string $name): ?CategorizationResult
103
    {
104
        // Release names consisting only of uppercase letters and numbers
105
        if (preg_match('/^[A-Z0-9]{15,}$/', $name)) {
106
            return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_uppercase');
107
        }
108
109
        // Mixed-case alphanumeric strings without separators (common obfuscation pattern)
110
        // These look like random strings: e.g., "AA7Jl2toE8Q53yNZmQ5R6G"
111
        if (preg_match('/^[a-zA-Z0-9]{15,}$/', $name) &&
112
            !preg_match('/\b(19|20)\d{2}\b/', $name) &&
113
            !preg_match('/^[A-Z][a-z]+([A-Z][a-z]+)+$/', $name)) { // Exclude CamelCase words
114
            return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_mixed_alphanumeric');
115
        }
116
117
        // Obfuscated filename embedded in usenet subject line format
118
        // Matches patterns like: [XX/XX] - "RANDOMSTRING.partXX.rar" or "RANDOMSTRING.7z.001"
119
        // The filename inside quotes is random alphanumeric with no meaningful words
120
        if (preg_match('/\[\d+\/\d+\]\s*-\s*"([a-zA-Z0-9]{12,})\.(part\d+\.rar|7z\.\d{3}|rar|zip|vol\d+\+\d+\.par2|par2)"/i', $name, $matches)) {
121
            $filename = $matches[1];
122
            // Ensure the filename looks random (not a real title)
123
            // Real titles would have words/structure, obfuscated ones are random chars
124
            if (!preg_match('/[._ -]/', $filename) && // No separators
125
                !preg_match('/\b(19|20)\d{2}\b/', $filename) && // No year
126
                $this->looksLikeRandomString($filename)) { // Additional entropy check
127
                return $this->matched(Category::OTHER_HASHED, 0.85, 'obfuscated_usenet_filename');
128
            }
129
        }
130
131
        // Only punctuation and numbers with no clear structure
132
        if (preg_match('/^[^a-zA-Z]*[A-Z0-9\._\-]{5,}[^a-zA-Z]*$/', $name) &&
133
            !preg_match('/\.(mkv|avi|mp4|mp3|flac|pdf|epub|exe|iso)$/i', $name)) {
134
            return $this->matched(Category::OTHER_MISC, 0.5, 'obfuscated_pattern');
135
        }
136
137
        return null;
138
    }
139
140
    /**
141
     * Check if a string looks like a random/obfuscated string rather than a real title.
142
     * Uses multiple heuristics to detect randomly generated filenames.
143
     */
144
    protected function looksLikeRandomString(string $str): bool
145
    {
146
        // If it's all uppercase or all lowercase with no pattern, likely random
147
        if (preg_match('/^[A-Z]+$/', $str) || preg_match('/^[a-z]+$/', $str)) {
148
            return strlen($str) >= 12;
149
        }
150
151
        // Count character type transitions (upper to lower, letter to digit, etc.)
152
        // Random strings have more irregular transitions
153
        $transitions = 0;
154
        $len = strlen($str);
155
        for ($i = 1; $i < $len; $i++) {
156
            $prevIsUpper = ctype_upper($str[$i - 1]);
157
            $currIsUpper = ctype_upper($str[$i]);
158
            $prevIsDigit = ctype_digit($str[$i - 1]);
159
            $currIsDigit = ctype_digit($str[$i]);
160
161
            if (($prevIsUpper !== $currIsUpper && !$prevIsDigit && !$currIsDigit) ||
162
                ($prevIsDigit !== $currIsDigit)) {
163
                $transitions++;
164
            }
165
        }
166
167
        // High transition ratio suggests random string
168
        $transitionRatio = $transitions / max(1, $len - 1);
169
170
        // Random strings typically have:
171
        // - Many case transitions (not following CamelCase pattern)
172
        // - Mix of letters and numbers throughout
173
        // - No recognizable word patterns
174
175
        // Check for common English word patterns (consonant-vowel patterns)
176
        $hasWordPattern = preg_match('/[bcdfghjklmnpqrstvwxyz]{1,2}[aeiou][bcdfghjklmnpqrstvwxyz]{1,2}[aeiou]/i', $str);
177
178
        // If transition ratio is high and no word patterns, likely random
179
        if ($transitionRatio > 0.3 && !$hasWordPattern) {
180
            return true;
181
        }
182
183
        // Check for sequences of consonants that are unlikely in real words
184
        if (preg_match('/[bcdfghjklmnpqrstvwxyz]{5,}/i', $str)) {
185
            return true;
186
        }
187
188
        // If mixed case and digits with no clear structure
189
        if (preg_match('/[A-Z]/', $str) && preg_match('/[a-z]/', $str) && preg_match('/\d/', $str)) {
190
            return true;
191
        }
192
193
        return false;
194
    }
195
}
196