|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace App\Services\Categorization\Categorizers; |
|
4
|
|
|
|
|
5
|
|
|
use App\Models\Category; |
|
6
|
|
|
use App\Services\Categorization\CategorizationResult; |
|
7
|
|
|
use App\Services\Categorization\ReleaseContext; |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* Categorizer for miscellaneous content and hash detection. |
|
11
|
|
|
* This runs FIRST with high priority to detect hashes early and prevent |
|
12
|
|
|
* them from being incorrectly categorized by group-based or content-based rules. |
|
13
|
|
|
*/ |
|
14
|
|
|
class MiscCategorizer extends AbstractCategorizer |
|
15
|
|
|
{ |
|
16
|
|
|
protected int $priority = 1; // Highest priority - run first to catch hashes |
|
17
|
|
|
|
|
18
|
|
|
public function getName(): string |
|
19
|
|
|
{ |
|
20
|
|
|
return 'Misc'; |
|
21
|
|
|
} |
|
22
|
|
|
|
|
23
|
|
|
public function categorize(ReleaseContext $context): CategorizationResult |
|
24
|
|
|
{ |
|
25
|
|
|
$name = $context->releaseName; |
|
26
|
|
|
|
|
27
|
|
|
// Check for hash patterns first |
|
28
|
|
|
if ($result = $this->checkHash($name)) { |
|
29
|
|
|
return $result; |
|
30
|
|
|
} |
|
31
|
|
|
|
|
32
|
|
|
// Check for archive formats |
|
33
|
|
|
if ($result = $this->checkArchive($name)) { |
|
34
|
|
|
return $result; |
|
35
|
|
|
} |
|
36
|
|
|
|
|
37
|
|
|
// Check for dataset/dump patterns |
|
38
|
|
|
if ($result = $this->checkDataset($name)) { |
|
39
|
|
|
return $result; |
|
40
|
|
|
} |
|
41
|
|
|
|
|
42
|
|
|
// Check for obfuscated/encoded patterns |
|
43
|
|
|
if ($result = $this->checkObfuscated($name)) { |
|
44
|
|
|
return $result; |
|
45
|
|
|
} |
|
46
|
|
|
|
|
47
|
|
|
return $this->noMatch(); |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
protected function checkHash(string $name): ?CategorizationResult |
|
51
|
|
|
{ |
|
52
|
|
|
// MD5 hash (32 hex characters) - match with word boundaries or quotes/punctuation |
|
53
|
|
|
if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{32})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) { |
|
54
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_md5'); |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
|
|
// SHA-1 hash (40 hex characters) - match with word boundaries or quotes/punctuation |
|
58
|
|
|
if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{40})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) { |
|
59
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_sha1'); |
|
60
|
|
|
} |
|
61
|
|
|
|
|
62
|
|
|
// SHA-256 hash (64 hex characters) - match with word boundaries or quotes/punctuation |
|
63
|
|
|
if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{64})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) { |
|
64
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_sha256'); |
|
65
|
|
|
} |
|
66
|
|
|
|
|
67
|
|
|
// Generic long hex hash (32-128 chars) - match with word boundaries or quotes/punctuation |
|
68
|
|
|
if (preg_match('/(?:^|["\'\s\[\]\/\-])([a-f0-9]{32,128})(?:["\'\s\[\]\/\-\.]|$)/i', $name)) { |
|
69
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_generic'); |
|
70
|
|
|
} |
|
71
|
|
|
|
|
72
|
|
|
return null; |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
protected function checkArchive(string $name): ?CategorizationResult |
|
76
|
|
|
{ |
|
77
|
|
|
if (preg_match('/\.(zip|rar|7z|tar|gz|bz2|xz|tgz|tbz2|cab|iso|img|dmg|pkg|archive)$/i', $name)) { |
|
78
|
|
|
return $this->matched(Category::OTHER_MISC, 0.5, 'archive'); |
|
79
|
|
|
} |
|
80
|
|
|
|
|
81
|
|
|
return null; |
|
82
|
|
|
} |
|
83
|
|
|
|
|
84
|
|
|
protected function checkDataset(string $name): ?CategorizationResult |
|
85
|
|
|
{ |
|
86
|
|
|
// Dataset/dump patterns that aren't media |
|
87
|
|
|
if (preg_match('/\b(sql|csv|dump|backup|dataset|collection)\b/i', $name) && |
|
88
|
|
|
!preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) { |
|
89
|
|
|
return $this->matched(Category::OTHER_MISC, 0.6, 'dataset'); |
|
90
|
|
|
} |
|
91
|
|
|
|
|
92
|
|
|
// Data leaks/dumps (be careful with these) |
|
93
|
|
|
if (preg_match('/\b(leak|breach|data|database)\b/i', $name) && |
|
94
|
|
|
preg_match('/\b(dump|export|backup)\b/i', $name) && |
|
95
|
|
|
!preg_match('/\b(movie|tv|show|audio|video|book|game)\b/i', $name)) { |
|
96
|
|
|
return $this->matched(Category::OTHER_MISC, 0.6, 'data_dump'); |
|
97
|
|
|
} |
|
98
|
|
|
|
|
99
|
|
|
return null; |
|
100
|
|
|
} |
|
101
|
|
|
|
|
102
|
|
|
protected function checkObfuscated(string $name): ?CategorizationResult |
|
103
|
|
|
{ |
|
104
|
|
|
// Release names consisting only of uppercase letters and numbers |
|
105
|
|
|
if (preg_match('/^[A-Z0-9]{15,}$/', $name)) { |
|
106
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_uppercase'); |
|
107
|
|
|
} |
|
108
|
|
|
|
|
109
|
|
|
// Mixed-case alphanumeric strings without separators (common obfuscation pattern) |
|
110
|
|
|
// These look like random strings: e.g., "AA7Jl2toE8Q53yNZmQ5R6G" |
|
111
|
|
|
if (preg_match('/^[a-zA-Z0-9]{15,}$/', $name) && |
|
112
|
|
|
!preg_match('/\b(19|20)\d{2}\b/', $name) && |
|
113
|
|
|
!preg_match('/^[A-Z][a-z]+([A-Z][a-z]+)+$/', $name)) { // Exclude CamelCase words |
|
114
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.7, 'obfuscated_mixed_alphanumeric'); |
|
115
|
|
|
} |
|
116
|
|
|
|
|
117
|
|
|
// Obfuscated filename embedded in usenet subject line format |
|
118
|
|
|
// Matches patterns like: [XX/XX] - "RANDOMSTRING.partXX.rar" or "RANDOMSTRING.7z.001" |
|
119
|
|
|
// The filename inside quotes is random alphanumeric with no meaningful words |
|
120
|
|
|
if (preg_match('/\[\d+\/\d+\]\s*-\s*"([a-zA-Z0-9]{12,})\.(part\d+\.rar|7z\.\d{3}|rar|zip|vol\d+\+\d+\.par2|par2)"/i', $name, $matches)) { |
|
121
|
|
|
$filename = $matches[1]; |
|
122
|
|
|
// Ensure the filename looks random (not a real title) |
|
123
|
|
|
// Real titles would have words/structure, obfuscated ones are random chars |
|
124
|
|
|
if (!preg_match('/[._ -]/', $filename) && // No separators |
|
125
|
|
|
!preg_match('/\b(19|20)\d{2}\b/', $filename) && // No year |
|
126
|
|
|
$this->looksLikeRandomString($filename)) { // Additional entropy check |
|
127
|
|
|
return $this->matched(Category::OTHER_HASHED, 0.85, 'obfuscated_usenet_filename'); |
|
128
|
|
|
} |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
// Only punctuation and numbers with no clear structure |
|
132
|
|
|
if (preg_match('/^[^a-zA-Z]*[A-Z0-9\._\-]{5,}[^a-zA-Z]*$/', $name) && |
|
133
|
|
|
!preg_match('/\.(mkv|avi|mp4|mp3|flac|pdf|epub|exe|iso)$/i', $name)) { |
|
134
|
|
|
return $this->matched(Category::OTHER_MISC, 0.5, 'obfuscated_pattern'); |
|
135
|
|
|
} |
|
136
|
|
|
|
|
137
|
|
|
return null; |
|
138
|
|
|
} |
|
139
|
|
|
|
|
140
|
|
|
/** |
|
141
|
|
|
* Check if a string looks like a random/obfuscated string rather than a real title. |
|
142
|
|
|
* Uses multiple heuristics to detect randomly generated filenames. |
|
143
|
|
|
*/ |
|
144
|
|
|
protected function looksLikeRandomString(string $str): bool |
|
145
|
|
|
{ |
|
146
|
|
|
// If it's all uppercase or all lowercase with no pattern, likely random |
|
147
|
|
|
if (preg_match('/^[A-Z]+$/', $str) || preg_match('/^[a-z]+$/', $str)) { |
|
148
|
|
|
return strlen($str) >= 12; |
|
149
|
|
|
} |
|
150
|
|
|
|
|
151
|
|
|
// Count character type transitions (upper to lower, letter to digit, etc.) |
|
152
|
|
|
// Random strings have more irregular transitions |
|
153
|
|
|
$transitions = 0; |
|
154
|
|
|
$len = strlen($str); |
|
155
|
|
|
for ($i = 1; $i < $len; $i++) { |
|
156
|
|
|
$prevIsUpper = ctype_upper($str[$i - 1]); |
|
157
|
|
|
$currIsUpper = ctype_upper($str[$i]); |
|
158
|
|
|
$prevIsDigit = ctype_digit($str[$i - 1]); |
|
159
|
|
|
$currIsDigit = ctype_digit($str[$i]); |
|
160
|
|
|
|
|
161
|
|
|
if (($prevIsUpper !== $currIsUpper && !$prevIsDigit && !$currIsDigit) || |
|
162
|
|
|
($prevIsDigit !== $currIsDigit)) { |
|
163
|
|
|
$transitions++; |
|
164
|
|
|
} |
|
165
|
|
|
} |
|
166
|
|
|
|
|
167
|
|
|
// High transition ratio suggests random string |
|
168
|
|
|
$transitionRatio = $transitions / max(1, $len - 1); |
|
169
|
|
|
|
|
170
|
|
|
// Random strings typically have: |
|
171
|
|
|
// - Many case transitions (not following CamelCase pattern) |
|
172
|
|
|
// - Mix of letters and numbers throughout |
|
173
|
|
|
// - No recognizable word patterns |
|
174
|
|
|
|
|
175
|
|
|
// Check for common English word patterns (consonant-vowel patterns) |
|
176
|
|
|
$hasWordPattern = preg_match('/[bcdfghjklmnpqrstvwxyz]{1,2}[aeiou][bcdfghjklmnpqrstvwxyz]{1,2}[aeiou]/i', $str); |
|
177
|
|
|
|
|
178
|
|
|
// If transition ratio is high and no word patterns, likely random |
|
179
|
|
|
if ($transitionRatio > 0.3 && !$hasWordPattern) { |
|
180
|
|
|
return true; |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
// Check for sequences of consonants that are unlikely in real words |
|
184
|
|
|
if (preg_match('/[bcdfghjklmnpqrstvwxyz]{5,}/i', $str)) { |
|
185
|
|
|
return true; |
|
186
|
|
|
} |
|
187
|
|
|
|
|
188
|
|
|
// If mixed case and digits with no clear structure |
|
189
|
|
|
if (preg_match('/[A-Z]/', $str) && preg_match('/[a-z]/', $str) && preg_match('/\d/', $str)) { |
|
190
|
|
|
return true; |
|
191
|
|
|
} |
|
192
|
|
|
|
|
193
|
|
|
return false; |
|
194
|
|
|
} |
|
195
|
|
|
} |
|
196
|
|
|
|