|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace detox\dataset; |
|
4
|
|
|
|
|
5
|
|
|
|
|
6
|
|
|
class EnglishSet implements SetContract |
|
7
|
|
|
{ |
|
8
|
|
|
|
|
9
|
|
|
private $words = [ |
|
10
|
|
|
'1.0' => [ |
|
11
|
|
|
// simple bad words |
|
12
|
|
|
'fuck', |
|
13
|
|
|
'slut', |
|
14
|
|
|
'dick', |
|
15
|
|
|
'faggot', |
|
16
|
|
|
'bitch', |
|
17
|
|
|
'douchebag', |
|
18
|
|
|
'dickhead', |
|
19
|
|
|
'jerk', |
|
20
|
|
|
'cunt', |
|
21
|
|
|
'shit', |
|
22
|
|
|
'piss', |
|
23
|
|
|
'crap', |
|
24
|
|
|
'cock', |
|
25
|
|
|
'twat', |
|
26
|
|
|
'arse', |
|
27
|
|
|
'arsehole', |
|
28
|
|
|
'tosser', |
|
29
|
|
|
'wanker', |
|
30
|
|
|
'bastard', |
|
31
|
|
|
'honkey', |
|
32
|
|
|
'nigger', |
|
33
|
|
|
'flikker', |
|
34
|
|
|
'scumbag', |
|
35
|
|
|
// porn industry |
|
36
|
|
|
'bukakke', |
|
37
|
|
|
'dildo', |
|
38
|
|
|
'strapon', |
|
39
|
|
|
'shag', |
|
40
|
|
|
'sex', |
|
41
|
|
|
'blowjob', |
|
42
|
|
|
'bdsm', |
|
43
|
|
|
'bbd', |
|
44
|
|
|
'milf', |
|
45
|
|
|
'anal', |
|
46
|
|
|
'vagina', |
|
47
|
|
|
], |
|
48
|
|
|
'0.9' => ['ugly', 'stupid', 'dumb', 'boobs', 'pish', 'fanny', 'slag', 'squirt', 'torture', 'ass', 'nitwit', 'whiffet'], |
|
49
|
|
|
'0.8' => ['silly', 'pussy', 'sick', 'git', 'poop', 'slaughter', 'sperm'], |
|
50
|
|
|
'0.7' => ['shallow', 'tit', 'tits', 'foolish', 'nonce', 'bugger', 'naught', 'prick', 'schmuck', 'nonentity', 'idiot'], |
|
51
|
|
|
'0.6' => ['rednack', 'mindless', 'fat', 'nude', 'wft', 'snot', 'bloodbath', 'massacre', 'massacrer'], |
|
52
|
|
|
'0.5' => ['bully', 'sneaky', 'greedy', 'creep', 'kill', 'revenge', 'catfight', 'die', 'death', 'nought', 'nonentity'], |
|
53
|
|
|
'0.4' => ['superficial', 'numb', 'clown', 'villager', 'flatter', 'murder', 'nothingness'], |
|
54
|
|
|
'0.3' => ['fake', 'strange', 'ignorant', 'critical', 'nuts', 'cum', 'genitals', 'retaliation', 'freak', 'kick'], |
|
55
|
|
|
'0.2' => ['useless', 'thoughtless', 'crazy', 'bollocks', 'bit', 'hit', 'exterminate', 'gangster'], |
|
56
|
|
|
// <= 0.1 is almost noise for detox |
|
57
|
|
|
'0.1' => ['punch', 'insect', 'annihilate', 'steal', 'kidnap'], |
|
58
|
|
|
'0.06' => ['dude', 'pal', 'yo'], |
|
59
|
|
|
]; |
|
60
|
|
|
|
|
61
|
|
|
private $phrases = [ |
|
62
|
|
|
'1.0' => ['dirty sanchez', 'gang bang', 'piss off', 'blow job', 'kick ass'], |
|
63
|
|
|
'0.9' => ['swinger party', 'bloody hell', 'bugger off', 'black on white', 'double penetration'], |
|
64
|
|
|
'0.8' => ['get staffed', 'get lost'], |
|
65
|
|
|
'0.7' => ['screw you', 'screw u', 'get off'], |
|
66
|
|
|
'0.3' => ['white supremacy', 'black supremacy', 'ku klux klan'], |
|
67
|
|
|
'0.2' => [ |
|
68
|
|
|
'black people', |
|
69
|
|
|
'white people', |
|
70
|
|
|
'asian people', |
|
71
|
|
|
'indian people', |
|
72
|
|
|
'spanish people', |
|
73
|
|
|
'mexican people', |
|
74
|
|
|
'black ppl', |
|
75
|
|
|
'white ppl', |
|
76
|
|
|
'asian ppl', |
|
77
|
|
|
'indian ppl', |
|
78
|
|
|
'spanish ppl', |
|
79
|
|
|
'mexican ppl', |
|
80
|
|
|
], |
|
81
|
|
|
]; |
|
82
|
|
|
|
|
83
|
|
|
/** |
|
84
|
|
|
* @return array |
|
85
|
|
|
*/ |
|
86
|
|
|
public function getWords() : array |
|
87
|
|
|
{ |
|
88
|
|
|
return $this->words; |
|
89
|
|
|
} |
|
90
|
|
|
|
|
91
|
|
|
/** |
|
92
|
|
|
* @return array |
|
93
|
|
|
*/ |
|
94
|
|
|
public function getPhrases() : array |
|
95
|
|
|
{ |
|
96
|
|
|
return $this->phrases; |
|
97
|
|
|
} |
|
98
|
|
|
} |