|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace App\Console\Commands; |
|
4
|
|
|
|
|
5
|
|
|
use App\Models\Student; |
|
6
|
|
|
use Illuminate\Console\Command; |
|
7
|
|
|
use Illuminate\Support\Facades\Http; |
|
8
|
|
|
|
|
9
|
|
|
class genderizeStudents extends Command |
|
10
|
|
|
{ |
|
11
|
|
|
protected $signature = 'academico:genderize-students'; |
|
12
|
|
|
|
|
13
|
|
|
protected $description = 'Guess gender for existing students, based on their firstname'; |
|
14
|
|
|
|
|
15
|
|
|
// wordpress implementation, from https://github.com/WordPress/WordPress/blob/a2693fd8602e3263b5925b9d799ddd577202167d/wp-includes/formatting.php#L1528 |
|
16
|
|
|
private function remove_accents( $string ) { |
|
17
|
|
|
|
|
18
|
|
|
$chars = array( |
|
19
|
|
|
// Decompositions for Latin-1 Supplement |
|
20
|
|
|
'ª' => 'a', 'º' => 'o', |
|
21
|
|
|
'À' => 'A', 'Á' => 'A', |
|
22
|
|
|
'Â' => 'A', 'Ã' => 'A', |
|
23
|
|
|
'Ä' => 'A', 'Å' => 'A', |
|
24
|
|
|
'Æ' => 'AE','Ç' => 'C', |
|
25
|
|
|
'È' => 'E', 'É' => 'E', |
|
26
|
|
|
'Ê' => 'E', 'Ë' => 'E', |
|
27
|
|
|
'Ì' => 'I', 'Í' => 'I', |
|
28
|
|
|
'Î' => 'I', 'Ï' => 'I', |
|
29
|
|
|
'Ð' => 'D', 'Ñ' => 'N', |
|
30
|
|
|
'Ò' => 'O', 'Ó' => 'O', |
|
31
|
|
|
'Ô' => 'O', 'Õ' => 'O', |
|
32
|
|
|
'Ö' => 'O', 'Ù' => 'U', |
|
33
|
|
|
'Ú' => 'U', 'Û' => 'U', |
|
34
|
|
|
'Ü' => 'U', 'Ý' => 'Y', |
|
35
|
|
|
'Þ' => 'TH','ß' => 's', |
|
36
|
|
|
'à' => 'a', 'á' => 'a', |
|
37
|
|
|
'â' => 'a', 'ã' => 'a', |
|
38
|
|
|
'ä' => 'a', 'å' => 'a', |
|
39
|
|
|
'æ' => 'ae','ç' => 'c', |
|
40
|
|
|
'è' => 'e', 'é' => 'e', |
|
41
|
|
|
'ê' => 'e', 'ë' => 'e', |
|
42
|
|
|
'ì' => 'i', 'í' => 'i', |
|
43
|
|
|
'î' => 'i', 'ï' => 'i', |
|
44
|
|
|
'ð' => 'd', 'ñ' => 'n', |
|
45
|
|
|
'ò' => 'o', 'ó' => 'o', |
|
46
|
|
|
'ô' => 'o', 'õ' => 'o', |
|
47
|
|
|
'ö' => 'o', 'ø' => 'o', |
|
48
|
|
|
'ù' => 'u', 'ú' => 'u', |
|
49
|
|
|
'û' => 'u', 'ü' => 'u', |
|
50
|
|
|
'ý' => 'y', 'þ' => 'th', |
|
51
|
|
|
'ÿ' => 'y', 'Ø' => 'O', |
|
52
|
|
|
// Decompositions for Latin Extended-A |
|
53
|
|
|
'Ā' => 'A', 'ā' => 'a', |
|
54
|
|
|
'Ă' => 'A', 'ă' => 'a', |
|
55
|
|
|
'Ą' => 'A', 'ą' => 'a', |
|
56
|
|
|
'Ć' => 'C', 'ć' => 'c', |
|
57
|
|
|
'Ĉ' => 'C', 'ĉ' => 'c', |
|
58
|
|
|
'Ċ' => 'C', 'ċ' => 'c', |
|
59
|
|
|
'Č' => 'C', 'č' => 'c', |
|
60
|
|
|
'Ď' => 'D', 'ď' => 'd', |
|
61
|
|
|
'Đ' => 'D', 'đ' => 'd', |
|
62
|
|
|
'Ē' => 'E', 'ē' => 'e', |
|
63
|
|
|
'Ĕ' => 'E', 'ĕ' => 'e', |
|
64
|
|
|
'Ė' => 'E', 'ė' => 'e', |
|
65
|
|
|
'Ę' => 'E', 'ę' => 'e', |
|
66
|
|
|
'Ě' => 'E', 'ě' => 'e', |
|
67
|
|
|
'Ĝ' => 'G', 'ĝ' => 'g', |
|
68
|
|
|
'Ğ' => 'G', 'ğ' => 'g', |
|
69
|
|
|
'Ġ' => 'G', 'ġ' => 'g', |
|
70
|
|
|
'Ģ' => 'G', 'ģ' => 'g', |
|
71
|
|
|
'Ĥ' => 'H', 'ĥ' => 'h', |
|
72
|
|
|
'Ħ' => 'H', 'ħ' => 'h', |
|
73
|
|
|
'Ĩ' => 'I', 'ĩ' => 'i', |
|
74
|
|
|
'Ī' => 'I', 'ī' => 'i', |
|
75
|
|
|
'Ĭ' => 'I', 'ĭ' => 'i', |
|
76
|
|
|
'Į' => 'I', 'į' => 'i', |
|
77
|
|
|
'İ' => 'I', 'ı' => 'i', |
|
78
|
|
|
'IJ' => 'IJ','ij' => 'ij', |
|
79
|
|
|
'Ĵ' => 'J', 'ĵ' => 'j', |
|
80
|
|
|
'Ķ' => 'K', 'ķ' => 'k', |
|
81
|
|
|
'ĸ' => 'k', 'Ĺ' => 'L', |
|
82
|
|
|
'ĺ' => 'l', 'Ļ' => 'L', |
|
83
|
|
|
'ļ' => 'l', 'Ľ' => 'L', |
|
84
|
|
|
'ľ' => 'l', 'Ŀ' => 'L', |
|
85
|
|
|
'ŀ' => 'l', 'Ł' => 'L', |
|
86
|
|
|
'ł' => 'l', 'Ń' => 'N', |
|
87
|
|
|
'ń' => 'n', 'Ņ' => 'N', |
|
88
|
|
|
'ņ' => 'n', 'Ň' => 'N', |
|
89
|
|
|
'ň' => 'n', 'ʼn' => 'n', |
|
90
|
|
|
'Ŋ' => 'N', 'ŋ' => 'n', |
|
91
|
|
|
'Ō' => 'O', 'ō' => 'o', |
|
92
|
|
|
'Ŏ' => 'O', 'ŏ' => 'o', |
|
93
|
|
|
'Ő' => 'O', 'ő' => 'o', |
|
94
|
|
|
'Œ' => 'OE','œ' => 'oe', |
|
95
|
|
|
'Ŕ' => 'R','ŕ' => 'r', |
|
96
|
|
|
'Ŗ' => 'R','ŗ' => 'r', |
|
97
|
|
|
'Ř' => 'R','ř' => 'r', |
|
98
|
|
|
'Ś' => 'S','ś' => 's', |
|
99
|
|
|
'Ŝ' => 'S','ŝ' => 's', |
|
100
|
|
|
'Ş' => 'S','ş' => 's', |
|
101
|
|
|
'Š' => 'S', 'š' => 's', |
|
102
|
|
|
'Ţ' => 'T', 'ţ' => 't', |
|
103
|
|
|
'Ť' => 'T', 'ť' => 't', |
|
104
|
|
|
'Ŧ' => 'T', 'ŧ' => 't', |
|
105
|
|
|
'Ũ' => 'U', 'ũ' => 'u', |
|
106
|
|
|
'Ū' => 'U', 'ū' => 'u', |
|
107
|
|
|
'Ŭ' => 'U', 'ŭ' => 'u', |
|
108
|
|
|
'Ů' => 'U', 'ů' => 'u', |
|
109
|
|
|
'Ű' => 'U', 'ű' => 'u', |
|
110
|
|
|
'Ų' => 'U', 'ų' => 'u', |
|
111
|
|
|
'Ŵ' => 'W', 'ŵ' => 'w', |
|
112
|
|
|
'Ŷ' => 'Y', 'ŷ' => 'y', |
|
113
|
|
|
'Ÿ' => 'Y', 'Ź' => 'Z', |
|
114
|
|
|
'ź' => 'z', 'Ż' => 'Z', |
|
115
|
|
|
'ż' => 'z', 'Ž' => 'Z', |
|
116
|
|
|
'ž' => 'z', 'ſ' => 's', |
|
117
|
|
|
// Decompositions for Latin Extended-B |
|
118
|
|
|
'Ș' => 'S', 'ș' => 's', |
|
119
|
|
|
'Ț' => 'T', 'ț' => 't', |
|
120
|
|
|
// Euro Sign |
|
121
|
|
|
'€' => 'E', |
|
122
|
|
|
// GBP (Pound) Sign |
|
123
|
|
|
'£' => '', |
|
124
|
|
|
// Vowels with diacritic (Vietnamese) |
|
125
|
|
|
// unmarked |
|
126
|
|
|
'Ơ' => 'O', 'ơ' => 'o', |
|
127
|
|
|
'Ư' => 'U', 'ư' => 'u', |
|
128
|
|
|
// grave accent |
|
129
|
|
|
'Ầ' => 'A', 'ầ' => 'a', |
|
130
|
|
|
'Ằ' => 'A', 'ằ' => 'a', |
|
131
|
|
|
'Ề' => 'E', 'ề' => 'e', |
|
132
|
|
|
'Ồ' => 'O', 'ồ' => 'o', |
|
133
|
|
|
'Ờ' => 'O', 'ờ' => 'o', |
|
134
|
|
|
'Ừ' => 'U', 'ừ' => 'u', |
|
135
|
|
|
'Ỳ' => 'Y', 'ỳ' => 'y', |
|
136
|
|
|
// hook |
|
137
|
|
|
'Ả' => 'A', 'ả' => 'a', |
|
138
|
|
|
'Ẩ' => 'A', 'ẩ' => 'a', |
|
139
|
|
|
'Ẳ' => 'A', 'ẳ' => 'a', |
|
140
|
|
|
'Ẻ' => 'E', 'ẻ' => 'e', |
|
141
|
|
|
'Ể' => 'E', 'ể' => 'e', |
|
142
|
|
|
'Ỉ' => 'I', 'ỉ' => 'i', |
|
143
|
|
|
'Ỏ' => 'O', 'ỏ' => 'o', |
|
144
|
|
|
'Ổ' => 'O', 'ổ' => 'o', |
|
145
|
|
|
'Ở' => 'O', 'ở' => 'o', |
|
146
|
|
|
'Ủ' => 'U', 'ủ' => 'u', |
|
147
|
|
|
'Ử' => 'U', 'ử' => 'u', |
|
148
|
|
|
'Ỷ' => 'Y', 'ỷ' => 'y', |
|
149
|
|
|
// tilde |
|
150
|
|
|
'Ẫ' => 'A', 'ẫ' => 'a', |
|
151
|
|
|
'Ẵ' => 'A', 'ẵ' => 'a', |
|
152
|
|
|
'Ẽ' => 'E', 'ẽ' => 'e', |
|
153
|
|
|
'Ễ' => 'E', 'ễ' => 'e', |
|
154
|
|
|
'Ỗ' => 'O', 'ỗ' => 'o', |
|
155
|
|
|
'Ỡ' => 'O', 'ỡ' => 'o', |
|
156
|
|
|
'Ữ' => 'U', 'ữ' => 'u', |
|
157
|
|
|
'Ỹ' => 'Y', 'ỹ' => 'y', |
|
158
|
|
|
// acute accent |
|
159
|
|
|
'Ấ' => 'A', 'ấ' => 'a', |
|
160
|
|
|
'Ắ' => 'A', 'ắ' => 'a', |
|
161
|
|
|
'Ế' => 'E', 'ế' => 'e', |
|
162
|
|
|
'Ố' => 'O', 'ố' => 'o', |
|
163
|
|
|
'Ớ' => 'O', 'ớ' => 'o', |
|
164
|
|
|
'Ứ' => 'U', 'ứ' => 'u', |
|
165
|
|
|
// dot below |
|
166
|
|
|
'Ạ' => 'A', 'ạ' => 'a', |
|
167
|
|
|
'Ậ' => 'A', 'ậ' => 'a', |
|
168
|
|
|
'Ặ' => 'A', 'ặ' => 'a', |
|
169
|
|
|
'Ẹ' => 'E', 'ẹ' => 'e', |
|
170
|
|
|
'Ệ' => 'E', 'ệ' => 'e', |
|
171
|
|
|
'Ị' => 'I', 'ị' => 'i', |
|
172
|
|
|
'Ọ' => 'O', 'ọ' => 'o', |
|
173
|
|
|
'Ộ' => 'O', 'ộ' => 'o', |
|
174
|
|
|
'Ợ' => 'O', 'ợ' => 'o', |
|
175
|
|
|
'Ụ' => 'U', 'ụ' => 'u', |
|
176
|
|
|
'Ự' => 'U', 'ự' => 'u', |
|
177
|
|
|
'Ỵ' => 'Y', 'ỵ' => 'y', |
|
178
|
|
|
// Vowels with diacritic (Chinese, Hanyu Pinyin) |
|
179
|
|
|
'ɑ' => 'a', |
|
180
|
|
|
// macron |
|
181
|
|
|
'Ǖ' => 'U', 'ǖ' => 'u', |
|
182
|
|
|
// acute accent |
|
183
|
|
|
'Ǘ' => 'U', 'ǘ' => 'u', |
|
184
|
|
|
// caron |
|
185
|
|
|
'Ǎ' => 'A', 'ǎ' => 'a', |
|
186
|
|
|
'Ǐ' => 'I', 'ǐ' => 'i', |
|
187
|
|
|
'Ǒ' => 'O', 'ǒ' => 'o', |
|
188
|
|
|
'Ǔ' => 'U', 'ǔ' => 'u', |
|
189
|
|
|
'Ǚ' => 'U', 'ǚ' => 'u', |
|
190
|
|
|
// grave accent |
|
191
|
|
|
'Ǜ' => 'U', 'ǜ' => 'u', |
|
192
|
|
|
); |
|
193
|
|
|
|
|
194
|
|
|
return strtr($string, $chars); |
|
195
|
|
|
} |
|
196
|
|
|
|
|
197
|
|
|
|
|
198
|
|
|
public function handle() |
|
199
|
|
|
{ |
|
200
|
|
|
Student::whereNull('gender_id')->chunkById(10, function ($students) { |
|
201
|
|
|
$query = $students->map(fn($student) => ['id' => $student->id, 'name' => $this->remove_accents(strtok($student->firstname, " "))]); |
|
202
|
|
|
|
|
203
|
|
|
$response = Http::get('https://api.genderize.io/?name[]=' . $query->pluck('name')->implode('&name[]=')); |
|
204
|
|
|
foreach ($students as $student) { |
|
205
|
|
|
$firstname = $query->firstWhere('id', $student->id)['name']; |
|
206
|
|
|
|
|
207
|
|
|
$student->update([ |
|
208
|
|
|
'gender_id' => match($response->collect()->where('name', $firstname)->where('probability', '>', 0.9)->where('count', '>', 10)->first()['gender'] ?? '-') { |
|
209
|
|
|
"male" => 2, |
|
210
|
|
|
"female" => 1, |
|
211
|
|
|
default => null, |
|
212
|
|
|
}, |
|
213
|
|
|
]); |
|
214
|
|
|
} |
|
215
|
|
|
}); |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
|
|
|
|
219
|
|
|
} |
|
220
|
|
|
|