1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace App\Console\Commands; |
4
|
|
|
|
5
|
|
|
use App\Models\Student; |
6
|
|
|
use Illuminate\Console\Command; |
7
|
|
|
use Illuminate\Support\Facades\Http; |
8
|
|
|
|
9
|
|
|
class genderizeStudents extends Command |
10
|
|
|
{ |
11
|
|
|
protected $signature = 'academico:genderize-students'; |
12
|
|
|
|
13
|
|
|
protected $description = 'Guess gender for existing students, based on their firstname'; |
14
|
|
|
|
15
|
|
|
// wordpress implementation, from https://github.com/WordPress/WordPress/blob/a2693fd8602e3263b5925b9d799ddd577202167d/wp-includes/formatting.php#L1528 |
16
|
|
|
private function remove_accents($string) |
17
|
|
|
{ |
18
|
|
|
$chars = [ |
19
|
|
|
// Decompositions for Latin-1 Supplement |
20
|
|
|
'ª' => 'a', |
21
|
|
|
'º' => 'o', |
22
|
|
|
'À' => 'A', |
23
|
|
|
'Á' => 'A', |
24
|
|
|
'Â' => 'A', |
25
|
|
|
'Ã' => 'A', |
26
|
|
|
'Ä' => 'A', |
27
|
|
|
'Å' => 'A', |
28
|
|
|
'Æ' => 'AE', |
29
|
|
|
'Ç' => 'C', |
30
|
|
|
'È' => 'E', |
31
|
|
|
'É' => 'E', |
32
|
|
|
'Ê' => 'E', |
33
|
|
|
'Ë' => 'E', |
34
|
|
|
'Ì' => 'I', |
35
|
|
|
'Í' => 'I', |
36
|
|
|
'Î' => 'I', |
37
|
|
|
'Ï' => 'I', |
38
|
|
|
'Ð' => 'D', |
39
|
|
|
'Ñ' => 'N', |
40
|
|
|
'Ò' => 'O', |
41
|
|
|
'Ó' => 'O', |
42
|
|
|
'Ô' => 'O', |
43
|
|
|
'Õ' => 'O', |
44
|
|
|
'Ö' => 'O', |
45
|
|
|
'Ù' => 'U', |
46
|
|
|
'Ú' => 'U', |
47
|
|
|
'Û' => 'U', |
48
|
|
|
'Ü' => 'U', |
49
|
|
|
'Ý' => 'Y', |
50
|
|
|
'Þ' => 'TH', |
51
|
|
|
'ß' => 's', |
52
|
|
|
'à' => 'a', |
53
|
|
|
'á' => 'a', |
54
|
|
|
'â' => 'a', |
55
|
|
|
'ã' => 'a', |
56
|
|
|
'ä' => 'a', |
57
|
|
|
'å' => 'a', |
58
|
|
|
'æ' => 'ae', |
59
|
|
|
'ç' => 'c', |
60
|
|
|
'è' => 'e', |
61
|
|
|
'é' => 'e', |
62
|
|
|
'ê' => 'e', |
63
|
|
|
'ë' => 'e', |
64
|
|
|
'ì' => 'i', |
65
|
|
|
'í' => 'i', |
66
|
|
|
'î' => 'i', |
67
|
|
|
'ï' => 'i', |
68
|
|
|
'ð' => 'd', |
69
|
|
|
'ñ' => 'n', |
70
|
|
|
'ò' => 'o', |
71
|
|
|
'ó' => 'o', |
72
|
|
|
'ô' => 'o', |
73
|
|
|
'õ' => 'o', |
74
|
|
|
'ö' => 'o', |
75
|
|
|
'ø' => 'o', |
76
|
|
|
'ù' => 'u', |
77
|
|
|
'ú' => 'u', |
78
|
|
|
'û' => 'u', |
79
|
|
|
'ü' => 'u', |
80
|
|
|
'ý' => 'y', |
81
|
|
|
'þ' => 'th', |
82
|
|
|
'ÿ' => 'y', |
83
|
|
|
'Ø' => 'O', |
84
|
|
|
// Decompositions for Latin Extended-A |
85
|
|
|
'Ā' => 'A', |
86
|
|
|
'ā' => 'a', |
87
|
|
|
'Ă' => 'A', |
88
|
|
|
'ă' => 'a', |
89
|
|
|
'Ą' => 'A', |
90
|
|
|
'ą' => 'a', |
91
|
|
|
'Ć' => 'C', |
92
|
|
|
'ć' => 'c', |
93
|
|
|
'Ĉ' => 'C', |
94
|
|
|
'ĉ' => 'c', |
95
|
|
|
'Ċ' => 'C', |
96
|
|
|
'ċ' => 'c', |
97
|
|
|
'Č' => 'C', |
98
|
|
|
'č' => 'c', |
99
|
|
|
'Ď' => 'D', |
100
|
|
|
'ď' => 'd', |
101
|
|
|
'Đ' => 'D', |
102
|
|
|
'đ' => 'd', |
103
|
|
|
'Ē' => 'E', |
104
|
|
|
'ē' => 'e', |
105
|
|
|
'Ĕ' => 'E', |
106
|
|
|
'ĕ' => 'e', |
107
|
|
|
'Ė' => 'E', |
108
|
|
|
'ė' => 'e', |
109
|
|
|
'Ę' => 'E', |
110
|
|
|
'ę' => 'e', |
111
|
|
|
'Ě' => 'E', |
112
|
|
|
'ě' => 'e', |
113
|
|
|
'Ĝ' => 'G', |
114
|
|
|
'ĝ' => 'g', |
115
|
|
|
'Ğ' => 'G', |
116
|
|
|
'ğ' => 'g', |
117
|
|
|
'Ġ' => 'G', |
118
|
|
|
'ġ' => 'g', |
119
|
|
|
'Ģ' => 'G', |
120
|
|
|
'ģ' => 'g', |
121
|
|
|
'Ĥ' => 'H', |
122
|
|
|
'ĥ' => 'h', |
123
|
|
|
'Ħ' => 'H', |
124
|
|
|
'ħ' => 'h', |
125
|
|
|
'Ĩ' => 'I', |
126
|
|
|
'ĩ' => 'i', |
127
|
|
|
'Ī' => 'I', |
128
|
|
|
'ī' => 'i', |
129
|
|
|
'Ĭ' => 'I', |
130
|
|
|
'ĭ' => 'i', |
131
|
|
|
'Į' => 'I', |
132
|
|
|
'į' => 'i', |
133
|
|
|
'İ' => 'I', |
134
|
|
|
'ı' => 'i', |
135
|
|
|
'IJ' => 'IJ', |
136
|
|
|
'ij' => 'ij', |
137
|
|
|
'Ĵ' => 'J', |
138
|
|
|
'ĵ' => 'j', |
139
|
|
|
'Ķ' => 'K', |
140
|
|
|
'ķ' => 'k', |
141
|
|
|
'ĸ' => 'k', |
142
|
|
|
'Ĺ' => 'L', |
143
|
|
|
'ĺ' => 'l', |
144
|
|
|
'Ļ' => 'L', |
145
|
|
|
'ļ' => 'l', |
146
|
|
|
'Ľ' => 'L', |
147
|
|
|
'ľ' => 'l', |
148
|
|
|
'Ŀ' => 'L', |
149
|
|
|
'ŀ' => 'l', |
150
|
|
|
'Ł' => 'L', |
151
|
|
|
'ł' => 'l', |
152
|
|
|
'Ń' => 'N', |
153
|
|
|
'ń' => 'n', |
154
|
|
|
'Ņ' => 'N', |
155
|
|
|
'ņ' => 'n', |
156
|
|
|
'Ň' => 'N', |
157
|
|
|
'ň' => 'n', |
158
|
|
|
'ʼn' => 'n', |
159
|
|
|
'Ŋ' => 'N', |
160
|
|
|
'ŋ' => 'n', |
161
|
|
|
'Ō' => 'O', |
162
|
|
|
'ō' => 'o', |
163
|
|
|
'Ŏ' => 'O', |
164
|
|
|
'ŏ' => 'o', |
165
|
|
|
'Ő' => 'O', |
166
|
|
|
'ő' => 'o', |
167
|
|
|
'Œ' => 'OE', |
168
|
|
|
'œ' => 'oe', |
169
|
|
|
'Ŕ' => 'R', |
170
|
|
|
'ŕ' => 'r', |
171
|
|
|
'Ŗ' => 'R', |
172
|
|
|
'ŗ' => 'r', |
173
|
|
|
'Ř' => 'R', |
174
|
|
|
'ř' => 'r', |
175
|
|
|
'Ś' => 'S', |
176
|
|
|
'ś' => 's', |
177
|
|
|
'Ŝ' => 'S', |
178
|
|
|
'ŝ' => 's', |
179
|
|
|
'Ş' => 'S', |
180
|
|
|
'ş' => 's', |
181
|
|
|
'Š' => 'S', |
182
|
|
|
'š' => 's', |
183
|
|
|
'Ţ' => 'T', |
184
|
|
|
'ţ' => 't', |
185
|
|
|
'Ť' => 'T', |
186
|
|
|
'ť' => 't', |
187
|
|
|
'Ŧ' => 'T', |
188
|
|
|
'ŧ' => 't', |
189
|
|
|
'Ũ' => 'U', |
190
|
|
|
'ũ' => 'u', |
191
|
|
|
'Ū' => 'U', |
192
|
|
|
'ū' => 'u', |
193
|
|
|
'Ŭ' => 'U', |
194
|
|
|
'ŭ' => 'u', |
195
|
|
|
'Ů' => 'U', |
196
|
|
|
'ů' => 'u', |
197
|
|
|
'Ű' => 'U', |
198
|
|
|
'ű' => 'u', |
199
|
|
|
'Ų' => 'U', |
200
|
|
|
'ų' => 'u', |
201
|
|
|
'Ŵ' => 'W', |
202
|
|
|
'ŵ' => 'w', |
203
|
|
|
'Ŷ' => 'Y', |
204
|
|
|
'ŷ' => 'y', |
205
|
|
|
'Ÿ' => 'Y', |
206
|
|
|
'Ź' => 'Z', |
207
|
|
|
'ź' => 'z', |
208
|
|
|
'Ż' => 'Z', |
209
|
|
|
'ż' => 'z', |
210
|
|
|
'Ž' => 'Z', |
211
|
|
|
'ž' => 'z', |
212
|
|
|
'ſ' => 's', |
213
|
|
|
// Decompositions for Latin Extended-B |
214
|
|
|
'Ș' => 'S', |
215
|
|
|
'ș' => 's', |
216
|
|
|
'Ț' => 'T', |
217
|
|
|
'ț' => 't', |
218
|
|
|
// Euro Sign |
219
|
|
|
'€' => 'E', |
220
|
|
|
// GBP (Pound) Sign |
221
|
|
|
'£' => '', |
222
|
|
|
// Vowels with diacritic (Vietnamese) |
223
|
|
|
// unmarked |
224
|
|
|
'Ơ' => 'O', |
225
|
|
|
'ơ' => 'o', |
226
|
|
|
'Ư' => 'U', |
227
|
|
|
'ư' => 'u', |
228
|
|
|
// grave accent |
229
|
|
|
'Ầ' => 'A', |
230
|
|
|
'ầ' => 'a', |
231
|
|
|
'Ằ' => 'A', |
232
|
|
|
'ằ' => 'a', |
233
|
|
|
'Ề' => 'E', |
234
|
|
|
'ề' => 'e', |
235
|
|
|
'Ồ' => 'O', |
236
|
|
|
'ồ' => 'o', |
237
|
|
|
'Ờ' => 'O', |
238
|
|
|
'ờ' => 'o', |
239
|
|
|
'Ừ' => 'U', |
240
|
|
|
'ừ' => 'u', |
241
|
|
|
'Ỳ' => 'Y', |
242
|
|
|
'ỳ' => 'y', |
243
|
|
|
// hook |
244
|
|
|
'Ả' => 'A', |
245
|
|
|
'ả' => 'a', |
246
|
|
|
'Ẩ' => 'A', |
247
|
|
|
'ẩ' => 'a', |
248
|
|
|
'Ẳ' => 'A', |
249
|
|
|
'ẳ' => 'a', |
250
|
|
|
'Ẻ' => 'E', |
251
|
|
|
'ẻ' => 'e', |
252
|
|
|
'Ể' => 'E', |
253
|
|
|
'ể' => 'e', |
254
|
|
|
'Ỉ' => 'I', |
255
|
|
|
'ỉ' => 'i', |
256
|
|
|
'Ỏ' => 'O', |
257
|
|
|
'ỏ' => 'o', |
258
|
|
|
'Ổ' => 'O', |
259
|
|
|
'ổ' => 'o', |
260
|
|
|
'Ở' => 'O', |
261
|
|
|
'ở' => 'o', |
262
|
|
|
'Ủ' => 'U', |
263
|
|
|
'ủ' => 'u', |
264
|
|
|
'Ử' => 'U', |
265
|
|
|
'ử' => 'u', |
266
|
|
|
'Ỷ' => 'Y', |
267
|
|
|
'ỷ' => 'y', |
268
|
|
|
// tilde |
269
|
|
|
'Ẫ' => 'A', |
270
|
|
|
'ẫ' => 'a', |
271
|
|
|
'Ẵ' => 'A', |
272
|
|
|
'ẵ' => 'a', |
273
|
|
|
'Ẽ' => 'E', |
274
|
|
|
'ẽ' => 'e', |
275
|
|
|
'Ễ' => 'E', |
276
|
|
|
'ễ' => 'e', |
277
|
|
|
'Ỗ' => 'O', |
278
|
|
|
'ỗ' => 'o', |
279
|
|
|
'Ỡ' => 'O', |
280
|
|
|
'ỡ' => 'o', |
281
|
|
|
'Ữ' => 'U', |
282
|
|
|
'ữ' => 'u', |
283
|
|
|
'Ỹ' => 'Y', |
284
|
|
|
'ỹ' => 'y', |
285
|
|
|
// acute accent |
286
|
|
|
'Ấ' => 'A', |
287
|
|
|
'ấ' => 'a', |
288
|
|
|
'Ắ' => 'A', |
289
|
|
|
'ắ' => 'a', |
290
|
|
|
'Ế' => 'E', |
291
|
|
|
'ế' => 'e', |
292
|
|
|
'Ố' => 'O', |
293
|
|
|
'ố' => 'o', |
294
|
|
|
'Ớ' => 'O', |
295
|
|
|
'ớ' => 'o', |
296
|
|
|
'Ứ' => 'U', |
297
|
|
|
'ứ' => 'u', |
298
|
|
|
// dot below |
299
|
|
|
'Ạ' => 'A', |
300
|
|
|
'ạ' => 'a', |
301
|
|
|
'Ậ' => 'A', |
302
|
|
|
'ậ' => 'a', |
303
|
|
|
'Ặ' => 'A', |
304
|
|
|
'ặ' => 'a', |
305
|
|
|
'Ẹ' => 'E', |
306
|
|
|
'ẹ' => 'e', |
307
|
|
|
'Ệ' => 'E', |
308
|
|
|
'ệ' => 'e', |
309
|
|
|
'Ị' => 'I', |
310
|
|
|
'ị' => 'i', |
311
|
|
|
'Ọ' => 'O', |
312
|
|
|
'ọ' => 'o', |
313
|
|
|
'Ộ' => 'O', |
314
|
|
|
'ộ' => 'o', |
315
|
|
|
'Ợ' => 'O', |
316
|
|
|
'ợ' => 'o', |
317
|
|
|
'Ụ' => 'U', |
318
|
|
|
'ụ' => 'u', |
319
|
|
|
'Ự' => 'U', |
320
|
|
|
'ự' => 'u', |
321
|
|
|
'Ỵ' => 'Y', |
322
|
|
|
'ỵ' => 'y', |
323
|
|
|
// Vowels with diacritic (Chinese, Hanyu Pinyin) |
324
|
|
|
'ɑ' => 'a', |
325
|
|
|
// macron |
326
|
|
|
'Ǖ' => 'U', |
327
|
|
|
'ǖ' => 'u', |
328
|
|
|
// acute accent |
329
|
|
|
'Ǘ' => 'U', |
330
|
|
|
'ǘ' => 'u', |
331
|
|
|
// caron |
332
|
|
|
'Ǎ' => 'A', |
333
|
|
|
'ǎ' => 'a', |
334
|
|
|
'Ǐ' => 'I', |
335
|
|
|
'ǐ' => 'i', |
336
|
|
|
'Ǒ' => 'O', |
337
|
|
|
'ǒ' => 'o', |
338
|
|
|
'Ǔ' => 'U', |
339
|
|
|
'ǔ' => 'u', |
340
|
|
|
'Ǚ' => 'U', |
341
|
|
|
'ǚ' => 'u', |
342
|
|
|
// grave accent |
343
|
|
|
'Ǜ' => 'U', |
344
|
|
|
'ǜ' => 'u', |
345
|
|
|
]; |
346
|
|
|
|
347
|
|
|
return strtr($string, $chars); |
348
|
|
|
} |
349
|
|
|
|
350
|
|
|
public function handle() |
351
|
|
|
{ |
352
|
|
|
Student::whereNull('gender_id')->chunkById(10, function ($students) { |
353
|
|
|
$query = $students->map(fn ($student) => ['id' => $student->id, |
354
|
|
|
'name' => $this->remove_accents(strtok($student->firstname, ' ')), ]); |
355
|
|
|
|
356
|
|
|
$response = Http::get('https://api.genderize.io/?name[]='.$query->pluck('name')->implode('&name[]=')); |
357
|
|
|
foreach ($students as $student) { |
358
|
|
|
$firstname = $query->firstWhere('id', $student->id)['name']; |
359
|
|
|
|
360
|
|
|
$student->update([ |
361
|
|
|
'gender_id' => match ($response->collect()->where('name', $firstname)->where('probability', '>', 0.9)->where('count', '>', 10)->first()['gender'] ?? '-') { |
362
|
|
|
'male' => 2, |
363
|
|
|
'female' => 1, |
364
|
|
|
default => null, |
365
|
|
|
}, |
366
|
|
|
]); |
367
|
|
|
} |
368
|
|
|
}); |
369
|
|
|
} |
370
|
|
|
} |
371
|
|
|
|