|
1
|
|
|
""" |
|
2
|
|
|
Initialization of CrowdTruth metrics |
|
3
|
|
|
""" |
|
4
|
1 |
|
import logging |
|
5
|
1 |
|
import math |
|
6
|
|
|
|
|
7
|
1 |
|
from collections import Counter |
|
8
|
|
|
|
|
9
|
1 |
|
import numpy as np |
|
10
|
1 |
|
import pandas as pd |
|
11
|
|
|
|
|
12
|
1 |
|
SMALL_NUMBER_CONST = 0.00000001 |
|
13
|
|
|
|
|
14
|
1 |
|
class Metrics(): |
|
15
|
|
|
""" |
|
16
|
|
|
Computes and applies the CrowdTruth metrics for evaluating units, workers and annotations. |
|
17
|
|
|
""" |
|
18
|
|
|
|
|
19
|
|
|
# Unit Quality Score |
|
20
|
1 |
|
@staticmethod |
|
21
|
|
|
def unit_quality_score(unit_id, unit_work_ann_dict, wqs, aqs): |
|
22
|
|
|
""" |
|
23
|
|
|
Computes the unit quality score. |
|
24
|
|
|
|
|
25
|
|
|
The unit quality score (UQS) is computed as the average cosine similarity between |
|
26
|
|
|
all worker vectors for a given unit, weighted by the worker quality (WQS) and the |
|
27
|
|
|
annotation quality (AQS). The goal is to capture the degree of agreement in annotating |
|
28
|
|
|
the media unit. |
|
29
|
|
|
|
|
30
|
|
|
Through the weighted average, workers and annotations with lower quality will have |
|
31
|
|
|
less of an impact on the final score. |
|
32
|
|
|
|
|
33
|
|
|
To weigh the metrics with the annotation quality, we compute weighted_cosine, the weighted |
|
34
|
|
|
version of the cosine similarity. |
|
35
|
|
|
|
|
36
|
|
|
Args: |
|
37
|
|
|
unit_id: Unit id. |
|
38
|
|
|
unit_work_ann_dict: A dictionary that contains all the workers judgments for the unit. |
|
39
|
|
|
aqs: Dict of annotation_id (string) that contains the annotation quality score (float) |
|
40
|
|
|
wqs: Dict of worker_id (string) that contains the worker quality score (float) |
|
41
|
|
|
|
|
42
|
|
|
Returns: |
|
43
|
|
|
The quality score (UQS) of the given unit. |
|
44
|
|
|
""" |
|
45
|
|
|
|
|
46
|
1 |
|
uqs_numerator = 0.0 |
|
47
|
1 |
|
uqs_denominator = 0.0 |
|
48
|
1 |
|
worker_ids = list(unit_work_ann_dict[unit_id].keys()) |
|
49
|
|
|
|
|
50
|
1 |
|
for worker_i in range(len(worker_ids) - 1): |
|
51
|
1 |
|
for worker_j in range(worker_i + 1, len(worker_ids)): |
|
52
|
1 |
|
numerator = 0.0 |
|
53
|
1 |
|
denominator_i = 0.0 |
|
54
|
1 |
|
denominator_j = 0.0 |
|
55
|
|
|
|
|
56
|
1 |
|
worker_i_vector = unit_work_ann_dict[unit_id][worker_ids[worker_i]] |
|
57
|
1 |
|
worker_j_vector = unit_work_ann_dict[unit_id][worker_ids[worker_j]] |
|
58
|
|
|
|
|
59
|
1 |
|
for ann in worker_i_vector: |
|
60
|
1 |
|
worker_i_vector_ann = worker_i_vector[ann] |
|
61
|
1 |
|
worker_j_vector_ann = worker_j_vector[ann] |
|
62
|
1 |
|
numerator += aqs[ann] * (worker_i_vector_ann * worker_j_vector_ann) |
|
63
|
1 |
|
denominator_i += aqs[ann] * (worker_i_vector_ann * worker_i_vector_ann) |
|
64
|
1 |
|
denominator_j += aqs[ann] * (worker_j_vector_ann * worker_j_vector_ann) |
|
65
|
|
|
|
|
66
|
1 |
|
denominator = math.sqrt(denominator_i * denominator_j) |
|
67
|
1 |
|
if denominator < SMALL_NUMBER_CONST: |
|
68
|
|
|
denominator = SMALL_NUMBER_CONST |
|
69
|
1 |
|
weighted_cosine = numerator / denominator |
|
70
|
|
|
|
|
71
|
1 |
|
uqs_numerator += weighted_cosine * wqs[worker_ids[worker_i]] * \ |
|
72
|
|
|
wqs[worker_ids[worker_j]] |
|
73
|
1 |
|
uqs_denominator += wqs[worker_ids[worker_i]] * wqs[worker_ids[worker_j]] |
|
74
|
|
|
|
|
75
|
1 |
|
if uqs_denominator < SMALL_NUMBER_CONST: |
|
76
|
1 |
|
uqs_denominator = SMALL_NUMBER_CONST |
|
77
|
1 |
|
return uqs_numerator / uqs_denominator |
|
78
|
|
|
|
|
79
|
|
|
|
|
80
|
|
|
# Worker - Unit Agreement |
|
81
|
1 |
|
@staticmethod |
|
82
|
|
|
def worker_unit_agreement(worker_id, unit_ann_dict, work_unit_ann_dict, uqs, aqs, wqs): |
|
83
|
|
|
""" |
|
84
|
|
|
Computes the worker agreement on a unit. |
|
85
|
|
|
|
|
86
|
|
|
The worker unit agreement (WUA) is the average cosine distance between the annotations |
|
87
|
|
|
of a worker i and all the other annotations for the units they have worked on, |
|
88
|
|
|
weighted by the unit and annotation quality. It calculates how much a worker disagrees |
|
89
|
|
|
with the crowd on a unit basis. |
|
90
|
|
|
|
|
91
|
|
|
Through the weighted average, units and anntation with lower quality will have less |
|
92
|
|
|
of an impact on the final score. |
|
93
|
|
|
|
|
94
|
|
|
Args: |
|
95
|
|
|
worker_id: Worker id. |
|
96
|
|
|
unit_ann_dict: Dictionary of units and their aggregated annotations. |
|
97
|
|
|
work_unit_ann_dict: Dictionary of units (and its annotation) annotated by the worker. |
|
98
|
|
|
uqs: Dict unit_id that contains the unit quality scores (float). |
|
99
|
|
|
aqs: Dict of annotation_id (string) that contains the annotation quality scores (float). |
|
100
|
|
|
wqs: Dict of worker_id (string) that contains the worker quality scores (float). |
|
101
|
|
|
|
|
102
|
|
|
Returns: |
|
103
|
|
|
The worker unit agreement score for the given worker. |
|
104
|
|
|
""" |
|
105
|
|
|
|
|
106
|
1 |
|
wsa_numerator = 0.0 |
|
107
|
1 |
|
wsa_denominator = 0.0 |
|
108
|
1 |
|
work_unit_ann_dict_worker_id = work_unit_ann_dict[worker_id] |
|
109
|
|
|
|
|
110
|
1 |
|
for unit_id in work_unit_ann_dict_worker_id: |
|
111
|
1 |
|
numerator = 0.0 |
|
112
|
1 |
|
denominator_w = 0.0 |
|
113
|
1 |
|
denominator_s = 0.0 |
|
114
|
|
|
|
|
115
|
1 |
|
worker_vector = work_unit_ann_dict[worker_id][unit_id] |
|
116
|
1 |
|
unit_vector = unit_ann_dict[unit_id] |
|
117
|
|
|
|
|
118
|
1 |
|
for ann in worker_vector: |
|
119
|
1 |
|
worker_vector_ann = worker_vector[ann] * wqs |
|
120
|
1 |
|
unit_vector_ann = unit_vector[ann] |
|
121
|
|
|
|
|
122
|
1 |
|
numerator += aqs[ann] * worker_vector_ann * \ |
|
123
|
|
|
(unit_vector_ann - worker_vector_ann) |
|
124
|
1 |
|
denominator_w += aqs[ann] * \ |
|
125
|
|
|
(worker_vector_ann * worker_vector_ann) |
|
126
|
1 |
|
denominator_s += aqs[ann] * ( \ |
|
127
|
|
|
(unit_vector_ann - worker_vector_ann) * \ |
|
128
|
|
|
(unit_vector_ann - worker_vector_ann)) |
|
129
|
1 |
|
weighted_cosine = None |
|
130
|
1 |
|
if math.sqrt(denominator_w * denominator_s) < SMALL_NUMBER_CONST: |
|
131
|
1 |
|
weighted_cosine = SMALL_NUMBER_CONST |
|
132
|
|
|
else: |
|
133
|
1 |
|
weighted_cosine = numerator / math.sqrt(denominator_w * denominator_s) |
|
134
|
1 |
|
wsa_numerator += weighted_cosine * uqs[unit_id] |
|
135
|
1 |
|
wsa_denominator += uqs[unit_id] |
|
136
|
1 |
|
if wsa_denominator < SMALL_NUMBER_CONST: |
|
137
|
1 |
|
wsa_denominator = SMALL_NUMBER_CONST |
|
138
|
1 |
|
return wsa_numerator / wsa_denominator |
|
139
|
|
|
|
|
140
|
|
|
# Worker - Worker Agreement |
|
141
|
1 |
|
@staticmethod |
|
142
|
|
|
def worker_worker_agreement(worker_id, work_unit_ann_dict, unit_work_ann_dict, wqs, uqs, aqs): |
|
143
|
|
|
""" |
|
144
|
|
|
Computes the agreement between every two workers. |
|
145
|
|
|
|
|
146
|
|
|
The worker-worker agreement (WWA) is the average cosine distance between the annotations of |
|
147
|
|
|
a worker i and all other workers that have worked on the same media units as worker i, |
|
148
|
|
|
weighted by the worker and annotation qualities. |
|
149
|
|
|
|
|
150
|
|
|
The metric gives an indication as to whether there are consisently like-minded workers. |
|
151
|
|
|
This is useful for identifying communities of thought. |
|
152
|
|
|
|
|
153
|
|
|
Through the weighted average, workers and annotations with lower quality will have less |
|
154
|
|
|
of an impact on the final score of the given worker. |
|
155
|
|
|
|
|
156
|
|
|
Args: |
|
157
|
|
|
worker_id: Worker id. |
|
158
|
|
|
work_unit_ann_dict: Dictionary of worker annotation vectors on annotated units. |
|
159
|
|
|
unit_work_ann_dict: Dictionary of unit annotation vectors. |
|
160
|
|
|
uqs: Dict unit_id that contains the unit quality scores (float). |
|
161
|
|
|
aqs: Dict of annotation_id (string) that contains the annotation quality scores (float). |
|
162
|
|
|
wqs: Dict of worker_id (string) that contains the worker quality scores (float). |
|
163
|
|
|
|
|
164
|
|
|
Returns: |
|
165
|
|
|
The worker worker agreement score for the given worker. |
|
166
|
|
|
""" |
|
167
|
|
|
|
|
168
|
1 |
|
wwa_numerator = 0.0 |
|
169
|
1 |
|
wwa_denominator = 0.0 |
|
170
|
|
|
|
|
171
|
1 |
|
worker_vector = work_unit_ann_dict[worker_id] |
|
172
|
1 |
|
unit_ids = list(work_unit_ann_dict[worker_id].keys()) |
|
173
|
|
|
|
|
174
|
1 |
|
for unit_id in unit_ids: |
|
175
|
1 |
|
wv_unit_id = worker_vector[unit_id] |
|
176
|
1 |
|
unit_work_ann_dict_unit_id = unit_work_ann_dict[unit_id] |
|
177
|
1 |
|
for other_workid in unit_work_ann_dict_unit_id: |
|
178
|
1 |
|
if worker_id != other_workid: |
|
179
|
1 |
|
numerator = 0.0 |
|
180
|
1 |
|
denominator_w = 0.0 |
|
181
|
1 |
|
denominator_ow = 0.0 |
|
182
|
|
|
|
|
183
|
1 |
|
unit_work_ann_dict_uid_oworkid = unit_work_ann_dict_unit_id[other_workid] |
|
184
|
1 |
|
for ann in wv_unit_id: |
|
185
|
1 |
|
unit_work_ann_dict_uid_oworkid_ann = unit_work_ann_dict_uid_oworkid[ann] |
|
186
|
1 |
|
wv_unit_id_ann = wv_unit_id[ann] |
|
187
|
|
|
|
|
188
|
1 |
|
numerator += aqs[ann] * (wv_unit_id_ann * \ |
|
189
|
|
|
unit_work_ann_dict_uid_oworkid_ann) |
|
190
|
|
|
|
|
191
|
1 |
|
denominator_w += aqs[ann] * (wv_unit_id_ann * wv_unit_id_ann) |
|
192
|
|
|
|
|
193
|
1 |
|
denominator_ow += aqs[ann] * \ |
|
194
|
|
|
(unit_work_ann_dict_uid_oworkid_ann *\ |
|
195
|
|
|
unit_work_ann_dict_uid_oworkid_ann) |
|
196
|
1 |
|
denominator = math.sqrt(denominator_w * denominator_ow) |
|
197
|
1 |
|
if denominator < SMALL_NUMBER_CONST: |
|
198
|
|
|
denominator = SMALL_NUMBER_CONST |
|
199
|
1 |
|
weighted_cosine = numerator / denominator |
|
200
|
|
|
#weighted_cosine = numerator / math.sqrt(denominator_w * denominator_ow) |
|
201
|
|
|
# pdb.set_trace() |
|
202
|
1 |
|
wwa_numerator += weighted_cosine * wqs[other_workid] * uqs[unit_id] |
|
203
|
1 |
|
wwa_denominator += wqs[other_workid] * uqs[unit_id] |
|
204
|
1 |
|
if wwa_denominator < SMALL_NUMBER_CONST: |
|
205
|
1 |
|
wwa_denominator = SMALL_NUMBER_CONST |
|
206
|
1 |
|
return wwa_numerator / wwa_denominator |
|
207
|
|
|
|
|
208
|
|
|
|
|
209
|
|
|
|
|
210
|
|
|
# Unit - Annotation Score (UAS) |
|
211
|
1 |
|
@staticmethod |
|
212
|
|
|
def unit_annotation_score(unit_id, annotation, unit_work_annotation_dict, wqs): |
|
213
|
|
|
""" |
|
214
|
|
|
Computes the unit annotation score. |
|
215
|
|
|
|
|
216
|
|
|
The unit - annotation score (UAS) calculates the likelihood that annotation a |
|
217
|
|
|
is expressed in unit u. It is the ratio of the number of workers that picked |
|
218
|
|
|
annotation a over all workers that annotated the unit, weighted by the worker quality. |
|
219
|
|
|
|
|
220
|
|
|
Args: |
|
221
|
|
|
unit_id: Unit id. |
|
222
|
|
|
annotation: Annotation. |
|
223
|
|
|
unit_work_annotation_dict: Dictionary of unit annotation vectors. |
|
224
|
|
|
wqs: Dict of worker_id (string) that contains the worker quality scores (float). |
|
225
|
|
|
|
|
226
|
|
|
Returns: |
|
227
|
|
|
The unit annotation score for the given unit and annotation. |
|
228
|
|
|
""" |
|
229
|
|
|
|
|
230
|
1 |
|
uas_numerator = 0.0 |
|
231
|
1 |
|
uas_denominator = 0.0 |
|
232
|
|
|
|
|
233
|
1 |
|
worker_ids = unit_work_annotation_dict[unit_id] |
|
234
|
1 |
|
for worker_id in worker_ids: |
|
235
|
1 |
|
uas_numerator += worker_ids[worker_id][annotation] * wqs[worker_id] |
|
236
|
1 |
|
uas_denominator += wqs[worker_id] |
|
237
|
1 |
|
if uas_denominator < SMALL_NUMBER_CONST: |
|
238
|
1 |
|
uas_denominator = SMALL_NUMBER_CONST |
|
239
|
1 |
|
return uas_numerator / uas_denominator |
|
240
|
|
|
|
|
241
|
1 |
|
@staticmethod |
|
242
|
|
|
def compute_ann_quality_factors(numerator, denominator, work_unit_ann_dict_worker_i, \ |
|
243
|
|
|
work_unit_ann_dict_worker_j, ann, uqs): |
|
244
|
|
|
""" |
|
245
|
|
|
Computes the factors for each unit annotation. |
|
246
|
|
|
|
|
247
|
|
|
Args: |
|
248
|
|
|
numerator: Current numerator |
|
249
|
|
|
denominator: Current denominator |
|
250
|
|
|
work_unit_ann_dict_worker_i: Dict of worker i annotation vectors on annotated units. |
|
251
|
|
|
work_unit_ann_dict_worker_j: Dict of worker j annotation vectors on annotated units. |
|
252
|
|
|
ann: Annotation value |
|
253
|
|
|
uqs: Dict unit_id that contains the unit quality scores (float). |
|
254
|
|
|
|
|
255
|
|
|
Returns: |
|
256
|
|
|
The annotation quality factors. |
|
257
|
|
|
""" |
|
258
|
1 |
|
for unit_id, work_unit_ann_dict_work_i_unit in work_unit_ann_dict_worker_i.items(): |
|
259
|
1 |
|
if unit_id in work_unit_ann_dict_worker_j: |
|
260
|
1 |
|
work_unit_ann_dict_work_j_unit = work_unit_ann_dict_worker_j[unit_id] |
|
261
|
|
|
|
|
262
|
1 |
|
work_unit_ann_dict_wj_unit_ann = work_unit_ann_dict_work_j_unit[ann] |
|
263
|
|
|
|
|
264
|
1 |
|
def compute_numerator_aqs(unit_id_ann_value, worker_i_ann_value, \ |
|
265
|
|
|
worker_j_ann_value): |
|
266
|
|
|
""" compute numerator """ |
|
267
|
1 |
|
numerator = unit_id_ann_value * worker_i_ann_value * \ |
|
268
|
|
|
worker_j_ann_value |
|
269
|
1 |
|
return numerator |
|
270
|
|
|
|
|
271
|
1 |
|
def compute_denominator_aqs(unit_id_ann_value, worker_j_ann_value): |
|
272
|
|
|
""" compute denominator """ |
|
273
|
1 |
|
denominator = unit_id_ann_value * worker_j_ann_value |
|
274
|
1 |
|
return denominator |
|
275
|
|
|
|
|
276
|
1 |
|
numerator += compute_numerator_aqs(uqs[unit_id], \ |
|
277
|
|
|
work_unit_ann_dict_work_i_unit[ann], \ |
|
278
|
|
|
work_unit_ann_dict_wj_unit_ann) |
|
279
|
1 |
|
denominator += compute_denominator_aqs(uqs[unit_id], \ |
|
280
|
|
|
work_unit_ann_dict_wj_unit_ann) |
|
281
|
1 |
|
return numerator, denominator |
|
282
|
|
|
|
|
283
|
1 |
|
@staticmethod |
|
284
|
|
|
def aqs_dict(annotations, aqs_numerator, aqs_denominator): |
|
285
|
|
|
""" |
|
286
|
|
|
Create the dictionary of annotation quality score values. |
|
287
|
|
|
|
|
288
|
|
|
Args: |
|
289
|
|
|
annotations: Dictionary of annotations. |
|
290
|
|
|
aqs_numerator: Annotation numerator. |
|
291
|
|
|
aqs_denominator: Annotation denominator. |
|
292
|
|
|
|
|
293
|
|
|
|
|
294
|
|
|
Returns: |
|
295
|
|
|
The dictionary of annotation quality scores. |
|
296
|
|
|
""" |
|
297
|
|
|
|
|
298
|
1 |
|
aqs = dict() |
|
299
|
1 |
|
for ann in annotations: |
|
300
|
1 |
|
if aqs_denominator[ann] > SMALL_NUMBER_CONST: |
|
301
|
1 |
|
aqs[ann] = aqs_numerator[ann] / aqs_denominator[ann] |
|
302
|
|
|
# prevent division by zero by storing very small value instead |
|
303
|
1 |
|
if aqs[ann] < SMALL_NUMBER_CONST: |
|
304
|
1 |
|
aqs[ann] = SMALL_NUMBER_CONST |
|
305
|
|
|
else: |
|
306
|
1 |
|
aqs[ann] = SMALL_NUMBER_CONST |
|
307
|
1 |
|
return aqs |
|
308
|
|
|
|
|
309
|
|
|
|
|
310
|
|
|
# Annotation Quality Score (AQS) |
|
311
|
1 |
|
@staticmethod |
|
312
|
|
|
def annotation_quality_score(annotations, work_unit_ann_dict, uqs, wqs): |
|
313
|
|
|
""" |
|
314
|
|
|
Computes the annotation quality score. |
|
315
|
|
|
|
|
316
|
|
|
The annotation quality score AQS calculates the agreement of selecting an annotation a, |
|
317
|
|
|
over all the units it appears in. Therefore, it is only applicable to closed tasks, where |
|
318
|
|
|
the same annotation set is used for all units. It is based on the probability that if a |
|
319
|
|
|
worker j annotates annotation a in a unit, worker i will also annotate it. |
|
320
|
|
|
|
|
321
|
|
|
The annotation quality score is the weighted average of these probabilities for all possible |
|
322
|
|
|
pairs of workers. Through the weighted average, units and workers with lower quality will |
|
323
|
|
|
have less of an impact on the final score of the annotation. |
|
324
|
|
|
|
|
325
|
|
|
Args: |
|
326
|
|
|
annotations: Possible annotations. |
|
327
|
|
|
work_unit_annotation_dict: Dictionary of worker annotation vectors on annotated units. |
|
328
|
|
|
uqs: Dict unit_id that contains the unit quality scores (float). |
|
329
|
|
|
wqs: Dict of worker_id (string) that contains the worker quality scores (float). |
|
330
|
|
|
|
|
331
|
|
|
Returns: |
|
332
|
|
|
The worker worker agreement score for the given worker. |
|
333
|
|
|
""" |
|
334
|
|
|
|
|
335
|
1 |
|
aqs_numerator = dict() |
|
336
|
1 |
|
aqs_denominator = dict() |
|
337
|
|
|
|
|
338
|
1 |
|
for ann in annotations: |
|
339
|
1 |
|
aqs_numerator[ann] = 0.0 |
|
340
|
1 |
|
aqs_denominator[ann] = 0.0 |
|
341
|
|
|
|
|
342
|
1 |
|
for worker_i, work_unit_ann_dict_worker_i in work_unit_ann_dict.items(): |
|
343
|
|
|
#work_unit_ann_dict_worker_i = work_unit_ann_dict[worker_i] |
|
344
|
1 |
|
work_unit_ann_dict_i_keys = list(work_unit_ann_dict_worker_i.keys()) |
|
345
|
1 |
|
for worker_j, work_unit_ann_dict_worker_j in work_unit_ann_dict.items(): |
|
346
|
|
|
#work_unit_ann_dict_worker_j = work_unit_ann_dict[worker_j] |
|
347
|
1 |
|
work_unit_ann_dict_j_keys = list(work_unit_ann_dict_worker_j.keys()) |
|
348
|
|
|
|
|
349
|
1 |
|
length_keys = len(np.intersect1d(np.array(work_unit_ann_dict_i_keys), \ |
|
350
|
|
|
np.array(work_unit_ann_dict_j_keys))) |
|
351
|
|
|
|
|
352
|
1 |
|
if worker_i != worker_j and length_keys > 0: |
|
353
|
1 |
|
for ann in annotations: |
|
354
|
1 |
|
numerator = 0.0 |
|
355
|
1 |
|
denominator = 0.0 |
|
356
|
|
|
|
|
357
|
1 |
|
numerator, denominator = Metrics.compute_ann_quality_factors(numerator, \ |
|
358
|
|
|
denominator, work_unit_ann_dict_worker_i, \ |
|
359
|
|
|
work_unit_ann_dict_worker_j, ann, uqs) |
|
360
|
|
|
|
|
361
|
1 |
|
if denominator > 0: |
|
362
|
1 |
|
aqs_numerator[ann] += wqs[worker_i] * wqs[worker_j] * \ |
|
363
|
|
|
numerator / denominator |
|
364
|
1 |
|
aqs_denominator[ann] += wqs[worker_i] * wqs[worker_j] |
|
365
|
|
|
|
|
366
|
1 |
|
return Metrics.aqs_dict(annotations, aqs_numerator, aqs_denominator) |
|
367
|
|
|
|
|
368
|
|
|
|
|
369
|
1 |
|
@staticmethod |
|
370
|
1 |
|
def run(results, config, max_delta=0.001): |
|
371
|
|
|
''' |
|
372
|
|
|
iteratively run the CrowdTruth metrics |
|
373
|
|
|
''' |
|
374
|
|
|
|
|
375
|
1 |
|
judgments = results['judgments'].copy() |
|
376
|
1 |
|
units = results['units'].copy() |
|
377
|
|
|
|
|
378
|
|
|
# unit_work_ann_dict, work_unit_ann_dict, unit_ann_dict |
|
379
|
|
|
# to be done: change to use all vectors in one unit |
|
380
|
1 |
|
col = list(config.output.values())[0] |
|
381
|
1 |
|
unit_ann_dict = dict(units.copy()[col]) |
|
382
|
|
|
|
|
383
|
1 |
|
def expanded_vector(worker, unit): |
|
384
|
|
|
''' |
|
385
|
|
|
expand the vector of a worker on a given unit |
|
386
|
|
|
''' |
|
387
|
1 |
|
vector = Counter() |
|
388
|
1 |
|
for ann in unit: |
|
389
|
1 |
|
if ann in worker: |
|
390
|
1 |
|
vector[ann] = worker[ann] |
|
391
|
|
|
else: |
|
392
|
1 |
|
vector[ann] = 0 |
|
393
|
1 |
|
return vector |
|
394
|
|
|
|
|
395
|
|
|
# fill judgment vectors with unit keys |
|
396
|
1 |
|
for index, row in judgments.iterrows(): |
|
397
|
1 |
|
judgments.at[index, col] = expanded_vector(row[col], units.at[row['unit'], col]) |
|
398
|
|
|
|
|
399
|
1 |
|
unit_work_ann_dict = judgments[['unit', 'worker', col]].copy().groupby('unit') |
|
400
|
1 |
|
unit_work_ann_dict = {name : group.set_index('worker')[col].to_dict() \ |
|
401
|
|
|
for name, group in unit_work_ann_dict} |
|
402
|
|
|
|
|
403
|
1 |
|
work_unit_ann_dict = judgments[['worker', 'unit', col]].copy().groupby('worker') |
|
404
|
1 |
|
work_unit_ann_dict = {name : group.set_index('unit')[col].to_dict() \ |
|
405
|
|
|
for name, group in work_unit_ann_dict} |
|
406
|
|
|
|
|
407
|
|
|
#initialize data structures |
|
408
|
1 |
|
uqs_list = list() |
|
409
|
1 |
|
wqs_list = list() |
|
410
|
1 |
|
wwa_list = list() |
|
411
|
1 |
|
wsa_list = list() |
|
412
|
1 |
|
aqs_list = list() |
|
413
|
|
|
|
|
414
|
1 |
|
uqs = dict((unit_id, 1.0) for unit_id in unit_work_ann_dict) |
|
415
|
1 |
|
wqs = dict((worker_id, 1.0) for worker_id in work_unit_ann_dict) |
|
416
|
1 |
|
wwa = dict((worker_id, 1.0) for worker_id in work_unit_ann_dict) |
|
417
|
1 |
|
wsa = dict((worker_id, 1.0) for worker_id in work_unit_ann_dict) |
|
418
|
|
|
|
|
419
|
1 |
|
uqs_list.append(uqs.copy()) |
|
420
|
1 |
|
wqs_list.append(wqs.copy()) |
|
421
|
1 |
|
wwa_list.append(wwa.copy()) |
|
422
|
1 |
|
wsa_list.append(wsa.copy()) |
|
423
|
|
|
|
|
424
|
1 |
|
def init_aqs(config, unit_ann_dict): |
|
425
|
|
|
""" initialize aqs depending on whether or not it is an open ended task """ |
|
426
|
1 |
|
aqs = dict() |
|
427
|
1 |
|
if not config.open_ended_task: |
|
428
|
1 |
|
aqs_keys = list(unit_ann_dict[list(unit_ann_dict.keys())[0]].keys()) |
|
429
|
1 |
|
for ann in aqs_keys: |
|
430
|
1 |
|
aqs[ann] = 1.0 |
|
431
|
|
|
else: |
|
432
|
1 |
|
for unit_id in unit_ann_dict: |
|
433
|
1 |
|
for ann in unit_ann_dict[unit_id]: |
|
434
|
1 |
|
aqs[ann] = 1.0 |
|
435
|
1 |
|
return aqs |
|
436
|
|
|
|
|
437
|
1 |
|
aqs = init_aqs(config, unit_ann_dict) |
|
438
|
1 |
|
aqs_list.append(aqs.copy()) |
|
439
|
|
|
|
|
440
|
1 |
|
uqs_len = len(list(uqs.keys())) * 1.0 |
|
441
|
1 |
|
wqs_len = len(list(wqs.keys())) * 1.0 |
|
442
|
1 |
|
aqs_len = len(list(aqs.keys())) * 1.0 |
|
443
|
|
|
|
|
444
|
|
|
# compute metrics until stable values |
|
445
|
1 |
|
iterations = 0 |
|
446
|
1 |
|
while max_delta >= 0.001: |
|
447
|
1 |
|
uqs_new = dict() |
|
448
|
1 |
|
wqs_new = dict() |
|
449
|
1 |
|
wwa_new = dict() |
|
450
|
1 |
|
wsa_new = dict() |
|
451
|
|
|
|
|
452
|
1 |
|
avg_uqs_delta = 0.0 |
|
453
|
1 |
|
avg_wqs_delta = 0.0 |
|
454
|
1 |
|
avg_aqs_delta = 0.0 |
|
455
|
1 |
|
max_delta = 0.0 |
|
456
|
|
|
|
|
457
|
|
|
# pdb.set_trace() |
|
458
|
|
|
|
|
459
|
1 |
|
def compute_wqs(wwa_new, wsa_new, wqs_new, work_unit_ann_dict, unit_ann_dict, \ |
|
460
|
|
|
unit_work_ann_dict, wqs_list, uqs_list, aqs_list, wqs_len, \ |
|
461
|
|
|
max_delta, avg_wqs_delta): |
|
462
|
|
|
""" compute worker quality score (WQS) """ |
|
463
|
1 |
|
for worker_id, _ in work_unit_ann_dict.items(): |
|
464
|
1 |
|
wwa_new[worker_id] = Metrics.worker_worker_agreement( \ |
|
465
|
|
|
worker_id, work_unit_ann_dict, \ |
|
466
|
|
|
unit_work_ann_dict, \ |
|
467
|
|
|
wqs_list[len(wqs_list) - 1], \ |
|
468
|
|
|
uqs_list[len(uqs_list) - 1], \ |
|
469
|
|
|
aqs_list[len(aqs_list) - 1]) |
|
470
|
1 |
|
wsa_new[worker_id] = Metrics.worker_unit_agreement( \ |
|
471
|
|
|
worker_id, \ |
|
472
|
|
|
unit_ann_dict, \ |
|
473
|
|
|
work_unit_ann_dict, \ |
|
474
|
|
|
uqs_list[len(uqs_list) - 1], \ |
|
475
|
|
|
aqs_list[len(aqs_list) - 1], \ |
|
476
|
|
|
wqs_list[len(wqs_list) - 1][worker_id]) |
|
477
|
1 |
|
wqs_new[worker_id] = wwa_new[worker_id] * wsa_new[worker_id] |
|
478
|
1 |
|
max_delta = max(max_delta, \ |
|
479
|
|
|
abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id])) |
|
480
|
1 |
|
avg_wqs_delta += abs(wqs_new[worker_id] - \ |
|
481
|
|
|
wqs_list[len(wqs_list) - 1][worker_id]) |
|
482
|
1 |
|
avg_wqs_delta /= wqs_len |
|
483
|
|
|
|
|
484
|
1 |
|
return wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta |
|
485
|
|
|
|
|
486
|
1 |
|
def compute_aqs(aqs, work_unit_ann_dict, uqs_list, wqs_list, aqs_list, aqs_len, max_delta, avg_aqs_delta): |
|
487
|
|
|
""" compute annotation quality score (aqs) """ |
|
488
|
1 |
|
aqs_new = Metrics.annotation_quality_score(list(aqs.keys()), work_unit_ann_dict, \ |
|
489
|
|
|
uqs_list[len(uqs_list) - 1], \ |
|
490
|
|
|
wqs_list[len(wqs_list) - 1]) |
|
491
|
1 |
|
for ann, _ in aqs_new.items(): |
|
492
|
1 |
|
max_delta = max(max_delta, abs(aqs_new[ann] - aqs_list[len(aqs_list) - 1][ann])) |
|
493
|
1 |
|
avg_aqs_delta += abs(aqs_new[ann] - aqs_list[len(aqs_list) - 1][ann]) |
|
494
|
1 |
|
avg_aqs_delta /= aqs_len |
|
495
|
1 |
|
return aqs_new, max_delta, avg_aqs_delta |
|
496
|
|
|
|
|
497
|
1 |
|
def compute_uqs(uqs_new, unit_work_ann_dict, wqs_list, aqs_list, uqs_list, uqs_len, max_delta, avg_uqs_delta): |
|
498
|
|
|
""" compute unit quality score (uqs) """ |
|
499
|
1 |
|
for unit_id, _ in unit_work_ann_dict.items(): |
|
500
|
1 |
|
uqs_new[unit_id] = Metrics.unit_quality_score(unit_id, unit_work_ann_dict, \ |
|
501
|
|
|
wqs_list[len(wqs_list) - 1], \ |
|
502
|
|
|
aqs_list[len(aqs_list) - 1]) |
|
503
|
1 |
|
max_delta = max(max_delta, \ |
|
504
|
|
|
abs(uqs_new[unit_id] - uqs_list[len(uqs_list) - 1][unit_id])) |
|
505
|
1 |
|
avg_uqs_delta += abs(uqs_new[unit_id] - uqs_list[len(uqs_list) - 1][unit_id]) |
|
506
|
1 |
|
avg_uqs_delta /= uqs_len |
|
507
|
1 |
|
return uqs_new, max_delta, avg_uqs_delta |
|
508
|
|
|
|
|
509
|
1 |
|
def reconstruct_unit_ann_dict(unit_ann_dict, work_unit_ann_dict, wqs_new): |
|
510
|
|
|
""" reconstruct unit_ann_dict with worker scores """ |
|
511
|
1 |
|
new_unit_ann_dict = dict() |
|
512
|
1 |
|
for unit_id, ann_dict in unit_ann_dict.items(): |
|
513
|
1 |
|
new_unit_ann_dict[unit_id] = dict() |
|
514
|
1 |
|
for ann, _ in ann_dict.items(): |
|
515
|
1 |
|
new_unit_ann_dict[unit_id][ann] = 0.0 |
|
516
|
1 |
|
for work_id, srd in work_unit_ann_dict.items(): |
|
517
|
1 |
|
wqs_work_id = wqs_new[work_id] |
|
518
|
1 |
|
for unit_id, ann_dict in srd.items(): |
|
519
|
1 |
|
for ann, score in ann_dict.items(): |
|
520
|
1 |
|
new_unit_ann_dict[unit_id][ann] += score * wqs_work_id |
|
521
|
|
|
|
|
522
|
1 |
|
return new_unit_ann_dict |
|
523
|
|
|
|
|
524
|
1 |
|
if not config.open_ended_task: |
|
525
|
|
|
# compute annotation quality score (aqs) |
|
526
|
1 |
|
aqs_new, max_delta, avg_aqs_delta = compute_aqs(aqs, work_unit_ann_dict, \ |
|
527
|
|
|
uqs_list, wqs_list, aqs_list, aqs_len, max_delta, avg_aqs_delta) |
|
528
|
|
|
|
|
529
|
|
|
# compute unit quality score (uqs) |
|
530
|
1 |
|
uqs_new, max_delta, avg_uqs_delta = compute_uqs(uqs_new, unit_work_ann_dict, \ |
|
531
|
|
|
wqs_list, aqs_list, uqs_list, uqs_len, max_delta, avg_uqs_delta) |
|
532
|
|
|
|
|
533
|
|
|
# compute worker quality score (WQS) |
|
534
|
1 |
|
wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta = compute_wqs(\ |
|
535
|
|
|
wwa_new, wsa_new, wqs_new, \ |
|
536
|
|
|
work_unit_ann_dict, unit_ann_dict, unit_work_ann_dict, wqs_list, \ |
|
537
|
|
|
uqs_list, aqs_list, wqs_len, max_delta, avg_wqs_delta) |
|
538
|
|
|
|
|
539
|
|
|
# save results for current iteration |
|
540
|
1 |
|
uqs_list.append(uqs_new.copy()) |
|
541
|
1 |
|
wqs_list.append(wqs_new.copy()) |
|
542
|
1 |
|
wwa_list.append(wwa_new.copy()) |
|
543
|
1 |
|
wsa_list.append(wsa_new.copy()) |
|
544
|
1 |
|
if not config.open_ended_task: |
|
545
|
1 |
|
aqs_list.append(aqs_new.copy()) |
|
|
|
|
|
|
546
|
1 |
|
iterations += 1 |
|
547
|
|
|
|
|
548
|
1 |
|
unit_ann_dict = reconstruct_unit_ann_dict(unit_ann_dict, work_unit_ann_dict, wqs_new) |
|
549
|
|
|
|
|
550
|
1 |
|
logging.info(str(iterations) + " iterations; max d= " + str(max_delta) + \ |
|
551
|
|
|
" ; wqs d= " + str(avg_wqs_delta) + "; uqs d= " + str(avg_uqs_delta) + \ |
|
552
|
|
|
"; aqs d= " + str(avg_aqs_delta)) |
|
553
|
|
|
|
|
554
|
1 |
|
def save_unit_ann_score(unit_ann_dict, unit_work_ann_dict, iteration_value): |
|
555
|
|
|
""" save the unit annotation score for print """ |
|
556
|
1 |
|
uas = Counter() |
|
557
|
1 |
|
for unit_id in unit_ann_dict: |
|
558
|
1 |
|
uas[unit_id] = Counter() |
|
559
|
1 |
|
for ann in unit_ann_dict[unit_id]: |
|
560
|
1 |
|
uas[unit_id][ann] = Metrics.unit_annotation_score(unit_id, \ |
|
561
|
|
|
ann, unit_work_ann_dict, \ |
|
562
|
|
|
iteration_value) |
|
563
|
1 |
|
return uas |
|
564
|
|
|
|
|
565
|
1 |
|
uas = save_unit_ann_score(unit_ann_dict, unit_work_ann_dict, wqs_list[len(wqs_list) - 1]) |
|
566
|
1 |
|
uas_initial = save_unit_ann_score(unit_ann_dict, unit_work_ann_dict, wqs_list[0]) |
|
567
|
|
|
|
|
568
|
1 |
|
results['units']['uqs'] = pd.Series(uqs_list[-1]) |
|
569
|
1 |
|
results['units']['unit_annotation_score'] = pd.Series(uas) |
|
570
|
1 |
|
results['workers']['wqs'] = pd.Series(wqs_list[-1]) |
|
571
|
1 |
|
results['workers']['wwa'] = pd.Series(wwa_list[-1]) |
|
572
|
1 |
|
results['workers']['wsa'] = pd.Series(wsa_list[-1]) |
|
573
|
1 |
|
if not config.open_ended_task: |
|
574
|
1 |
|
results['annotations']['aqs'] = pd.Series(aqs_list[-1]) |
|
575
|
|
|
|
|
576
|
1 |
|
results['units']['uqs_initial'] = pd.Series(uqs_list[1]) |
|
577
|
1 |
|
results['units']['unit_annotation_score_initial'] = pd.Series(uas_initial) |
|
578
|
1 |
|
results['workers']['wqs_initial'] = pd.Series(wqs_list[1]) |
|
579
|
1 |
|
results['workers']['wwa_initial'] = pd.Series(wwa_list[1]) |
|
580
|
1 |
|
results['workers']['wsa_initial'] = pd.Series(wsa_list[1]) |
|
581
|
1 |
|
if not config.open_ended_task: |
|
582
|
1 |
|
results['annotations']['aqs_initial'] = pd.Series(aqs_list[1]) |
|
583
|
|
|
return results |
|
584
|
|
|
|