GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( aeff40...59002d )
by Anca
17:59
created

Metrics.worker_sentence_agreement()   B

Complexity

Conditions 5

Size

Total Lines 44
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 30
dl 0
loc 44
rs 8.6933
c 0
b 0
f 0
ccs 25
cts 25
cp 1
cc 5
nop 6
crap 5
1
2
3 1
import logging
4 1
import math
5
# import datetime
6
# import itertools
7
# import pdb
8
9 1
from collections import Counter
10
# from pprint import pprint
11
# from datetime import datetime
12
# from collections import defaultdict
13
14 1
import numpy as np
15 1
import pandas as pd
16
17
# import crowdtruth.models.unit
18
19 1
SMALL_NUMBER_CONST = 0.00000001
20
21 1
class Metrics():
22
23
24
    # Sentence Quality Score
25 1
    @staticmethod
26
    def sentence_quality_score(sentence_id, sent_work_rel_dict, wqs, rqs):
27
        '''
28
        sentence_id
29
        work_sent_rel_dict
30
        rqs: dict of relation_id (string) -> relation quality (float)
31
        wqs: dict of worker_id (string) -> worker quality score
32
        '''
33
34 1
        sqs_numerator = 0.0
35 1
        sqs_denominator = 0.0
36 1
        worker_ids = list(sent_work_rel_dict[sentence_id].keys())
37
38 1
        for worker_i in range(len(worker_ids) - 1):
39 1
            for worker_j in range(worker_i + 1, len(worker_ids)):
40
                # print worker_ids[i] + " - " + worker_ids[j] + "\n"
41 1
                numerator = 0.0
42 1
                denominator_i = 0.0
43 1
                denominator_j = 0.0
44
45 1
                worker_i_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_i]]
46 1
                worker_j_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_j]]
47
48 1
                for relation in worker_i_vector:
49 1
                    worker_i_vector_rel = worker_i_vector[relation]
50 1
                    worker_j_vector_rel = worker_j_vector[relation]
51 1
                    numerator += rqs[relation] * (worker_i_vector_rel * worker_j_vector_rel)
52 1
                    denominator_i += rqs[relation] * (worker_i_vector_rel * worker_i_vector_rel)
53 1
                    denominator_j += rqs[relation] * (worker_j_vector_rel * worker_j_vector_rel)
54
55 1
                weighted_cosine = numerator / math.sqrt(denominator_i * denominator_j)
56
57 1
                sqs_numerator += weighted_cosine * wqs[worker_ids[worker_i]] * wqs[worker_ids[worker_j]]
58 1
                sqs_denominator += wqs[worker_ids[worker_i]] * wqs[worker_ids[worker_j]]
59
60 1
        if sqs_denominator < SMALL_NUMBER_CONST:
61 1
            sqs_denominator = SMALL_NUMBER_CONST
62 1
        return sqs_numerator / sqs_denominator
63
64
65
    # Worker - Sentence Agreement
66 1
    @staticmethod
67
    def worker_sentence_agreement(worker_id, sent_rel_dict, work_sent_rel_dict, sqs, rqs, wqs):
68
        '''
69
        worker_id
70
        sent_rel_dict
71
        work_sent_rel_dict
72
        sentence_vectors: data frame of sentence vectors
73
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
74
        rqs: dict of relation_id (string) -> relation quality (float)
75
        wqs: quality score of the given worker
76
        '''
77 1
        wsa_numerator = 0.0
78 1
        wsa_denominator = 0.0
79 1
        work_sent_rel_dict_worker_id = work_sent_rel_dict[worker_id]
80
81 1
        for sentence_id in work_sent_rel_dict_worker_id:
82 1
            numerator = 0.0
83 1
            denominator_w = 0.0
84 1
            denominator_s = 0.0
85
86 1
            worker_vector = work_sent_rel_dict[worker_id][sentence_id]
87 1
            sentence_vector = sent_rel_dict[sentence_id]
88
89 1
            for relation in worker_vector:
90 1
                worker_vector_relation = worker_vector[relation] * wqs
91 1
                sentence_vector_relation = sentence_vector[relation]
92
93 1
                numerator += rqs[relation] * worker_vector_relation * \
94
                    (sentence_vector_relation - worker_vector_relation)
95 1
                denominator_w += rqs[relation] * \
96
                    (worker_vector_relation * worker_vector_relation)
97 1
                denominator_s += rqs[relation] * ( \
98
                    (sentence_vector_relation - worker_vector_relation) * \
99
                    (sentence_vector_relation - worker_vector_relation))
100 1
            weighted_cosine = None
101 1
            if math.sqrt(denominator_w * denominator_s) < SMALL_NUMBER_CONST:
102 1
                weighted_cosine = SMALL_NUMBER_CONST
103
            else:
104 1
                weighted_cosine = numerator / math.sqrt(denominator_w * denominator_s)
105 1
            wsa_numerator += weighted_cosine * sqs[sentence_id]
106 1
            wsa_denominator += sqs[sentence_id]
107 1
        if wsa_denominator < SMALL_NUMBER_CONST:
108 1
            wsa_denominator = SMALL_NUMBER_CONST
109 1
        return wsa_numerator / wsa_denominator
110
111
112
    # Worker - Worker Agreement
113 1
    @staticmethod
114
    def worker_worker_agreement(worker_id, work_sent_rel_dict, sent_work_rel_dict, wqs, sqs, rqs):
115
        '''
116
        worker_id
117
        work_sent_rel_dict
118
        sent_work_rel_dict
119
        worker_vectors: data frame of worker vectors
120
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
121
        rqs: dict of relation_id (string) -> relation quality (float)
122
        '''
123
        
124 1
        wwa_numerator = 0.0
125 1
        wwa_denominator = 0.0
126
        
127 1
        wv = work_sent_rel_dict[worker_id]
128 1
        sentence_ids = list(work_sent_rel_dict[worker_id].keys())
129
        
130
        
131 1
        for sentence_id in sentence_ids:
132 1
            wv_sentence_id = wv[sentence_id]
133 1
            sent_work_rel_dict_sentence_id = sent_work_rel_dict[sentence_id]
134 1
            for other_worker_id in sent_work_rel_dict_sentence_id:
135 1
                if worker_id != other_worker_id:
136 1
                    numerator = 0.0
137 1
                    denominator_w = 0.0
138 1
                    denominator_ow = 0.0
139
                    
140 1
                    sent_work_rel_dict_sentence_id_other_worker_id = sent_work_rel_dict_sentence_id[other_worker_id]
141 1
                    for relation in wv_sentence_id:
142 1
                        sent_work_rel_dict_sentence_id_other_worker_id_relation = sent_work_rel_dict_sentence_id_other_worker_id[relation]
143 1
                        wv_sentence_id_relation = wv_sentence_id[relation]       
144
                   
145 1
                        numerator += rqs[relation] * (wv_sentence_id_relation * sent_work_rel_dict_sentence_id_other_worker_id_relation)
146
147 1
                        denominator_w += rqs[relation] * (wv_sentence_id_relation * wv_sentence_id_relation)
148
149 1
                        denominator_ow += rqs[relation] * (sent_work_rel_dict_sentence_id_other_worker_id_relation *
150
                                                           sent_work_rel_dict_sentence_id_other_worker_id_relation)
151
152 1
                    weighted_cosine = numerator / math.sqrt(denominator_w * denominator_ow)
153
                    # pdb.set_trace()
154 1
                    wwa_numerator += weighted_cosine * wqs[other_worker_id] * sqs[sentence_id]
155 1
                    wwa_denominator += wqs[other_worker_id] * sqs[sentence_id]
156 1
        if wwa_denominator < SMALL_NUMBER_CONST:
157 1
            wwa_denominator = SMALL_NUMBER_CONST
158 1
        return wwa_numerator / wwa_denominator
159
160
161
162
    # Sentence - Relation Score
163 1
    @staticmethod
164
    def sentence_relation_score(sentence_id, relation, sent_work_rel_dict, wqs):
165 1
        srs_numerator = 0.0
166 1
        srs_denominator = 0.0
167
        
168 1
        worker_ids = sent_work_rel_dict[sentence_id]
169 1
        for worker_id in worker_ids:
170 1
            srs_numerator += worker_ids[worker_id][relation] * wqs[worker_id]
171 1
            srs_denominator += wqs[worker_id]
172 1
        if srs_denominator < SMALL_NUMBER_CONST:
173 1
            srs_denominator = SMALL_NUMBER_CONST
174 1
        return srs_numerator / srs_denominator
175
176
177
    # Relation Quality Score
178 1
    @staticmethod
179
    def relation_quality_score(relations, work_sent_rel_dict, sqs, wqs):
180 1
        rqs_numerator = dict()
181 1
        rqs_denominator = dict()
182
        
183 1
        for relation in relations:
184 1
            rqs_numerator[relation] = 0.0
185 1
            rqs_denominator[relation] = 0.0
186
        
187 1
        worker_ids = list(work_sent_rel_dict.keys())
188 1
        for worker_i, work_sent_rel_dict_worker_i in work_sent_rel_dict.items():
189
            #work_sent_rel_dict_worker_i = work_sent_rel_dict[worker_i]
190 1
            work_sent_rel_dict_i_keys = list(work_sent_rel_dict_worker_i.keys())
191 1
            for worker_j, work_sent_rel_dict_worker_j in work_sent_rel_dict.items():
192
                #work_sent_rel_dict_worker_j = work_sent_rel_dict[worker_j]
193 1
                work_sent_rel_dict_j_keys = list(work_sent_rel_dict_worker_j.keys())
194
                
195
                #print worker_i, worker_j,np.intersect1d(np.array(work_sent_rel_dict[worker_i].keys()),np.array(work_sent_rel_dict[worker_j].keys()))
196 1
                if worker_i != worker_j and len(np.intersect1d(np.array(work_sent_rel_dict_i_keys),np.array(work_sent_rel_dict_j_keys))) > 0:
197
                        
198 1
                    for relation in relations:
199 1
                        numerator = 0.0
200 1
                        denominator = 0.0
201
202 1
                        for sentence_id, work_sent_rel_dict_worker_i_sent in work_sent_rel_dict_worker_i.items():
203 1
                            if sentence_id in work_sent_rel_dict_worker_j:
204
                                #work_sent_rel_dict_worker_i_sent = work_sent_rel_dict_worker_i[sentence_id]
205 1
                                work_sent_rel_dict_worker_j_sent = work_sent_rel_dict_worker_j[sentence_id]
206
207 1
                                work_sent_rel_dict_worker_j_sent_rel = work_sent_rel_dict_worker_j_sent[relation]
208
                                #print worker_i,worker_j,sentence_id,relation
209 1
                                numerator += sqs[sentence_id] * (work_sent_rel_dict_worker_i_sent[relation] *
210
                                                                 work_sent_rel_dict_worker_j_sent_rel)
211 1
                                denominator += sqs[sentence_id] * work_sent_rel_dict_worker_j_sent_rel
212
213 1
                        if denominator > 0:
214 1
                            rqs_numerator[relation] += wqs[worker_i] * wqs[worker_j] * numerator / denominator
215 1
                            rqs_denominator[relation] += wqs[worker_i] * wqs[worker_j]                    
216
217
218 1
        rqs = dict()
219 1
        for relation in relations:
220 1
            if rqs_denominator[relation] > SMALL_NUMBER_CONST:
221 1
                rqs[relation] = rqs_numerator[relation] / rqs_denominator[relation]
222
                
223
                # prevent division by zero by storing very small value instead
224 1
                if rqs[relation] < SMALL_NUMBER_CONST:
225 1
                    rqs[relation] = SMALL_NUMBER_CONST
226
            else:
227 1
                rqs[relation] = SMALL_NUMBER_CONST
228 1
        return rqs
229
230 1
    @staticmethod
231 1
    def run(results, config, max_delta = 0.001):
232
        
233 1
        judgments = results['judgments'].copy()
234 1
        units = results['units'].copy()
235
236
        #sent_work_rel_dict, work_sent_rel_dict, sent_rel_dict
237
        # TODO: change to use all vectors in one unit
238 1
        col = list(config.output.values())[0]
239 1
        sent_rel_dict = dict(units.copy()[col])
240
241 1
        def expandedVector(worker, unit):
242
            #print worker, unit
243 1
            vector = Counter()
244 1
            for rel in unit:
245 1
                if rel in worker:
246 1
                    vector[rel] = worker[rel]
247
                else:
248 1
                    vector[rel] = 0
249 1
            return vector
250
251
        # fill judgment vectors with unit keys
252 1
        for index,row in judgments.iterrows():
253
            # judgments.set_value(index, col, expandedVector(row[col], units.at[row['unit'], col]))
254 1
            judgments.at[index, col] = expandedVector(row[col], units.at[row['unit'], col])
255
256
        #print judgments.head()
257
258 1
        sent_work_rel_dict = judgments[['unit','worker',col]].copy().groupby('unit')
259 1
        sent_work_rel_dict = {name : group.set_index('worker')[col].to_dict() for name, group in sent_work_rel_dict}
260
261
        #print sent_work_rel_dict
262
263 1
        work_sent_rel_dict = judgments[['worker','unit',col]].copy().groupby('worker')
264 1
        work_sent_rel_dict = {name : group.set_index('unit')[col].to_dict() for name, group in work_sent_rel_dict}
265
#        print [i for i in list(sent_work_rel_dict)]
266
#        sent_work_rel_dict = {k : dict(sent_work_rel_dict[k]) for k in sent_work_rel_dict}
267
268
        #pprint(work_sent_rel_dict)
269
270
        #initialize data structures
271 1
        sqs_list = list()
272 1
        wqs_list = list()
273 1
        wwa_list = list()
274 1
        wsa_list = list()
275 1
        rqs_list = list()
276
        
277 1
        sqs = dict((sentence_id, 1.0) for sentence_id in sent_work_rel_dict)
278 1
        wqs = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
279 1
        wwa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
280 1
        wsa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)    
281
282 1
        sqs_list.append(sqs.copy())
283 1
        wqs_list.append(wqs.copy())
284 1
        wwa_list.append(wwa.copy())
285 1
        wsa_list.append(wsa.copy())
286
        
287
        # initialize RQS depending on whether or not it is an open ended task
288 1
        rqs = dict()
289 1
        if not config.open_ended_task:
290 1
            rqs_keys = list(sent_rel_dict[list(sent_rel_dict.keys())[0]].keys())
291 1
            for relation in rqs_keys:
292 1
                rqs[relation] = 1.0
293
        else:
294 1
            for sentence_id in sent_rel_dict:
295 1
                for relation in sent_rel_dict[sentence_id]:
296 1
                    rqs[relation] = 1.0
297 1
        rqs_list.append(rqs.copy())
298
299 1
        sqs_len = len(list(sqs.keys())) * 1.0
300 1
        wqs_len = len(list(wqs.keys())) * 1.0
301 1
        rqs_len = len(list(rqs.keys())) * 1.0
302
303
        # compute metrics until stable values
304 1
        iterations = 0
305 1
        while max_delta >= 0.001:
306 1
            sqs_new = dict()
307 1
            wqs_new = dict()
308 1
            wwa_new = dict()
309 1
            wsa_new = dict()
310
            
311 1
            avg_sqs_delta = 0.0
312 1
            avg_wqs_delta = 0.0
313 1
            avg_rqs_delta = 0.0
314 1
            max_delta = 0.0
315
            
316
            # pdb.set_trace()
317
            
318 1
            if not config.open_ended_task:
319
                # compute relation quality score (RQS)
320 1
                rqs_new = Metrics.relation_quality_score(list(rqs.keys()), work_sent_rel_dict,
321
                                                 sqs_list[len(sqs_list) - 1],
322
                                                 wqs_list[len(wqs_list) - 1])
323 1
                for relation, _ in rqs_new.items():
324 1
                    max_delta = max(max_delta, abs(rqs_new[relation] - rqs_list[len(rqs_list) - 1][relation]))
325 1
                    avg_rqs_delta += abs(rqs_new[relation] - rqs_list[len(rqs_list) - 1][relation])
326 1
                avg_rqs_delta /= rqs_len
327
            
328
            # compute sentence quality score (SQS)
329 1
            for sentence_id, _ in sent_work_rel_dict.items():
330 1
                sqs_new[sentence_id] = Metrics.sentence_quality_score(sentence_id, sent_work_rel_dict,
331
                                                              wqs_list[len(wqs_list) - 1],
332
                                                              rqs_list[len(rqs_list) - 1])
333 1
                max_delta = max(max_delta, abs(sqs_new[sentence_id] - sqs_list[len(sqs_list) - 1][sentence_id]))
334 1
                avg_sqs_delta += abs(sqs_new[sentence_id] - sqs_list[len(sqs_list) - 1][sentence_id])
335 1
            avg_sqs_delta /= sqs_len
336
337
            # compute worker quality score (WQS)
338 1
            for worker_id, _ in work_sent_rel_dict.items():
339 1
                wwa_new[worker_id] = Metrics.worker_worker_agreement(
340
                    worker_id, work_sent_rel_dict,
341
                    sent_work_rel_dict,
342
                    wqs_list[len(wqs_list) - 1],
343
                    sqs_list[len(sqs_list) - 1],
344
                    rqs_list[len(rqs_list) - 1])
345 1
                wsa_new[worker_id] = Metrics.worker_sentence_agreement(
346
                    worker_id,
347
                    sent_rel_dict,
348
                    work_sent_rel_dict,
349
                    sqs_list[len(sqs_list) - 1],
350
                    rqs_list[len(rqs_list) - 1],
351
                    wqs_list[len(rqs_list) - 1][worker_id])
352 1
                wqs_new[worker_id] = wwa_new[worker_id] * wsa_new[worker_id]
353 1
                max_delta = max(
354
                    max_delta,
355
                    abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id]))
356 1
                avg_wqs_delta += abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id])
357 1
            avg_wqs_delta /= wqs_len
358
            
359
            # save results for current iteration
360 1
            sqs_list.append(sqs_new.copy())
361 1
            wqs_list.append(wqs_new.copy())
362 1
            wwa_list.append(wwa_new.copy())
363 1
            wsa_list.append(wsa_new.copy())
364 1
            if not config.open_ended_task:
365 1
                rqs_list.append(rqs_new.copy())
0 ignored issues
show
introduced by
The variable rqs_new does not seem to be defined for all execution paths.
Loading history...
366 1
            iterations += 1 
367
            
368
            # reconstruct sent_rel_dict with worker scores
369 1
            new_sent_rel_dict = dict()
370 1
            for sent_id, rel_dict in sent_rel_dict.items():
371 1
                new_sent_rel_dict[sent_id] = dict()
372 1
                for relation, _ in rel_dict.items():
373 1
                    new_sent_rel_dict[sent_id][relation] = 0.0
374 1
            for work_id, srd in work_sent_rel_dict.items():
375 1
                wqs_work_id = wqs_new[work_id]
376 1
                for sent_id, rel_dict in srd.items():
377 1
                    for relation, score in rel_dict.items():
378 1
                        new_sent_rel_dict[sent_id][relation] += score * wqs_work_id
379
            # pdb.set_trace()
380 1
            sent_rel_dict = new_sent_rel_dict
381
382 1
            logging.info(str(iterations) + " iterations; max d= " + str(max_delta) + " ; wqs d= " + str(avg_wqs_delta) + "; sqs d= " + str(avg_sqs_delta) + "; rqs d= " + str(avg_rqs_delta))
383
384
            #if iterations == 1:
385
            #    break
386
        #pprint(sqs_list)
387
        #pprint(wqs_list)
388
        #pprint(rqs_list)
389
        
390 1
        srs = Counter()
391 1
        for sentence_id in sent_rel_dict:
392 1
            srs[sentence_id] = Counter()
393 1
            for relation in sent_rel_dict[sentence_id]:
394 1
                srs[sentence_id][relation] = Metrics.sentence_relation_score(sentence_id, relation, sent_work_rel_dict, wqs_list[len(wqs_list) - 1])
395
        
396 1
        srs_initial = Counter()
397 1
        for sentence_id in sent_rel_dict:
398 1
            srs_initial[sentence_id] = Counter()
399 1
            for relation in sent_rel_dict[sentence_id]:
400 1
                srs_initial[sentence_id][relation] = Metrics.sentence_relation_score(sentence_id, relation, sent_work_rel_dict, wqs_list[0])
401
402 1
        results['units']['uqs'] = pd.Series(sqs_list[-1])
403 1
        results['units']['unit_annotation_score'] = pd.Series(srs)
404 1
        results['workers']['wqs'] = pd.Series(wqs_list[-1])
405 1
        results['workers']['wwa'] = pd.Series(wwa_list[-1])
406 1
        results['workers']['wsa'] = pd.Series(wsa_list[-1])
407 1
        if not config.open_ended_task:
408 1
            results['annotations']['aqs'] = pd.Series(rqs_list[-1])
409
410 1
        results['units']['uqs_initial'] = pd.Series(sqs_list[1])
411 1
        results['units']['unit_annotation_score_initial'] = pd.Series(srs_initial)
412 1
        results['workers']['wqs_initial'] = pd.Series(wqs_list[1])
413 1
        results['workers']['wwa_initial'] = pd.Series(wwa_list[1])
414 1
        results['workers']['wsa_initial'] = pd.Series(wsa_list[1])
415 1
        if not config.open_ended_task:
416 1
            results['annotations']['aqs_initial'] = pd.Series(rqs_list[1])
417
        return results
418