GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( fbc1dc...b92d79 )
by Oana
18:54
created

Metrics.sentence_quality_score()   B

Complexity

Conditions 5

Size

Total Lines 39
Code Lines 25

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 23
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 25
dl 0
loc 39
rs 8.8133
c 0
b 0
f 0
ccs 23
cts 23
cp 1
cc 5
nop 4
crap 5
1 1
import logging
2 1
import math
3
4 1
from collections import Counter
5
6 1
import numpy as np
7 1
import pandas as pd
8
9 1
SMALL_NUMBER_CONST = 0.00000001
10
11 1
class Metrics():
12
13
14
    # Sentence Quality Score
15 1
    @staticmethod
16
    def sentence_quality_score(sentence_id, sent_work_rel_dict, wqs, rqs):
17
        '''
18
        sentence_id
19
        work_sent_rel_dict
20
        rqs: dict of relation_id (string) -> relation quality (float)
21
        wqs: dict of worker_id (string) -> worker quality score
22
        '''
23
24 1
        sqs_numerator = 0.0
25 1
        sqs_denominator = 0.0
26 1
        worker_ids = list(sent_work_rel_dict[sentence_id].keys())
27
28 1
        for worker_i in range(len(worker_ids) - 1):
29 1
            for worker_j in range(worker_i + 1, len(worker_ids)):
30
                # print worker_ids[i] + " - " + worker_ids[j] + "\n"
31 1
                numerator = 0.0
32 1
                denominator_i = 0.0
33 1
                denominator_j = 0.0
34
35 1
                worker_i_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_i]]
36 1
                worker_j_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_j]]
37
38 1
                for relation in worker_i_vector:
39 1
                    worker_i_vector_rel = worker_i_vector[relation]
40 1
                    worker_j_vector_rel = worker_j_vector[relation]
41 1
                    numerator += rqs[relation] * (worker_i_vector_rel * worker_j_vector_rel)
42 1
                    denominator_i += rqs[relation] * (worker_i_vector_rel * worker_i_vector_rel)
43 1
                    denominator_j += rqs[relation] * (worker_j_vector_rel * worker_j_vector_rel)
44
45 1
                weighted_cosine = numerator / math.sqrt(denominator_i * denominator_j)
46
47 1
                sqs_numerator += weighted_cosine * wqs[worker_ids[worker_i]] * \
48
                                 wqs[worker_ids[worker_j]]
49 1
                sqs_denominator += wqs[worker_ids[worker_i]] * wqs[worker_ids[worker_j]]
50
51 1
        if sqs_denominator < SMALL_NUMBER_CONST:
52 1
            sqs_denominator = SMALL_NUMBER_CONST
53 1
        return sqs_numerator / sqs_denominator
54
55
56
    # Worker - Sentence Agreement
57 1
    @staticmethod
58
    def worker_sentence_agreement(worker_id, sent_rel_dict, work_sent_rel_dict, sqs, rqs, wqs):
59
        '''
60
        worker_id
61
        sent_rel_dict
62
        work_sent_rel_dict
63
        sentence_vectors: data frame of sentence vectors
64
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
65
        rqs: dict of relation_id (string) -> relation quality (float)
66
        wqs: quality score of the given worker
67
        '''
68 1
        wsa_numerator = 0.0
69 1
        wsa_denominator = 0.0
70 1
        work_sent_rel_dict_worker_id = work_sent_rel_dict[worker_id]
71
72 1
        for sentence_id in work_sent_rel_dict_worker_id:
73 1
            numerator = 0.0
74 1
            denominator_w = 0.0
75 1
            denominator_s = 0.0
76
77 1
            worker_vector = work_sent_rel_dict[worker_id][sentence_id]
78 1
            sentence_vector = sent_rel_dict[sentence_id]
79
80 1
            for relation in worker_vector:
81 1
                worker_vector_relation = worker_vector[relation] * wqs
82 1
                sentence_vector_relation = sentence_vector[relation]
83
84 1
                numerator += rqs[relation] * worker_vector_relation * \
85
                    (sentence_vector_relation - worker_vector_relation)
86 1
                denominator_w += rqs[relation] * \
87
                    (worker_vector_relation * worker_vector_relation)
88 1
                denominator_s += rqs[relation] * ( \
89
                    (sentence_vector_relation - worker_vector_relation) * \
90
                    (sentence_vector_relation - worker_vector_relation))
91 1
            weighted_cosine = None
92 1
            if math.sqrt(denominator_w * denominator_s) < SMALL_NUMBER_CONST:
93 1
                weighted_cosine = SMALL_NUMBER_CONST
94
            else:
95 1
                weighted_cosine = numerator / math.sqrt(denominator_w * denominator_s)
96 1
            wsa_numerator += weighted_cosine * sqs[sentence_id]
97 1
            wsa_denominator += sqs[sentence_id]
98 1
        if wsa_denominator < SMALL_NUMBER_CONST:
99 1
            wsa_denominator = SMALL_NUMBER_CONST
100 1
        return wsa_numerator / wsa_denominator
101
102
    # Worker - Worker Agreement
103 1
    @staticmethod
104
    def worker_worker_agreement(worker_id, work_sent_rel_dict, sent_work_rel_dict, wqs, sqs, rqs):
105
        '''
106
        worker_id
107
        work_sent_rel_dict
108
        sent_work_rel_dict
109
        worker_vectors: data frame of worker vectors
110
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
111
        rqs: dict of relation_id (string) -> relation quality (float)
112
        '''
113
114 1
        wwa_numerator = 0.0
115 1
        wwa_denominator = 0.0
116
117 1
        worker_vector = work_sent_rel_dict[worker_id]
118 1
        sentence_ids = list(work_sent_rel_dict[worker_id].keys())
119
120 1
        for sentence_id in sentence_ids:
121 1
            wv_sentence_id = worker_vector[sentence_id]
122 1
            sent_work_rel_dict_sentence_id = sent_work_rel_dict[sentence_id]
123 1
            for other_worker_id in sent_work_rel_dict_sentence_id:
124 1
                if worker_id != other_worker_id:
125 1
                    numerator = 0.0
126 1
                    denominator_w = 0.0
127 1
                    denominator_ow = 0.0
128
129 1
                    sent_work_rel_dict_sentence_id_other_worker_id = sent_work_rel_dict_sentence_id[other_worker_id]
130 1
                    for relation in wv_sentence_id:
131 1
                        sent_work_rel_dict_sentence_id_other_worker_id_relation = sent_work_rel_dict_sentence_id_other_worker_id[relation]
132 1
                        wv_sentence_id_relation = wv_sentence_id[relation]
133
134 1
                        numerator += rqs[relation] * (wv_sentence_id_relation * sent_work_rel_dict_sentence_id_other_worker_id_relation)
135
136 1
                        denominator_w += rqs[relation] * (wv_sentence_id_relation * wv_sentence_id_relation)
137
138 1
                        denominator_ow += rqs[relation] * (sent_work_rel_dict_sentence_id_other_worker_id_relation *
139
                                                           sent_work_rel_dict_sentence_id_other_worker_id_relation)
140
141 1
                    weighted_cosine = numerator / math.sqrt(denominator_w * denominator_ow)
142
                    # pdb.set_trace()
143 1
                    wwa_numerator += weighted_cosine * wqs[other_worker_id] * sqs[sentence_id]
144 1
                    wwa_denominator += wqs[other_worker_id] * sqs[sentence_id]
145 1
        if wwa_denominator < SMALL_NUMBER_CONST:
146 1
            wwa_denominator = SMALL_NUMBER_CONST
147 1
        return wwa_numerator / wwa_denominator
148
149
150
151
    # Sentence - Relation Score
152 1
    @staticmethod
153
    def sentence_relation_score(sentence_id, relation, sent_work_rel_dict, wqs):
154
        '''
155
        sentence_id
156
        relation
157
        sent_work_rel_dict
158
        wqs: dict of workers_id (string) -> worker quality (float)
159
        '''
160 1
        srs_numerator = 0.0
161 1
        srs_denominator = 0.0
162
163 1
        worker_ids = sent_work_rel_dict[sentence_id]
164 1
        for worker_id in worker_ids:
165 1
            srs_numerator += worker_ids[worker_id][relation] * wqs[worker_id]
166 1
            srs_denominator += wqs[worker_id]
167 1
        if srs_denominator < SMALL_NUMBER_CONST:
168 1
            srs_denominator = SMALL_NUMBER_CONST
169 1
        return srs_numerator / srs_denominator
170
171
172
    # Relation Quality Score
173 1
    @staticmethod
174
    def relation_quality_score(relations, work_sent_rel_dict, sqs, wqs):
175
        '''
176
        relations
177
        work_sent_rel_dict
178
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
179
        wqs: dict of workers_id (string) -> worker quality (float)
180
        '''
181 1
        rqs_numerator = dict()
182 1
        rqs_denominator = dict()
183
184 1
        for relation in relations:
185 1
            rqs_numerator[relation] = 0.0
186 1
            rqs_denominator[relation] = 0.0
187
188 1
        for worker_i, work_sent_rel_dict_worker_i in work_sent_rel_dict.items():
189
            #work_sent_rel_dict_worker_i = work_sent_rel_dict[worker_i]
190 1
            work_sent_rel_dict_i_keys = list(work_sent_rel_dict_worker_i.keys())
191 1
            for worker_j, work_sent_rel_dict_worker_j in work_sent_rel_dict.items():
192
                #work_sent_rel_dict_worker_j = work_sent_rel_dict[worker_j]
193 1
                work_sent_rel_dict_j_keys = list(work_sent_rel_dict_worker_j.keys())
194
195 1
                if worker_i != worker_j and len(np.intersect1d(np.array(work_sent_rel_dict_i_keys), np.array(work_sent_rel_dict_j_keys))) > 0:
196 1
                    for relation in relations:
197 1
                        numerator = 0.0
198 1
                        denominator = 0.0
199
200 1
                        for sentence_id, work_sent_rel_dict_worker_i_sent in work_sent_rel_dict_worker_i.items():
201 1
                            if sentence_id in work_sent_rel_dict_worker_j:
202
                                #work_sent_rel_dict_worker_i_sent = work_sent_rel_dict_worker_i[sentence_id]
203 1
                                work_sent_rel_dict_worker_j_sent = work_sent_rel_dict_worker_j[sentence_id]
204
205 1
                                work_sent_rel_dict_worker_j_sent_rel = work_sent_rel_dict_worker_j_sent[relation]
206
                                #print worker_i,worker_j,sentence_id,relation
207 1
                                numerator += sqs[sentence_id] * (work_sent_rel_dict_worker_i_sent[relation] *
208
                                                                 work_sent_rel_dict_worker_j_sent_rel)
209 1
                                denominator += sqs[sentence_id] * work_sent_rel_dict_worker_j_sent_rel
210
211 1
                        if denominator > 0:
212 1
                            rqs_numerator[relation] += wqs[worker_i] * wqs[worker_j] * \
213
                                                        numerator / denominator
214 1
                            rqs_denominator[relation] += wqs[worker_i] * wqs[worker_j]
215
216
217 1
        rqs = dict()
218 1
        for relation in relations:
219 1
            if rqs_denominator[relation] > SMALL_NUMBER_CONST:
220 1
                rqs[relation] = rqs_numerator[relation] / rqs_denominator[relation]
221
222
                # prevent division by zero by storing very small value instead
223 1
                if rqs[relation] < SMALL_NUMBER_CONST:
224 1
                    rqs[relation] = SMALL_NUMBER_CONST
225
            else:
226 1
                rqs[relation] = SMALL_NUMBER_CONST
227 1
        return rqs
228
229
230 1
    @staticmethod
231 1
    def run(results, config, max_delta=0.001):
232
        '''
233
        iteratively run the CrowdTruth metrics
234
        '''
235
236 1
        judgments = results['judgments'].copy()
237 1
        units = results['units'].copy()
238
239
        # sent_work_rel_dict, work_sent_rel_dict, sent_rel_dict
240
        # to be done: change to use all vectors in one unit
241 1
        col = list(config.output.values())[0]
242 1
        sent_rel_dict = dict(units.copy()[col])
243
244 1
        def expanded_vector(worker, unit):
245
            '''
246
            expand the vector of a worker on a given unit
247
            '''
248 1
            vector = Counter()
249 1
            for rel in unit:
250 1
                if rel in worker:
251 1
                    vector[rel] = worker[rel]
252
                else:
253 1
                    vector[rel] = 0
254 1
            return vector
255
256
        # fill judgment vectors with unit keys
257 1
        for index, row in judgments.iterrows():
258
            # judgments.set_value(index, col, expandedVector(row[col], units.at[row['unit'], col]))
259 1
            judgments.at[index, col] = expanded_vector(row[col], units.at[row['unit'], col])
260
261 1
        sent_work_rel_dict = judgments[['unit', 'worker', col]].copy().groupby('unit')
262 1
        sent_work_rel_dict = {name : group.set_index('worker')[col].to_dict() \
263
                                for name, group in sent_work_rel_dict}
264
265 1
        work_sent_rel_dict = judgments[['worker', 'unit', col]].copy().groupby('worker')
266 1
        work_sent_rel_dict = {name : group.set_index('unit')[col].to_dict() \
267
                                for name, group in work_sent_rel_dict}
268
269
        #initialize data structures
270 1
        sqs_list = list()
271 1
        wqs_list = list()
272 1
        wwa_list = list()
273 1
        wsa_list = list()
274 1
        rqs_list = list()
275
276 1
        sqs = dict((sentence_id, 1.0) for sentence_id in sent_work_rel_dict)
277 1
        wqs = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
278 1
        wwa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
279 1
        wsa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
280
281 1
        sqs_list.append(sqs.copy())
282 1
        wqs_list.append(wqs.copy())
283 1
        wwa_list.append(wwa.copy())
284 1
        wsa_list.append(wsa.copy())
285
286
        # initialize RQS depending on whether or not it is an open ended task
287 1
        rqs = dict()
288 1
        if not config.open_ended_task:
289 1
            rqs_keys = list(sent_rel_dict[list(sent_rel_dict.keys())[0]].keys())
290 1
            for relation in rqs_keys:
291 1
                rqs[relation] = 1.0
292
        else:
293 1
            for sentence_id in sent_rel_dict:
294 1
                for relation in sent_rel_dict[sentence_id]:
295 1
                    rqs[relation] = 1.0
296 1
        rqs_list.append(rqs.copy())
297
298 1
        sqs_len = len(list(sqs.keys())) * 1.0
299 1
        wqs_len = len(list(wqs.keys())) * 1.0
300 1
        rqs_len = len(list(rqs.keys())) * 1.0
301
302
        # compute metrics until stable values
303 1
        iterations = 0
304 1
        while max_delta >= 0.001:
305 1
            sqs_new = dict()
306 1
            wqs_new = dict()
307 1
            wwa_new = dict()
308 1
            wsa_new = dict()
309
310 1
            avg_sqs_delta = 0.0
311 1
            avg_wqs_delta = 0.0
312 1
            avg_rqs_delta = 0.0
313 1
            max_delta = 0.0
314
315
            # pdb.set_trace()
316
317 1
            def compute_wqs(wwa_new, wsa_new, wqs_new, work_sent_rel_dict, sent_rel_dict, \
318
                            sent_work_rel_dict, wqs_list, sqs_list, rqs_list, wqs_len, \
319
                            max_delta, avg_wqs_delta):
320
                """ compute worker quality score (WQS) """
321 1
                for worker_id, _ in work_sent_rel_dict.items():
322 1
                    wwa_new[worker_id] = Metrics.worker_worker_agreement( \
323
                             worker_id, work_sent_rel_dict, \
324
                             sent_work_rel_dict, \
325
                             wqs_list[len(wqs_list) - 1], \
326
                             sqs_list[len(sqs_list) - 1], \
327
                             rqs_list[len(rqs_list) - 1])
328 1
                    wsa_new[worker_id] = Metrics.worker_sentence_agreement( \
329
                             worker_id, \
330
                             sent_rel_dict, \
331
                             work_sent_rel_dict, \
332
                             sqs_list[len(sqs_list) - 1], \
333
                             rqs_list[len(rqs_list) - 1], \
334
                             wqs_list[len(rqs_list) - 1][worker_id])
335 1
                    wqs_new[worker_id] = wwa_new[worker_id] * wsa_new[worker_id]
336 1
                    max_delta = max(max_delta, \
337
                                abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id]))
338 1
                    avg_wqs_delta += abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id])
339 1
                avg_wqs_delta /= wqs_len
340
341 1
                return wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta
342
343 1
            def reconstruct_sent_rel_dict(sent_rel_dict, work_sent_rel_dict, wqs_new):
344
                """ reconstruct sent_rel_dict with worker scores """
345 1
                new_sent_rel_dict = dict()
346 1
                for sent_id, rel_dict in sent_rel_dict.items():
347 1
                    new_sent_rel_dict[sent_id] = dict()
348 1
                    for relation, _ in rel_dict.items():
349 1
                        new_sent_rel_dict[sent_id][relation] = 0.0
350 1
                for work_id, srd in work_sent_rel_dict.items():
351 1
                    wqs_work_id = wqs_new[work_id]
352 1
                    for sent_id, rel_dict in srd.items():
353 1
                        for relation, score in rel_dict.items():
354 1
                            new_sent_rel_dict[sent_id][relation] += score * wqs_work_id
355
356 1
                return new_sent_rel_dict
357
358 1
            def save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, iteration_value):
359
                """ save the unit relation score for print """
360 1
                srs = Counter()
361 1
                for sentence_id in sent_rel_dict:
362 1
                    srs[sentence_id] = Counter()
363 1
                    for relation in sent_rel_dict[sentence_id]:
364 1
                        srs[sentence_id][relation] = Metrics.sentence_relation_score(sentence_id, \
365
                                                    relation, sent_work_rel_dict, \
366
                                                    iteration_value)
367 1
                return srs
368
369 1
            def compute_rqs(rqs, work_sent_rel_dict, sqs_list, wqs_list, rqs_list, rqs_len, max_delta, avg_rqs_delta):
370
                """ compute relation quality score (RQS) """
371 1
                rqs_new = Metrics.relation_quality_score(list(rqs.keys()), work_sent_rel_dict, \
372
                                                        sqs_list[len(sqs_list) - 1], \
373
                                                        wqs_list[len(wqs_list) - 1])
374 1
                for rel, _ in rqs_new.items():
375 1
                    max_delta = max(max_delta, abs(rqs_new[rel] - rqs_list[len(rqs_list) - 1][rel]))
376 1
                    avg_rqs_delta += abs(rqs_new[rel] - rqs_list[len(rqs_list) - 1][rel])
377 1
                avg_rqs_delta /= rqs_len
378 1
                return rqs_new, max_delta, avg_rqs_delta
379
380 1
            def compute_sqs(sqs_new, sent_work_rel_dict, wqs_list, rqs_list, sqs_list, sqs_len, max_delta, avg_sqs_delta):
381
                """ compute sentence quality score (SQS) """
382 1
                for sent_id, _ in sent_work_rel_dict.items():
383 1
                    sqs_new[sent_id] = Metrics.sentence_quality_score(sent_id, sent_work_rel_dict, \
384
                                                                      wqs_list[len(wqs_list) - 1], \
385
                                                                      rqs_list[len(rqs_list) - 1])
386 1
                    max_delta = max(max_delta, \
387
                                abs(sqs_new[sent_id] - sqs_list[len(sqs_list) - 1][sent_id]))
388 1
                    avg_sqs_delta += abs(sqs_new[sent_id] - sqs_list[len(sqs_list) - 1][sent_id])
389 1
                avg_sqs_delta /= sqs_len
390 1
                return sqs_new, max_delta, avg_sqs_delta
391
392 1
            if not config.open_ended_task:
393
                # compute relation quality score (RQS)
394 1
                rqs_new, max_delta, avg_rqs_delta = compute_rqs(rqs, work_sent_rel_dict, \
395
                                sqs_list, wqs_list, rqs_list, rqs_len, max_delta, avg_rqs_delta)
396
397
            # compute sentence quality score (SQS)
398 1
            sqs_new, max_delta, avg_sqs_delta = compute_sqs(sqs_new, sent_work_rel_dict, \
399
                                    wqs_list, rqs_list, sqs_list, sqs_len, max_delta, avg_sqs_delta)
400
401
            # compute worker quality score (WQS)
402 1
            wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta = compute_wqs(\
403
                        wwa_new, wsa_new, wqs_new, \
404
                        work_sent_rel_dict, sent_rel_dict, sent_work_rel_dict, wqs_list, \
405
                        sqs_list, rqs_list, wqs_len, max_delta, avg_wqs_delta)
406
407
            # save results for current iteration
408 1
            sqs_list.append(sqs_new.copy())
409 1
            wqs_list.append(wqs_new.copy())
410 1
            wwa_list.append(wwa_new.copy())
411 1
            wsa_list.append(wsa_new.copy())
412 1
            if not config.open_ended_task:
413 1
                rqs_list.append(rqs_new.copy())
0 ignored issues
show
introduced by
The variable rqs_new does not seem to be defined for all execution paths.
Loading history...
414 1
            iterations += 1
415
416 1
            sent_rel_dict = reconstruct_sent_rel_dict(sent_rel_dict, work_sent_rel_dict, wqs_new)
417
418 1
            logging.info(str(iterations) + " iterations; max d= " + str(max_delta) + \
419
                        " ; wqs d= " + str(avg_wqs_delta) + "; sqs d= " + str(avg_sqs_delta) + \
420
                        "; rqs d= " + str(avg_rqs_delta))
421
422 1
        srs = save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, wqs_list[len(wqs_list) - 1])
0 ignored issues
show
introduced by
The variable save_unit_rel_score does not seem to be defined in case the while loop on line 304 is not entered. Are you sure this can never be the case?
Loading history...
423 1
        srs_initial = save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, wqs_list[0])
424
425 1
        results['units']['uqs'] = pd.Series(sqs_list[-1])
426 1
        results['units']['unit_annotation_score'] = pd.Series(srs)
427 1
        results['workers']['wqs'] = pd.Series(wqs_list[-1])
428 1
        results['workers']['wwa'] = pd.Series(wwa_list[-1])
429 1
        results['workers']['wsa'] = pd.Series(wsa_list[-1])
430 1
        if not config.open_ended_task:
431 1
            results['annotations']['aqs'] = pd.Series(rqs_list[-1])
432
433 1
        results['units']['uqs_initial'] = pd.Series(sqs_list[1])
434 1
        results['units']['unit_annotation_score_initial'] = pd.Series(srs_initial)
435 1
        results['workers']['wqs_initial'] = pd.Series(wqs_list[1])
436 1
        results['workers']['wwa_initial'] = pd.Series(wwa_list[1])
437 1
        results['workers']['wsa_initial'] = pd.Series(wsa_list[1])
438 1
        if not config.open_ended_task:
439 1
            results['annotations']['aqs_initial'] = pd.Series(rqs_list[1])
440
        return results
441