GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( d6d549...fbc1dc )
by Oana
20:56
created

crowdtruth.models.metrics.Metrics.compute_sqs()   A

Complexity

Conditions 2

Size

Total Lines 13
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 13
ccs 7
cts 7
cp 1
rs 9.85
c 0
b 0
f 0
cc 2
nop 8
crap 2

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1 1
import logging
2 1
import math
3
4 1
from collections import Counter
5
6 1
import numpy as np
7 1
import pandas as pd
8
9 1
SMALL_NUMBER_CONST = 0.00000001
10
11 1
class Metrics():
12
13
14
    # Sentence Quality Score
15 1
    @staticmethod
16
    def sentence_quality_score(sentence_id, sent_work_rel_dict, wqs, rqs):
17
        '''
18
        sentence_id
19
        work_sent_rel_dict
20
        rqs: dict of relation_id (string) -> relation quality (float)
21
        wqs: dict of worker_id (string) -> worker quality score
22
        '''
23
24 1
        sqs_numerator = 0.0
25 1
        sqs_denominator = 0.0
26 1
        worker_ids = list(sent_work_rel_dict[sentence_id].keys())
27
28 1
        for worker_i in range(len(worker_ids) - 1):
29 1
            for worker_j in range(worker_i + 1, len(worker_ids)):
30
                # print worker_ids[i] + " - " + worker_ids[j] + "\n"
31 1
                numerator = 0.0
32 1
                denominator_i = 0.0
33 1
                denominator_j = 0.0
34
35 1
                worker_i_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_i]]
36 1
                worker_j_vector = sent_work_rel_dict[sentence_id][worker_ids[worker_j]]
37
38 1
                for relation in worker_i_vector:
39 1
                    worker_i_vector_rel = worker_i_vector[relation]
40 1
                    worker_j_vector_rel = worker_j_vector[relation]
41 1
                    numerator += rqs[relation] * (worker_i_vector_rel * worker_j_vector_rel)
42 1
                    denominator_i += rqs[relation] * (worker_i_vector_rel * worker_i_vector_rel)
43 1
                    denominator_j += rqs[relation] * (worker_j_vector_rel * worker_j_vector_rel)
44
45 1
                weighted_cosine = numerator / math.sqrt(denominator_i * denominator_j)
46
47 1
                sqs_numerator += weighted_cosine * wqs[worker_ids[worker_i]] * \
48
                                 wqs[worker_ids[worker_j]]
49 1
                sqs_denominator += wqs[worker_ids[worker_i]] * wqs[worker_ids[worker_j]]
50
51 1
        if sqs_denominator < SMALL_NUMBER_CONST:
52 1
            sqs_denominator = SMALL_NUMBER_CONST
53 1
        return sqs_numerator / sqs_denominator
54
55
56
    # Worker - Sentence Agreement
57 1
    @staticmethod
58
    def worker_sentence_agreement(worker_id, sent_rel_dict, work_sent_rel_dict, sqs, rqs, wqs):
59
        '''
60
        worker_id
61
        sent_rel_dict
62
        work_sent_rel_dict
63
        sentence_vectors: data frame of sentence vectors
64
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
65
        rqs: dict of relation_id (string) -> relation quality (float)
66
        wqs: quality score of the given worker
67
        '''
68 1
        wsa_numerator = 0.0
69 1
        wsa_denominator = 0.0
70 1
        work_sent_rel_dict_worker_id = work_sent_rel_dict[worker_id]
71
72 1
        for sentence_id in work_sent_rel_dict_worker_id:
73 1
            numerator = 0.0
74 1
            denominator_w = 0.0
75 1
            denominator_s = 0.0
76
77 1
            worker_vector = work_sent_rel_dict[worker_id][sentence_id]
78 1
            sentence_vector = sent_rel_dict[sentence_id]
79
80 1
            for relation in worker_vector:
81 1
                worker_vector_relation = worker_vector[relation] * wqs
82 1
                sentence_vector_relation = sentence_vector[relation]
83
84 1
                numerator += rqs[relation] * worker_vector_relation * \
85
                    (sentence_vector_relation - worker_vector_relation)
86 1
                denominator_w += rqs[relation] * \
87
                    (worker_vector_relation * worker_vector_relation)
88 1
                denominator_s += rqs[relation] * ( \
89
                    (sentence_vector_relation - worker_vector_relation) * \
90
                    (sentence_vector_relation - worker_vector_relation))
91 1
            weighted_cosine = None
92 1
            if math.sqrt(denominator_w * denominator_s) < SMALL_NUMBER_CONST:
93 1
                weighted_cosine = SMALL_NUMBER_CONST
94
            else:
95 1
                weighted_cosine = numerator / math.sqrt(denominator_w * denominator_s)
96 1
            wsa_numerator += weighted_cosine * sqs[sentence_id]
97 1
            wsa_denominator += sqs[sentence_id]
98 1
        if wsa_denominator < SMALL_NUMBER_CONST:
99 1
            wsa_denominator = SMALL_NUMBER_CONST
100 1
        return wsa_numerator / wsa_denominator
101
102
    # Worker - Worker Agreement
103 1
    @staticmethod
104
    def worker_worker_agreement(worker_id, work_sent_rel_dict, sent_work_rel_dict, wqs, sqs, rqs):
105
        '''
106
        worker_id
107
        work_sent_rel_dict
108
        sent_work_rel_dict
109
        worker_vectors: data frame of worker vectors
110
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
111
        rqs: dict of relation_id (string) -> relation quality (float)
112
        '''
113
114 1
        wwa_numerator = 0.0
115 1
        wwa_denominator = 0.0
116
117 1
        worker_vector = work_sent_rel_dict[worker_id]
118 1
        sentence_ids = list(work_sent_rel_dict[worker_id].keys())
119
120 1
        for sentence_id in sentence_ids:
121 1
            wv_sentence_id = worker_vector[sentence_id]
122 1
            sent_work_rel_dict_sentence_id = sent_work_rel_dict[sentence_id]
123 1
            for other_worker_id in sent_work_rel_dict_sentence_id:
124 1
                if worker_id != other_worker_id:
125 1
                    numerator = 0.0
126 1
                    denominator_w = 0.0
127 1
                    denominator_ow = 0.0
128
129 1
                    sent_work_rel_dict_sentence_id_other_worker_id = sent_work_rel_dict_sentence_id[other_worker_id]
130 1
                    for relation in wv_sentence_id:
131 1
                        sent_work_rel_dict_sentence_id_other_worker_id_relation = sent_work_rel_dict_sentence_id_other_worker_id[relation]
132 1
                        wv_sentence_id_relation = wv_sentence_id[relation]
133
134 1
                        numerator += rqs[relation] * (wv_sentence_id_relation * sent_work_rel_dict_sentence_id_other_worker_id_relation)
135
136 1
                        denominator_w += rqs[relation] * (wv_sentence_id_relation * wv_sentence_id_relation)
137
138 1
                        denominator_ow += rqs[relation] * (sent_work_rel_dict_sentence_id_other_worker_id_relation *
139
                                                           sent_work_rel_dict_sentence_id_other_worker_id_relation)
140
141 1
                    weighted_cosine = numerator / math.sqrt(denominator_w * denominator_ow)
142
                    # pdb.set_trace()
143 1
                    wwa_numerator += weighted_cosine * wqs[other_worker_id] * sqs[sentence_id]
144 1
                    wwa_denominator += wqs[other_worker_id] * sqs[sentence_id]
145 1
        if wwa_denominator < SMALL_NUMBER_CONST:
146 1
            wwa_denominator = SMALL_NUMBER_CONST
147 1
        return wwa_numerator / wwa_denominator
148
149
150
151
    # Sentence - Relation Score
152 1
    @staticmethod
153
    def sentence_relation_score(sentence_id, relation, sent_work_rel_dict, wqs):
154
        '''
155
        sentence_id
156
        relation
157
        sent_work_rel_dict
158
        wqs: dict of workers_id (string) -> worker quality (float)
159
        '''
160 1
        srs_numerator = 0.0
161 1
        srs_denominator = 0.0
162
163 1
        worker_ids = sent_work_rel_dict[sentence_id]
164 1
        for worker_id in worker_ids:
165 1
            srs_numerator += worker_ids[worker_id][relation] * wqs[worker_id]
166 1
            srs_denominator += wqs[worker_id]
167 1
        if srs_denominator < SMALL_NUMBER_CONST:
168 1
            srs_denominator = SMALL_NUMBER_CONST
169 1
        return srs_numerator / srs_denominator
170
171
172
    # Relation Quality Score
173 1
    @staticmethod
174
    def relation_quality_score(relations, work_sent_rel_dict, sqs, wqs):
175
        '''
176
        relations
177
        work_sent_rel_dict
178
        sqs (sentence quality score): dict sentence_id -> sentence quality (float)
179
        wqs: dict of workers_id (string) -> worker quality (float)
180
        '''
181 1
        rqs_numerator = dict()
182 1
        rqs_denominator = dict()
183
184 1
        for relation in relations:
185 1
            rqs_numerator[relation] = 0.0
186 1
            rqs_denominator[relation] = 0.0
187
188 1
        for worker_i, work_sent_rel_dict_worker_i in work_sent_rel_dict.items():
189
            #work_sent_rel_dict_worker_i = work_sent_rel_dict[worker_i]
190 1
            work_sent_rel_dict_i_keys = list(work_sent_rel_dict_worker_i.keys())
191 1
            for worker_j, work_sent_rel_dict_worker_j in work_sent_rel_dict.items():
192
                #work_sent_rel_dict_worker_j = work_sent_rel_dict[worker_j]
193 1
                work_sent_rel_dict_j_keys = list(work_sent_rel_dict_worker_j.keys())
194
195 1
                if worker_i != worker_j and len(np.intersect1d(np.array(work_sent_rel_dict_i_keys), np.array(work_sent_rel_dict_j_keys))) > 0:
196 1
                    for relation in relations:
197 1
                        numerator = 0.0
198 1
                        denominator = 0.0
199
200 1
                        for sentence_id, work_sent_rel_dict_worker_i_sent in work_sent_rel_dict_worker_i.items():
201 1
                            if sentence_id in work_sent_rel_dict_worker_j:
202
                                #work_sent_rel_dict_worker_i_sent = work_sent_rel_dict_worker_i[sentence_id]
203 1
                                work_sent_rel_dict_worker_j_sent = work_sent_rel_dict_worker_j[sentence_id]
204
205 1
                                work_sent_rel_dict_worker_j_sent_rel = work_sent_rel_dict_worker_j_sent[relation]
206
                                #print worker_i,worker_j,sentence_id,relation
207 1
                                numerator += sqs[sentence_id] * (work_sent_rel_dict_worker_i_sent[relation] *
208
                                                                 work_sent_rel_dict_worker_j_sent_rel)
209 1
                                denominator += sqs[sentence_id] * work_sent_rel_dict_worker_j_sent_rel
210
211 1
                        if denominator > 0:
212 1
                            rqs_numerator[relation] += wqs[worker_i] * wqs[worker_j] * \
213
                                                        numerator / denominator
214 1
                            rqs_denominator[relation] += wqs[worker_i] * wqs[worker_j]
215
216
217 1
        rqs = dict()
218 1
        for relation in relations:
219 1
            if rqs_denominator[relation] > SMALL_NUMBER_CONST:
220 1
                rqs[relation] = rqs_numerator[relation] / rqs_denominator[relation]
221
222
                # prevent division by zero by storing very small value instead
223 1
                if rqs[relation] < SMALL_NUMBER_CONST:
224 1
                    rqs[relation] = SMALL_NUMBER_CONST
225
            else:
226 1
                rqs[relation] = SMALL_NUMBER_CONST
227 1
        return rqs
228
229 1
    @staticmethod
230
    def compute_rqs(rqs, work_sent_rel_dict, sqs_list, wqs_list, rqs_list, rqs_len, max_delta, avg_rqs_delta):
231
        """ compute relation quality score (RQS) """
232 1
        rqs_new = Metrics.relation_quality_score(list(rqs.keys()), work_sent_rel_dict, \
233
                                                sqs_list[len(sqs_list) - 1], \
234
                                                wqs_list[len(wqs_list) - 1])
235 1
        for rel, _ in rqs_new.items():
236 1
            max_delta = max(max_delta, abs(rqs_new[rel] - rqs_list[len(rqs_list) - 1][rel]))
237 1
            avg_rqs_delta += abs(rqs_new[rel] - rqs_list[len(rqs_list) - 1][rel])
238 1
        avg_rqs_delta /= rqs_len
239
240 1
        return rqs_new, max_delta, avg_rqs_delta
241
242 1
    @staticmethod
243
    def compute_sqs(sqs_new, sent_work_rel_dict, wqs_list, rqs_list, sqs_list, sqs_len, max_delta, avg_sqs_delta):
244
        """ compute sentence quality score (SQS) """
245 1
        for sent_id, _ in sent_work_rel_dict.items():
246 1
            sqs_new[sent_id] = Metrics.sentence_quality_score(sent_id, sent_work_rel_dict, \
247
                                                              wqs_list[len(wqs_list) - 1], \
248
                                                              rqs_list[len(rqs_list) - 1])
249 1
            max_delta = max(max_delta, \
250
                        abs(sqs_new[sent_id] - sqs_list[len(sqs_list) - 1][sent_id]))
251 1
            avg_sqs_delta += abs(sqs_new[sent_id] - sqs_list[len(sqs_list) - 1][sent_id])
252 1
        avg_sqs_delta /= sqs_len
253
254 1
        return sqs_new, max_delta, avg_sqs_delta
255
256 1
    @staticmethod
257
    def compute_wqs(wwa_new, wsa_new, wqs_new, work_sent_rel_dict, sent_rel_dict, sent_work_rel_dict, wqs_list, \
258
                    sqs_list, rqs_list, wqs_len, max_delta, avg_wqs_delta):
259
        """ compute worker quality score (WQS) """
260 1
        for worker_id, _ in work_sent_rel_dict.items():
261 1
            wwa_new[worker_id] = Metrics.worker_worker_agreement( \
262
                     worker_id, work_sent_rel_dict, \
263
                     sent_work_rel_dict, \
264
                     wqs_list[len(wqs_list) - 1], \
265
                     sqs_list[len(sqs_list) - 1], \
266
                     rqs_list[len(rqs_list) - 1])
267 1
            wsa_new[worker_id] = Metrics.worker_sentence_agreement( \
268
                     worker_id, \
269
                     sent_rel_dict, \
270
                     work_sent_rel_dict, \
271
                     sqs_list[len(sqs_list) - 1], \
272
                     rqs_list[len(rqs_list) - 1], \
273
                     wqs_list[len(rqs_list) - 1][worker_id])
274 1
            wqs_new[worker_id] = wwa_new[worker_id] * wsa_new[worker_id]
275 1
            max_delta = max(max_delta, \
276
                        abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id]))
277 1
            avg_wqs_delta += abs(wqs_new[worker_id] - wqs_list[len(wqs_list) - 1][worker_id])
278 1
        avg_wqs_delta /= wqs_len
279
280 1
        return wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta
281
282 1
    @staticmethod
283
    def reconstruct_sent_rel_dict(sent_rel_dict, work_sent_rel_dict, wqs_new):
284
        """ reconstruct sent_rel_dict with worker scores """
285 1
        new_sent_rel_dict = dict()
286 1
        for sent_id, rel_dict in sent_rel_dict.items():
287 1
            new_sent_rel_dict[sent_id] = dict()
288 1
            for relation, _ in rel_dict.items():
289 1
                new_sent_rel_dict[sent_id][relation] = 0.0
290 1
        for work_id, srd in work_sent_rel_dict.items():
291 1
            wqs_work_id = wqs_new[work_id]
292 1
            for sent_id, rel_dict in srd.items():
293 1
                for relation, score in rel_dict.items():
294 1
                    new_sent_rel_dict[sent_id][relation] += score * wqs_work_id
295
296 1
        return new_sent_rel_dict
297
298 1
    @staticmethod
299
    def save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, iteration_value):
300
        """ save the unit relation score for print """
301 1
        srs = Counter()
302 1
        for sentence_id in sent_rel_dict:
303 1
            srs[sentence_id] = Counter()
304 1
            for relation in sent_rel_dict[sentence_id]:
305 1
                srs[sentence_id][relation] = Metrics.sentence_relation_score(sentence_id, \
306
                                            relation, sent_work_rel_dict, \
307
                                            iteration_value)
308 1
        return srs
309
310 1
    @staticmethod
311 1
    def run(results, config, max_delta=0.001):
312
        '''
313
        iteratively run the CrowdTruth metrics
314
        '''
315
316 1
        judgments = results['judgments'].copy()
317 1
        units = results['units'].copy()
318
319
        # sent_work_rel_dict, work_sent_rel_dict, sent_rel_dict
320
        # to be done: change to use all vectors in one unit
321 1
        col = list(config.output.values())[0]
322 1
        sent_rel_dict = dict(units.copy()[col])
323
324 1
        def expanded_vector(worker, unit):
325
            '''
326
            expand the vector of a worker on a given unit
327
            '''
328 1
            vector = Counter()
329 1
            for rel in unit:
330 1
                if rel in worker:
331 1
                    vector[rel] = worker[rel]
332
                else:
333 1
                    vector[rel] = 0
334 1
            return vector
335
336
        # fill judgment vectors with unit keys
337 1
        for index, row in judgments.iterrows():
338
            # judgments.set_value(index, col, expandedVector(row[col], units.at[row['unit'], col]))
339 1
            judgments.at[index, col] = expanded_vector(row[col], units.at[row['unit'], col])
340
341 1
        sent_work_rel_dict = judgments[['unit', 'worker', col]].copy().groupby('unit')
342 1
        sent_work_rel_dict = {name : group.set_index('worker')[col].to_dict() \
343
                                for name, group in sent_work_rel_dict}
344
345 1
        work_sent_rel_dict = judgments[['worker', 'unit', col]].copy().groupby('worker')
346 1
        work_sent_rel_dict = {name : group.set_index('unit')[col].to_dict() \
347
                                for name, group in work_sent_rel_dict}
348
349
        #initialize data structures
350 1
        sqs_list = list()
351 1
        wqs_list = list()
352 1
        wwa_list = list()
353 1
        wsa_list = list()
354 1
        rqs_list = list()
355
356 1
        sqs = dict((sentence_id, 1.0) for sentence_id in sent_work_rel_dict)
357 1
        wqs = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
358 1
        wwa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
359 1
        wsa = dict((worker_id, 1.0) for worker_id in work_sent_rel_dict)
360
361 1
        sqs_list.append(sqs.copy())
362 1
        wqs_list.append(wqs.copy())
363 1
        wwa_list.append(wwa.copy())
364 1
        wsa_list.append(wsa.copy())
365
366
        # initialize RQS depending on whether or not it is an open ended task
367 1
        rqs = dict()
368 1
        if not config.open_ended_task:
369 1
            rqs_keys = list(sent_rel_dict[list(sent_rel_dict.keys())[0]].keys())
370 1
            for relation in rqs_keys:
371 1
                rqs[relation] = 1.0
372
        else:
373 1
            for sentence_id in sent_rel_dict:
374 1
                for relation in sent_rel_dict[sentence_id]:
375 1
                    rqs[relation] = 1.0
376 1
        rqs_list.append(rqs.copy())
377
378 1
        sqs_len = len(list(sqs.keys())) * 1.0
379 1
        wqs_len = len(list(wqs.keys())) * 1.0
380 1
        rqs_len = len(list(rqs.keys())) * 1.0
381
382
        # compute metrics until stable values
383 1
        iterations = 0
384 1
        while max_delta >= 0.001:
385 1
            sqs_new = dict()
386 1
            wqs_new = dict()
387 1
            wwa_new = dict()
388 1
            wsa_new = dict()
389
390 1
            avg_sqs_delta = 0.0
391 1
            avg_wqs_delta = 0.0
392 1
            avg_rqs_delta = 0.0
393 1
            max_delta = 0.0
394
395
            # pdb.set_trace()
396
397 1
            if not config.open_ended_task:
398
                # compute relation quality score (RQS)
399 1
                rqs_new, max_delta, avg_rqs_delta = Metrics.compute_rqs(rqs, work_sent_rel_dict, \
400
                                sqs_list, wqs_list, rqs_list, rqs_len, max_delta, avg_rqs_delta)
401
402
            # compute sentence quality score (SQS)
403 1
            sqs_new, max_delta, avg_sqs_delta = Metrics.compute_sqs(sqs_new, sent_work_rel_dict, \
404
                                    wqs_list, rqs_list, sqs_list, sqs_len, max_delta, avg_sqs_delta)
405
406
            # compute worker quality score (WQS)
407 1
            wwa_new, wsa_new, wqs_new, max_delta, avg_wqs_delta = Metrics.compute_wqs(\
408
                        wwa_new, wsa_new, wqs_new, \
409
                        work_sent_rel_dict, sent_rel_dict, sent_work_rel_dict, wqs_list, \
410
                        sqs_list, rqs_list, wqs_len, max_delta, avg_wqs_delta)
411
412
            # save results for current iteration
413 1
            sqs_list.append(sqs_new.copy())
414 1
            wqs_list.append(wqs_new.copy())
415 1
            wwa_list.append(wwa_new.copy())
416 1
            wsa_list.append(wsa_new.copy())
417 1
            if not config.open_ended_task:
418 1
                rqs_list.append(rqs_new.copy())
0 ignored issues
show
introduced by
The variable rqs_new does not seem to be defined for all execution paths.
Loading history...
419 1
            iterations += 1
420
421 1
            sent_rel_dict = Metrics.reconstruct_sent_rel_dict(sent_rel_dict, work_sent_rel_dict, wqs_new)
422
423 1
            logging.info(str(iterations) + " iterations; max d= " + str(max_delta) + \
424
                        " ; wqs d= " + str(avg_wqs_delta) + "; sqs d= " + str(avg_sqs_delta) + \
425
                        "; rqs d= " + str(avg_rqs_delta))
426
427 1
        srs = Metrics.save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, wqs_list[len(wqs_list) - 1])
428 1
        srs_initial = Metrics.save_unit_rel_score(sent_rel_dict, sent_work_rel_dict, wqs_list[0])
429
430 1
        results['units']['uqs'] = pd.Series(sqs_list[-1])
431 1
        results['units']['unit_annotation_score'] = pd.Series(srs)
432 1
        results['workers']['wqs'] = pd.Series(wqs_list[-1])
433 1
        results['workers']['wwa'] = pd.Series(wwa_list[-1])
434 1
        results['workers']['wsa'] = pd.Series(wsa_list[-1])
435 1
        if not config.open_ended_task:
436 1
            results['annotations']['aqs'] = pd.Series(rqs_list[-1])
437
438 1
        results['units']['uqs_initial'] = pd.Series(sqs_list[1])
439 1
        results['units']['unit_annotation_score_initial'] = pd.Series(srs_initial)
440 1
        results['workers']['wqs_initial'] = pd.Series(wqs_list[1])
441 1
        results['workers']['wwa_initial'] = pd.Series(wwa_list[1])
442 1
        results['workers']['wsa_initial'] = pd.Series(wsa_list[1])
443 1
        if not config.open_ended_task:
444 1
            results['annotations']['aqs_initial'] = pd.Series(rqs_list[1])
445
        return results
446