1
|
|
|
import numpy as np |
|
|
|
|
2
|
|
|
from collections import Counter |
|
|
|
|
3
|
|
|
from scipy.optimize import linear_sum_assignment as linear_assignment |
4
|
|
|
|
5
|
|
|
|
6
|
|
|
def f1(p_num, p_den, r_num, r_den, beta=1): |
|
|
|
|
7
|
|
|
p = 0 if p_den == 0 else p_num / float(p_den) |
|
|
|
|
8
|
|
|
r = 0 if r_den == 0 else r_num / float(r_den) |
|
|
|
|
9
|
|
|
return 0 if p + r == 0 else (1 + beta * beta) * \ |
10
|
|
|
p * r / (beta * beta * p + r) |
11
|
|
|
|
12
|
|
|
|
13
|
|
|
class CorefEvaluator(object): |
|
|
|
|
14
|
|
|
def __init__(self): |
15
|
|
|
self.evaluators = [Evaluator(m) for m in (muc, b_cubed, ceafe)] |
16
|
|
|
|
17
|
|
|
def update(self, predicted, gold, mention_to_predicted, mention_to_gold): |
|
|
|
|
18
|
|
|
for e in self.evaluators: |
|
|
|
|
19
|
|
|
e.update(predicted, gold, mention_to_predicted, mention_to_gold) |
20
|
|
|
|
21
|
|
|
def get_f1(self): |
|
|
|
|
22
|
|
|
return sum(e.get_f1() for e in self.evaluators) / len(self.evaluators) |
23
|
|
|
|
24
|
|
|
def get_recall(self): |
|
|
|
|
25
|
|
|
return sum(e.get_recall() |
26
|
|
|
for e in self.evaluators) / len(self.evaluators) |
27
|
|
|
|
28
|
|
|
def get_precision(self): |
|
|
|
|
29
|
|
|
return sum(e.get_precision() |
30
|
|
|
for e in self.evaluators) / len(self.evaluators) |
31
|
|
|
|
32
|
|
|
def get_prf(self): |
|
|
|
|
33
|
|
|
return self.get_precision(), self.get_recall(), self.get_f1() |
34
|
|
|
|
35
|
|
|
|
36
|
|
|
class Evaluator(object): |
|
|
|
|
37
|
|
|
def __init__(self, metric, beta=1): |
38
|
|
|
self.p_num = 0 |
39
|
|
|
self.p_den = 0 |
40
|
|
|
self.r_num = 0 |
41
|
|
|
self.r_den = 0 |
42
|
|
|
self.metric = metric |
43
|
|
|
self.beta = beta |
44
|
|
|
|
45
|
|
|
def update(self, predicted, gold, mention_to_predicted, mention_to_gold): |
|
|
|
|
46
|
|
|
if self.metric == ceafe: |
|
|
|
|
47
|
|
|
pn, pd, rn, rd = self.metric(predicted, gold) |
|
|
|
|
48
|
|
|
else: |
49
|
|
|
pn, pd = self.metric(predicted, mention_to_gold) |
|
|
|
|
50
|
|
|
rn, rd = self.metric(gold, mention_to_predicted) |
|
|
|
|
51
|
|
|
self.p_num += pn |
52
|
|
|
self.p_den += pd |
53
|
|
|
self.r_num += rn |
54
|
|
|
self.r_den += rd |
55
|
|
|
|
56
|
|
|
def get_f1(self): |
|
|
|
|
57
|
|
|
return f1(self.p_num, self.p_den, self.r_num, |
58
|
|
|
self.r_den, beta=self.beta) |
59
|
|
|
|
60
|
|
|
def get_recall(self): |
|
|
|
|
61
|
|
|
return 0 if self.r_num == 0 else self.r_num / float(self.r_den) |
62
|
|
|
|
63
|
|
|
def get_precision(self): |
|
|
|
|
64
|
|
|
return 0 if self.p_num == 0 else self.p_num / float(self.p_den) |
65
|
|
|
|
66
|
|
|
def get_prf(self): |
|
|
|
|
67
|
|
|
return self.get_precision(), self.get_recall(), self.get_f1() |
68
|
|
|
|
69
|
|
|
def get_counts(self): |
|
|
|
|
70
|
|
|
return self.p_num, self.p_den, self.r_num, self.r_den |
71
|
|
|
|
72
|
|
|
|
73
|
|
|
def evaluate_documents(documents, metric, beta=1): |
|
|
|
|
74
|
|
|
evaluator = Evaluator(metric, beta=beta) |
75
|
|
|
for document in documents: |
76
|
|
|
evaluator.update(document) |
|
|
|
|
77
|
|
|
return evaluator.get_precision(), evaluator.get_recall(), evaluator.get_f1() |
78
|
|
|
|
79
|
|
|
|
80
|
|
|
def b_cubed(clusters, mention_to_gold): |
|
|
|
|
81
|
|
|
num, dem = 0, 0 |
82
|
|
|
|
83
|
|
|
for c in clusters: |
|
|
|
|
84
|
|
|
if len(c) == 1: |
85
|
|
|
continue |
86
|
|
|
|
87
|
|
|
gold_counts = Counter() |
88
|
|
|
correct = 0 |
89
|
|
|
for m in c: |
|
|
|
|
90
|
|
|
if m in mention_to_gold: |
91
|
|
|
gold_counts[tuple(mention_to_gold[m])] += 1 |
|
|
|
|
92
|
|
|
for c2, count in gold_counts.items(): |
|
|
|
|
93
|
|
|
if len(c2) != 1: |
94
|
|
|
correct += count * count |
95
|
|
|
|
96
|
|
|
num += correct / float(len(c)) |
97
|
|
|
dem += len(c) |
98
|
|
|
|
99
|
|
|
return num, dem |
100
|
|
|
|
101
|
|
|
|
102
|
|
|
def muc(clusters, mention_to_gold): |
|
|
|
|
103
|
|
|
tp, p = 0, 0 |
|
|
|
|
104
|
|
|
for c in clusters: |
|
|
|
|
105
|
|
|
p += len(c) - 1 |
|
|
|
|
106
|
|
|
tp += len(c) |
|
|
|
|
107
|
|
|
linked = set() |
108
|
|
|
for m in c: |
|
|
|
|
109
|
|
|
if m in mention_to_gold: |
110
|
|
|
linked.add(mention_to_gold[m]) |
111
|
|
|
else: |
112
|
|
|
tp -= 1 |
|
|
|
|
113
|
|
|
tp -= len(linked) |
|
|
|
|
114
|
|
|
return tp, p |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
def phi4(c1, c2): |
|
|
|
|
118
|
|
|
return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2)) |
119
|
|
|
|
120
|
|
|
|
121
|
|
|
def ceafe(clusters, gold_clusters): |
|
|
|
|
122
|
|
|
clusters = [c for c in clusters if len(c) != 1] |
123
|
|
|
scores = np.zeros((len(gold_clusters), len(clusters))) |
124
|
|
|
for i in range(len(gold_clusters)): |
|
|
|
|
125
|
|
|
for j in range(len(clusters)): |
|
|
|
|
126
|
|
|
scores[i, j] = phi4(gold_clusters[i], clusters[j]) |
127
|
|
|
matching_row, matching_col = linear_assignment(-scores) |
128
|
|
|
similarity = sum(scores[matching_row, matching_col]) |
129
|
|
|
return similarity, len(clusters), similarity, len(gold_clusters) |
130
|
|
|
|
131
|
|
|
|
132
|
|
|
def lea(clusters, mention_to_gold): |
|
|
|
|
133
|
|
|
num, dem = 0, 0 |
134
|
|
|
|
135
|
|
|
for c in clusters: |
|
|
|
|
136
|
|
|
if len(c) == 1: |
137
|
|
|
continue |
138
|
|
|
|
139
|
|
|
common_links = 0 |
140
|
|
|
all_links = len(c) * (len(c) - 1) / 2.0 |
141
|
|
|
for i, m in enumerate(c): |
|
|
|
|
142
|
|
|
if m in mention_to_gold: |
143
|
|
|
for m2 in c[i + 1:]: |
|
|
|
|
144
|
|
|
if m2 in mention_to_gold and mention_to_gold[m] == mention_to_gold[m2]: |
145
|
|
|
common_links += 1 |
146
|
|
|
|
147
|
|
|
num += len(c) * common_links / float(all_links) |
148
|
|
|
dem += len(c) |
149
|
|
|
|
150
|
|
|
return num, dem |
151
|
|
|
|