1
|
|
|
from . import __version__ |
|
|
|
|
2
|
|
|
import logging |
|
|
|
|
3
|
|
|
|
4
|
|
|
import KafNafParserPy |
|
|
|
|
5
|
|
|
from KafNafParserPy import KafNafParser |
|
|
|
|
6
|
|
|
from lxml.etree import XMLSyntaxError |
|
|
|
|
7
|
|
|
import itertools |
|
|
|
|
8
|
|
|
import tensorflow.compat.v1 as tf |
|
|
|
|
9
|
|
|
tf.disable_v2_behavior() |
10
|
|
|
|
11
|
|
|
logger = logging.getLogger('e2edutch') |
12
|
|
|
this_name = 'Coreference resolution based on e2e model' |
|
|
|
|
13
|
|
|
|
14
|
|
|
|
15
|
|
|
def get_naf(input_filename): |
|
|
|
|
16
|
|
|
try: |
17
|
|
|
naf = KafNafParser(input_filename) |
18
|
|
|
except XMLSyntaxError: |
19
|
|
|
with open(input_filename) as input_file: |
20
|
|
|
input = input_file.read() |
|
|
|
|
21
|
|
|
if "<NAF" in input and "</NAF>" in input: |
22
|
|
|
# I'm guessing this should be a NAF file but something is wrong |
23
|
|
|
logger.exception("Error parsing NAF file") |
24
|
|
|
raise |
25
|
|
|
naf = KafNafParser(type="NAF") |
26
|
|
|
naf.set_version("3.0") |
27
|
|
|
naf.set_language("nl") |
28
|
|
|
naf.lang = "nl" |
29
|
|
|
naf.raw = input |
30
|
|
|
naf.set_raw(naf.raw) |
31
|
|
|
return naf |
32
|
|
|
|
33
|
|
|
|
34
|
|
|
def get_naf_from_sentences(sentences): |
|
|
|
|
35
|
|
|
naf_obj = KafNafParser(type="NAF") |
36
|
|
|
naf_obj.set_version("3.0") |
37
|
|
|
naf_obj.set_language("nl") |
38
|
|
|
naf_obj.lang = "nl" |
39
|
|
|
naf_obj.raw = '\n'.join([' '.join(s) for s in sentences]) |
40
|
|
|
naf_obj.set_raw(naf_obj.raw) |
41
|
|
|
# Create text layer |
42
|
|
|
wcount = 1 |
43
|
|
|
offsets = {} |
44
|
|
|
txt = naf_obj.get_raw() |
45
|
|
|
token_ids = [] |
46
|
|
|
for sid, sentence in enumerate(sentences): |
47
|
|
|
token_ids_sub = [] |
48
|
|
|
for token in sentence: |
49
|
|
|
token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type()) |
50
|
|
|
token_id = 'w{}'.format(wcount) |
51
|
|
|
token_length = len(token) |
52
|
|
|
offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0)) |
53
|
|
|
token_obj.set_id(token_id) |
54
|
|
|
token_obj.set_length(str(token_length)) |
55
|
|
|
# token_obj.set_offset(str(offset)) # Is this correct???? |
56
|
|
|
token_obj.set_para('1') |
57
|
|
|
token_obj.set_sent(str(sid + 1)) |
58
|
|
|
token_obj.set_text(token) |
59
|
|
|
token_obj.set_offset(str(offsets[wcount])) |
60
|
|
|
token_ids_sub.append(token_id) |
61
|
|
|
wcount += 1 |
62
|
|
|
naf_obj.add_wf(token_obj) |
63
|
|
|
token_ids.append(token_ids_sub) |
64
|
|
|
# Create term layers |
65
|
|
|
term_ids = [] |
66
|
|
|
count_terms = 0 |
67
|
|
|
for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)): |
68
|
|
|
term_ids_sub = [] |
69
|
|
|
logger.info('Creating the term layer...') |
70
|
|
|
for num_token, (token, token_id) in enumerate( |
|
|
|
|
71
|
|
|
zip(sentence, token_ids_sub)): |
72
|
|
|
new_term_id = 't_' + str(count_terms) |
73
|
|
|
count_terms += 1 |
74
|
|
|
term_ids_sub.append(new_term_id) |
75
|
|
|
term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type()) |
76
|
|
|
term_obj.set_id(new_term_id) |
77
|
|
|
new_span = KafNafParserPy.Cspan() |
78
|
|
|
new_span.create_from_ids([token_id]) |
79
|
|
|
term_obj.set_span(new_span) |
80
|
|
|
naf_obj.add_term(term_obj) |
81
|
|
|
term_ids.append(term_ids_sub) |
82
|
|
|
|
83
|
|
|
return naf_obj, term_ids |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
def create_coref_layer(knaf_obj, clusters, term_ids): |
|
|
|
|
87
|
|
|
term_ids_list = list(itertools.chain.from_iterable(term_ids)) |
88
|
|
|
for cluster_id, cluster in enumerate(clusters): |
89
|
|
|
coref_obj = KafNafParserPy.Ccoreference(type=knaf_obj.get_type()) |
90
|
|
|
coref_obj.set_id('co{}'.format(cluster_id + 1)) |
91
|
|
|
coref_obj.set_type('entity') |
92
|
|
|
for start, end in cluster: |
93
|
|
|
coref_obj.add_span(term_ids_list[start:end + 1]) |
94
|
|
|
span_text = [] |
95
|
|
|
for term_id in term_ids_list[start:end + 1]: |
96
|
|
|
word_ids = knaf_obj.get_term(term_id).get_span_ids() |
97
|
|
|
for word_id in word_ids: |
98
|
|
|
word = knaf_obj.get_token(word_id).get_text() |
99
|
|
|
span_text.append(word) |
100
|
|
|
span_text = ' '.join(span_text) |
101
|
|
|
# TODO: output span_text as comment |
|
|
|
|
102
|
|
|
knaf_obj.add_coreference(coref_obj) |
103
|
|
|
return knaf_obj |
104
|
|
|
|
105
|
|
|
|
106
|
|
|
def add_linguistic_processors(in_obj): |
|
|
|
|
107
|
|
|
name = this_name |
108
|
|
|
|
109
|
|
|
my_lp = KafNafParserPy.Clp() |
110
|
|
|
my_lp.set_name(name) |
111
|
|
|
my_lp.set_version(__version__) |
112
|
|
|
my_lp.set_timestamp() |
113
|
|
|
in_obj.add_linguistic_processor('coreferences', my_lp) |
114
|
|
|
|
115
|
|
|
return in_obj |
116
|
|
|
|
117
|
|
|
|
118
|
|
|
def get_jsonlines(knaf_obj): |
|
|
|
|
119
|
|
|
sent_term_tok = [] |
120
|
|
|
|
121
|
|
|
for term in knaf_obj.get_terms(): |
122
|
|
|
for tok_id in term.get_span_ids(): |
123
|
|
|
tok = knaf_obj.get_token(tok_id) |
124
|
|
|
sent_term_tok.append( |
125
|
|
|
(tok.get_sent(), term.get_id(), tok_id, tok.get_text())) |
126
|
|
|
|
127
|
|
|
sentences = [] |
128
|
|
|
term_ids = [] |
129
|
|
|
tok_ids = [] |
130
|
|
|
for sent_id, idlist in itertools.groupby(sent_term_tok, lambda t: t[0]): |
|
|
|
|
131
|
|
|
idlist = list(idlist) |
132
|
|
|
sentences.append([t[3] for t in idlist]) |
133
|
|
|
term_ids.append([t[1] for t in idlist]) |
134
|
|
|
tok_ids.append([t[2] for t in idlist]) |
135
|
|
|
|
136
|
|
|
jsonlines_obj = {'doc_key': str(knaf_obj.get_filename()), |
137
|
|
|
'sentences': sentences, |
138
|
|
|
'clusters': [] |
139
|
|
|
} |
140
|
|
|
return jsonlines_obj, term_ids, tok_ids |
141
|
|
|
|