1
|
|
|
import os |
|
|
|
|
2
|
|
|
from pathlib import Path |
3
|
|
|
import pyhocon |
4
|
|
|
import errno |
|
|
|
|
5
|
|
|
import codecs |
|
|
|
|
6
|
|
|
import collections |
|
|
|
|
7
|
|
|
import shutil |
|
|
|
|
8
|
|
|
import logging |
|
|
|
|
9
|
|
|
import pkg_resources |
10
|
|
|
|
11
|
|
|
import numpy as np |
12
|
|
|
import tensorflow.compat.v1 as tf |
|
|
|
|
13
|
|
|
tf.disable_v2_behavior() |
14
|
|
|
|
15
|
|
|
logger = logging.getLogger('e2edutch') |
16
|
|
|
|
17
|
|
|
|
18
|
|
|
def initialize_from_env(model_name='final', cfg_file=None, model_cfg_file=None): |
19
|
|
|
'''Read configuration files |
20
|
|
|
|
21
|
|
|
Read configuration files cfg_file and model_cfg_file from provided |
22
|
|
|
filenames. If none given, use default config files provided by e2edutch: |
23
|
|
|
cfg/defaults.conf for cfg_file, and |
24
|
|
|
cfg/models.conf for model_cfg_file |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
Configure Tensorflow to use a gpu or cpu based on the environment values of GPU. |
28
|
|
|
|
29
|
|
|
Returns a config dict |
30
|
|
|
''' |
31
|
|
|
if "GPU" in os.environ: |
32
|
|
|
set_gpus(int(os.environ["GPU"])) |
33
|
|
|
else: |
34
|
|
|
set_gpus() |
35
|
|
|
|
36
|
|
|
logger.info('Running model: {}'.format(model_name)) |
|
|
|
|
37
|
|
|
|
38
|
|
|
if cfg_file is None: |
39
|
|
|
cfg_file = pkg_resources.resource_filename( |
40
|
|
|
"e2edutch", 'cfg/defaults.conf') |
41
|
|
|
if model_cfg_file is None: |
42
|
|
|
model_cfg_file = pkg_resources.resource_filename( |
43
|
|
|
"e2edutch", 'cfg/models.conf') |
44
|
|
|
config_base = pyhocon.ConfigFactory.parse_file(cfg_file) |
45
|
|
|
config_model = pyhocon.ConfigFactory.parse_file(model_cfg_file)[model_name] |
46
|
|
|
config = pyhocon.ConfigTree.merge_configs(config_model, config_base) |
47
|
|
|
|
48
|
|
|
# Override datapath from environment, if set |
49
|
|
|
if os.environ.get('E2E_HOME', None) is not None: |
50
|
|
|
config['datapath'] = os.environ['E2E_HOME'] |
51
|
|
|
|
52
|
|
|
# Finally, provide fallback for datapath |
53
|
|
|
if config.get('datapath', None) is None: |
54
|
|
|
config['datapath'] = Path(__file__).parent / "data" |
55
|
|
|
|
56
|
|
|
config['log_root'] = config['datapath'] |
57
|
|
|
config['log_dir'] = model_name |
58
|
|
|
|
59
|
|
|
mkdirs(os.path.join(config['log_root'], config['log_dir'])) |
60
|
|
|
|
61
|
|
|
logger.debug(pyhocon.HOCONConverter.convert(config, 'hocon')) |
62
|
|
|
return config |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
def copy_checkpoint(source, target): |
|
|
|
|
66
|
|
|
for ext in (".index", ".data-00000-of-00001"): |
67
|
|
|
shutil.copyfile(source + ext, target + ext) |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
def make_summary(value_dict): |
|
|
|
|
71
|
|
|
return tf.Summary(value=[tf.Summary.Value( |
72
|
|
|
tag=k, simple_value=v) for k, v in value_dict.items()]) |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
def create_example(text, doc_key='example'): |
|
|
|
|
76
|
|
|
import stanza |
|
|
|
|
77
|
|
|
stanza.download('nl') |
78
|
|
|
nlp = stanza.Pipeline('nl', processors='tokenize') |
79
|
|
|
doc = nlp(text) |
80
|
|
|
sentences = [[token['text'] for token in sentence] |
81
|
|
|
for sentence in doc.to_dict()] |
82
|
|
|
return { |
83
|
|
|
"doc_key": doc_key, |
84
|
|
|
"clusters": [], |
85
|
|
|
"sentences": sentences |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
def flatten(l): |
|
|
|
|
90
|
|
|
return [item for sublist in l for item in sublist] |
91
|
|
|
|
92
|
|
|
|
93
|
|
|
def set_gpus(*gpus): |
|
|
|
|
94
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in gpus) |
95
|
|
|
logger.info("Setting CUDA_VISIBLE_DEVICES to: {}".format( |
|
|
|
|
96
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"])) |
97
|
|
|
for gpu in tf.config.experimental.list_physical_devices('GPU'): |
98
|
|
|
tf.config.experimental.set_memory_growth(gpu, True) |
99
|
|
|
|
100
|
|
|
|
101
|
|
|
def mkdirs(path): |
|
|
|
|
102
|
|
|
try: |
103
|
|
|
os.makedirs(path) |
104
|
|
|
except OSError as exception: |
105
|
|
|
if exception.errno != errno.EEXIST: |
106
|
|
|
raise |
107
|
|
|
return path |
108
|
|
|
|
109
|
|
|
|
110
|
|
|
def load_char_dict(char_vocab_path): |
|
|
|
|
111
|
|
|
vocab = [u"<unk>"] |
112
|
|
|
with codecs.open(char_vocab_path, encoding="utf-8") as f: |
|
|
|
|
113
|
|
|
vocab.extend(l.strip() for l in f.readlines()) |
114
|
|
|
char_dict = collections.defaultdict(int) |
115
|
|
|
char_dict.update({c: i for i, c in enumerate(vocab)}) |
116
|
|
|
return char_dict |
117
|
|
|
|
118
|
|
|
|
119
|
|
|
def maybe_divide(x, y): |
|
|
|
|
120
|
|
|
return 0 if y == 0 else x / float(y) |
121
|
|
|
|
122
|
|
|
|
123
|
|
|
def projection(inputs, output_size, initializer=None): |
|
|
|
|
124
|
|
|
return ffnn(inputs, 0, -1, output_size, dropout=None, |
125
|
|
|
output_weights_initializer=initializer) |
126
|
|
|
|
127
|
|
|
|
128
|
|
|
def highway(inputs, num_layers, dropout): |
|
|
|
|
129
|
|
|
for i in range(num_layers): |
130
|
|
|
with tf.variable_scope("highway_{}".format(i)): |
131
|
|
|
j, f = tf.split(projection(inputs, 2 * shape(inputs, -1)), 2, -1) |
|
|
|
|
132
|
|
|
f = tf.sigmoid(f) |
|
|
|
|
133
|
|
|
j = tf.nn.relu(j) |
134
|
|
|
if dropout is not None: |
135
|
|
|
j = tf.nn.dropout(j, dropout) |
136
|
|
|
inputs = f * j + (1 - f) * inputs |
137
|
|
|
return inputs |
138
|
|
|
|
139
|
|
|
|
140
|
|
|
def shape(x, dim): |
|
|
|
|
141
|
|
|
return x.get_shape()[dim].value or tf.shape(x)[dim] |
142
|
|
|
|
143
|
|
|
|
144
|
|
|
def ffnn(inputs, num_hidden_layers, hidden_size, output_size, |
|
|
|
|
145
|
|
|
dropout, output_weights_initializer=None): |
146
|
|
|
if len(inputs.get_shape()) > 3: |
147
|
|
|
raise ValueError("FFNN with rank {} not supported".format( |
148
|
|
|
len(inputs.get_shape()))) |
149
|
|
|
|
150
|
|
|
if len(inputs.get_shape()) == 3: |
151
|
|
|
batch_size = shape(inputs, 0) |
152
|
|
|
seqlen = shape(inputs, 1) |
153
|
|
|
emb_size = shape(inputs, 2) |
154
|
|
|
current_inputs = tf.reshape(inputs, [batch_size * seqlen, emb_size]) |
155
|
|
|
else: |
156
|
|
|
current_inputs = inputs |
157
|
|
|
|
158
|
|
|
for i in range(num_hidden_layers): |
159
|
|
|
hidden_weights = tf.get_variable("hidden_weights_{}".format(i), |
160
|
|
|
[shape(current_inputs, 1), |
161
|
|
|
hidden_size]) |
162
|
|
|
hidden_bias = tf.get_variable( |
163
|
|
|
"hidden_bias_{}".format(i), [hidden_size]) |
164
|
|
|
current_outputs = tf.nn.relu(tf.nn.xw_plus_b( |
165
|
|
|
current_inputs, hidden_weights, hidden_bias)) |
166
|
|
|
|
167
|
|
|
if dropout is not None: |
168
|
|
|
current_outputs = tf.nn.dropout(current_outputs, dropout) |
169
|
|
|
current_inputs = current_outputs |
170
|
|
|
|
171
|
|
|
output_weights = tf.get_variable("output_weights", [shape( |
172
|
|
|
current_inputs, 1), output_size], |
173
|
|
|
initializer=output_weights_initializer) |
|
|
|
|
174
|
|
|
output_bias = tf.get_variable("output_bias", [output_size]) |
175
|
|
|
outputs = tf.nn.xw_plus_b(current_inputs, output_weights, output_bias) |
176
|
|
|
|
177
|
|
|
if len(inputs.get_shape()) == 3: |
178
|
|
|
outputs = tf.reshape(outputs, [batch_size, seqlen, output_size]) |
|
|
|
|
179
|
|
|
return outputs |
180
|
|
|
|
181
|
|
|
|
182
|
|
|
def cnn(inputs, filter_sizes, num_filters): |
|
|
|
|
183
|
|
|
# num_words = shape(inputs, 0) |
184
|
|
|
# num_chars = shape(inputs, 1) |
185
|
|
|
input_size = shape(inputs, 2) |
186
|
|
|
outputs = [] |
187
|
|
|
for i, filter_size in enumerate(filter_sizes): |
188
|
|
|
with tf.variable_scope("conv_{}".format(i)): |
189
|
|
|
w = tf.get_variable("w", [filter_size, input_size, num_filters]) |
|
|
|
|
190
|
|
|
b = tf.get_variable("b", [num_filters]) |
|
|
|
|
191
|
|
|
# [num_words, num_chars - filter_size, num_filters] |
192
|
|
|
conv = tf.nn.conv1d(inputs, w, stride=1, padding="VALID") |
193
|
|
|
# [num_words, num_chars - filter_size, num_filters] |
194
|
|
|
h = tf.nn.relu(tf.nn.bias_add(conv, b)) |
|
|
|
|
195
|
|
|
pooled = tf.reduce_max(h, 1) # [num_words, num_filters] |
196
|
|
|
outputs.append(pooled) |
197
|
|
|
# [num_words, num_filters * len(filter_sizes)] |
198
|
|
|
return tf.concat(outputs, 1) |
199
|
|
|
|
200
|
|
|
|
201
|
|
|
def batch_gather(emb, indices): |
|
|
|
|
202
|
|
|
batch_size = shape(emb, 0) |
203
|
|
|
seqlen = shape(emb, 1) |
204
|
|
|
if len(emb.get_shape()) > 2: |
205
|
|
|
emb_size = shape(emb, 2) |
206
|
|
|
else: |
207
|
|
|
emb_size = 1 |
208
|
|
|
# [batch_size * seqlen, emb] |
209
|
|
|
flattened_emb = tf.reshape(emb, [batch_size * seqlen, emb_size]) |
210
|
|
|
offset = tf.expand_dims(tf.range(batch_size) |
211
|
|
|
* seqlen, 1) # [batch_size, 1] |
212
|
|
|
# [batch_size, num_indices, emb] |
213
|
|
|
gathered = tf.gather(flattened_emb, indices + offset) |
214
|
|
|
if len(emb.get_shape()) == 2: |
215
|
|
|
gathered = tf.squeeze(gathered, 2) # [batch_size, num_indices] |
216
|
|
|
return gathered |
217
|
|
|
|
218
|
|
|
|
219
|
|
|
class RetrievalEvaluator(object): |
|
|
|
|
220
|
|
|
def __init__(self): |
221
|
|
|
self._num_correct = 0 |
222
|
|
|
self._num_gold = 0 |
223
|
|
|
self._num_predicted = 0 |
224
|
|
|
|
225
|
|
|
def update(self, gold_set, predicted_set): |
|
|
|
|
226
|
|
|
self._num_correct += len(gold_set & predicted_set) |
227
|
|
|
self._num_gold += len(gold_set) |
228
|
|
|
self._num_predicted += len(predicted_set) |
229
|
|
|
|
230
|
|
|
def recall(self): |
|
|
|
|
231
|
|
|
return maybe_divide(self._num_correct, self._num_gold) |
232
|
|
|
|
233
|
|
|
def precision(self): |
|
|
|
|
234
|
|
|
return maybe_divide(self._num_correct, self._num_predicted) |
235
|
|
|
|
236
|
|
|
def metrics(self): |
|
|
|
|
237
|
|
|
recall = self.recall() |
238
|
|
|
precision = self.precision() |
239
|
|
|
f1 = maybe_divide(2 * recall * precision, precision + recall) |
|
|
|
|
240
|
|
|
return recall, precision, f1 |
241
|
|
|
|
242
|
|
|
|
243
|
|
|
class EmbeddingDictionary(object): |
|
|
|
|
244
|
|
|
def __init__(self, info, datapath='', normalize=True, maybe_cache=None): |
245
|
|
|
self._size = info["size"] |
246
|
|
|
self._normalize = normalize |
247
|
|
|
self._path = os.path.join(datapath, info["path"]) |
248
|
|
|
self._cased = info["cased"] if "cased" in info else True |
249
|
|
|
if maybe_cache is not None and maybe_cache._path == self._path: |
250
|
|
|
assert self._size == maybe_cache._size |
251
|
|
|
self._embeddings = maybe_cache._embeddings |
252
|
|
|
else: |
253
|
|
|
self._embeddings = self.load_embedding_dict(self._path) |
254
|
|
|
|
255
|
|
|
@property |
256
|
|
|
def size(self): |
|
|
|
|
257
|
|
|
return self._size |
258
|
|
|
|
259
|
|
|
def load_embedding_dict(self, path): |
|
|
|
|
260
|
|
|
logger.info("Loading word embeddings from {}...".format(path)) |
|
|
|
|
261
|
|
|
default_embedding = np.zeros(self.size) |
262
|
|
|
embedding_dict = collections.defaultdict(lambda: default_embedding) |
263
|
|
|
if len(path) > 0: |
264
|
|
|
vocab_size = None |
265
|
|
|
with open(path) as f: |
|
|
|
|
266
|
|
|
for i, line in enumerate(f.readlines()): |
|
|
|
|
267
|
|
|
word_end = line.find(" ") |
268
|
|
|
word = line[:word_end] |
269
|
|
|
embedding = np.fromstring( |
270
|
|
|
line[word_end + 1:], np.float32, sep=" ") |
271
|
|
|
assert len(embedding) == self.size |
272
|
|
|
embedding_dict[word] = embedding |
273
|
|
|
if vocab_size is not None: |
274
|
|
|
assert vocab_size == len(embedding_dict) |
275
|
|
|
logger.info("Done loading word embeddings.") |
276
|
|
|
return embedding_dict |
277
|
|
|
|
278
|
|
|
def __getitem__(self, key): |
279
|
|
|
if not self._cased: |
280
|
|
|
key = key.lower() |
281
|
|
|
embedding = self._embeddings[key] |
282
|
|
|
if self._normalize: |
283
|
|
|
embedding = self.normalize(embedding) |
284
|
|
|
return embedding |
285
|
|
|
|
286
|
|
|
def normalize(self, v): |
|
|
|
|
287
|
|
|
norm = np.linalg.norm(v) |
288
|
|
|
if norm > 0: |
|
|
|
|
289
|
|
|
return v / norm |
290
|
|
|
else: |
291
|
|
|
return v |
292
|
|
|
|
293
|
|
|
|
294
|
|
|
class CustomLSTMCell(tf.nn.rnn_cell.RNNCell): |
|
|
|
|
295
|
|
|
def __init__(self, num_units, batch_size, dropout): |
296
|
|
|
self._num_units = num_units |
297
|
|
|
self._dropout = dropout |
298
|
|
|
self._dropout_mask = tf.nn.dropout( |
299
|
|
|
tf.ones([batch_size, self.output_size]), dropout) |
300
|
|
|
self._initializer = self._block_orthonormal_initializer( |
301
|
|
|
[self.output_size] * 3) |
302
|
|
|
initial_cell_state = tf.get_variable( |
303
|
|
|
"lstm_initial_cell_state", [1, self.output_size]) |
304
|
|
|
initial_hidden_state = tf.get_variable( |
305
|
|
|
"lstm_initial_hidden_state", [1, self.output_size]) |
306
|
|
|
self._initial_state = tf.nn.rnn_cell.LSTMStateTuple( |
307
|
|
|
initial_cell_state, initial_hidden_state) |
308
|
|
|
|
309
|
|
|
@property |
310
|
|
|
def state_size(self): |
|
|
|
|
311
|
|
|
return tf.nn.rnn_cell.LSTMStateTuple( |
312
|
|
|
self.output_size, self.output_size) |
313
|
|
|
|
314
|
|
|
@property |
315
|
|
|
def output_size(self): |
|
|
|
|
316
|
|
|
return self._num_units |
317
|
|
|
|
318
|
|
|
@property |
319
|
|
|
def initial_state(self): |
|
|
|
|
320
|
|
|
return self._initial_state |
321
|
|
|
|
322
|
|
|
def __call__(self, inputs, state, scope=None): |
323
|
|
|
"""Long short-term memory cell (LSTM).""" |
324
|
|
|
with tf.variable_scope(scope or type(self).__name__): # CustomLSTMCell |
325
|
|
|
c, h = state |
|
|
|
|
326
|
|
|
h *= self._dropout_mask |
|
|
|
|
327
|
|
|
concat = projection( |
328
|
|
|
tf.concat([inputs, h], 1), 3 * self.output_size, |
329
|
|
|
initializer=self._initializer) |
330
|
|
|
i, j, o = tf.split(concat, num_or_size_splits=3, axis=1) |
|
|
|
|
331
|
|
|
i = tf.sigmoid(i) |
332
|
|
|
new_c = (1 - i) * c + i * tf.tanh(j) |
333
|
|
|
new_h = tf.tanh(new_c) * tf.sigmoid(o) |
334
|
|
|
new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) |
335
|
|
|
return new_h, new_state |
336
|
|
|
|
337
|
|
|
def _orthonormal_initializer(self, scale=1.0): |
|
|
|
|
338
|
|
|
def _initializer(shape, dtype=tf.float32, partition_info=None): |
|
|
|
|
339
|
|
|
M1 = np.random.randn(shape[0], shape[0]).astype(np.float32) |
|
|
|
|
340
|
|
|
M2 = np.random.randn(shape[1], shape[1]).astype(np.float32) |
|
|
|
|
341
|
|
|
Q1, R1 = np.linalg.qr(M1) |
|
|
|
|
342
|
|
|
Q2, R2 = np.linalg.qr(M2) |
|
|
|
|
343
|
|
|
Q1 = Q1 * np.sign(np.diag(R1)) |
|
|
|
|
344
|
|
|
Q2 = Q2 * np.sign(np.diag(R2)) |
|
|
|
|
345
|
|
|
n_min = min(shape[0], shape[1]) |
346
|
|
|
params = np.dot(Q1[:, :n_min], Q2[:n_min, :]) * scale |
347
|
|
|
return params |
348
|
|
|
return _initializer |
349
|
|
|
|
350
|
|
|
def _block_orthonormal_initializer(self, output_sizes): |
351
|
|
|
def _initializer(shape, dtype=np.float32, partition_info=None): |
|
|
|
|
352
|
|
|
assert len(shape) == 2 |
353
|
|
|
assert sum(output_sizes) == shape[1] |
354
|
|
|
initializer = self._orthonormal_initializer() |
355
|
|
|
params = np.concatenate( |
356
|
|
|
[initializer([shape[0], o], dtype, partition_info) for o in output_sizes], 1) |
357
|
|
|
return params |
358
|
|
|
return _initializer |
359
|
|
|
|