cache_bert.main() - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

cache_bert.main() A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: cache_bert

Complexity

Conditions

Size

Total Lines	8
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	8
dl	0
loc	8
rs	10
c	0
b	0
f	0
cc	3
nop	1

import numpy as np
import h5py
import json
import logging
import argparse

from e2edutch import bert


def cache_dataset(data_path, out_file, tokenizer, model):
    with open(data_path) as in_file:
        for doc_num, line in enumerate(in_file.readlines()):
            example = json.loads(line)
            sentences = example["sentences"]
            bert_final = bert.encode_sentences(sentences, tokenizer, model)
            # shape: (num_sent, max_sent_len, lm_size, 1)
            text_len = np.array([len(s) for s in sentences])
            file_key = example["doc_key"].replace("/", ":")
            if file_key in out_file.keys():
                del out_file[file_key]

            group = out_file.create_group(file_key)
            for i, (e, l) in enumerate(zip(bert_final, text_len)):
                e = np.array(e[:l, :, :])
                group[str(i)] = e

            if doc_num % 10 == 0:
                logging.info("Cached {} documents in {}".format(
                    doc_num + 1, data_path))


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('model_name', choices=['bertje', 'bert-nl', 'robbert'])
    parser.add_argument('datapath')
    parser.add_argument('input_files', nargs='+')
    return parser


def main(args=None):
    args = get_parser().parse_args()
    model_name = args.model_name
    datapath = args.datapath
    tokenizer, model = bert.load_bert(model_name)
    with h5py.File("{}/{}_cache.hdf5".format(datapath, model_name), "a") as out_file:
        for json_filename in args.input_files:
            cache_dataset(json_filename, out_file, tokenizer, model)


if __name__ == "__main__":
    main()


1			import numpy as np
2			import h5py
3			import json
4			import logging
5			import argparse
6
7			from e2edutch import bert
8
9
10			def cache_dataset(data_path, out_file, tokenizer, model):
11			with open(data_path) as in_file:
12			for doc_num, line in enumerate(in_file.readlines()):
13			example = json.loads(line)
14			sentences = example["sentences"]
15			bert_final = bert.encode_sentences(sentences, tokenizer, model)
16			# shape: (num_sent, max_sent_len, lm_size, 1)
17			text_len = np.array([len(s) for s in sentences])
18			file_key = example["doc_key"].replace("/", ":")
19			if file_key in out_file.keys():
20			del out_file[file_key]
21
22			group = out_file.create_group(file_key)
23			for i, (e, l) in enumerate(zip(bert_final, text_len)):
24			e = np.array(e[:l, :, :])
25			group[str(i)] = e
			0 ignored issues – show Comprehensibility Best Practice introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `str` does not seem to be defined. Loading history...
26			if doc_num % 10 == 0:
27			logging.info("Cached {} documents in {}".format(
28			doc_num + 1, data_path))
29
30
31			def get_parser():
32			parser = argparse.ArgumentParser()
33			parser.add_argument('model_name', choices=['bertje', 'bert-nl', 'robbert'])
34			parser.add_argument('datapath')
35			parser.add_argument('input_files', nargs='+')
36			return parser
37
38
39			def main(args=None):
40			args = get_parser().parse_args()
41			model_name = args.model_name
42			datapath = args.datapath
43			tokenizer, model = bert.load_bert(model_name)
44			with h5py.File("{}/{}_cache.hdf5".format(datapath, model_name), "a") as out_file:
45			for json_filename in args.input_files:
46			cache_dataset(json_filename, out_file, tokenizer, model)
47
48
49			if __name__ == "__main__":
50			main()
51

Filter-Bubble / e2e-Dutch

cache_bert.main() A last analyzed 2021-08-05 09:00 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

cache_bert.main() A
last analyzed 2021-08-05 09:00 UTC