Total Complexity | 8 |
Total Lines | 36 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | from __future__ import absolute_import |
||
2 | from __future__ import division |
||
3 | from __future__ import print_function |
||
4 | |||
5 | import json |
||
6 | import argparse |
||
7 | import logging |
||
8 | |||
9 | |||
10 | def get_char_vocab(input_filenames, output_file): |
||
11 | vocab = set() |
||
12 | for filename in input_filenames: |
||
13 | with open(filename) as f: |
||
14 | for line in f.readlines(): |
||
15 | for sentence in json.loads(line)["sentences"]: |
||
16 | for word in sentence: |
||
17 | vocab.update(word) |
||
18 | vocab = sorted(list(vocab)) |
||
19 | for char in vocab: |
||
20 | output_file.write(char) |
||
21 | output_file.write(u"\n") |
||
22 | logging.info("Wrote {} characters".format(len(vocab))) |
||
23 | |||
24 | |||
25 | def get_parser(): |
||
26 | parser = argparse.ArgumentParser() |
||
27 | parser.add_argument('input_filenames', nargs='+') |
||
28 | parser.add_argument('output_file', |
||
29 | type=argparse.FileType('w')) |
||
30 | return parser |
||
31 | |||
32 | |||
33 | if __name__ == "__main__": |
||
34 | args = get_parser().parse_args() |
||
35 | get_char_vocab(args.input_filenames, args.output_file) |
||
36 |