| Total Complexity | 8 |
| Total Lines | 36 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | from __future__ import absolute_import |
||
| 2 | from __future__ import division |
||
| 3 | from __future__ import print_function |
||
| 4 | |||
| 5 | import json |
||
| 6 | import argparse |
||
| 7 | import logging |
||
| 8 | |||
| 9 | |||
| 10 | def get_char_vocab(input_filenames, output_file): |
||
| 11 | vocab = set() |
||
| 12 | for filename in input_filenames: |
||
| 13 | with open(filename) as f: |
||
| 14 | for line in f.readlines(): |
||
| 15 | for sentence in json.loads(line)["sentences"]: |
||
| 16 | for word in sentence: |
||
| 17 | vocab.update(word) |
||
| 18 | vocab = sorted(list(vocab)) |
||
| 19 | for char in vocab: |
||
| 20 | output_file.write(char) |
||
| 21 | output_file.write(u"\n") |
||
| 22 | logging.info("Wrote {} characters".format(len(vocab))) |
||
| 23 | |||
| 24 | |||
| 25 | def get_parser(): |
||
| 26 | parser = argparse.ArgumentParser() |
||
| 27 | parser.add_argument('input_filenames', nargs='+') |
||
| 28 | parser.add_argument('output_file', |
||
| 29 | type=argparse.FileType('w')) |
||
| 30 | return parser |
||
| 31 | |||
| 32 | |||
| 33 | if __name__ == "__main__": |
||
| 34 | args = get_parser().parse_args() |
||
| 35 | get_char_vocab(args.input_filenames, args.output_file) |
||
| 36 |