char_vocab   A
last analyzed

Complexity

Total Complexity 8

Size/Duplication

Total Lines 36
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 8
eloc 29
dl 0
loc 36
rs 10
c 0
b 0
f 0

2 Functions

Rating   Name   Duplication   Size   Complexity  
B get_char_vocab() 0 13 7
A get_parser() 0 6 1
1
from __future__ import absolute_import
2
from __future__ import division
3
from __future__ import print_function
4
5
import json
6
import argparse
7
import logging
8
9
10
def get_char_vocab(input_filenames, output_file):
11
    vocab = set()
12
    for filename in input_filenames:
13
        with open(filename) as f:
14
            for line in f.readlines():
15
                for sentence in json.loads(line)["sentences"]:
16
                    for word in sentence:
17
                        vocab.update(word)
18
    vocab = sorted(list(vocab))
19
    for char in vocab:
20
        output_file.write(char)
21
        output_file.write(u"\n")
22
    logging.info("Wrote {} characters".format(len(vocab)))
23
24
25
def get_parser():
26
    parser = argparse.ArgumentParser()
27
    parser.add_argument('input_filenames', nargs='+')
28
    parser.add_argument('output_file',
29
                        type=argparse.FileType('w'))
30
    return parser
31
32
33
if __name__ == "__main__":
34
    args = get_parser().parse_args()
35
    get_char_vocab(args.input_filenames, args.output_file)
36