filter_embeddings   A
last analyzed

Complexity

Total Complexity 1

Size/Duplication

Total Lines 48
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 1
eloc 38
dl 0
loc 48
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
A get_parser() 0 10 1
1
from __future__ import absolute_import
2
from __future__ import division
3
from __future__ import print_function
4
5
import sys
6
import json
7
import argparse
8
import logging
9
10
11
def get_parser():
12
    parser = argparse.ArgumentParser()
13
14
    parser.add_argument('embedding_file',
15
                        type=argparse.FileType('r'))
16
    parser.add_argument('-c', '--cased', default=False)
17
    parser.add_argument('doc_files', nargs='+')
18
    parser.add_argument('out_file',
19
                        type=argparse.FileType('w'))
20
    return parser
21
22
23
if __name__ == "__main__":
24
    args = get_parser().parse_args()
25
    words_to_keep = set()
26
    for json_filename in args.doc_files:
27
        with open(json_filename) as json_file:
28
            for line in json_file.readlines():
29
                for sentence in json.loads(line)["sentences"]:
30
                    if args.cased:
31
                        words_to_keep.update(sentence)
32
                    else:
33
                        words_to_keep.update([w.lower() for w in sentence])
34
35
    logging.info("Found {} words in {} dataset(s).".format(
36
        len(words_to_keep), len(sys.argv) - 3))
37
38
    total_lines = 0
39
    kept_lines = 0
40
    for line in args.embedding_file.readlines():
41
        total_lines += 1
42
        word = line.split()[0]
43
        if word in words_to_keep:
44
            kept_lines += 1
45
            args.out_file.write(line)
46
47
    logging.info("Kept {} out of {} lines.".format(kept_lines, total_lines))
48