1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
|
|
|
5
|
|
|
SENT_MARK = "</s>" |
6
|
|
|
NULL_MARK = "<null>" |
7
|
|
|
UNK_MARK = "<unk>" |
8
|
|
|
|
9
|
|
|
import numpy as np |
10
|
|
|
import logging as loggers |
11
|
|
|
|
12
|
|
|
logging = loggers.getLogger(__name__) |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class Vocab(object): |
16
|
|
|
|
17
|
|
|
def __init__(self, is_lang=True, char_based=False, null_mark=False): |
18
|
|
|
self.vocab_map = {} |
19
|
|
|
self.reversed_map = None |
20
|
|
|
self.size = 0 |
21
|
|
|
self._char_based = char_based |
22
|
|
|
self.null_mark = null_mark |
23
|
|
|
if null_mark: |
24
|
|
|
self.add(NULL_MARK) |
25
|
|
|
if is_lang: |
26
|
|
|
self.add(SENT_MARK) |
27
|
|
|
self.add(UNK_MARK) |
28
|
|
|
|
29
|
|
|
def add(self, word): |
30
|
|
|
if word not in self.vocab_map: |
31
|
|
|
self.vocab_map[word] = self.size |
32
|
|
|
self.size += 1 |
33
|
|
|
|
34
|
|
|
def index(self, word): |
35
|
|
|
if word in self.vocab_map: |
36
|
|
|
return self.vocab_map[word] |
37
|
|
|
else: |
38
|
|
|
return self.vocab_map[UNK_MARK] |
39
|
|
|
|
40
|
|
|
def word(self, index): |
41
|
|
|
if not self.reversed_map: |
42
|
|
|
self.reversed_map = {} |
43
|
|
|
for k in self.vocab_map: |
44
|
|
|
self.reversed_map[self.vocab_map[k]] = k |
45
|
|
|
return self.reversed_map[index] |
46
|
|
|
|
47
|
|
|
def transform(self, word): |
48
|
|
|
v = np.zeros(self.size, dtype=int) |
49
|
|
|
v[self.index(word)] = 1 |
50
|
|
|
return v |
51
|
|
|
|
52
|
|
|
def transform_index(self, index): |
53
|
|
|
v = np.zeros(self.size, dtype=int) |
54
|
|
|
v[index] = 1 |
55
|
|
|
return v |
56
|
|
|
|
57
|
|
|
def _load_fixed_size(self, path, max_size): |
58
|
|
|
from collections import Counter |
59
|
|
|
logging.info("fixed size: %d" % max_size) |
60
|
|
|
counter = Counter() |
61
|
|
|
for line in open(path).readlines(): |
62
|
|
|
line = line.strip() |
63
|
|
|
words = line.split(" ") if not self._char_based else line |
64
|
|
|
counter.update(words) |
65
|
|
|
for w, _ in counter.most_common(max_size): |
66
|
|
|
self.add(w) |
67
|
|
|
|
68
|
|
|
def load(self, path, max_size=-1): |
69
|
|
|
logging.info("load data from %s" % path) |
70
|
|
|
if max_size > 0: |
71
|
|
|
self._load_fixed_size(path, max_size) |
72
|
|
|
return |
73
|
|
|
for line in open(path).xreadlines(): |
74
|
|
|
line = line.strip() |
75
|
|
|
words = line.split(" ") if not self._char_based else line |
76
|
|
|
map(self.add, words) |
77
|
|
|
logging.info("vocab size: %d" % self.size) |
78
|
|
|
|
79
|
|
|
@property |
80
|
|
|
def sent_index(self): |
81
|
|
|
return self.index(SENT_MARK) |
82
|
|
|
|
83
|
|
|
@property |
84
|
|
|
def sent_vector(self): |
85
|
|
|
return self.transform(SENT_MARK) |
86
|
|
|
|
87
|
|
|
|
88
|
|
|
|