1
|
|
|
from collections import Counter |
|
|
|
|
2
|
|
|
import itertools |
3
|
|
|
|
4
|
|
|
def _accumulate_terms(tokenized_corpus): |
|
|
|
|
5
|
|
|
global_terms=set() |
|
|
|
|
6
|
|
|
document_term_counts = {} |
7
|
|
|
doc_lengths = {} |
8
|
|
|
global_term_frequency_counter = Counter() |
9
|
|
|
for doc_id, doc in tokenized_corpus: |
10
|
|
|
doc_terms = set(doc) |
11
|
|
|
global_terms.update(doc_terms) |
12
|
|
|
doc_lengths[doc_id] = len(doc) |
13
|
|
|
document_term_counts[doc_id] = len(doc_terms) |
14
|
|
|
global_term_frequency_counter.update(doc) |
15
|
|
|
id_term_map = {} |
16
|
|
|
global_term_frequency = {} |
17
|
|
|
for term_id, term in enumerate(global_terms): |
18
|
|
|
id_term_map[term_id] = term |
19
|
|
|
global_term_frequency[term_id] = global_term_frequency_counter[term] |
20
|
|
|
|
21
|
|
|
return id_term_map, document_term_counts, doc_lengths, global_term_frequency |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class VectorizerOutput(object): |
|
|
|
|
25
|
|
|
def __init__(self, tokenized_corpus=None, vectorizer_func=None, |
26
|
|
|
id_term_map=None, document_term_counts=None, doc_lengths=None, |
27
|
|
|
term_frequency=None, vectors=None): |
28
|
|
|
if tokenized_corpus and vectorizer_func and not vectors: |
|
|
|
|
29
|
|
|
iter1, iter2 = itertools.tee(tokenized_corpus) |
30
|
|
|
self._id_term_map, self._document_term_counts, self._doc_lengths, \ |
31
|
|
|
self._term_frequency = _accumulate_terms(iter1) |
|
|
|
|
32
|
|
|
self._term_id_map = {term: id |
|
|
|
|
33
|
|
|
for id, term in self._id_term_map.items()} |
34
|
|
|
self._vectors = vectorizer_func(iter2, self) |
|
|
|
|
35
|
|
|
elif id_term_map and document_term_counts and doc_lengths and \ |
|
|
|
|
36
|
|
|
term_frequency and vectors: |
|
|
|
|
37
|
|
|
self._id_term_map = id_term_map |
38
|
|
|
self._term_id_map = {term: id for id, term in self._id_term_map.items()} |
39
|
|
|
self._document_term_counts = document_term_counts |
40
|
|
|
self._doc_lengths = doc_lengths |
41
|
|
|
self._term_frequency = term_frequency |
42
|
|
|
self._vectors = vectors |
43
|
|
|
else: |
44
|
|
|
raise ValueError( |
45
|
|
|
"Must provide either tokenized corpora and vectorizer func, " |
46
|
|
|
"or global term collection, document term counts, and vectors.") |
47
|
|
|
|
48
|
|
|
def get_vectors(self): |
|
|
|
|
49
|
|
|
for doc_id, vector in self._vectors.items(): |
50
|
|
|
yield doc_id, vector |
|
|
|
|
51
|
|
|
|
52
|
|
|
def __len__(self): |
53
|
|
|
return len(self._vectors) |
|
|
|
|
54
|
|
|
|
55
|
|
|
@property |
|
|
|
|
56
|
|
|
def id_term_map(self): |
|
|
|
|
57
|
|
|
return self._id_term_map |
|
|
|
|
58
|
|
|
|
59
|
|
|
@property |
|
|
|
|
60
|
|
|
def term_id_map(self): |
|
|
|
|
61
|
|
|
return self._term_id_map |
|
|
|
|
62
|
|
|
|
63
|
|
|
@property |
|
|
|
|
64
|
|
|
def global_term_count(self): |
|
|
|
|
65
|
|
|
return len(self.id_term_map) |
|
|
|
|
66
|
|
|
|
67
|
|
|
@property |
|
|
|
|
68
|
|
|
def document_term_counts(self): |
|
|
|
|
69
|
|
|
return self._document_term_counts |
|
|
|
|
70
|
|
|
|
71
|
|
|
@property |
|
|
|
|
72
|
|
|
def doc_lengths(self): |
|
|
|
|
73
|
|
|
return self._doc_lengths |
|
|
|
|
74
|
|
|
|
75
|
|
|
@property |
|
|
|
|
76
|
|
|
def term_frequency(self): |
|
|
|
|
77
|
|
|
return self._term_frequency |
|
|
|
|
78
|
|
|
|
79
|
|
|
@property |
|
|
|
|
80
|
|
|
def vectors(self): |
|
|
|
|
81
|
|
|
return self._vectors |
|
|
|
|
82
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.