1
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
2
|
|
|
|
3
|
|
|
import logging |
4
|
|
|
|
5
|
|
|
import numpy as np |
6
|
|
|
|
7
|
|
|
from .base_model_output import ModelOutput |
8
|
|
|
from ._registry import register |
9
|
|
|
|
10
|
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', |
11
|
|
|
level=logging.WARNING) |
|
|
|
|
12
|
|
|
|
13
|
|
|
|
14
|
|
|
def _rand_mat(rows, cols): |
|
|
|
|
15
|
|
|
out = np.random.random((rows, cols)) |
16
|
|
|
for row in out: |
17
|
|
|
row /= row.sum() |
18
|
|
|
return out |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
def _cal_p_dw(words_in_docs, word_cts_in_docs, topic_array, zw, dz, beta, p_dw): |
|
|
|
|
22
|
|
|
for (d, doc_id, words) in words_in_docs: |
|
|
|
|
23
|
|
|
p_dw[d, words] = (word_cts_in_docs[doc_id] * (zw[:, words]*np.expand_dims(dz[d, :], 1))**beta).sum(axis=0) |
24
|
|
|
return p_dw |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
def _e_step(words_in_docs, dw_z, topic_array, zw, dz, beta, p_dw): |
|
|
|
|
28
|
|
|
for (d, _, words) in words_in_docs: |
|
|
|
|
29
|
|
|
dw_z[d, words, :] = ((zw[:, words].T * dz[d, :]) ** beta) / np.expand_dims(p_dw[d, words], 1) |
30
|
|
|
return dw_z |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
def _m_step(words_in_docs, word_cts_in_docs, topic_array, zw, dw_z, dz): |
|
|
|
|
34
|
|
|
zw[:] = 0 |
35
|
|
|
for (d, doc_id, words) in words_in_docs: |
|
|
|
|
36
|
|
|
zw[:, words] += word_cts_in_docs[doc_id]*dw_z[d, words].T |
37
|
|
|
# normalize by sum of topic word weights |
38
|
|
|
zw /= np.expand_dims(zw.sum(axis=1), 1) |
39
|
|
|
for (d, doc_id, words) in words_in_docs: |
|
|
|
|
40
|
|
|
dz[d] = (word_cts_in_docs[doc_id] * dw_z[d, words].T).sum(axis=1) |
41
|
|
|
dz /= np.expand_dims(dz.sum(axis=1), 1) |
42
|
|
|
return zw, dz |
43
|
|
|
|
44
|
|
|
|
45
|
|
|
def _cal_likelihood(words_in_docs, word_cts_in_docs, p_dw): |
|
|
|
|
46
|
|
|
likelihood = 0 |
47
|
|
|
for (d, doc_id, words) in words_in_docs: |
|
|
|
|
48
|
|
|
likelihood += sum(word_cts_in_docs[doc_id] * np.log(p_dw[d][words])) |
49
|
|
|
return likelihood |
50
|
|
|
|
51
|
|
|
|
52
|
|
|
def _get_topic_term_matrix(zw, ntopics, id_term_map): |
|
|
|
|
53
|
|
|
labeled_zw = {"topic"+str(topicno): zw[topicno].tolist() for topicno in range(ntopics)} |
|
|
|
|
54
|
|
|
return labeled_zw |
55
|
|
|
|
56
|
|
|
|
57
|
|
|
def _get_doc_topic_matrix(dz, ntopics, vectorized_corpus): |
|
|
|
|
58
|
|
|
labeled_dz = {doc_id: dz[i].tolist() for i, (doc_id, vector) in enumerate(vectorized_corpus.get_vectors())} |
|
|
|
|
59
|
|
|
return labeled_dz |
60
|
|
|
|
61
|
|
|
|
62
|
|
|
def _PLSA(vectorized_corpus, ntopics, max_iter): |
|
|
|
|
63
|
|
|
cur = 0 |
64
|
|
|
topic_array = np.arange(ntopics, dtype=np.int32) |
|
|
|
|
65
|
|
|
# topic-word matrix |
66
|
|
|
zw = _rand_mat(ntopics, vectorized_corpus.global_term_count) |
|
|
|
|
67
|
|
|
# document-topic matrix |
68
|
|
|
dz = _rand_mat(len(vectorized_corpus), ntopics) |
|
|
|
|
69
|
|
|
dw_z = np.zeros((len(vectorized_corpus), vectorized_corpus.global_term_count, ntopics)) |
70
|
|
|
p_dw = np.zeros((len(vectorized_corpus), vectorized_corpus.global_term_count)) |
71
|
|
|
beta = 0.8 |
72
|
|
|
words_in_docs = [(id, doc_id, [word_id for word_id, _ in doc.items()]) |
|
|
|
|
73
|
|
|
for id, (doc_id, doc) in enumerate(vectorized_corpus.get_vectors())] |
|
|
|
|
74
|
|
|
word_cts_in_docs = {doc_id: [ct for _, ct in doc.items()] for doc_id, doc in vectorized_corpus.get_vectors()} |
|
|
|
|
75
|
|
|
for i in range(max_iter): |
|
|
|
|
76
|
|
|
p_dw = _cal_p_dw(words_in_docs, word_cts_in_docs, topic_array, zw, dz, beta, p_dw) |
77
|
|
|
dw_z = _e_step(words_in_docs, dw_z, topic_array, zw, dz, beta, p_dw) |
78
|
|
|
zw, dz = _m_step(words_in_docs, word_cts_in_docs, topic_array, zw, dw_z, dz) |
|
|
|
|
79
|
|
|
likelihood = _cal_likelihood(words_in_docs, word_cts_in_docs, p_dw) |
80
|
|
|
if cur != 0 and abs((likelihood-cur)/cur) < 1e-8: |
81
|
|
|
break |
82
|
|
|
cur = likelihood |
83
|
|
|
topic_term_matrix = _get_topic_term_matrix(zw, ntopics, vectorized_corpus.id_term_map) |
84
|
|
|
doc_topic_matrix = _get_doc_topic_matrix(dz, ntopics, vectorized_corpus) |
85
|
|
|
return topic_term_matrix, doc_topic_matrix |
86
|
|
|
|
87
|
|
|
@register |
|
|
|
|
88
|
|
|
def plsa(vectorized_corpus, ntopics, max_iter=100, **kwargs): |
|
|
|
|
89
|
|
|
return ModelOutput(vectorized_corpus=vectorized_corpus, model_func=_PLSA, ntopics=ntopics, max_iter=max_iter, **kwargs) |
|
|
|
|
90
|
|
|
|
91
|
|
|
|
92
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.