1
|
|
|
from six.moves import UserDict |
|
|
|
|
2
|
|
|
from functools import partial |
3
|
|
|
|
4
|
|
|
from topik.singleton_registry import _base_register_decorator |
5
|
|
|
|
6
|
|
|
|
7
|
|
|
# This subclass serves to establish a new singleon instance of functions |
8
|
|
|
# for this particular step in topic modeling. No implementation necessary. |
9
|
|
|
class TokenizerRegistry(UserDict, object): |
|
|
|
|
10
|
|
|
"""Uses Borg design pattern. Core idea is that there is a global registry for each step's |
11
|
|
|
possible methods |
12
|
|
|
""" |
13
|
|
|
__shared_state = {} |
14
|
|
|
def __init__(self, *args, **kwargs): |
15
|
|
|
self.__dict__ = self.__shared_state |
|
|
|
|
16
|
|
|
super(TokenizerRegistry, self).__init__(*args, **kwargs) |
|
|
|
|
17
|
|
|
|
18
|
|
|
|
19
|
|
|
# a nicer, more pythonic handle to our singleton instance |
20
|
|
|
registered_tokenizers = TokenizerRegistry() |
|
|
|
|
21
|
|
|
|
22
|
|
|
# fill in the registration function |
23
|
|
|
register = partial(_base_register_decorator, registered_tokenizers) |
|
|
|
|
24
|
|
|
|
25
|
|
|
|
26
|
|
|
def tokenize(corpus, method="simple", **kwargs): |
27
|
|
|
"""Break documents up into component words, optionally eliminating stopwords. |
28
|
|
|
|
29
|
|
|
Output from this function is used as input to vectorization steps. |
30
|
|
|
|
31
|
|
|
raw_data: iterable corpus object containing the text to be processed. |
32
|
|
|
Each iteration call should return a new document's content. |
33
|
|
|
method: string id of tokenizer to use. For keys, see |
34
|
|
|
topik.tokenizers.registered_tokenizers (which is a dictionary of functions) |
35
|
|
|
kwargs: arbitrary dicionary of extra parameters. |
36
|
|
|
""" |
37
|
|
|
return registered_tokenizers[method](corpus, **kwargs) |
|
|
|
|
38
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.