1
|
|
|
import gensim |
|
|
|
|
2
|
|
|
import logging |
|
|
|
|
3
|
|
|
# imports used only for doctests |
4
|
|
|
from topik.tokenizers._registry import register |
5
|
|
|
|
6
|
|
|
|
7
|
|
|
def _simple_document(text, min_length=1, stopwords=None): |
8
|
|
|
"""A text tokenizer that simply lowercases, matches alphabetic |
9
|
|
|
characters and removes stopwords. For use on individual text documents. |
10
|
|
|
|
11
|
|
|
Parameters |
12
|
|
|
---------- |
13
|
|
|
text : str |
14
|
|
|
A single document's text to be tokenized |
15
|
|
|
min_length : int |
16
|
|
|
Minimum length of any single word |
17
|
|
|
stopwords: None or iterable of str |
18
|
|
|
Collection of words to ignore as tokens |
19
|
|
|
|
20
|
|
|
Examples |
21
|
|
|
-------- |
22
|
|
|
>>> text = "frank FRANK the frank dog cat" |
23
|
|
|
>>> tokenized_text = _simple_document(text) |
24
|
|
|
>>> tokenized_text == ["frank", "frank", "frank", "dog", "cat"] |
25
|
|
|
True |
26
|
|
|
""" |
27
|
|
|
if not stopwords: |
28
|
|
|
from gensim.parsing.preprocessing import STOPWORDS as stopwords |
|
|
|
|
29
|
|
|
#logging.debug("Tokenizing text: {}".format(text)) |
30
|
|
|
return [word for word in gensim.utils.tokenize(text, lower=True) |
|
|
|
|
31
|
|
|
if word not in stopwords and len(word) >= min_length] |
32
|
|
|
|
33
|
|
|
|
34
|
|
|
@register |
|
|
|
|
35
|
|
|
def simple(raw_corpus, min_length=1, stopwords=None): |
36
|
|
|
"""A text tokenizer that simply lowercases, matches alphabetic |
37
|
|
|
characters and removes stopwords. |
38
|
|
|
|
39
|
|
|
Parameters |
40
|
|
|
---------- |
41
|
|
|
raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) |
42
|
|
|
body of documents to examine |
43
|
|
|
min_length : int |
44
|
|
|
Minimum length of any single word |
45
|
|
|
stopwords: None or iterable of str |
46
|
|
|
Collection of words to ignore as tokens |
47
|
|
|
|
48
|
|
|
Examples |
49
|
|
|
-------- |
50
|
|
|
>>> sample_corpus = [("doc1", "frank FRANK the frank dog cat"), |
51
|
|
|
... ("doc2", "frank a dog of the llama")] |
52
|
|
|
>>> tokenized_corpora = simple(sample_corpus) |
53
|
|
|
>>> next(tokenized_corpora) == ("doc1", |
54
|
|
|
... ["frank", "frank", "frank", "dog", "cat"]) |
55
|
|
|
True |
56
|
|
|
""" |
57
|
|
|
for doc_id, doc_text in raw_corpus: |
58
|
|
|
# logging.debug("Tokenizing doc_id: {}".format(doc_id)) |
59
|
|
|
yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords)) |
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.