|
1
|
|
|
import gensim |
|
|
|
|
|
|
2
|
|
|
import logging |
|
|
|
|
|
|
3
|
|
|
# imports used only for doctests |
|
4
|
|
|
from topik.tokenizers._registry import register |
|
5
|
|
|
|
|
6
|
|
|
|
|
7
|
|
|
def _simple_document(text, min_length=1, stopwords=None): |
|
8
|
|
|
"""A text tokenizer that simply lowercases, matches alphabetic |
|
9
|
|
|
characters and removes stopwords. For use on individual text documents. |
|
10
|
|
|
|
|
11
|
|
|
Parameters |
|
12
|
|
|
---------- |
|
13
|
|
|
text : str |
|
14
|
|
|
A single document's text to be tokenized |
|
15
|
|
|
min_length : int |
|
16
|
|
|
Minimum length of any single word |
|
17
|
|
|
stopwords: None or iterable of str |
|
18
|
|
|
Collection of words to ignore as tokens |
|
19
|
|
|
|
|
20
|
|
|
Examples |
|
21
|
|
|
-------- |
|
22
|
|
|
>>> text = "frank FRANK the frank dog cat" |
|
23
|
|
|
>>> tokenized_text = _simple_document(text) |
|
24
|
|
|
>>> tokenized_text == ["frank", "frank", "frank", "dog", "cat"] |
|
25
|
|
|
True |
|
26
|
|
|
""" |
|
27
|
|
|
if not stopwords: |
|
28
|
|
|
from gensim.parsing.preprocessing import STOPWORDS as stopwords |
|
|
|
|
|
|
29
|
|
|
#logging.debug("Tokenizing text: {}".format(text)) |
|
30
|
|
|
return [word for word in gensim.utils.tokenize(text, lower=True) |
|
|
|
|
|
|
31
|
|
|
if word not in stopwords and len(word) >= min_length] |
|
32
|
|
|
|
|
33
|
|
|
|
|
34
|
|
|
@register |
|
|
|
|
|
|
35
|
|
|
def simple(raw_corpus, min_length=1, stopwords=None): |
|
36
|
|
|
"""A text tokenizer that simply lowercases, matches alphabetic |
|
37
|
|
|
characters and removes stopwords. |
|
38
|
|
|
|
|
39
|
|
|
Parameters |
|
40
|
|
|
---------- |
|
41
|
|
|
raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) |
|
42
|
|
|
body of documents to examine |
|
43
|
|
|
min_length : int |
|
44
|
|
|
Minimum length of any single word |
|
45
|
|
|
stopwords: None or iterable of str |
|
46
|
|
|
Collection of words to ignore as tokens |
|
47
|
|
|
|
|
48
|
|
|
Examples |
|
49
|
|
|
-------- |
|
50
|
|
|
>>> sample_corpus = [("doc1", "frank FRANK the frank dog cat"), |
|
51
|
|
|
... ("doc2", "frank a dog of the llama")] |
|
52
|
|
|
>>> tokenized_corpora = simple(sample_corpus) |
|
53
|
|
|
>>> next(tokenized_corpora) == ("doc1", |
|
54
|
|
|
... ["frank", "frank", "frank", "dog", "cat"]) |
|
55
|
|
|
True |
|
56
|
|
|
""" |
|
57
|
|
|
for doc_id, doc_text in raw_corpus: |
|
58
|
|
|
# logging.debug("Tokenizing doc_id: {}".format(doc_id)) |
|
59
|
|
|
yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords)) |
|
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.