topik.tokenizers._collect_bigrams_and_trigrams() - Code Metrics - Inspection of "add regex stop capability for omitting things like..." - ContinuumIO/topik - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#74)

by Mike

created 2016-03-14 17:11 UTC

topik.tokenizers._collect_bigrams_and_trigrams() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	4
dl	0
loc	54
rs	9.0306

How to fix Long Method

import itertools
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
import re

from topik.tokenizers.simple import _simple_document
from topik.tokenizers._registry import register

# sample_corpus for doctests
sample_corpus = [

            ("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
                         u" to prancercise class daily.  Prancercise was "
                         u"a tremendously popular pastime of sassy "
                         u"unicorns and retirees alike.")),
            ("doc2", str(u"Prancercise is a form of both art and fitniss, "
                         u"originally invented by sassy unicorns. It has "
                         u"recently been popularized by such retired "
                         u"celebrities as Frank The Swank-Tank."))]


# TODO: replace min_freqs with freq_bounds like ngrams takes.  Unify format across the board.

def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None,
                                  stopwords=None, stop_regex=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens
    stop_regex : str
        A regular expression of content to remove from text before tokenizing.
        Potentially useful for ignoring code (HTML tags).

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords,
                                  stop_regex=stop_regex)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns


def _collocation_document(text, patterns, min_length=1, stopwords=None, stop_regex=None):
    """A text tokenizer that includes collocations(bigrams and trigrams).

    A collocation is sequence of words or terms that co-occur more often
    than would be expected by chance.  This function breaks a raw document
    up into tokens based on a pre-established collection of bigrams and
    trigrams.  This collection is derived from a body of many documents, and
    must be obtained in a prior step using the collect_bigrams_and_trigrams
    function.

    Uses nltk.collocations.TrigramCollocationFinder to
    find trigrams and bigrams.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    patterns: tuple of compiled regex object to find n-grams
        Obtained from collect_bigrams_and_trigrams function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens
    stop_regex : str
        A regular expression of content to remove from text before tokenizing.
        Potentially useful for ignoring code (HTML tags).

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _collocation_document(text,patterns)
    >>> tokenized_text == [
    ...     u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
    True
    """
    text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords, stop_regex=stop_regex))
    for pattern in patterns:
        text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
    return text.split()

@register
def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None,
           stop_regex=None):
    """
    A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
    according to the frequency bounds, then tokenizes all documents using those
    extracted phrases.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    min_length : int
        Minimum length of any single word
    freq_bounds : list of tuples of ints
        Currently ngrams supports bigrams and trigrams, so this list should
        contain two tuples (the first for bigrams, the second for trigrams),
        where each tuple consists of a (minimum, maximum) corpus-wide frequency.
    top_n : int
        limit results to this many entries
    stopwords: None or iterable of str
        Collection of words to ignore as tokens
    stop_regex : str
        A regular expression of content to remove from text before tokenizing.
        Potentially useful for ignoring code (HTML tags).

    Examples
    --------
    >>> tokenized_corpora = ngrams(sample_corpus, freq_bounds=[(2,100),(2,100)])
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
    True
    """
    if not freq_bounds:
        freq_bounds=[(50, 10000), (20, 10000)]

    min_freqs = [freq[0] for freq in freq_bounds]
    patterns = _collect_bigrams_and_trigrams(raw_corpus, top_n=top_n, min_length=min_length, min_freqs=min_freqs,
                                             stopwords=stopwords, stop_regex=stop_regex)
    for doc_id, doc_text in raw_corpus:
        yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length,
                                            stopwords=stopwords, stop_regex=stop_regex)


Pull Request — master (#74)

topik.tokenizers._collect_bigrams_and_trigrams() A

Complexity

Size

Duplication

How to fix Long Method

Long Method

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			import itertools
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2			import re
3
4			from topik.tokenizers.simple import _simple_document
5			from topik.tokenizers._registry import register
6
7			# sample_corpus for doctests
8			sample_corpus = [
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The name `sample_corpus` does not conform to the constant naming conventions (`(([A-Z_][A-Z0-9_])\|(__.__))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
9			("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
10			u" to prancercise class daily. Prancercise was "
11			u"a tremendously popular pastime of sassy "
12			u"unicorns and retirees alike.")),
13			("doc2", str(u"Prancercise is a form of both art and fitniss, "
14			u"originally invented by sassy unicorns. It has "
15			u"recently been popularized by such retired "
16			u"celebrities as Frank The Swank-Tank."))]
17
18
19			# TODO: replace min_freqs with freq_bounds like ngrams takes. Unify format across the board.
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
20			def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None,
21			stopwords=None, stop_regex=None):
22			"""collects bigrams and trigrams from collection of documents. Input to collocation tokenizer.
23
24			bigrams are pairs of words that recur in the collection; trigrams are triplets.
25
26			Parameters
27			----------
28			raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
29			body of documents to examine
30			top_n : int
31			limit results to this many entries
32			min_length : int
33			Minimum length of any single word
34			min_freqs : iterable of int
35			threshold of when to consider a pair of words as a recognized n-gram,
36			starting with bigrams.
37			stopwords : None or iterable of str
38			Collection of words to ignore as tokens
39			stop_regex : str
40			A regular expression of content to remove from text before tokenizing.
41			Potentially useful for ignoring code (HTML tags).
42
43			Examples
44			--------
45			>>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
46			>>> patterns[0].pattern
47			u'(frank swank\|swank tank\|sassy unicorns)'
48			>>> patterns[1].pattern
49			u'(frank swank tank)'
50			"""
51
52			from nltk.collocations import TrigramCollocationFinder
			0 ignored issues – show Configuration introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The import `nltk.collocations` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
53			from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
			0 ignored issues – show Configuration introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The import `nltk.metrics` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
54
55			# generator of documents, turn each element to its list of words
56			doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords,
57			stop_regex=stop_regex)
58			for doc_id, doc_text in raw_corpus)
59			# generator, concatenate (chain) all words into a single sequence, lazily
60			words = itertools.chain.from_iterable(doc_texts)
61			tcf = TrigramCollocationFinder.from_words(iter(words))
62
63			bcf = tcf.bigram_finder()
64			bcf.apply_freq_filter(min_freqs[0])
65			bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
66
67			tcf.apply_freq_filter(min_freqs[1])
68			trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
69
70			bigrams_patterns = re.compile('(%s)' % '\|'.join(bigrams), re.UNICODE)
71			trigrams_patterns = re.compile('(%s)' % '\|'.join(trigrams), re.UNICODE)
72
73			return bigrams_patterns, trigrams_patterns
74
75
76			def _collocation_document(text, patterns, min_length=1, stopwords=None, stop_regex=None):
77			"""A text tokenizer that includes collocations(bigrams and trigrams).
78
79			A collocation is sequence of words or terms that co-occur more often
80			than would be expected by chance. This function breaks a raw document
81			up into tokens based on a pre-established collection of bigrams and
82			trigrams. This collection is derived from a body of many documents, and
83			must be obtained in a prior step using the collect_bigrams_and_trigrams
84			function.
85
86			Uses nltk.collocations.TrigramCollocationFinder to
87			find trigrams and bigrams.
88
89			Parameters
90			----------
91			text : str
92			A single document's text to be tokenized
93			patterns: tuple of compiled regex object to find n-grams
94			Obtained from collect_bigrams_and_trigrams function
95			min_length : int
96			Minimum length of any single word
97			stopwords : None or iterable of str
98			Collection of words to ignore as tokens
99			stop_regex : str
100			A regular expression of content to remove from text before tokenizing.
101			Potentially useful for ignoring code (HTML tags).
102
103			Examples
104			--------
105			>>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
106			>>> text = sample_corpus[0][1]
107			>>> tokenized_text = _collocation_document(text,patterns)
108			>>> tokenized_text == [
109			... u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
110			... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
111			... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
112			True
113			"""
114			text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords, stop_regex=stop_regex))
115			for pattern in patterns:
116			text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
117			return text.split()
118
119			@register
120			def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None,
121			stop_regex=None):
122			"""
123			A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
124			according to the frequency bounds, then tokenizes all documents using those
125			extracted phrases.
126
127			Parameters
128			----------
129			raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
130			body of documents to examine
131			min_length : int
132			Minimum length of any single word
133			freq_bounds : list of tuples of ints
134			Currently ngrams supports bigrams and trigrams, so this list should
135			contain two tuples (the first for bigrams, the second for trigrams),
136			where each tuple consists of a (minimum, maximum) corpus-wide frequency.
137			top_n : int
138			limit results to this many entries
139			stopwords: None or iterable of str
140			Collection of words to ignore as tokens
141			stop_regex : str
142			A regular expression of content to remove from text before tokenizing.
143			Potentially useful for ignoring code (HTML tags).
144
145			Examples
146			--------
147			>>> tokenized_corpora = ngrams(sample_corpus, freq_bounds=[(2,100),(2,100)])
148			>>> next(tokenized_corpora) == ('doc1',
149			... [u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
150			... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
151			... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
152			True
153			"""
154			if not freq_bounds:
155			freq_bounds=[(50, 10000), (20, 10000)]
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Exactly one space required around assignment freq_bounds=[(50, 10000), (20, 10000)] ^ Loading history...
156			min_freqs = [freq[0] for freq in freq_bounds]
157			patterns = _collect_bigrams_and_trigrams(raw_corpus, top_n=top_n, min_length=min_length, min_freqs=min_freqs,
158			stopwords=stopwords, stop_regex=stop_regex)
159			for doc_id, doc_text in raw_corpus:
160			yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length,
161			stopwords=stopwords, stop_regex=stop_regex)
162

ContinuumIO / topik

Pull Request — master (#74)

topik.tokenizers._collect_bigrams_and_trigrams() A

Complexity

Size

Duplication

How to fix Long Method

Long Method

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files