_collect_ngrams() - Code Metrics - Inspection of "Various fixes + logging + refactoring." - ContinuumIO/topik - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#79)

unknown

created 2016-04-21 18:55 UTC

_collect_ngrams() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
dl	0
loc	41
rs	8.8571
cc	2

import itertools
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
import re
import logging

from topik.tokenizers.simple import _simple_document
from topik.tokenizers._registry import register
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from nltk.metrics.association import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

# sample_corpus for doctests
sample_corpus = [

            ("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
                         u" to prancercise class daily.  Prancercise was "
                         u"a tremendously popular pastime of sassy "
                         u"unicorns and retirees alike.")),
            ("doc2", str(u"Prancercise is a form of both art and fitniss, "
                         u"originally invented by sassy unicorns. It has "
                         u"recently been popularized by such retired "
                         u"celebrities as Frank The Swank-Tank."))]

# TODO: replace min_freqs with freq_bounds like ngrams takes.  Unify format across the board.

def _collect_ngrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams/quadgrams are triplets/quadruplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)

    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)

    words_iterators = itertools.tee(words, 3)
    bigrams_patterns = _get_bigrams(words_iterators[0], top_n, min_freqs[0])
    trigrams_patterns = _get_trigrams(words_iterators[1], top_n, min_freqs[1])
    quadgrams_patterns = _get_quadgrams(words_iterators[2], top_n, min_freqs[2])

    return (bigrams_patterns, trigrams_patterns, quadgrams_patterns)

def _get_bigrams(words, top_n, min_freq):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
    bcf = BigramCollocationFinder.from_words(iter(words))
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    return re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)

def _get_trigrams(words, top_n, min_freq):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
    tcf = TrigramCollocationFinder.from_words(iter(words))
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    return re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

def _get_quadgrams(words, top_n, min_freq):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
    qcf = QuadgramCollocationFinder.from_words(iter(words))
    qcf.apply_freq_filter(min_freq)
    quadgrams = [' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n)]
    return re.compile('(%s)' % '|'.join(quadgrams), re.UNICODE)


def _collocation_document(text, patterns, min_length=1, stopwords=None):
    """A text tokenizer that includes collocations(bigrams and trigrams).

    A collocation is sequence of words or terms that co-occur more often
    than would be expected by chance.  This function breaks a raw document
    up into tokens based on a pre-established collection of bigrams, trigrams,
    and trigrams.  This collection is derived from a body of many documents, and
    must be obtained in a prior step using the collect_ngrams
    function.

    Uses nltk.collocations.(Bi/Tri/Quad)gramCollocationFinder to
    find bigrams/trigrams/quadgrams.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    patterns: tuple of compiled regex object to find n-grams
        Obtained from collect_ngrams function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _collocation_document(text,patterns)
    >>> tokenized_text == [
    ...     u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
    True
    """
    text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords))
    for pattern in patterns:
        text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
    return text.split()

@register
def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None):
    '''
    A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
    according to the frequency bounds, then tokenizes all documents using those
    extracted phrases.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    min_length : int
        Minimum length of any single word
    freq_bounds : list of tuples of ints
        Currently ngrams supports bigrams and trigrams, so this list should
        contain two tuples (the first for bigrams, the second for trigrams),
        where each tuple consists of a (minimum, maximum) corpus-wide frequency.
    top_n : int
        limit results to this many entries
    stopwords: None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> tokenized_corpora = ngrams(sample_corpus, freq_bounds=[(2, 100), (2, 100), (2, 100)])
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
    True
    '''
    if not freq_bounds:
        freq_bounds=[(50, 10000), (25, 10000), (15, 10000)]

    min_freqs = [freq[0] for freq in freq_bounds]
    # Tee corpus, since we exhaust it when finding patterns
    logging.debug("Collecting (bi/tri/quad)grams from corpus")
    corpus_iterators = itertools.tee(raw_corpus, 2)
    patterns = _collect_ngrams(corpus_iterators[0], top_n=top_n, min_length=min_length, min_freqs=min_freqs,
                               stopwords=stopwords)
    logging.debug("Determining collocation on corpus")
    for doc_id, doc_text in corpus_iterators[1]:
        yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, stopwords=stopwords)


Pull Request — master (#79)

_collect_ngrams() B

Complexity

Size

Duplication

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			import itertools
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2			import re
3			import logging
4
5			from topik.tokenizers.simple import _simple_document
6			from topik.tokenizers._registry import register
7			from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
			0 ignored issues – show Configuration introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report The import `nltk.collocations` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
8			from nltk.metrics.association import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
			0 ignored issues – show Configuration introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report The import `nltk.metrics.association` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
9
10			# sample_corpus for doctests
11			sample_corpus = [
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The name `sample_corpus` does not conform to the constant naming conventions (`(([A-Z_][A-Z0-9_])\|(__.__))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
12			("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
13			u" to prancercise class daily. Prancercise was "
14			u"a tremendously popular pastime of sassy "
15			u"unicorns and retirees alike.")),
16			("doc2", str(u"Prancercise is a form of both art and fitniss, "
17			u"originally invented by sassy unicorns. It has "
18			u"recently been popularized by such retired "
19			u"celebrities as Frank The Swank-Tank."))]
20
21			# TODO: replace min_freqs with freq_bounds like ngrams takes. Unify format across the board.
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
22			def _collect_ngrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
23			"""collects bigrams and trigrams from collection of documents. Input to collocation tokenizer.
24
25			bigrams are pairs of words that recur in the collection; trigrams/quadgrams are triplets/quadruplets.
26
27			Parameters
28			----------
29			raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
30			body of documents to examine
31			top_n : int
32			limit results to this many entries
33			min_length : int
34			Minimum length of any single word
35			min_freqs : iterable of int
36			threshold of when to consider a pair of words as a recognized n-gram,
37			starting with bigrams.
38			stopwords : None or iterable of str
39			Collection of words to ignore as tokens
40
41			Examples
42			--------
43			>>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
44			>>> patterns[0].pattern
45			u'(frank swank\|swank tank\|sassy unicorns)'
46			>>> patterns[1].pattern
47			u'(frank swank tank)'
48			"""
49
50			# generator of documents, turn each element to its list of words
51			doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
52			for doc_id, doc_text in raw_corpus)
53
54			# generator, concatenate (chain) all words into a single sequence, lazily
55			words = itertools.chain.from_iterable(doc_texts)
56
57			words_iterators = itertools.tee(words, 3)
58			bigrams_patterns = _get_bigrams(words_iterators[0], top_n, min_freqs[0])
59			trigrams_patterns = _get_trigrams(words_iterators[1], top_n, min_freqs[1])
60			quadgrams_patterns = _get_quadgrams(words_iterators[2], top_n, min_freqs[2])
61
62			return (bigrams_patterns, trigrams_patterns, quadgrams_patterns)
63
64			def _get_bigrams(words, top_n, min_freq):
			0 ignored issues – show Coding Style introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report This function should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
65			bcf = BigramCollocationFinder.from_words(iter(words))
66			bcf.apply_freq_filter(min_freq)
67			bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
68			return re.compile('(%s)' % '\|'.join(bigrams), re.UNICODE)
69
70			def _get_trigrams(words, top_n, min_freq):
			0 ignored issues – show Coding Style introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report This function should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
71			tcf = TrigramCollocationFinder.from_words(iter(words))
72			tcf.apply_freq_filter(min_freq)
73			trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
74			return re.compile('(%s)' % '\|'.join(trigrams), re.UNICODE)
75
76			def _get_quadgrams(words, top_n, min_freq):
			0 ignored issues – show Coding Style introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report This function should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
77			qcf = QuadgramCollocationFinder.from_words(iter(words))
78			qcf.apply_freq_filter(min_freq)
79			quadgrams = [' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n)]
80			return re.compile('(%s)' % '\|'.join(quadgrams), re.UNICODE)
81
82
83			def _collocation_document(text, patterns, min_length=1, stopwords=None):
84			"""A text tokenizer that includes collocations(bigrams and trigrams).
85
86			A collocation is sequence of words or terms that co-occur more often
87			than would be expected by chance. This function breaks a raw document
88			up into tokens based on a pre-established collection of bigrams, trigrams,
89			and trigrams. This collection is derived from a body of many documents, and
90			must be obtained in a prior step using the collect_ngrams
91			function.
92
93			Uses nltk.collocations.(Bi/Tri/Quad)gramCollocationFinder to
94			find bigrams/trigrams/quadgrams.
95
96			Parameters
97			----------
98			text : str
99			A single document's text to be tokenized
100			patterns: tuple of compiled regex object to find n-grams
101			Obtained from collect_ngrams function
102			min_length : int
103			Minimum length of any single word
104			stopwords : None or iterable of str
105			Collection of words to ignore as tokens
106
107			Examples
108			--------
109			>>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
110			>>> text = sample_corpus[0][1]
111			>>> tokenized_text = _collocation_document(text,patterns)
112			>>> tokenized_text == [
113			... u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
114			... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
115			... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
116			True
117			"""
118			text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords))
119			for pattern in patterns:
120			text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
121			return text.split()
122
123			@register
124			def ngrams(raw_corpus, min_length=1, freq_bounds=None, top_n=10000, stopwords=None):
125			'''
126			A tokenizer that extracts collocations (bigrams and trigrams) from a corpus
127			according to the frequency bounds, then tokenizes all documents using those
128			extracted phrases.
129
130			Parameters
131			----------
132			raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
133			body of documents to examine
134			min_length : int
135			Minimum length of any single word
136			freq_bounds : list of tuples of ints
137			Currently ngrams supports bigrams and trigrams, so this list should
138			contain two tuples (the first for bigrams, the second for trigrams),
139			where each tuple consists of a (minimum, maximum) corpus-wide frequency.
140			top_n : int
141			limit results to this many entries
142			stopwords: None or iterable of str
143			Collection of words to ignore as tokens
144
145			Examples
146			--------
147			>>> tokenized_corpora = ngrams(sample_corpus, freq_bounds=[(2, 100), (2, 100), (2, 100)])
148			>>> next(tokenized_corpora) == ('doc1',
149			... [u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
150			... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
151			... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'])
152			True
153			'''
154			if not freq_bounds:
155			freq_bounds=[(50, 10000), (25, 10000), (15, 10000)]
			0 ignored issues – show Coding Style introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report Exactly one space required around assignment freq_bounds=[(50, 10000), (25, 10000), (15, 10000)] ^ Loading history...
156			min_freqs = [freq[0] for freq in freq_bounds]
157			# Tee corpus, since we exhaust it when finding patterns
158			logging.debug("Collecting (bi/tri/quad)grams from corpus")
159			corpus_iterators = itertools.tee(raw_corpus, 2)
160			patterns = _collect_ngrams(corpus_iterators[0], top_n=top_n, min_length=min_length, min_freqs=min_freqs,
161			stopwords=stopwords)
162			logging.debug("Determining collocation on corpus")
163			for doc_id, doc_text in corpus_iterators[1]:
164			yield doc_id, _collocation_document(doc_text, patterns, min_length=min_length, stopwords=stopwords)
165

ContinuumIO / topik

Pull Request — master (#79)

_collect_ngrams() B

Complexity

Size

Duplication

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files