Issues in entities.py (master) - Issues in master - ContinuumIO/topik - Measure and Improve Code Quality continuously with Scrutinizer

Issues (1006)

topik/tokenizers/entities.py (15 issues)

Labels

Severity

import logging
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
import itertools
from textblob import TextBlob
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from topik.tokenizers.simple import _simple_document

# imports used only for doctests
from topik.tokenizers._registry import register

sample_corpus = [

            ("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
                         u" to prancercise class daily.  Prancercise was "
                         u"a tremendously popular pastime of sassy "
                         u"unicorns and retirees alike.")),
            ("doc2", str(u"Prancercise is a form of both art and fitniss, "
                         u"originally invented by sassy unicorns. It has "
                         u"recently been popularized by such retired "
                         u"celebrities as Frank The Swank-Tank."))]

def _collect_entities(raw_corpus, freq_min=2, freq_max=10000):
    """Return noun phrases from collection of documents.

    Parameters
    ----------
    raw_corpus: Corpus-base derived object or iterable collection of raw text
    freq_min: int
        Minimum frequency of a noun phrase occurrences in order to retrieve it. Default is 2.
    freq_max: int
        Maximum frequency of a noun phrase occurrences in order to retrieve it. Default is 10000.

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> ents == {'swank-tank', 'prancercise', 'sassy unicorns', 'frank'}
    True
    """

    np_counts_total = {}
    docs_examined = 0
    for doc_id, doc_text in raw_corpus:

        if docs_examined > 0 and docs_examined % 1000 == 0:
            sorted_phrases = sorted(np_counts_total.items(),
                                    key=lambda item: -item[1])
            np_counts_total = dict(sorted_phrases)
            logging.info("at document #%i, considering %i phrases: %s..." %

                         (docs_examined, len(np_counts_total), sorted_phrases[0]))

        for np in TextBlob(doc_text).noun_phrases:

            np_counts_total[np] = np_counts_total.get(np, 0) + 1
        docs_examined += 1

    # Remove noun phrases in the list that have higher frequencies than 'freq_max' or lower frequencies than 'freq_min'
    np_counts = {}
    for np, count in np_counts_total.items():

        if freq_max >= count >= freq_min:
            np_counts[np] = count

    return set(np_counts)


def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
    '''
    A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
    contained in the entities argument.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_entities_document(text,ents)
    >>> tokenized_text == [
    ...     u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
    True
    '''
    result = []
    for np in TextBlob(text).noun_phrases:

        if np in entities:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result


def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
    """
    A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_mixed_document(text,ents)
    >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn',
    ... u'brony', u'prancercise', u'class', u'prancercise', u'popular',
    ... u'pastime', u'sassy_unicorns']
    True
    """
    result = []
    for np in TextBlob(text).noun_phrases:

        if ' ' in np and np not in entities:
            # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
            result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
        else:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result


@register

def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
    """
    A tokenizer that extracts noun phrases from a corpus, then tokenizes all
    documents using those extracted phrases.

    Parameters
    ----------
    corpus : iterable of str
        A collection of text to be tokenized
    min_length : int
        Minimum length of any single word
    freq_min : int
        Minimum occurrence of phrase in order to be considered
    freq_max : int
        Maximum occurrence of phrase, beyond which it is ignored
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> tokenized_corpora = entities(sample_corpus)
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
    True
    """
    # Tee in case it is a generator (else it will get exhausted).
    corpus_iterator = itertools.tee(corpus, 2)
    entities = _collect_entities(corpus_iterator[0], freq_min=freq_min, freq_max=freq_max)
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
    for doc_id, doc_text in corpus_iterator[1]:
        yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length,
                                       stopwords=stopwords)


@register

def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
    """A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    corpus : iterable of str
        A collection of text to be tokenized
    min_length : int
        Minimum length of any single word
    freq_min : int
        Minimum occurrence of phrase in order to be considered
    freq_max : int
        Maximum occurrence of phrase, beyond which it is ignored
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> tokenized_corpora = entities(sample_corpus)
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
    True
    """
    corpus_iterators = itertools.tee(corpus, 2)
    entities = _collect_entities(corpus_iterators[0], freq_min=freq_min, freq_max=freq_max)
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
    for doc_id, doc_text in corpus_iterators[1]:
        yield doc_id, _tokenize_mixed_document(doc_text, entities,
                                                min_length=min_length,
                                                stopwords=stopwords)


Issues (1006)

topik/tokenizers/entities.py (15 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing init.py files

1			import logging
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2			import itertools
3			from textblob import TextBlob
			0 ignored issues – show Configuration introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The import `textblob` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
4
5			from topik.tokenizers.simple import _simple_document
6
7			# imports used only for doctests
8			from topik.tokenizers._registry import register
9
10			sample_corpus = [
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `sample_corpus` does not conform to the constant naming conventions (`(([A-Z_][A-Z0-9_])\|(__.__))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
11			("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
12			u" to prancercise class daily. Prancercise was "
13			u"a tremendously popular pastime of sassy "
14			u"unicorns and retirees alike.")),
15			("doc2", str(u"Prancercise is a form of both art and fitniss, "
16			u"originally invented by sassy unicorns. It has "
17			u"recently been popularized by such retired "
18			u"celebrities as Frank The Swank-Tank."))]
19
20			def _collect_entities(raw_corpus, freq_min=2, freq_max=10000):
21			"""Return noun phrases from collection of documents.
22
23			Parameters
24			----------
25			raw_corpus: Corpus-base derived object or iterable collection of raw text
26			freq_min: int
27			Minimum frequency of a noun phrase occurrences in order to retrieve it. Default is 2.
28			freq_max: int
29			Maximum frequency of a noun phrase occurrences in order to retrieve it. Default is 10000.
30
31			Examples
32			--------
33			>>> ents = _collect_entities(sample_corpus)
34			>>> ents == {'swank-tank', 'prancercise', 'sassy unicorns', 'frank'}
35			True
36			"""
37
38			np_counts_total = {}
39			docs_examined = 0
40			for doc_id, doc_text in raw_corpus:
			0 ignored issues – show Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `doc_id` seems to be unused. Loading history...
41			if docs_examined > 0 and docs_examined % 1000 == 0:
42			sorted_phrases = sorted(np_counts_total.items(),
43			key=lambda item: -item[1])
44			np_counts_total = dict(sorted_phrases)
45			logging.info("at document #%i, considering %i phrases: %s..." %
			0 ignored issues – show Coding Style Best Practice introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this Specify string format arguments as logging function parameters Loading history...
46			(docs_examined, len(np_counts_total), sorted_phrases[0]))
47
48			for np in TextBlob(doc_text).noun_phrases:
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `np` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
49			np_counts_total[np] = np_counts_total.get(np, 0) + 1
50			docs_examined += 1
51
52			# Remove noun phrases in the list that have higher frequencies than 'freq_max' or lower frequencies than 'freq_min'
53			np_counts = {}
54			for np, count in np_counts_total.items():
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `np` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
55			if freq_max >= count >= freq_min:
56			np_counts[np] = count
57
58			return set(np_counts)
59
60
61			def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
			0 ignored issues – show Comprehensibility Bug introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report Show Similar Issues like this `entities` is re-defining a name which is already available in the outer-scope (previously defined on line `137`). It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5 class Foo: def __init__(self, param): # "param" would be flagged here self.param = param Loading history...
62			'''
63			A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
64			contained in the entities argument.
65
66			Parameters
67			----------
68			text : str
69			A single text document to be tokenized
70			entities : iterable of str
71			Collection of noun phrases, obtained from collect_entities function
72			min_length : int
73			Minimum length of any single word
74			stopwords : None or iterable of str
75			Collection of words to ignore as tokens
76
77			Examples
78			--------
79			>>> ents = _collect_entities(sample_corpus)
80			>>> text = sample_corpus[0][1]
81			>>> tokenized_text = _tokenize_entities_document(text,ents)
82			>>> tokenized_text == [
83			... u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
84			True
85			'''
86			result = []
87			for np in TextBlob(text).noun_phrases:
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `np` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
88			if np in entities:
89			# filter out stop words
90			tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
91			# if we end up with nothing, don't append an empty string
92			if tmp:
93			result.append(tmp)
94			return result
95
96
97			def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
			0 ignored issues – show Comprehensibility Bug introduced 2016-02-08 22:02 UTC by Report Bug Copy Issue Report Show Similar Issues like this `entities` is re-defining a name which is already available in the outer-scope (previously defined on line `137`). It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5 class Foo: def __init__(self, param): # "param" would be flagged here self.param = param Loading history...
98			"""
99			A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.
100
101			Parameters
102			----------
103			text : str
104			A single text document to be tokenized
105			entities : iterable of str
106			Collection of noun phrases, obtained from collect_entities function
107			min_length : int
108			Minimum length of any single word
109			stopwords : None or iterable of str
110			Collection of words to ignore as tokens
111
112			Examples
113			--------
114			>>> ents = _collect_entities(sample_corpus)
115			>>> text = sample_corpus[0][1]
116			>>> tokenized_text = _tokenize_mixed_document(text,ents)
117			>>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn',
118			... u'brony', u'prancercise', u'class', u'prancercise', u'popular',
119			... u'pastime', u'sassy_unicorns']
120			True
121			"""
122			result = []
123			for np in TextBlob(text).noun_phrases:
			0 ignored issues – show Coding Style Naming introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `np` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
124			if ' ' in np and np not in entities:
125			# break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
126			result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
127			else:
128			# filter out stop words
129			tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
130			# if we end up with nothing, don't append an empty string
131			if tmp:
132			result.append(tmp)
133			return result
134
135
136			@register
			0 ignored issues – show Comprehensibility Best Practice introduced 2016-04-26 18:00 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `register` does not seem to be defined. Loading history...
137			def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
138			"""
139			A tokenizer that extracts noun phrases from a corpus, then tokenizes all
140			documents using those extracted phrases.
141
142			Parameters
143			----------
144			corpus : iterable of str
145			A collection of text to be tokenized
146			min_length : int
147			Minimum length of any single word
148			freq_min : int
149			Minimum occurrence of phrase in order to be considered
150			freq_max : int
151			Maximum occurrence of phrase, beyond which it is ignored
152			stopwords : None or iterable of str
153			Collection of words to ignore as tokens
154
155			Examples
156			--------
157			>>> tokenized_corpora = entities(sample_corpus)
158			>>> next(tokenized_corpora) == ('doc1',
159			... [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
160			True
161			"""
162			# Tee in case it is a generator (else it will get exhausted).
163			corpus_iterator = itertools.tee(corpus, 2)
164			entities = _collect_entities(corpus_iterator[0], freq_min=freq_min, freq_max=freq_max)
			0 ignored issues – show Comprehensibility Bug introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report Show Similar Issues like this `entities` is re-defining a name which is already available in the outer-scope (previously defined on line `137`). It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5 class Foo: def __init__(self, param): # "param" would be flagged here self.param = param Loading history...
165			for doc_id, doc_text in corpus_iterator[1]:
166			yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length,
167			stopwords=stopwords)
168
169
170			@register
			0 ignored issues – show Comprehensibility Best Practice introduced 2016-04-26 18:00 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `register` does not seem to be defined. Loading history...
171			def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
172			"""A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.
173
174			Parameters
175			----------
176			corpus : iterable of str
177			A collection of text to be tokenized
178			min_length : int
179			Minimum length of any single word
180			freq_min : int
181			Minimum occurrence of phrase in order to be considered
182			freq_max : int
183			Maximum occurrence of phrase, beyond which it is ignored
184			stopwords : None or iterable of str
185			Collection of words to ignore as tokens
186
187			Examples
188			--------
189			>>> tokenized_corpora = entities(sample_corpus)
190			>>> next(tokenized_corpora) == ('doc1',
191			... [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
192			True
193			"""
194			corpus_iterators = itertools.tee(corpus, 2)
195			entities = _collect_entities(corpus_iterators[0], freq_min=freq_min, freq_max=freq_max)
			0 ignored issues – show Comprehensibility Bug introduced 2016-04-21 18:16 UTC by Report Bug Copy Issue Report Show Similar Issues like this `entities` is re-defining a name which is already available in the outer-scope (previously defined on line `137`). It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5 class Foo: def __init__(self, param): # "param" would be flagged here self.param = param Loading history...
196			for doc_id, doc_text in corpus_iterators[1]:
197			yield doc_id, _tokenize_mixed_document(doc_text, entities,
198			min_length=min_length,
199			stopwords=stopwords)
200

ContinuumIO / topik

Issues (1006)

topik/tokenizers/entities.py (15 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files