topik.simple_run.run_pipeline() - Code Metrics - Inspection of "Merge pull request #59 from ContinuumIO/youngblood..." - ContinuumIO/topik - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 21b042...505a96 )

by Mike

created 2015-11-24 20:02 UTC

topik.simple_run.run_pipeline() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	5
dl	0
loc	58
rs	8.3921

How to fix Long Method

from __future__ import absolute_import, print_function
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

import logging
import os

import numpy as np

from topik.fileio import read_input
from topik import tokenizers, vectorizers, models, visualizers
from topik.visualizers.termite_plot import termite_html

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

BASEDIR = os.path.abspath(os.path.dirname(__file__))


def run_pipeline(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,

              content_field=None, tokenizer='simple', vectorizer='bag_of_words', ntopics=10,
              dir_path='./topic_model', model='lda', termite_plot=False, output_file=False,

              lda_vis=True, seed=42, **kwargs):

    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data_source : str
        Input data (e.g. file or folder or solr/elasticsearch instance).

    source_type : {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files.
        Default is 'json_stream'
    year_field : str
        The field name (if any) that contains the year associated with each document (for filtering).
    start_year : int
        For beginning of range filter on year_field values
    stop_year : int
        For beginning of range filter on year_field values
    content_field : string
        The primary text field to parse.
    tokenizer : {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.
    vectorizer : {'bag_of_words', 'tfidf'}
        The type of vectorizer to use.  Default is 'bag_of_words'.
    ntopics : int
        Number of topics to find in your data
    dir_path : str
        Directory path to store all topic modeling results files. Default is `./topic_model`.
    model : {'LDA', 'PLSA'}.
        Statistical modeling algorithm to use. Default 'LDA'.
    termite_plot : bool
        Generate termite plot of your model if True. Default is True.
    ldavis : bool
        Generate an interactive data visualization of your topics. Default is False.
    seed : int
        Set random number generator to seed, to be able to reproduce results. Default 42.
    **kwargs : additional keyword arguments, passed through to each individual step
    """

    np.random.seed(seed)

    raw_data = read_input(data_source, content_field=content_field,
                          source_type=source_type, **kwargs)
    raw_data = ((hash(item[content_field]), item[content_field]) for item in raw_data)
    tokenized_data = tokenizers.registered_tokenizers[tokenizer](raw_data, **kwargs)
    vectorized_data = vectorizers.registered_vectorizers[vectorizer](tokenized_data, **kwargs)
    model = models.registered_models[model](vectorized_data, ntopics=ntopics, **kwargs)
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    if termite_plot:
        termite_html(model, filename="termite.html", plot_title="Termite plot", topn=15)

    if lda_vis:
        visualizers.visualize(model, "lda_vis")




1			from __future__ import absolute_import, print_function
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2
3			import logging
4			import os
5
6			import numpy as np
7
8			from topik.fileio import read_input
9			from topik import tokenizers, vectorizers, models, visualizers
10			from topik.visualizers.termite_plot import termite_html
11
12			logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
13			level=logging.INFO)
14
15			BASEDIR = os.path.abspath(os.path.dirname(__file__))
16
17
18			def run_pipeline(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
			0 ignored issues – show Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The argument `start_year` seems to be unused. Loading history... Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The argument `stop_year` seems to be unused. Loading history... Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The argument `year_field` seems to be unused. Loading history...
19			content_field=None, tokenizer='simple', vectorizer='bag_of_words', ntopics=10,
20			dir_path='./topic_model', model='lda', termite_plot=False, output_file=False,
			0 ignored issues – show Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report The argument `output_file` seems to be unused. Loading history...
21			lda_vis=True, seed=42, **kwargs):
22
23			"""Run your data through all topik functionality and save all results to a specified directory.
24
25			Parameters
26			----------
27			data_source : str
28			Input data (e.g. file or folder or solr/elasticsearch instance).
29
30			source_type : {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
31			The format of your data input. Currently available a json stream or a folder containing text files.
32			Default is 'json_stream'
33			year_field : str
34			The field name (if any) that contains the year associated with each document (for filtering).
35			start_year : int
36			For beginning of range filter on year_field values
37			stop_year : int
38			For beginning of range filter on year_field values
39			content_field : string
40			The primary text field to parse.
41			tokenizer : {'simple', 'collocations', 'entities', 'mixed'}
42			The type of tokenizer to use. Default is 'simple'.
43			vectorizer : {'bag_of_words', 'tfidf'}
44			The type of vectorizer to use. Default is 'bag_of_words'.
45			ntopics : int
46			Number of topics to find in your data
47			dir_path : str
48			Directory path to store all topic modeling results files. Default is `./topic_model`.
49			model : {'LDA', 'PLSA'}.
50			Statistical modeling algorithm to use. Default 'LDA'.
51			termite_plot : bool
52			Generate termite plot of your model if True. Default is True.
53			ldavis : bool
54			Generate an interactive data visualization of your topics. Default is False.
55			seed : int
56			Set random number generator to seed, to be able to reproduce results. Default 42.
57			**kwargs : additional keyword arguments, passed through to each individual step
58			"""
59
60			np.random.seed(seed)
61
62			raw_data = read_input(data_source, content_field=content_field,
63			source_type=source_type, **kwargs)
64			raw_data = ((hash(item[content_field]), item[content_field]) for item in raw_data)
65			tokenized_data = tokenizers.registered_tokenizers[tokenizer](raw_data, **kwargs)
66			vectorized_data = vectorizers.registered_vectorizers[vectorizer](tokenized_data, **kwargs)
67			model = models.registered_models[model](vectorized_data, ntopics=ntopics, **kwargs)
68			if not os.path.exists(dir_path):
69			os.mkdir(dir_path)
70
71			if termite_plot:
72			termite_html(model, filename="termite.html", plot_title="Termite plot", topn=15)
73
74			if lda_vis:
75			visualizers.visualize(model, "lda_vis")
76
77
78

ContinuumIO / topik

Push — master ( 21b042...505a96 )

topik.simple_run.run_pipeline() B

Complexity

Size

Duplication

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like