topik.fileio.read_input() - Code Metrics - Inspection of "fixed error in LDA (transposed score/topic)" - ContinuumIO/topik - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#77)

unknown

created 2016-04-19 13:40 UTC

topik.fileio.read_input() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	12
dl	0
loc	64
rs	2.7469

How to fix Long Method Complexity

import os
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

from topik.fileio._registry import registered_inputs
from topik.fileio.tests import test_data_path


# this function is the primary API for people using any registered functions.
def read_input(source, source_type="auto", folder_content_field='text', **kwargs):
    """
    Read data from given source into Topik's internal data structures.

    Parameters
    ----------
    source : str
        input data.  Can be file path, directory, or server address.
    source_type : str
        "auto" tries to figure out data type of source.  Can be manually specified instead.
        options for manual specification are ['solr', 'elastic', 'json_stream', 'large_json', 'folder']
    folder_content_field : str
        Only used for document_folder source. This argument is used as the key
        (field name), where each document represents the value of that field.
    kwargs : any other arguments to pass to input parsers

    Returns
    -------
    iterable output object

    >> ids, texts = zip(*list(iter(raw_data)))
    Examples
    --------
    >>> loaded_corpus = read_input(
    ...         '{}/test_data_json_stream.json'.format(test_data_path))
    >>> solution_text = (
    ... u'Transition metal oxides are being considered as the next generation '+
    ... u'materials in field such as electronics and advanced catalysts; '+
    ... u'between them is Tantalum (V) Oxide; however, there are few reports '+
    ... u'for the synthesis of this material at the nanometer size which could '+
    ... u'have unusual properties. Hence, in this work we present the '+
    ... u'synthesis of Ta2O5 nanorods by sol gel method using DNA as structure '+
    ... u'directing agent, the size of the nanorods was of the order of 40 to '+
    ... u'100 nm in diameter and several microns in length; this easy method '+
    ... u'can be useful in the preparation of nanomaterials for electronics, '+
    ... u'biomedical applications as well as catalysts.')
    >>> solution_text == next(loaded_corpus)['abstract']
    True
    """
    json_extensions = [".js", ".json"]

    # web addresses default to elasticsearch
    if (source_type == "auto" and "9200" in source) or source_type == "elastic":
        data_iterator = registered_inputs["read_elastic"](source, **kwargs)
    # files must end in .json.  Try json parser first, try large_json parser next.  Fail otherwise.
    elif (source_type == "auto" and os.path.splitext(source)[1] in json_extensions) or source_type == "json_stream":
        try:
            data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
            # tee the iterator and try to get the first element.  If it fails, this is actually a large_json file.
            next(data_iterator)
            # reset the iterator after this check so that it starts at document 0 rather than document 1
            data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
        except ValueError:
            data_iterator = registered_inputs["read_large_json"](source, **kwargs)
    elif source_type == "large_json":
        data_iterator = registered_inputs["read_large_json"](source, **kwargs)
    # folder paths are simple strings that don't end in an extension (.+3-4 characters), or end in a /
    elif (source_type == "auto" and os.path.splitext(source)[1] == "") or source_type == "folder":
        data_iterator = registered_inputs["read_document_folder"](source,
                                                                  content_field=folder_content_field)
    else:
        raise ValueError("Unrecognized source type: {}.  Please either manually specify the type, or convert your input"
                         " to a supported type.".format(source))
    return data_iterator



1			import os
			0 ignored issues – show Coding Style introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2
3			from topik.fileio._registry import registered_inputs
4			from topik.fileio.tests import test_data_path
			0 ignored issues – show Unused Code introduced 2015-11-23 14:51 UTC by Report Bug Copy Issue Report Unused test_data_path imported from topik.fileio.tests Loading history...
5
6			# this function is the primary API for people using any registered functions.
7			def read_input(source, source_type="auto", folder_content_field='text', **kwargs):
8			"""
9			Read data from given source into Topik's internal data structures.
10
11			Parameters
12			----------
13			source : str
14			input data. Can be file path, directory, or server address.
15			source_type : str
16			"auto" tries to figure out data type of source. Can be manually specified instead.
17			options for manual specification are ['solr', 'elastic', 'json_stream', 'large_json', 'folder']
18			folder_content_field : str
19			Only used for document_folder source. This argument is used as the key
20			(field name), where each document represents the value of that field.
21			kwargs : any other arguments to pass to input parsers
22
23			Returns
24			-------
25			iterable output object
26
27			>> ids, texts = zip(*list(iter(raw_data)))
28			Examples
29			--------
30			>>> loaded_corpus = read_input(
31			... '{}/test_data_json_stream.json'.format(test_data_path))
32			>>> solution_text = (
33			... u'Transition metal oxides are being considered as the next generation '+
34			... u'materials in field such as electronics and advanced catalysts; '+
35			... u'between them is Tantalum (V) Oxide; however, there are few reports '+
36			... u'for the synthesis of this material at the nanometer size which could '+
37			... u'have unusual properties. Hence, in this work we present the '+
38			... u'synthesis of Ta2O5 nanorods by sol gel method using DNA as structure '+
39			... u'directing agent, the size of the nanorods was of the order of 40 to '+
40			... u'100 nm in diameter and several microns in length; this easy method '+
41			... u'can be useful in the preparation of nanomaterials for electronics, '+
42			... u'biomedical applications as well as catalysts.')
43			>>> solution_text == next(loaded_corpus)['abstract']
44			True
45			"""
46			json_extensions = [".js", ".json"]
47
48			# web addresses default to elasticsearch
49			if (source_type == "auto" and "9200" in source) or source_type == "elastic":
50			data_iterator = registered_inputs["read_elastic"](source, **kwargs)
51			# files must end in .json. Try json parser first, try large_json parser next. Fail otherwise.
52			elif (source_type == "auto" and os.path.splitext(source)[1] in json_extensions) or source_type == "json_stream":
53			try:
54			data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
55			# tee the iterator and try to get the first element. If it fails, this is actually a large_json file.
56			next(data_iterator)
57			# reset the iterator after this check so that it starts at document 0 rather than document 1
58			data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
59			except ValueError:
60			data_iterator = registered_inputs["read_large_json"](source, **kwargs)
61			elif source_type == "large_json":
62			data_iterator = registered_inputs["read_large_json"](source, **kwargs)
63			# folder paths are simple strings that don't end in an extension (.+3-4 characters), or end in a /
64			elif (source_type == "auto" and os.path.splitext(source)[1] == "") or source_type == "folder":
65			data_iterator = registered_inputs["read_document_folder"](source,
66			content_field=folder_content_field)
67			else:
68			raise ValueError("Unrecognized source type: {}. Please either manually specify the type, or convert your input"
69			" to a supported type.".format(source))
70			return data_iterator
71
72

ContinuumIO / topik

Pull Request — master (#77)

topik.fileio.read_input() F

Complexity

Size

Duplication

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like