Denoiser.cleanse() - Code Metrics - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

Denoiser.cleanse() B
last analyzed 2017-09-28 14:20 UTC

↳ Parent: Denoiser

Complexity

Conditions

Size

Total Lines

Duplication

Lines	30
Ratio	100 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
cc	2
c	1
b	0
f	1
dl	30
loc	30
rs	8.8571

"""Package containing all the functions and classes needed to clean a file.

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
from __future__ import division
import logging
from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel
from denoiser.text import Text


class Denoiser(object):
    """Denoiser object, able to clean a file and train related models
    """

    def __init__(self, app_config):
        self.config = app_config
        self.logger = logging.getLogger('local')

        self.inline_model = InlineModel(self.config)
        self.indicator_model = IndicatorModel(self.config)
        self.learning_model = MachineLearningModel(self.config)

        self.logger.info("Denoiser initialized")

    def cleanse(self, filename, is_csv=False):

        """Cleanse a file given its name

        Parameters:
            filename (str): Path of the file to cleanse
            is_csv (bool): Specifies if the file is a CSV

        Returns:
            `Text`: Text data
        """
        self.logger.debug("Cleaning "+filename+"...")
        text_data = Text(filename)

        # Parse the proper format
        if is_csv:
            text_data.read_csv()
        else:
            text_data.read_txt()

        # Clean the text
        self.inline_model.load(text_data)
        self.inline_model.correct(text_data)

        self.indicator_model.load(text_data)
        self.indicator_model.correct(text_data)

        self.learning_model.load(text_data)
        self.learning_model.correct(text_data)

        return text_data

    def train(self, dataset):

        """ Train the denoiser with a set of files

        Parameters
            dataset (list): List of files
        """
        self.logger.debug("Training denoiser...")

        # Generate datastructures from dataset
        text_dataset = [Text(f) for f in dataset]

        # Create datastructures for the whole dataset
        for text_data in text_dataset:
            self.logger.debug("Preprocessing "+text_data.filename)
            text_data.read_csv()

            # print "Loading "+text.filename+"..."
            self.inline_model.load(text_data)
            self.inline_model.correct(text_data)

            self.indicator_model.load(text_data)
            self.indicator_model.correct(text_data)

        # Load existing training data
        self.logger.debug("Training learning model...")
        self.learning_model.train(text_dataset)

        self.logger.info("Machine learning model trained")

    def generate_models(self, dataset):
        """ Generates the datastructures given a set of files

        Parameters
            dataset (list): List of files
        """
        self.logger.debug("Generating datastructures...")
        text_dataset = [Text(f) for f in dataset]

        for text_data in text_dataset:
            self.logger.debug("Processing "+text_data.filename+"...")

            text_data.read_csv()
            self.inline_model.load(text_data)

        self.logger.info("Datastructure generated")
        return 0



1		"""Package containing all the functions and classes needed to clean a file.
2
3		.. Authors:
4		Philippe Dessauw
5		[email protected]
6
7		.. Sponsor:
8		Alden Dima
9		[email protected]
10		Information Systems Group
11		Software and Systems Division
12		Information Technology Laboratory
13		National Institute of Standards and Technology
14		http://www.nist.gov/itl/ssd/is
15		"""
16		from __future__ import division
17		import logging
18		from denoiser.models import InlineModel, IndicatorModel, MachineLearningModel
19		from denoiser.text import Text
20
21
22		class Denoiser(object):
23		"""Denoiser object, able to clean a file and train related models
24		"""
25
26		def __init__(self, app_config):
27		self.config = app_config
28		self.logger = logging.getLogger('local')
29
30		self.inline_model = InlineModel(self.config)
31		self.indicator_model = IndicatorModel(self.config)
32		self.learning_model = MachineLearningModel(self.config)
33
34		self.logger.info("Denoiser initialized")
35
36	View Code Duplication	def cleanse(self, filename, is_csv=False):
		0 ignored issues – show Duplication introduced 2017-07-17 12:35 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
37		"""Cleanse a file given its name
38
39		Parameters:
40		filename (str): Path of the file to cleanse
41		is_csv (bool): Specifies if the file is a CSV
42
43		Returns:
44		`Text`: Text data
45		"""
46		self.logger.debug("Cleaning "+filename+"...")
47		text_data = Text(filename)
48
49		# Parse the proper format
50		if is_csv:
51		text_data.read_csv()
52		else:
53		text_data.read_txt()
54
55		# Clean the text
56		self.inline_model.load(text_data)
57		self.inline_model.correct(text_data)
58
59		self.indicator_model.load(text_data)
60		self.indicator_model.correct(text_data)
61
62		self.learning_model.load(text_data)
63		self.learning_model.correct(text_data)
64
65		return text_data
66
67	View Code Duplication	def train(self, dataset):
		0 ignored issues – show Duplication introduced 2017-07-17 12:35 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
68		""" Train the denoiser with a set of files
69
70		Parameters
71		dataset (list): List of files
72		"""
73		self.logger.debug("Training denoiser...")
74
75		# Generate datastructures from dataset
76		text_dataset = [Text(f) for f in dataset]
77
78		# Create datastructures for the whole dataset
79		for text_data in text_dataset:
80		self.logger.debug("Preprocessing "+text_data.filename)
81		text_data.read_csv()
82
83		# print "Loading "+text.filename+"..."
84		self.inline_model.load(text_data)
85		self.inline_model.correct(text_data)
86
87		self.indicator_model.load(text_data)
88		self.indicator_model.correct(text_data)
89
90		# Load existing training data
91		self.logger.debug("Training learning model...")
92		self.learning_model.train(text_dataset)
93
94		self.logger.info("Machine learning model trained")
95
96		def generate_models(self, dataset):
97		""" Generates the datastructures given a set of files
98
99		Parameters
100		dataset (list): List of files
101		"""
102		self.logger.debug("Generating datastructures...")
103		text_dataset = [Text(f) for f in dataset]
104
105		for text_data in text_dataset:
106		self.logger.debug("Processing "+text_data.filename+"...")
107
108		text_data.read_csv()
109		self.inline_model.load(text_data)
110
111		self.logger.info("Datastructure generated")
112		return 0
113
114

usnistgov / ocr-pipeline

Denoiser.cleanse() B last analyzed 2017-09-28 14:20 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Denoiser.cleanse() B
last analyzed 2017-09-28 14:20 UTC