src.pipeline.commands.TXTDenoiser.execute() - Code Metrics - Inspection of "Update LICENSE.txt" - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (e214b7)

by Philippe

created 2017-07-17 12:35 UTC

src.pipeline.commands.TXTDenoiser.execute() F

↳ Parent: TXTDenoiser

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	12
dl	0
loc	44
rs	2.7855

How to fix Complexity

"""Package to clean TXT files

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
import codecs
from os.path import join, isfile, splitext, basename
from os import listdir
from denoiser import Denoiser
from pipeline.command import Command


class TXTDenoiser(Command):
    """Command to clean TXT files
    """

    def __init__(self, filename, logger, config):
        super(TXTDenoiser, self).__init__(filename, logger, config)
        self.denoiser = Denoiser(config)

    def execute(self):
        """Execute the command
        """
        try:
            self.logger.debug("::: Text cleaning :::")
            super(TXTDenoiser, self).get_file()

            txt_dir = join(self.unzipped, "txt")
            txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]

            if len(txt_files) != 1:
                self.logger.error("Incorrect number of text files")
                self.finalize()
                return -1

            text_data = self.denoiser.cleanse(txt_files[0], False)

            # Writing classified lines
            base_filename = splitext(basename(txt_files[0]))[0]
            clean_filename = join(txt_dir, base_filename+".clean.txt")
            garbage_filename = join(txt_dir, base_filename+".grbge.txt")
            unclassified_filename = join(txt_dir, base_filename+".unclss.txt")

            with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
                for line in text_data.get_clean_lines():
                    clean_file.write(line+"\n")

            with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
                for line in text_data.get_garbage_lines():
                    garbage_file.write(line+"\n")

            if len(text_data.get_unclassified_lines()) > 0:
                with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
                    for line in text_data.get_unclassified_lines():
                        unclassified_file.write(line+"\n")
        except Exception, e:
            print e

            self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
            self.finalize()
            return -2

        self.finalize()
        return 0

    def finalize(self):
        """Finalize the job
        """
        super(TXTDenoiser, self).store_file()
        self.logger.debug("::: Text cleaning (END) :::")


1			"""Package to clean TXT files
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			import codecs
17			from os.path import join, isfile, splitext, basename
18			from os import listdir
19			from denoiser import Denoiser
20			from pipeline.command import Command
21
22
23			class TXTDenoiser(Command):
24			"""Command to clean TXT files
25			"""
26
27			def __init__(self, filename, logger, config):
28			super(TXTDenoiser, self).__init__(filename, logger, config)
29			self.denoiser = Denoiser(config)
30
31			def execute(self):
32			"""Execute the command
33			"""
34			try:
35			self.logger.debug("::: Text cleaning :::")
36			super(TXTDenoiser, self).get_file()
37
38			txt_dir = join(self.unzipped, "txt")
39			txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]
40
41			if len(txt_files) != 1:
42			self.logger.error("Incorrect number of text files")
43			self.finalize()
44			return -1
45
46			text_data = self.denoiser.cleanse(txt_files[0], False)
47
48			# Writing classified lines
49			base_filename = splitext(basename(txt_files[0]))[0]
50			clean_filename = join(txt_dir, base_filename+".clean.txt")
51			garbage_filename = join(txt_dir, base_filename+".grbge.txt")
52			unclassified_filename = join(txt_dir, base_filename+".unclss.txt")
53
54			with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
55			for line in text_data.get_clean_lines():
56			clean_file.write(line+"\n")
57
58			with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
59			for line in text_data.get_garbage_lines():
60			garbage_file.write(line+"\n")
61
62			if len(text_data.get_unclassified_lines()) > 0:
63			with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
64			for line in text_data.get_unclassified_lines():
65			unclassified_file.write(line+"\n")
66			except Exception, e:
67			print e
68
69			self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
70			self.finalize()
71			return -2
72
73			self.finalize()
74			return 0
75
76			def finalize(self):
77			"""Finalize the job
78			"""
79			super(TXTDenoiser, self).store_file()
80			self.logger.debug("::: Text cleaning (END) :::")
81

usnistgov / ocr-pipeline

Branch — master (e214b7)

src.pipeline.commands.TXTDenoiser.execute() F

Complexity

Size

Duplication

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like