TXTDenoiser.__init__() - Code Metrics - usnistgov/ocr-pipeline - Measure and Improve Code Quality continuously with Scrutinizer

TXTDenoiser.init() A
last analyzed 2017-09-28 14:20 UTC

↳ Parent: TXTDenoiser

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
cc	1
c	1
b	0
f	1
dl	0
loc	5
rs	9.4285

"""Package to clean TXT files

.. Authors:
    Philippe Dessauw
    [email protected]

.. Sponsor:
    Alden Dima
    [email protected]
    Information Systems Group
    Software and Systems Division
    Information Technology Laboratory
    National Institute of Standards and Technology
    http://www.nist.gov/itl/ssd/is
"""
import codecs
from os.path import join, isfile, splitext, basename
from os import listdir
from denoiser import Denoiser
from pipeline.command import Command


class TXTDenoiser(Command):
    """Command to clean TXT files
    """

    def __init__(self, filename, logger, config):
        super(TXTDenoiser, self).__init__(filename, logger, config)
        self.denoiser = Denoiser(config)

        self.logger.debug("Denoiser initialized")

    def execute(self):
        """Execute the command
        """
        try:
            self.logger.debug("::: Text cleaning :::")
            # super(TXTDenoiser, self).get_file()

            txt_dir = join(self.unzipped, "txt")
            txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]

            if len(txt_files) != 1:
                self.logger.error("Incorrect number of text files")
                self.finalize()
                return -1

            text_data = self.denoiser.cleanse(txt_files[0], False)

            # Writing classified lines
            base_filename = splitext(basename(txt_files[0]))[0]
            clean_filename = join(txt_dir, base_filename+".clean.txt")
            garbage_filename = join(txt_dir, base_filename+".grbge.txt")
            unclassified_filename = join(txt_dir, base_filename+".unclss.txt")

            with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
                for line in text_data.get_clean_lines():
                    clean_file.write(line+"\n")

            with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
                for line in text_data.get_garbage_lines():
                    garbage_file.write(line+"\n")

            if len(text_data.get_unclassified_lines()) > 0:
                with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
                    for line in text_data.get_unclassified_lines():
                        unclassified_file.write(line+"\n")
        except Exception, e:
            print e

            self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
            self.finalize()
            return -2

        self.finalize()
        return 0

    def finalize(self):
        """Finalize the job
        """
        # super(TXTDenoiser, self).store_file()
        self.logger.debug("::: Text cleaning (END) :::")


1			"""Package to clean TXT files
2
3			.. Authors:
4			Philippe Dessauw
5			[email protected]
6
7			.. Sponsor:
8			Alden Dima
9			[email protected]
10			Information Systems Group
11			Software and Systems Division
12			Information Technology Laboratory
13			National Institute of Standards and Technology
14			http://www.nist.gov/itl/ssd/is
15			"""
16			import codecs
17			from os.path import join, isfile, splitext, basename
18			from os import listdir
19			from denoiser import Denoiser
20			from pipeline.command import Command
21
22
23			class TXTDenoiser(Command):
24			"""Command to clean TXT files
25			"""
26
27			def __init__(self, filename, logger, config):
28			super(TXTDenoiser, self).__init__(filename, logger, config)
29			self.denoiser = Denoiser(config)
30
31			self.logger.debug("Denoiser initialized")
32
33			def execute(self):
34			"""Execute the command
35			"""
36			try:
37			self.logger.debug("::: Text cleaning :::")
38			# super(TXTDenoiser, self).get_file()
39
40			txt_dir = join(self.unzipped, "txt")
41			txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]
42
43			if len(txt_files) != 1:
44			self.logger.error("Incorrect number of text files")
45			self.finalize()
46			return -1
47
48			text_data = self.denoiser.cleanse(txt_files[0], False)
49
50			# Writing classified lines
51			base_filename = splitext(basename(txt_files[0]))[0]
52			clean_filename = join(txt_dir, base_filename+".clean.txt")
53			garbage_filename = join(txt_dir, base_filename+".grbge.txt")
54			unclassified_filename = join(txt_dir, base_filename+".unclss.txt")
55
56			with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
57			for line in text_data.get_clean_lines():
58			clean_file.write(line+"\n")
59
60			with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
61			for line in text_data.get_garbage_lines():
62			garbage_file.write(line+"\n")
63
64			if len(text_data.get_unclassified_lines()) > 0:
65			with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
66			for line in text_data.get_unclassified_lines():
67			unclassified_file.write(line+"\n")
68			except Exception, e:
69			print e
70
71			self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
72			self.finalize()
73			return -2
74
75			self.finalize()
76			return 0
77
78			def finalize(self):
79			"""Finalize the job
80			"""
81			# super(TXTDenoiser, self).store_file()
82			self.logger.debug("::: Text cleaning (END) :::")
83

usnistgov / ocr-pipeline

TXTDenoiser.__init__() A last analyzed 2017-09-28 14:20 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

TXTDenoiser.init() A
last analyzed 2017-09-28 14:20 UTC