TXTDenoiser.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 1
c 1
b 0
f 1
dl 0
loc 5
rs 9.4285
1
"""Package to clean TXT files
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
import codecs
17
from os.path import join, isfile, splitext, basename
18
from os import listdir
19
from denoiser import Denoiser
20
from pipeline.command import Command
21
22
23
class TXTDenoiser(Command):
24
    """Command to clean TXT files
25
    """
26
27
    def __init__(self, filename, logger, config):
28
        super(TXTDenoiser, self).__init__(filename, logger, config)
29
        self.denoiser = Denoiser(config)
30
31
        self.logger.debug("Denoiser initialized")
32
33
    def execute(self):
34
        """Execute the command
35
        """
36
        try:
37
            self.logger.debug("::: Text cleaning :::")
38
            # super(TXTDenoiser, self).get_file()
39
40
            txt_dir = join(self.unzipped, "txt")
41
            txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")]
42
43
            if len(txt_files) != 1:
44
                self.logger.error("Incorrect number of text files")
45
                self.finalize()
46
                return -1
47
48
            text_data = self.denoiser.cleanse(txt_files[0], False)
49
50
            # Writing classified lines
51
            base_filename = splitext(basename(txt_files[0]))[0]
52
            clean_filename = join(txt_dir, base_filename+".clean.txt")
53
            garbage_filename = join(txt_dir, base_filename+".grbge.txt")
54
            unclassified_filename = join(txt_dir, base_filename+".unclss.txt")
55
56
            with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file:
57
                for line in text_data.get_clean_lines():
58
                    clean_file.write(line+"\n")
59
60
            with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file:
61
                for line in text_data.get_garbage_lines():
62
                    garbage_file.write(line+"\n")
63
64
            if len(text_data.get_unclassified_lines()) > 0:
65
                with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file:
66
                    for line in text_data.get_unclassified_lines():
67
                        unclassified_file.write(line+"\n")
68
        except Exception, e:
69
            print e
70
71
            self.logger.error("Cleaner has stopped unexpectedly: "+e.message)
72
            self.finalize()
73
            return -2
74
75
        self.finalize()
76
        return 0
77
78
    def finalize(self):
79
        """Finalize the job
80
        """
81
        # super(TXTDenoiser, self).store_file()
82
        self.logger.debug("::: Text cleaning (END) :::")
83