Completed
Branch master (e214b7)
by Philippe
36s
created

src.pipeline.commands.PDFConverter   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 70
Duplicated Lines 0 %
Metric Value
dl 0
loc 70
rs 10
wmc 9

3 Methods

Rating   Name   Duplication   Size   Complexity  
A PDFConverter.__init__() 0 10 1
A PDFConverter.finalize() 0 5 1
D PDFConverter.execute() 0 49 8
1
"""Package to convert PDF to PNG
2
3
.. Authors:
4
    Philippe Dessauw
5
    [email protected]
6
7
.. Sponsor:
8
    Alden Dima
9
    [email protected]
10
    Information Systems Group
11
    Software and Systems Division
12
    Information Technology Laboratory
13
    National Institute of Standards and Technology
14
    http://www.nist.gov/itl/ssd/is
15
"""
16
from os import listdir
17
from os.path import join, isfile, dirname, splitext, basename
18
import PyPDF2
19
import PythonMagick
20
from pipeline.command import Command
21
22
23
class PDFConverter(Command):
24
    """Command to convert PDF to PNG.
25
    """
26
27
    def __init__(self, filename, logger, config):
28
        super(PDFConverter, self).__init__(filename, logger, config)
29
30
        self.density = config["command"]["density"]
31
        self.depth = config["command"]["depth"]
32
        self.quality = config["command"]["quality"]
33
34
        self.logger.debug("PDF converter {density: "+str(self.density)
35
                          + "; depth: "+str(self.depth)
36
                          + "; quality: "+str(self.quality) + "}")
37
38
    def execute(self):
39
        """Execute the command
40
        """
41
        self.logger.debug(":::    PDF conversion    :::")
42
        super(PDFConverter, self).get_file()
43
44
        self.logger.debug(str(listdir(self.unzipped)))
45
        pdf_list = [join(self.unzipped, f) for f in listdir(self.unzipped)
46
                    if isfile(join(self.unzipped, f)) and f.endswith(".pdf")]
47
48
        if len(pdf_list) != 1:
49
            self.logger.error("Incorrect number of PDF file in " + self.unzipped
50
                              + " (" + str(len(pdf_list)) + " found, 1 expected)")
51
            self.finalize()
52
            return 1
53
54
        filename = str(pdf_list[0])
55
        with open(filename, "rb") as pdf:
56
            pdf_filereader = PyPDF2.PdfFileReader(pdf)
57
            pdf_page_nb = pdf_filereader.getNumPages()
58
59
        pdf_dirname = dirname(filename)
60
        imagesdir = "png"
61
62
        self.logger.debug(str(pdf_page_nb) + " page(s) detected")
63
        for p in xrange(pdf_page_nb):
64
65
            try:  # Reading the PDF
66
                img = PythonMagick.Image()
67
                img.density(str(self.density))
68
                img.depth(self.depth)
69
                img.quality(self.quality)
70
71
                pdf_page_file = filename + '[' + str(p) + ']'
72
                self.logger.debug("Reading " + pdf_page_file + "...")
73
                img.read(pdf_page_file)
74
75
                png_dirname = join(pdf_dirname, imagesdir)
76
                png_filename = splitext(basename(filename))[0] + '-' + str(p) + '.png'
77
                png_page_file = join(png_dirname, png_filename)
78
                self.logger.debug("Writing " + png_page_file + "...")
79
                img.write(png_page_file)
80
            except Exception, e:
81
                self.logger.fatal("An exception has been caugth: "+str(e.message))
82
                self.finalize()
83
                return 1
84
85
        self.finalize()
86
        return 0
87
88
    def finalize(self):
89
        """Finalize the job
90
        """
91
        super(PDFConverter, self).store_file()
92
        self.logger.debug("::: PDF conversion (END) :::")
93