1
|
|
|
"""Package to convert PDF to PNG |
2
|
|
|
|
3
|
|
|
.. Authors: |
4
|
|
|
Philippe Dessauw |
5
|
|
|
[email protected] |
6
|
|
|
|
7
|
|
|
.. Sponsor: |
8
|
|
|
Alden Dima |
9
|
|
|
[email protected] |
10
|
|
|
Information Systems Group |
11
|
|
|
Software and Systems Division |
12
|
|
|
Information Technology Laboratory |
13
|
|
|
National Institute of Standards and Technology |
14
|
|
|
http://www.nist.gov/itl/ssd/is |
15
|
|
|
""" |
16
|
|
|
from os import listdir |
17
|
|
|
from os.path import join, isfile, dirname, splitext, basename |
18
|
|
|
import PyPDF2 |
19
|
|
|
import PythonMagick |
20
|
|
|
from pipeline.command import Command |
21
|
|
|
|
22
|
|
|
|
23
|
|
|
class PDFConverter(Command): |
24
|
|
|
"""Command to convert PDF to PNG. |
25
|
|
|
""" |
26
|
|
|
|
27
|
|
|
def __init__(self, filename, logger, config): |
28
|
|
|
super(PDFConverter, self).__init__(filename, logger, config) |
29
|
|
|
|
30
|
|
|
self.density = config["command"]["density"] |
31
|
|
|
self.depth = config["command"]["depth"] |
32
|
|
|
self.quality = config["command"]["quality"] |
33
|
|
|
|
34
|
|
|
self.logger.debug("PDF converter {density: "+str(self.density) |
35
|
|
|
+ "; depth: "+str(self.depth) |
36
|
|
|
+ "; quality: "+str(self.quality) + "}") |
37
|
|
|
|
38
|
|
|
def execute(self): |
39
|
|
|
"""Execute the command |
40
|
|
|
""" |
41
|
|
|
self.logger.debug("::: PDF conversion :::") |
42
|
|
|
super(PDFConverter, self).get_file() |
43
|
|
|
|
44
|
|
|
self.logger.debug(str(listdir(self.unzipped))) |
45
|
|
|
pdf_list = [join(self.unzipped, f) for f in listdir(self.unzipped) |
46
|
|
|
if isfile(join(self.unzipped, f)) and f.endswith(".pdf")] |
47
|
|
|
|
48
|
|
|
if len(pdf_list) != 1: |
49
|
|
|
self.logger.error("Incorrect number of PDF file in " + self.unzipped |
50
|
|
|
+ " (" + str(len(pdf_list)) + " found, 1 expected)") |
51
|
|
|
self.finalize() |
52
|
|
|
return 1 |
53
|
|
|
|
54
|
|
|
filename = str(pdf_list[0]) |
55
|
|
|
with open(filename, "rb") as pdf: |
56
|
|
|
pdf_filereader = PyPDF2.PdfFileReader(pdf) |
57
|
|
|
pdf_page_nb = pdf_filereader.getNumPages() |
58
|
|
|
|
59
|
|
|
pdf_dirname = dirname(filename) |
60
|
|
|
imagesdir = "png" |
61
|
|
|
|
62
|
|
|
self.logger.debug(str(pdf_page_nb) + " page(s) detected") |
63
|
|
|
for p in xrange(pdf_page_nb): |
64
|
|
|
|
65
|
|
|
try: # Reading the PDF |
66
|
|
|
img = PythonMagick.Image() |
67
|
|
|
img.density(str(self.density)) |
68
|
|
|
img.depth(self.depth) |
69
|
|
|
img.quality(self.quality) |
70
|
|
|
|
71
|
|
|
pdf_page_file = filename + '[' + str(p) + ']' |
72
|
|
|
self.logger.debug("Reading " + pdf_page_file + "...") |
73
|
|
|
img.read(pdf_page_file) |
74
|
|
|
|
75
|
|
|
png_dirname = join(pdf_dirname, imagesdir) |
76
|
|
|
png_filename = splitext(basename(filename))[0] + '-' + str(p) + '.png' |
77
|
|
|
png_page_file = join(png_dirname, png_filename) |
78
|
|
|
self.logger.debug("Writing " + png_page_file + "...") |
79
|
|
|
img.write(png_page_file) |
80
|
|
|
except Exception, e: |
81
|
|
|
self.logger.fatal("An exception has been caugth: "+str(e.message)) |
82
|
|
|
self.finalize() |
83
|
|
|
return 1 |
84
|
|
|
|
85
|
|
|
self.finalize() |
86
|
|
|
return 0 |
87
|
|
|
|
88
|
|
|
def finalize(self): |
89
|
|
|
"""Finalize the job |
90
|
|
|
""" |
91
|
|
|
super(PDFConverter, self).store_file() |
92
|
|
|
self.logger.debug("::: PDF conversion (END) :::") |
93
|
|
|
|