Completed
Push — master ( a97f12...558115 )
by Daisuke
22s
created

AbstractGenerator._remove_parentheses()   A

Complexity

Conditions 4

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 0
loc 13
rs 9.2
1
# -*- coding: utf-8 -*-
2
"""
3
Generate abstract document (docx) file from table (xlsx)
4
by nebula
5
6
Dependency: pandas, xlrd, python-docx, pillow
7
"""
8
from PIL import Image
9
import pandas as pd
10
import docx
11
import math
12
import re
13
import os
14
15
16
class AbstractGenerator:
17
    def __init__(self, image_dir='', template_type='aini2016'):
18
        self.records = None
19
        self.image_dir = image_dir
20
        self.template_type = template_type
21
        self.exreg4author = re.compile(r'^([^\)]+)((?:\(.+\))*)$')
22
        self.exreg4affiliation = re.compile(r'^((?:\(.+\))*)(.+)$')
23
        self.exreg4super = re.compile(r'(\(\w+\))')
24
        # self.exreg4italic = re.compile(r'(\<i\>\w+\</i\>)')
25
        self.exreg4italic = re.compile(r'(\<i\>.*?\</i\>)')
26
        self.exreg4sup = re.compile(r'(\<sup\>.*?\</sup\>)')
27
        self.exreg4sub = re.compile(r'(\<sub\>.*?\</sub\>)')
28
        self.exreg4tags = re.compile(r'(\<.*?\>.*?\</.*?\>)')
29
        self.exreg4tag_strip = re.compile(r'<[^>]*?>')
30
        self.preferredImageMaxWidth = 14  # cm
31
        self.preferredImageMaxHeight = 8.5  # cm
32
        self.preferredImageDpi = 72
33
34
    def _insert_image(self, filename, image_filename):
35
        doc = docx.Document(filename)
36
37
        for paragraph in doc.paragraphs:
38
            if '[[FIGURE]]' in paragraph.text:
39
                # paragraph.text = ''
40
                run = paragraph.add_run()
41
                run.add_paragraph()
42
                inline_shape = run.add_picture(image_filename, width=docx.shared.Pt(300))
43
                run.add_paragraph()
44
45
        doc.save(filename)
46
47
    @staticmethod
48
    def _empty(text):
49
        if isinstance(text, float) and math.isnan(text):
50
            return True
51
        return text.strip() == ''
52
53
    def _to_array(self, text, delim):
54
        if self._empty(text):
55
            return []
56
        items = text.split(delim)
57
        return [item for item in items if item.strip()]
58
59
    @staticmethod
60
    def _remove_parentheses(text):
61
        exreg = re.compile(r'\((\w+)\)')
62
        nums = exreg.split(text)
63
        num = ''
64
        for n in nums:
65
            n = n.strip()
66
            if n == '':
67
                continue
68
            if num != '':
69
                num += ', '
70
            num += n
71
        return num
72
73
    @staticmethod
74
    def _get_image_size(pixel, dpi):
75
        return pixel / dpi * 2.54
76
77
    def _get_preferred_image_size(self, fpath):
78
        img = Image.open(fpath)
79
        dpi = (self.preferredImageDpi, self.preferredImageDpi)
80
        if 'dpi' in img.info:
81
            dpi = img.info['dpi']
82
        if 'jfif_density' in img.info:
83
            dpi = img.info['jfif_density']
84
        width = self._get_image_size(img.size[0], dpi[0])
85
        height = self._get_image_size(img.size[1], dpi[1])
86
        if width > self.preferredImageMaxWidth:
87
            height = height * self.preferredImageMaxWidth / width
88
            width = self.preferredImageMaxWidth
89
        if height > self.preferredImageMaxHeight:
90
            width = width * self.preferredImageMaxHeight / height
91
            height = self.preferredImageMaxHeight
92
        # print('image: %s(w:%dpx(%gcm),h:%dpx(%gcm),dpi:%s) -> (w:%gcm,h:%gcm)' % (fpath, img.size[0], self._getImageSize(img.size[0], dpi[0]), img.size[1], self._getImageSize(img.size[1], dpi[1]), dpi, width, height))
93
        img.close()
94
        return docx.shared.Cm(width), docx.shared.Cm(height)
95
96
    def _apply_it_sup_sub(self, doc, body, debug=False):
97
        p = doc.add_paragraph()
98
        body_split = self.exreg4tags.split(body)
99
100
        for split in body_split:
101
            italic_mode = False
102
            sup_mode = False
103
            sub_mode = False
104
105
            if self.exreg4italic.match(split):
106
                italic_mode = True
107
            if self.exreg4sup.match(split):
108
                sup_mode = True
109
            if self.exreg4sub.match(split):
110
                sub_mode = True
111
112
            if debug:
113
                run = p.add_run(split)
114
            else:
115
                run = p.add_run(self.exreg4tag_strip.sub('', split))
116
117
            run.italic = italic_mode
118
            run.font.superscript = sup_mode
119
            run.font.subscript = sub_mode
120
121
        return p
122
123
    def read_xlsx(self, filename):
124
        print('Reading: %s' % filename)
125
        exls = pd.ExcelFile(filename)
126
        self.records = exls.parse()
127
128
    def write_docx(self, filename, template=None):
129
        print('Writing: %s' % filename)
130
131
        if template is not None:
132
            doc = docx.Document(template)
133
        else:
134
            doc = docx.Document()
135
136
        first = True
137
        for i in self.records.index:
138
            if first:
139
                section = doc.sections[0]
140
            else:
141
                section = doc.add_section(docx.enum.section.WD_SECTION.NEW_PAGE)
142
            section.orientation = docx.enum.section.WD_ORIENT.PORTRAIT
143
            section.page_height = docx.shared.Mm(297)
144
            section.page_width = docx.shared.Mm(210)
145
            section.top_margin = docx.shared.Mm(20)
146
            section.right_margin = docx.shared.Mm(20)
147
            section.left_margin = docx.shared.Mm(20)
148
            section.bottom_margin = docx.shared.Mm(15)
149
            if self.template_type == 'aini2016':
150
                self._write_doc_aini2016(doc, self.records.loc[i])
151
            else:
152
                self._write_doc_jscpb2016(doc, self.records.loc[i])
153
            first = False
154
155
        doc.save(filename)
156
157
    def _write_doc_jscpb2016(self, doc, record):
158
        print(record.title)
159
160
        # Title
161
        # p = doc.add_paragraph(record.title)
162
        p = self._apply_it_sup_sub(doc, record.title, debug=True)
163
        for run in p.runs:
164
            run.font.size = docx.shared.Pt(12)
165
            run.bold = True
166
167
        # Authors
168
        p = doc.add_paragraph()
169
        author_list = self.exreg4super.split(record.authors)
170
        for j in range(len(author_list)):
171
            if j & 1:
172
                p.add_run(author_list[j]).font.superscript = True
173
            else:
174
                p.add_run(author_list[j])
175
176
        # Affiliations
177
        p = doc.add_paragraph(record.affiliations)
178
        p.runs[0].font.size = docx.shared.Pt(9)
179
        p.runs[0].italic = True
180
181
        # Abstract Body
182
        self._apply_it_sup_sub(doc, record.abstract, debug=True)
183
184
        # p = doc.add_paragraph(record.abstract)
185
186
        # keywords
187
        p = doc.add_paragraph('Keywords: ')
188
        p.add_run(record.keywords).italic = True
189
190
    def _write_doc_aini2016(self, doc, record):
191
        print('"%s"' % record['Title'])
192
        exreg4num = re.compile(r'\((\w+)\)')
193
194
        font = doc.styles['Normal'].font
195
        font.size = docx.shared.Pt(10)
196
        font.name = 'Times New Roman'
197
198
        # Program Number
199
        # p = doc.add_paragraph()
200
        # p.paragraph_format.line_spacing = docx.shared.Pt(12)
201
        # p.paragraph_format.space_after = docx.shared.Pt(5)
202
        # r = p.add_run(record['Program No.'].strip())
203
204
        # Title
205
        p = doc.add_paragraph()
206
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
207
        p.paragraph_format.space_before = docx.shared.Pt(25)
208
        p.paragraph_format.space_after = docx.shared.Pt(14)
209
        r = p.add_run(record['Title'].strip())
210
        r.font.size = docx.shared.Pt(12)
211
        r.bold = True
212
        r.italic = True
213
214
        # Authors
215
        p = doc.add_paragraph()
216
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
217
        p.paragraph_format.line_spacing = docx.shared.Pt(12)
218
        p.paragraph_format.space_after = docx.shared.Pt(12)
219
        authors = self._to_array(record['Name'], '\n')
220
        first = True
221
        for author in authors:
222
            m = self.exreg4author.match(author)
223
            if not first:
224
                p.add_run(', ').bold = True
225
            name = m.group(1).strip().replace(' ', '\u00A0')
226
            num = self._remove_parentheses(m.group(2).strip())
227
            p.add_run(name).bold = True
228
            if num != '':
229
                r = p.add_run('\u00A0' + num)
230
                r.bold = True
231
                r.font.superscript = True
232
            first = False
233
        p.add_run('\n')
234
235
        # Affiliation
236
        affiliations = self._to_array(record['Affiliation'], '\n')
237
        first = True
238
        for affiliation in affiliations:
239
            m = self.exreg4affiliation.match(affiliation)
240
            if not first:
241
                p.add_run(', ')
242
            num = self._remove_parentheses(m.group(1).strip())
243
            name = m.group(2).strip()
244
            if num != '':
245
                r = p.add_run(num + '\u00A0')
246
                r.font.superscript = True
247
            p.add_run(name)
248
            first = False
249
        p.add_run('\n' + record['e-mail'])
250
251
        # DOI
252
        p = doc.add_paragraph('DOI:' + record['DOI'].strip())
253
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
254
        p.paragraph_format.space_after = docx.shared.Pt(12)
255
256
        # Abstract Body
257
        items = self._to_array(record['Abstract'], '\n')
258
        first = True
259
        for item in items:
260
            p = doc.add_paragraph(item)
261
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
262
            p.paragraph_format.line_spacing = docx.shared.Pt(11)
263
            p.paragraph_format.space_after = docx.shared.Pt(2)
264
            if not first:
265
                p.paragraph_format.first_line_indent = docx.shared.Pt(12)
266
            first = False
267
        p.paragraph_format.space_after = docx.shared.Pt(12)
268
269
        # Figure
270
        if not self._empty(record['Figure file Name']):
271
272
            # Figure File Name
273
            img_fpath = os.path.join(self.image_dir, record['Figure file Name'])
274
            size = self._get_preferred_image_size(img_fpath)
275
            doc.add_picture(img_fpath, width=size[0])  # , height=size[1])
276
            p = doc.paragraphs[-1]
277
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
278
279
            # Figure Comment
280
            items = self._to_array(record['Figure comment'], '\n')
281
            first = True
282
            for item in items:
283
                p = doc.add_paragraph()
284
                p.paragraph_format.line_spacing = docx.shared.Pt(10)
285
                p.paragraph_format.space_after = docx.shared.Pt(0)
286
                p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
287
                if first:
288
                    p.add_run('Figure: ').bold = True
289
                    first = False
290
                p.add_run(item)
291
292
        p.paragraph_format.space_after = docx.shared.Pt(14)
293
294
        # References
295
        items = self._to_array(record['References'], '\n')
296
        first = True
297
        for item in items:
298
            if first:
299
                p = doc.add_paragraph()
300
                p.paragraph_format.line_spacing = docx.shared.Pt(11)
301
                p.paragraph_format.space_after = docx.shared.Pt(0)
302
                p.add_run('References:').bold = True
303
                first = False
304
            p = doc.add_paragraph()
305
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
306
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
307
            p.paragraph_format.space_after = docx.shared.Pt(0)
308
            p.add_run(item)
309
        p.paragraph_format.space_after = docx.shared.Pt(10)
310
311
        # Acknowledgement
312
        items = self._to_array(record['Acknowledgement'], '\n')
313
        first = True
314
        for item in items:
315
            p = doc.add_paragraph()
316
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
317
            p.paragraph_format.space_after = docx.shared.Pt(0)
318
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
319
            if first:
320
                p.add_run('Ackknowledgement: ').bold = True
321
                first = False
322
            p.add_run(item)
323
        p.paragraph_format.space_after = docx.shared.Pt(10)
324
325
        # Funding
326
        items = self._to_array(record['Funding'], '\n')
327
        first = True
328
        for item in items:
329
            p = doc.add_paragraph()
330
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
331
            p.paragraph_format.space_after = docx.shared.Pt(0)
332
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
333
            if first:
334
                p.add_run('Funding: ').bold = True
335
                first = False
336
            p.add_run(item)
337
        p.paragraph_format.space_after = docx.shared.Pt(10)
338
339
        # Citation
340
        p = doc.add_paragraph()
341
        p.paragraph_format.line_spacing = docx.shared.Pt(10)
342
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
343
        p.add_run('Citation: ').bold = True
344
        author_tmp = ''
345
        first = True
346
        for author in authors:
347
            m = self.exreg4author.match(author)
348
            if not first:
349
                author_tmp += ', '
350
            author_tmp += m.group(1).strip()
351
            first = False
352
        p.add_run(author_tmp + ' (2016). ' + record['Title'].strip().replace('\n', ' ') + '. ')
353
        p.add_run('Advances in Neuroinformatics IV. ').italic = True
354
        p.add_run(
355
            'AINI 2016 and INCF Nodes Workshop Abstract: ' + record['Program No. Long'].strip() + '. DOI:' + record[
356
                'DOI'].strip())
357
358
359
if __name__ == '__main__':
360
    img_dir = './image'
361
    input_xlsx = 'input.xlsx'
362
    output_docx = 'output.docx'
363
    template_docx = './template/aini2016.docx'
364