Completed
Push — master ( 4b6cac...a97f12 )
by Daisuke
23s
created

AbstractGenerator   C

Complexity

Total Complexity 57

Size/Duplication

Total Lines 338
Duplicated Lines 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
c 3
b 0
f 0
dl 0
loc 338
rs 5.1724
wmc 57

12 Methods

Rating   Name   Duplication   Size   Complexity  
A _empty() 0 4 3
A __init__() 0 16 1
A _insert_image() 0 12 3
B write_docx() 0 28 5
A _getImageSize() 0 2 1
B _apply_it_sup_sub() 0 26 6
A read_xlsx() 0 4 1
B _getPreferredImageSize() 0 18 5
A _toArray() 0 5 4
F _write_doc_aini2016() 0 167 20
B _write_doc_jscpb2016() 0 32 4
A _removeParentheses() 0 12 4

How to fix   Complexity   

Complex Class

Complex classes like AbstractGenerator often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
"""
3
Generate abstract document (docx) file from table (xlsx)
4
by nebula
5
6
Dependency: pandas, xlrd, python-docx, pillow
7
"""
8
from PIL import Image
9
import pandas as pd
10
import docx
11
import math
12
import re
13
import os
14
15
16
class AbstractGenerator:
17
    def __init__(self, image_dir='', template_type='aini2016'):
18
        self.records = None
19
        self.image_dir = image_dir
20
        self.template_type = template_type
21
        self.exreg4author = re.compile(r'^([^\)]+)((?:\(.+\))*)$')
22
        self.exreg4affiliation = re.compile(r'^((?:\(.+\))*)(.+)$')
23
        self.exreg4super = re.compile(r'(\(\w+\))')
24
        # self.exreg4italic = re.compile(r'(\<i\>\w+\</i\>)')
25
        self.exreg4italic = re.compile(r'(\<i\>.*?\</i\>)')
26
        self.exreg4sup = re.compile(r'(\<sup\>.*?\</sup\>)')
27
        self.exreg4sub = re.compile(r'(\<sub\>.*?\</sub\>)')
28
        self.exreg4tags = re.compile(r'(\<.*?\>.*?\</.*?\>)')
29
        self.exreg4tag_strip = re.compile(r'<[^>]*?>')
30
        self.preferredImageMaxWidth = 14  # cm
31
        self.preferredImageMaxHeight = 8.5  # cm
32
        self.preferredImageDpi = 72
33
34
    def _insert_image(self, filename, image_filename):
35
        doc = docx.Document(filename)
36
37
        for paragraph in doc.paragraphs:
38
            if '[[FIGURE]]' in paragraph.text:
39
                # paragraph.text = ''
40
                run = paragraph.add_run()
41
                run.add_paragraph()
42
                inline_shape = run.add_picture(image_filename, width=docx.shared.Pt(300))
43
                run.add_paragraph()
44
45
        doc.save(filename)
46
47
    def _empty(self, text):
48
        if isinstance(text, float) and math.isnan(text):
49
            return True
50
        return text.strip() == ''
51
52
    def _toArray(self, text, delim):
53
        if self._empty(text) == True:
54
            return []
55
        items = text.split(delim)
56
        return [item for item in items if item.strip()]
57
58
    def _removeParentheses(self, text):
59
        exreg = re.compile(r'\((\w+)\)')
60
        nums = exreg.split(text)
61
        num = ''
62
        for n in nums:
63
            n = n.strip()
64
            if n == '':
65
                continue
66
            if num != '':
67
                num += ', '
68
            num += n
69
        return num
70
71
    def _getImageSize(self, pixel, dpi):
72
        return pixel / dpi * 2.54
73
74
    def _getPreferredImageSize(self, fpath):
75
        img = Image.open(fpath)
76
        dpi = (self.preferredImageDpi, self.preferredImageDpi)
77
        if 'dpi' in img.info:
78
            dpi = img.info['dpi']
79
        if 'jfif_density' in img.info:
80
            dpi = img.info['jfif_density']
81
        width = self._getImageSize(img.size[0], dpi[0])
82
        height = self._getImageSize(img.size[1], dpi[1])
83
        if width > self.preferredImageMaxWidth:
84
            height = height * self.preferredImageMaxWidth / width
85
            width = self.preferredImageMaxWidth
86
        if height > self.preferredImageMaxHeight:
87
            width = width * self.preferredImageMaxHeight / height
88
            height = self.preferredImageMaxHeight
89
        # print('image: %s(w:%dpx(%gcm),h:%dpx(%gcm),dpi:%s) -> (w:%gcm,h:%gcm)' % (fpath, img.size[0], self._getImageSize(img.size[0], dpi[0]), img.size[1], self._getImageSize(img.size[1], dpi[1]), dpi, width, height))
90
        img.close()
91
        return docx.shared.Cm(width), docx.shared.Cm(height)
92
93
    def _apply_it_sup_sub(self, doc, body, debug=False):
94
        p = doc.add_paragraph()
95
        body_split = self.exreg4tags.split(body)
96
97
        for split in body_split:
98
            italic_mode = False
99
            sup_mode = False
100
            sub_mode = False
101
102
            if self.exreg4italic.match(split):
103
                italic_mode = True
104
            if self.exreg4sup.match(split):
105
                sup_mode = True
106
            if self.exreg4sub.match(split):
107
                sub_mode = True
108
109
            if debug:
110
                run = p.add_run(split)
111
            else:
112
                run = p.add_run(self.exreg4tag_strip.sub('', split))
113
114
            run.italic = italic_mode
115
            run.font.superscript = sup_mode
116
            run.font.subscript = sub_mode
117
118
        return p
119
120
    def read_xlsx(self, filename):
121
        print('Reading: %s' % filename)
122
        exls = pd.ExcelFile(filename)
123
        self.records = exls.parse()
124
125
    def write_docx(self, filename, template=None):
126
        print('Writing: %s' % filename)
127
128
        if template is not None:
129
            doc = docx.Document(template)
130
        else:
131
            doc = docx.Document()
132
133
        first = True
134
        for i in self.records.index:
135
            if first:
136
                section = doc.sections[0]
137
            else:
138
                section = doc.add_section(docx.enum.section.WD_SECTION.NEW_PAGE)
139
            section.orientation = docx.enum.section.WD_ORIENT.PORTRAIT
140
            section.page_height = docx.shared.Mm(297)
141
            section.page_width = docx.shared.Mm(210)
142
            section.top_margin = docx.shared.Mm(20)
143
            section.right_margin = docx.shared.Mm(20)
144
            section.left_margin = docx.shared.Mm(20)
145
            section.bottom_margin = docx.shared.Mm(15)
146
            if self.template_type == 'aini2016':
147
                self._write_doc_aini2016(doc, self.records.loc[i])
148
            else:
149
                self._write_doc_jscpb2016(doc, self.records.loc[i])
150
            first = False
151
152
        doc.save(filename)
153
154
    def _write_doc_jscpb2016(self, doc, record):
155
        #print(record['title'])
156
157
        # Title
158
        # p = doc.add_paragraph(record.title)
159
        p = self._apply_it_sup_sub(doc, record.title, debug=True)
160
        for run in p.runs:
161
            run.font.size = docx.shared.Pt(12)
162
            run.bold = True
163
164
        # Authors
165
        p = doc.add_paragraph()
166
        author_list = self.exreg4super.split(record.authors)
167
        for j in range(len(author_list)):
168
            if j & 1:
169
                p.add_run(author_list[j]).font.superscript = True
170
            else:
171
                p.add_run(author_list[j])
172
173
        # Affiliations
174
        p = doc.add_paragraph(record.affiliations)
175
        p.runs[0].font.size = docx.shared.Pt(9)
176
        p.runs[0].italic = True
177
178
        # Abstract Body
179
        self._apply_it_sup_sub(doc, record.abstract, debug=True)
180
181
        # p = doc.add_paragraph(record.abstract)
182
183
        # keywords
184
        p = doc.add_paragraph('Keywords: ')
185
        p.add_run(record.keywords).italic = True
186
187
    def _write_doc_aini2016(self, doc, record):
188
        print('"%s"' % record['Title'])
189
        exreg4num = re.compile(r'\((\w+)\)')
190
191
        font = doc.styles['Normal'].font
192
        font.size = docx.shared.Pt(10)
193
        font.name = 'Times New Roman'
194
195
        # Program Number
196
        # p = doc.add_paragraph()
197
        # p.paragraph_format.line_spacing = docx.shared.Pt(12)
198
        # p.paragraph_format.space_after = docx.shared.Pt(5)
199
        # r = p.add_run(record['Program No.'].strip())
200
201
        # Title
202
        p = doc.add_paragraph()
203
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
204
        p.paragraph_format.space_before = docx.shared.Pt(25)
205
        p.paragraph_format.space_after = docx.shared.Pt(14)
206
        r = p.add_run(record['Title'].strip())
207
        r.font.size = docx.shared.Pt(12)
208
        r.bold = True
209
        r.italic = True
210
211
        # Authors
212
        p = doc.add_paragraph()
213
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
214
        p.paragraph_format.line_spacing = docx.shared.Pt(12)
215
        p.paragraph_format.space_after = docx.shared.Pt(12)
216
        authors = self._toArray(record['Name'], '\n')
217
        first = True
218
        for author in authors:
219
            m = self.exreg4author.match(author)
220
            if first == False:
221
                p.add_run(', ').bold = True
222
            name = m.group(1).strip().replace(' ', '\u00A0')
223
            num = self._removeParentheses(m.group(2).strip())
224
            p.add_run(name).bold = True
225
            if num != '':
226
                r = p.add_run('\u00A0' + num)
227
                r.bold = True
228
                r.font.superscript = True
229
            first = False
230
        p.add_run('\n')
231
232
        # Affiliation
233
        affiliations = self._toArray(record['Affiliation'], '\n')
234
        first = True
235
        for affiliation in affiliations:
236
            m = self.exreg4affiliation.match(affiliation)
237
            if first == False:
238
                p.add_run(', ')
239
            num = self._removeParentheses(m.group(1).strip())
240
            name = m.group(2).strip()
241
            if num != '':
242
                r = p.add_run(num + '\u00A0')
243
                r.font.superscript = True
244
            p.add_run(name)
245
            first = False
246
        p.add_run('\n' + record['e-mail'])
247
248
        # DOI
249
        p = doc.add_paragraph('DOI:' + record['DOI'].strip())
250
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
251
        p.paragraph_format.space_after = docx.shared.Pt(12)
252
253
        # Abstract Body
254
        items = self._toArray(record['Abstract'], '\n')
255
        first = True
256
        for item in items:
257
            p = doc.add_paragraph(item)
258
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
259
            p.paragraph_format.line_spacing = docx.shared.Pt(11)
260
            p.paragraph_format.space_after = docx.shared.Pt(2)
261
            if first == False:
262
                p.paragraph_format.first_line_indent = docx.shared.Pt(12)
263
            first = False
264
        p.paragraph_format.space_after = docx.shared.Pt(12)
265
266
        # Figure
267
        if self._empty(record['Figure file Name']) == False:
268
269
            # Figure File Name
270
            img_fpath = os.path.join(self.image_dir, record['Figure file Name'])
271
            size = self._getPreferredImageSize(img_fpath)
272
            doc.add_picture(img_fpath, width=size[0])  # , height=size[1])
273
            p = doc.paragraphs[-1]
274
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER
275
276
            # Figure Comment
277
            items = self._toArray(record['Figure comment'], '\n')
278
            first = True
279
            for item in items:
280
                p = doc.add_paragraph()
281
                p.paragraph_format.line_spacing = docx.shared.Pt(10)
282
                p.paragraph_format.space_after = docx.shared.Pt(0)
283
                p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
284
                if first:
285
                    p.add_run('Figure: ').bold = True
286
                    first = False
287
                p.add_run(item)
288
289
        p.paragraph_format.space_after = docx.shared.Pt(14)
290
291
        # References
292
        items = self._toArray(record['References'], '\n')
293
        first = True
294
        for item in items:
295
            if first:
296
                p = doc.add_paragraph()
297
                p.paragraph_format.line_spacing = docx.shared.Pt(11)
298
                p.paragraph_format.space_after = docx.shared.Pt(0)
299
                p.add_run('References:').bold = True
300
                first = False
301
            p = doc.add_paragraph()
302
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
303
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
304
            p.paragraph_format.space_after = docx.shared.Pt(0)
305
            p.add_run(item)
306
        p.paragraph_format.space_after = docx.shared.Pt(10)
307
308
        # Acknowledgement
309
        items = self._toArray(record['Acknowledgement'], '\n')
310
        first = True
311
        for item in items:
312
            p = doc.add_paragraph()
313
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
314
            p.paragraph_format.space_after = docx.shared.Pt(0)
315
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
316
            if first:
317
                p.add_run('Ackknowledgement: ').bold = True
318
                first = False
319
            p.add_run(item)
320
        p.paragraph_format.space_after = docx.shared.Pt(10)
321
322
        # Funding
323
        items = self._toArray(record['Funding'], '\n')
324
        first = True
325
        for item in items:
326
            p = doc.add_paragraph()
327
            p.paragraph_format.line_spacing = docx.shared.Pt(10)
328
            p.paragraph_format.space_after = docx.shared.Pt(0)
329
            p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
330
            if first:
331
                p.add_run('Funding: ').bold = True
332
                first = False
333
            p.add_run(item)
334
        p.paragraph_format.space_after = docx.shared.Pt(10)
335
336
        # Citation
337
        p = doc.add_paragraph()
338
        p.paragraph_format.line_spacing = docx.shared.Pt(10)
339
        p.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY
340
        p.add_run('Citation: ').bold = True
341
        author_tmp = ''
342
        first = True
343
        for author in authors:
344
            m = self.exreg4author.match(author)
345
            if first == False:
346
                author_tmp += ', '
347
            author_tmp += m.group(1).strip()
348
            first = False
349
        p.add_run(author_tmp + ' (2016). ' + record['Title'].strip().replace('\n', ' ') + '. ')
350
        p.add_run('Advances in Neuroinformatics IV. ').italic = True
351
        p.add_run(
352
            'AINI 2016 and INCF Nodes Workshop Abstract: ' + record['Program No. Long'].strip() + '. DOI:' + record[
353
                'DOI'].strip())
354
355
356
if __name__ == '__main__':
357
    img_dir = './image'
358
    input_xlsx = 'input.xlsx'
359
    output_docx = 'output.docx'
360
    template_docx = './template/aini2016.docx'
361