ocrd.workspace_bagger.WorkspaceBagger.bag()   B
last analyzed

Complexity

Conditions 6

Size

Total Lines 68
Code Lines 33

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 33
dl 0
loc 68
rs 8.1546
c 0
b 0
f 0
cc 6
nop 11

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from datetime import datetime
2
from os import makedirs, walk
3
from os.path import join, isdir, basename as os_path_basename, exists, relpath
4
from pathlib import Path
5
from shutil import make_archive, rmtree, copyfile, move, copytree
6
from tempfile import mkdtemp, TemporaryDirectory
7
import re
8
import tempfile
9
import sys
10
from bagit import (
11
    Bag,
12
    make_manifests,
13
    _load_tag_file, _make_tag_file, _make_tagmanifest_file,  # pylint: disable=no-name-in-module
14
)
15
16
from ocrd_utils import (
17
    pushd_popd,
18
    getLogger,
19
    MIME_TO_EXT,
20
    unzip_file_to_dir,
21
    DEFAULT_METS_BASENAME,
22
    MIMETYPE_PAGE,
23
    VERSION,
24
    dist_version,
25
)
26
from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL
27
from ocrd_modelfactory import page_from_file
28
from ocrd_models.ocrd_page import to_xml
29
30
from .workspace import Workspace
31
32
tempfile.tempdir = '/tmp'  # TODO hard-coded
33
34
BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup')
35
36
37
class WorkspaceBagger():
38
    """
39
    Serialize/De-serialize from OCRD-ZIP to workspace and back.
40
    """
41
42
    def __init__(self, resolver, strict=False):
43
        self.resolver = resolver
44
        self.strict = strict
45
46
    def _serialize_bag(self, workspace, bagdir, dest, skip_zip):
47
        if skip_zip:
48
            move(bagdir, dest)
49
        else:
50
            make_archive(dest.replace('.zip', ''), 'zip', bagdir)
51
52
            # Remove temporary bagdir
53
            rmtree(bagdir)
54
55
    def _log_or_raise(self, msg):
56
        log = getLogger('ocrd.workspace_bagger')
57
        if self.strict:
58
            raise Exception(msg)
59
        else:
60
            log.info(msg)
61
62
    def _bag_mets_files(
63
        self,
64
        workspace,
65
        bagdir,
66
        ocrd_mets,
67
        processes,
68
        include_fileGrp=None,
69
        exclude_fileGrp=None,
70
    ):
71
        mets = workspace.mets
72
        changed_local_filenames = {}
73
74
        log = getLogger('ocrd.workspace_bagger')
75
        # TODO allow filtering by fileGrp@USE and such
76
77
        with pushd_popd(workspace.directory):
78
            # local_filenames of the files before changing
79
            for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp):
80
                log.info("Bagging OcrdFile %s", f)
81
82
                file_grp_dir = Path(bagdir, 'data', f.fileGrp)
83
                if not file_grp_dir.is_dir():
84
                    file_grp_dir.mkdir()
85
86
                attr = 'local_filename' if f.local_filename else 'url'
87
                basename = f.basename if f.basename else f"{f.ID}{MIME_TO_EXT.get(f.mimetype, '.xml')}"
88
                _relpath = join(f.fileGrp, basename)
89
                self.resolver.download_to_directory(file_grp_dir, getattr(f, attr), basename=basename)
90
                changed_local_filenames[str(getattr(f, attr))] = _relpath
91
                f.local_filename = _relpath
92
93
            # save mets.xml
94
            with open(join(bagdir, 'data', ocrd_mets), 'wb') as f:
95
                f.write(workspace.mets.to_xml())
96
97
        # Walk through bagged workspace and fix the PAGE
98
        # Page/@imageFilename and
99
        # AlternativeImage/@filename
100
        bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data'), mets_basename=ocrd_mets)
101
        with pushd_popd(bag_workspace.directory):
102
            for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE):
103
                pcgts = page_from_file(page_file)
104
                changed = False
105
                for old, new in changed_local_filenames.items():
106
                    if pcgts.get_Page().imageFilename == old:
107
                        pcgts.get_Page().imageFilename = new
108
                        changed = True
109
                    # TODO replace AlternativeImage, recursively...
110
                if changed:
111
                    with open(page_file.local_filename, 'w') as out:
112
                        out.write(to_xml(pcgts))
113
                    #  log.info("Replace %s -> %s in %s" % (old, new, page_file))
114
115
            with pushd_popd(bagdir):
116
                total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512'])
117
            log.info("New vs. old: %s" % changed_local_filenames)
118
        return total_bytes, total_files
119
120
    def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum,
121
                      ocrd_mets=DEFAULT_METS_BASENAME):
122
        bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL
123
        bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % (
124
            VERSION,  # TODO
125
            dist_version('ocrd-fork-bagit'),
126
            dist_version('ocrd-fork-bagit_profile'),
127
            ' '.join(sys.argv))
128
129
        bag.info['Ocrd-Identifier'] = ocrd_identifier
130
        if ocrd_base_version_checksum:
131
            bag.info['Ocrd-Base-Version-Checksum'] = ocrd_base_version_checksum
132
        bag.info['Bagging-Date'] = str(datetime.now())
133
        bag.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files)
134
        if ocrd_mets != DEFAULT_METS_BASENAME:
135
            bag.info['Ocrd-Mets'] = ocrd_mets
136
137
    def bag(self,
138
            workspace,
139
            ocrd_identifier,
140
            dest=None,
141
            ocrd_mets=DEFAULT_METS_BASENAME,
142
            ocrd_base_version_checksum=None,
143
            processes=1,
144
            skip_zip=False,
145
            tag_files=None,
146
            include_fileGrp=None,
147
            exclude_fileGrp=None,
148
    ):
149
        """
150
        Bag a workspace
151
152
        See https://ocr-d.github.com/ocrd_zip#packing-a-workspace-as-ocrd-zip
153
154
        Arguments:
155
            workspace (ocrd.Workspace): workspace to bag
156
            ord_identifier (string): Ocrd-Identifier in bag-info.txt
157
            dest (string): Path of the generated OCRD-ZIP.
158
            ord_mets (string): Ocrd-Mets in bag-info.txt
159
            ord_base_version_checksum (string): Ocrd-Base-Version-Checksum in bag-info.txt
160
            processes (integer): Number of parallel processes checksumming
161
            skip_zip (boolean): Whether to leave directory unzipped
162
            tag_files (list<string>): Path names of additional tag files to be bagged at the root of the bag
163
        """
164
        if tag_files is None:
165
            tag_files = []
166
167
        # create bagdir
168
        bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
169
170
        if dest is None:
171
            if not skip_zip:
172
                dest = '%s.ocrd.zip' % workspace.directory
173
            else:
174
                dest = '%s.ocrd' % workspace.directory
175
176
        log = getLogger('ocrd.workspace_bagger')
177
        log.info("Bagging %s to %s (temp dir %s)", workspace.directory, dest, bagdir)
178
179
        # create data dir
180
        makedirs(join(bagdir, 'data'))
181
182
        # create bagit.txt
183
        with open(join(bagdir, 'bagit.txt'), 'wb') as f:
184
            f.write(BAGIT_TXT.encode('utf-8'))
185
186
        # create manifests
187
        total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes,
188
                                                        include_fileGrp, exclude_fileGrp)
189
190
        # create bag-info.txt
191
        bag = Bag(bagdir)
192
        self._set_bag_info(bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=ocrd_mets)
193
194
        for tag_file in tag_files:
195
            copyfile(tag_file, join(bagdir, os_path_basename(tag_file)))
196
197
        # save bag
198
        bag.save()
199
200
        # ZIP it
201
        self._serialize_bag(workspace, bagdir, dest, skip_zip)
202
203
        log.info('Created bag at %s', dest)
204
        return dest
205
206
    def spill(self, src, dest):
207
        """
208
        Spill a workspace, i.e. unpack it and turn it into a workspace.
209
210
        See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace
211
212
        Arguments:
213
            src (string): Path to OCRD-ZIP
214
            dest (string): Path to directory to unpack data folder to
215
        """
216
        log = getLogger('ocrd.workspace_bagger')
217
218
        if exists(dest) and not isdir(dest):
219
            raise Exception("Not a directory: %s" % dest)
220
221
        # If dest is an existing directory, try to derive its name from src
222
        if isdir(dest):
223
            workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', os_path_basename(src))
224
            new_dest = join(dest, workspace_name)
225
            if exists(new_dest):
226
                raise Exception("Directory exists: %s" % new_dest)
227
            dest = new_dest
228
229
        log.info("Spilling %s to %s", src, dest)
230
231
        bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
232
        unzip_file_to_dir(src, bagdir)
233
        bag_info = _load_tag_file(join(bagdir, "bag-info.txt"))
234
235
        datadir = join(bagdir, 'data')
236
        for root, _, files in walk(datadir):
237
            for f in files:
238
                srcfile = join(root, f)
239
                destdir = join(dest, relpath(root, datadir))
240
                destfile = join(destdir, f)
241
                if not exists(destdir):
242
                    makedirs(destdir)
243
                log.debug("Copy %s -> %s", srcfile, destfile)
244
                copyfile(srcfile, destfile)
245
246
        # TODO copy allowed tag files if present
247
248
        # TODO validate bagit
249
250
        # Drop tempdir
251
        rmtree(bagdir)
252
253
        # Create workspace
254
        mets_basename = bag_info.get("Ocrd-Mets", DEFAULT_METS_BASENAME)
255
        workspace = Workspace(self.resolver, directory=dest, mets_basename=mets_basename)
256
257
        # TODO validate workspace
258
259
        return workspace
260
261
    def validate(self, bag):
262
        """
263
        Validate conformance with BagIt and OCR-D bagit profile.
264
265
        See:
266
            - https://ocr-d.github.io/ocrd_zip
267
            - https://ocr-d.github.io/bagit-profile.json
268
            - https://ocr-d.github.io/bagit-profile.yml
269
        """
270
        pass
271
272
    def recreate_checksums(self, src, dest=None, overwrite=False):
273
        """
274
        (Re)creates the files containing the checksums of a bag
275
276
        This function uses bag.py to create new files: manifest-sha512.txt and
277
        tagminifest-sha512.txt for the bag. Also 'Payload-Oxum' in bag-info.txt will be set to the
278
        appropriate value.
279
280
        Arguments:
281
            src (string):    Path to Bag. May be a zipped or unzipped bagit
282
            dest (string):   Path to where the result should be stored. Not needed if overwrite is
283
                             set
284
            overwrite(bool): Replace bag with newly created bag
285
        """
286
        if overwrite and dest:
287
            raise Exception("Setting 'dest' and 'overwrite' is a contradiction")
288
        if not overwrite and not dest:
289
            raise Exception("For checksum recreation 'dest' must be provided")
290
        src_path = Path(src)
291
        if not src_path.exists():
292
            raise Exception("Path to bag not existing")
293
        is_zipped = src_path.is_file()
294
295
        with TemporaryDirectory() as tempdir:
296
            if is_zipped:
297
                unzip_file_to_dir(src, tempdir)
298
                path_to_bag = Path(tempdir)
299
                if not path_to_bag.joinpath("data").exists():
300
                    raise FileNotFoundError("data directory of bag not found")
301
            else:
302
                path_to_bag = src_path if overwrite else Path(dest)
303
                if not src_path.joinpath("data").exists():
304
                    raise FileNotFoundError(f"data directory of bag not found at {src}")
305
                if not overwrite:
306
                    path_to_bag.mkdir(parents=True, exist_ok=True)
307
                    copytree(src, dest, dirs_exist_ok=True)
308
309
            with pushd_popd(path_to_bag):
310
                n_bytes, n_files = make_manifests("data", 1, ["sha512"])
311
312
                bag_infos = _load_tag_file("bag-info.txt")
313
                bag_infos["Payload-Oxum"] = f"{n_bytes}.{n_files}"
314
                _make_tag_file("bag-info.txt", bag_infos)
315
                _make_tagmanifest_file("sha512", ".")
316
317
            if is_zipped:
318
                name = src_path.name
319
                if name.endswith(".zip"):
320
                    name = name[:-4]
321
                zip_path = make_archive(name, "zip", path_to_bag)
322
                move(zip_path, src if overwrite else dest)
323