ocrd.workspace_bagger   C
last analyzed

Complexity

Total Complexity 53

Size/Duplication

Total Lines 323
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 53
eloc 196
dl 0
loc 323
rs 6.96
c 0
b 0
f 0

9 Methods

Rating   Name   Duplication   Size   Complexity  
F WorkspaceBagger.recreate_checksums() 0 51 16
B WorkspaceBagger.bag() 0 68 6
A WorkspaceBagger.validate() 0 10 1
A WorkspaceBagger._log_or_raise() 0 6 2
A WorkspaceBagger._serialize_bag() 0 8 2
B WorkspaceBagger.spill() 0 54 8
F WorkspaceBagger._bag_mets_files() 0 57 14
A WorkspaceBagger._set_bag_info() 0 16 3
A WorkspaceBagger.__init__() 0 3 1

How to fix   Complexity   

Complexity

Complex classes like ocrd.workspace_bagger often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from datetime import datetime
2
from os import makedirs, walk
3
from os.path import join, isdir, basename as os_path_basename, exists, relpath
4
from pathlib import Path
5
from shutil import make_archive, rmtree, copyfile, move, copytree
6
from tempfile import mkdtemp, TemporaryDirectory
7
import re
8
import tempfile
9
import sys
10
from bagit import (
11
    Bag,
12
    make_manifests,
13
    _load_tag_file, _make_tag_file, _make_tagmanifest_file,  # pylint: disable=no-name-in-module
14
)
15
16
from ocrd_utils import (
17
    pushd_popd,
18
    getLogger,
19
    MIME_TO_EXT,
20
    unzip_file_to_dir,
21
    DEFAULT_METS_BASENAME,
22
    MIMETYPE_PAGE,
23
    VERSION,
24
    dist_version,
25
)
26
from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL
27
from ocrd_modelfactory import page_from_file
28
from ocrd_models.ocrd_page import to_xml
29
30
from .workspace import Workspace
31
32
tempfile.tempdir = '/tmp'  # TODO hard-coded
33
34
BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup')
35
36
37
class WorkspaceBagger():
38
    """
39
    Serialize/De-serialize from OCRD-ZIP to workspace and back.
40
    """
41
42
    def __init__(self, resolver, strict=False):
43
        self.resolver = resolver
44
        self.strict = strict
45
46
    def _serialize_bag(self, workspace, bagdir, dest, skip_zip):
47
        if skip_zip:
48
            move(bagdir, dest)
49
        else:
50
            make_archive(dest.replace('.zip', ''), 'zip', bagdir)
51
52
            # Remove temporary bagdir
53
            rmtree(bagdir)
54
55
    def _log_or_raise(self, msg):
56
        log = getLogger('ocrd.workspace_bagger')
57
        if self.strict:
58
            raise Exception(msg)
59
        else:
60
            log.info(msg)
61
62
    def _bag_mets_files(
63
        self,
64
        workspace,
65
        bagdir,
66
        ocrd_mets,
67
        processes,
68
        include_fileGrp=None,
69
        exclude_fileGrp=None,
70
    ):
71
        mets = workspace.mets
72
        changed_local_filenames = {}
73
74
        log = getLogger('ocrd.workspace_bagger')
75
        # TODO allow filtering by fileGrp@USE and such
76
77
        with pushd_popd(workspace.directory):
78
            # local_filenames of the files before changing
79
            for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp):
80
                log.info("Bagging OcrdFile %s", f)
81
82
                file_grp_dir = Path(bagdir, 'data', f.fileGrp)
83
                if not file_grp_dir.is_dir():
84
                    file_grp_dir.mkdir()
85
86
                attr = 'local_filename' if f.local_filename else 'url'
87
                basename = f.basename if f.basename else f"{f.ID}{MIME_TO_EXT.get(f.mimetype, '.xml')}"
88
                _relpath = join(f.fileGrp, basename)
89
                self.resolver.download_to_directory(file_grp_dir, getattr(f, attr), basename=basename)
90
                changed_local_filenames[str(getattr(f, attr))] = _relpath
91
                f.local_filename = _relpath
92
93
            # save mets.xml
94
            with open(join(bagdir, 'data', ocrd_mets), 'wb') as f:
95
                f.write(workspace.mets.to_xml())
96
97
        # Walk through bagged workspace and fix the PAGE
98
        # Page/@imageFilename and
99
        # AlternativeImage/@filename
100
        bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data'), mets_basename=ocrd_mets)
101
        with pushd_popd(bag_workspace.directory):
102
            for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE):
103
                pcgts = page_from_file(page_file)
104
                changed = False
105
                for old, new in changed_local_filenames.items():
106
                    if pcgts.get_Page().imageFilename == old:
107
                        pcgts.get_Page().imageFilename = new
108
                        changed = True
109
                    # TODO replace AlternativeImage, recursively...
110
                if changed:
111
                    with open(page_file.local_filename, 'w') as out:
112
                        out.write(to_xml(pcgts))
113
                    #  log.info("Replace %s -> %s in %s" % (old, new, page_file))
114
115
            with pushd_popd(bagdir):
116
                total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512'])
117
            log.info("New vs. old: %s" % changed_local_filenames)
118
        return total_bytes, total_files
119
120
    def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum,
121
                      ocrd_mets=DEFAULT_METS_BASENAME):
122
        bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL
123
        bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % (
124
            VERSION,  # TODO
125
            dist_version('ocrd-fork-bagit'),
126
            dist_version('ocrd-fork-bagit_profile'),
127
            ' '.join(sys.argv))
128
129
        bag.info['Ocrd-Identifier'] = ocrd_identifier
130
        if ocrd_base_version_checksum:
131
            bag.info['Ocrd-Base-Version-Checksum'] = ocrd_base_version_checksum
132
        bag.info['Bagging-Date'] = str(datetime.now())
133
        bag.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files)
134
        if ocrd_mets != DEFAULT_METS_BASENAME:
135
            bag.info['Ocrd-Mets'] = ocrd_mets
136
137
    def bag(self,
138
            workspace,
139
            ocrd_identifier,
140
            dest=None,
141
            ocrd_mets=DEFAULT_METS_BASENAME,
142
            ocrd_base_version_checksum=None,
143
            processes=1,
144
            skip_zip=False,
145
            tag_files=None,
146
            include_fileGrp=None,
147
            exclude_fileGrp=None,
148
    ):
149
        """
150
        Bag a workspace
151
152
        See https://ocr-d.github.com/ocrd_zip#packing-a-workspace-as-ocrd-zip
153
154
        Arguments:
155
            workspace (ocrd.Workspace): workspace to bag
156
            ord_identifier (string): Ocrd-Identifier in bag-info.txt
157
            dest (string): Path of the generated OCRD-ZIP.
158
            ord_mets (string): Ocrd-Mets in bag-info.txt
159
            ord_base_version_checksum (string): Ocrd-Base-Version-Checksum in bag-info.txt
160
            processes (integer): Number of parallel processes checksumming
161
            skip_zip (boolean): Whether to leave directory unzipped
162
            tag_files (list<string>): Path names of additional tag files to be bagged at the root of the bag
163
        """
164
        if tag_files is None:
165
            tag_files = []
166
167
        # create bagdir
168
        bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
169
170
        if dest is None:
171
            if not skip_zip:
172
                dest = '%s.ocrd.zip' % workspace.directory
173
            else:
174
                dest = '%s.ocrd' % workspace.directory
175
176
        log = getLogger('ocrd.workspace_bagger')
177
        log.info("Bagging %s to %s (temp dir %s)", workspace.directory, dest, bagdir)
178
179
        # create data dir
180
        makedirs(join(bagdir, 'data'))
181
182
        # create bagit.txt
183
        with open(join(bagdir, 'bagit.txt'), 'wb') as f:
184
            f.write(BAGIT_TXT.encode('utf-8'))
185
186
        # create manifests
187
        total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes,
188
                                                        include_fileGrp, exclude_fileGrp)
189
190
        # create bag-info.txt
191
        bag = Bag(bagdir)
192
        self._set_bag_info(bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=ocrd_mets)
193
194
        for tag_file in tag_files:
195
            copyfile(tag_file, join(bagdir, os_path_basename(tag_file)))
196
197
        # save bag
198
        bag.save()
199
200
        # ZIP it
201
        self._serialize_bag(workspace, bagdir, dest, skip_zip)
202
203
        log.info('Created bag at %s', dest)
204
        return dest
205
206
    def spill(self, src, dest):
207
        """
208
        Spill a workspace, i.e. unpack it and turn it into a workspace.
209
210
        See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace
211
212
        Arguments:
213
            src (string): Path to OCRD-ZIP
214
            dest (string): Path to directory to unpack data folder to
215
        """
216
        log = getLogger('ocrd.workspace_bagger')
217
218
        if exists(dest) and not isdir(dest):
219
            raise Exception("Not a directory: %s" % dest)
220
221
        # If dest is an existing directory, try to derive its name from src
222
        if isdir(dest):
223
            workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', os_path_basename(src))
224
            new_dest = join(dest, workspace_name)
225
            if exists(new_dest):
226
                raise Exception("Directory exists: %s" % new_dest)
227
            dest = new_dest
228
229
        log.info("Spilling %s to %s", src, dest)
230
231
        bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
232
        unzip_file_to_dir(src, bagdir)
233
        bag_info = _load_tag_file(join(bagdir, "bag-info.txt"))
234
235
        datadir = join(bagdir, 'data')
236
        for root, _, files in walk(datadir):
237
            for f in files:
238
                srcfile = join(root, f)
239
                destdir = join(dest, relpath(root, datadir))
240
                destfile = join(destdir, f)
241
                if not exists(destdir):
242
                    makedirs(destdir)
243
                log.debug("Copy %s -> %s", srcfile, destfile)
244
                copyfile(srcfile, destfile)
245
246
        # TODO copy allowed tag files if present
247
248
        # TODO validate bagit
249
250
        # Drop tempdir
251
        rmtree(bagdir)
252
253
        # Create workspace
254
        mets_basename = bag_info.get("Ocrd-Mets", DEFAULT_METS_BASENAME)
255
        workspace = Workspace(self.resolver, directory=dest, mets_basename=mets_basename)
256
257
        # TODO validate workspace
258
259
        return workspace
260
261
    def validate(self, bag):
262
        """
263
        Validate conformance with BagIt and OCR-D bagit profile.
264
265
        See:
266
            - https://ocr-d.github.io/ocrd_zip
267
            - https://ocr-d.github.io/bagit-profile.json
268
            - https://ocr-d.github.io/bagit-profile.yml
269
        """
270
        pass
271
272
    def recreate_checksums(self, src, dest=None, overwrite=False):
273
        """
274
        (Re)creates the files containing the checksums of a bag
275
276
        This function uses bag.py to create new files: manifest-sha512.txt and
277
        tagminifest-sha512.txt for the bag. Also 'Payload-Oxum' in bag-info.txt will be set to the
278
        appropriate value.
279
280
        Arguments:
281
            src (string):    Path to Bag. May be a zipped or unzipped bagit
282
            dest (string):   Path to where the result should be stored. Not needed if overwrite is
283
                             set
284
            overwrite(bool): Replace bag with newly created bag
285
        """
286
        if overwrite and dest:
287
            raise Exception("Setting 'dest' and 'overwrite' is a contradiction")
288
        if not overwrite and not dest:
289
            raise Exception("For checksum recreation 'dest' must be provided")
290
        src_path = Path(src)
291
        if not src_path.exists():
292
            raise Exception("Path to bag not existing")
293
        is_zipped = src_path.is_file()
294
295
        with TemporaryDirectory() as tempdir:
296
            if is_zipped:
297
                unzip_file_to_dir(src, tempdir)
298
                path_to_bag = Path(tempdir)
299
                if not path_to_bag.joinpath("data").exists():
300
                    raise FileNotFoundError("data directory of bag not found")
301
            else:
302
                path_to_bag = src_path if overwrite else Path(dest)
303
                if not src_path.joinpath("data").exists():
304
                    raise FileNotFoundError(f"data directory of bag not found at {src}")
305
                if not overwrite:
306
                    path_to_bag.mkdir(parents=True, exist_ok=True)
307
                    copytree(src, dest, dirs_exist_ok=True)
308
309
            with pushd_popd(path_to_bag):
310
                n_bytes, n_files = make_manifests("data", 1, ["sha512"])
311
312
                bag_infos = _load_tag_file("bag-info.txt")
313
                bag_infos["Payload-Oxum"] = f"{n_bytes}.{n_files}"
314
                _make_tag_file("bag-info.txt", bag_infos)
315
                _make_tagmanifest_file("sha512", ".")
316
317
            if is_zipped:
318
                name = src_path.name
319
                if name.endswith(".zip"):
320
                    name = name[:-4]
321
                zip_path = make_archive(name, "zip", path_to_bag)
322
                move(zip_path, src if overwrite else dest)
323