|
1
|
|
|
from datetime import datetime |
|
2
|
|
|
from os import makedirs, walk |
|
3
|
|
|
from os.path import join, isdir, basename as os_path_basename, exists, relpath |
|
4
|
|
|
from pathlib import Path |
|
5
|
|
|
from shutil import make_archive, rmtree, copyfile, move, copytree |
|
6
|
|
|
from tempfile import mkdtemp, TemporaryDirectory |
|
7
|
|
|
import re |
|
8
|
|
|
import tempfile |
|
9
|
|
|
import sys |
|
10
|
|
|
from bagit import ( |
|
11
|
|
|
Bag, |
|
12
|
|
|
make_manifests, |
|
13
|
|
|
_load_tag_file, _make_tag_file, _make_tagmanifest_file, # pylint: disable=no-name-in-module |
|
14
|
|
|
) |
|
15
|
|
|
|
|
16
|
|
|
from ocrd_utils import ( |
|
17
|
|
|
pushd_popd, |
|
18
|
|
|
getLogger, |
|
19
|
|
|
MIME_TO_EXT, |
|
20
|
|
|
unzip_file_to_dir, |
|
21
|
|
|
DEFAULT_METS_BASENAME, |
|
22
|
|
|
MIMETYPE_PAGE, |
|
23
|
|
|
VERSION, |
|
24
|
|
|
dist_version, |
|
25
|
|
|
) |
|
26
|
|
|
from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL |
|
27
|
|
|
from ocrd_modelfactory import page_from_file |
|
28
|
|
|
from ocrd_models.ocrd_page import to_xml |
|
29
|
|
|
|
|
30
|
|
|
from .workspace import Workspace |
|
31
|
|
|
|
|
32
|
|
|
tempfile.tempdir = '/tmp' # TODO hard-coded |
|
33
|
|
|
|
|
34
|
|
|
BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup') |
|
35
|
|
|
|
|
36
|
|
|
|
|
37
|
|
|
class WorkspaceBagger(): |
|
38
|
|
|
""" |
|
39
|
|
|
Serialize/De-serialize from OCRD-ZIP to workspace and back. |
|
40
|
|
|
""" |
|
41
|
|
|
|
|
42
|
|
|
def __init__(self, resolver, strict=False): |
|
43
|
|
|
self.resolver = resolver |
|
44
|
|
|
self.strict = strict |
|
45
|
|
|
|
|
46
|
|
|
def _serialize_bag(self, workspace, bagdir, dest, skip_zip): |
|
47
|
|
|
if skip_zip: |
|
48
|
|
|
move(bagdir, dest) |
|
49
|
|
|
else: |
|
50
|
|
|
make_archive(dest.replace('.zip', ''), 'zip', bagdir) |
|
51
|
|
|
|
|
52
|
|
|
# Remove temporary bagdir |
|
53
|
|
|
rmtree(bagdir) |
|
54
|
|
|
|
|
55
|
|
|
def _log_or_raise(self, msg): |
|
56
|
|
|
log = getLogger('ocrd.workspace_bagger') |
|
57
|
|
|
if self.strict: |
|
58
|
|
|
raise Exception(msg) |
|
59
|
|
|
else: |
|
60
|
|
|
log.info(msg) |
|
61
|
|
|
|
|
62
|
|
|
def _bag_mets_files( |
|
63
|
|
|
self, |
|
64
|
|
|
workspace, |
|
65
|
|
|
bagdir, |
|
66
|
|
|
ocrd_mets, |
|
67
|
|
|
processes, |
|
68
|
|
|
include_fileGrp=None, |
|
69
|
|
|
exclude_fileGrp=None, |
|
70
|
|
|
): |
|
71
|
|
|
mets = workspace.mets |
|
72
|
|
|
changed_local_filenames = {} |
|
73
|
|
|
|
|
74
|
|
|
log = getLogger('ocrd.workspace_bagger') |
|
75
|
|
|
# TODO allow filtering by fileGrp@USE and such |
|
76
|
|
|
|
|
77
|
|
|
with pushd_popd(workspace.directory): |
|
78
|
|
|
# local_filenames of the files before changing |
|
79
|
|
|
for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp): |
|
80
|
|
|
log.info("Bagging OcrdFile %s", f) |
|
81
|
|
|
|
|
82
|
|
|
file_grp_dir = Path(bagdir, 'data', f.fileGrp) |
|
83
|
|
|
if not file_grp_dir.is_dir(): |
|
84
|
|
|
file_grp_dir.mkdir() |
|
85
|
|
|
|
|
86
|
|
|
attr = 'local_filename' if f.local_filename else 'url' |
|
87
|
|
|
basename = f.basename if f.basename else f"{f.ID}{MIME_TO_EXT.get(f.mimetype, '.xml')}" |
|
88
|
|
|
_relpath = join(f.fileGrp, basename) |
|
89
|
|
|
self.resolver.download_to_directory(file_grp_dir, getattr(f, attr), basename=basename) |
|
90
|
|
|
changed_local_filenames[str(getattr(f, attr))] = _relpath |
|
91
|
|
|
f.local_filename = _relpath |
|
92
|
|
|
|
|
93
|
|
|
# save mets.xml |
|
94
|
|
|
with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: |
|
95
|
|
|
f.write(workspace.mets.to_xml()) |
|
96
|
|
|
|
|
97
|
|
|
# Walk through bagged workspace and fix the PAGE |
|
98
|
|
|
# Page/@imageFilename and |
|
99
|
|
|
# AlternativeImage/@filename |
|
100
|
|
|
bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data'), mets_basename=ocrd_mets) |
|
101
|
|
|
with pushd_popd(bag_workspace.directory): |
|
102
|
|
|
for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE): |
|
103
|
|
|
pcgts = page_from_file(page_file) |
|
104
|
|
|
changed = False |
|
105
|
|
|
for old, new in changed_local_filenames.items(): |
|
106
|
|
|
if pcgts.get_Page().imageFilename == old: |
|
107
|
|
|
pcgts.get_Page().imageFilename = new |
|
108
|
|
|
changed = True |
|
109
|
|
|
# TODO replace AlternativeImage, recursively... |
|
110
|
|
|
if changed: |
|
111
|
|
|
with open(page_file.local_filename, 'w') as out: |
|
112
|
|
|
out.write(to_xml(pcgts)) |
|
113
|
|
|
# log.info("Replace %s -> %s in %s" % (old, new, page_file)) |
|
114
|
|
|
|
|
115
|
|
|
with pushd_popd(bagdir): |
|
116
|
|
|
total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) |
|
117
|
|
|
log.info("New vs. old: %s" % changed_local_filenames) |
|
118
|
|
|
return total_bytes, total_files |
|
119
|
|
|
|
|
120
|
|
|
def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, |
|
121
|
|
|
ocrd_mets=DEFAULT_METS_BASENAME): |
|
122
|
|
|
bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL |
|
123
|
|
|
bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % ( |
|
124
|
|
|
VERSION, # TODO |
|
125
|
|
|
dist_version('ocrd-fork-bagit'), |
|
126
|
|
|
dist_version('ocrd-fork-bagit_profile'), |
|
127
|
|
|
' '.join(sys.argv)) |
|
128
|
|
|
|
|
129
|
|
|
bag.info['Ocrd-Identifier'] = ocrd_identifier |
|
130
|
|
|
if ocrd_base_version_checksum: |
|
131
|
|
|
bag.info['Ocrd-Base-Version-Checksum'] = ocrd_base_version_checksum |
|
132
|
|
|
bag.info['Bagging-Date'] = str(datetime.now()) |
|
133
|
|
|
bag.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files) |
|
134
|
|
|
if ocrd_mets != DEFAULT_METS_BASENAME: |
|
135
|
|
|
bag.info['Ocrd-Mets'] = ocrd_mets |
|
136
|
|
|
|
|
137
|
|
|
def bag(self, |
|
138
|
|
|
workspace, |
|
139
|
|
|
ocrd_identifier, |
|
140
|
|
|
dest=None, |
|
141
|
|
|
ocrd_mets=DEFAULT_METS_BASENAME, |
|
142
|
|
|
ocrd_base_version_checksum=None, |
|
143
|
|
|
processes=1, |
|
144
|
|
|
skip_zip=False, |
|
145
|
|
|
tag_files=None, |
|
146
|
|
|
include_fileGrp=None, |
|
147
|
|
|
exclude_fileGrp=None, |
|
148
|
|
|
): |
|
149
|
|
|
""" |
|
150
|
|
|
Bag a workspace |
|
151
|
|
|
|
|
152
|
|
|
See https://ocr-d.github.com/ocrd_zip#packing-a-workspace-as-ocrd-zip |
|
153
|
|
|
|
|
154
|
|
|
Arguments: |
|
155
|
|
|
workspace (ocrd.Workspace): workspace to bag |
|
156
|
|
|
ord_identifier (string): Ocrd-Identifier in bag-info.txt |
|
157
|
|
|
dest (string): Path of the generated OCRD-ZIP. |
|
158
|
|
|
ord_mets (string): Ocrd-Mets in bag-info.txt |
|
159
|
|
|
ord_base_version_checksum (string): Ocrd-Base-Version-Checksum in bag-info.txt |
|
160
|
|
|
processes (integer): Number of parallel processes checksumming |
|
161
|
|
|
skip_zip (boolean): Whether to leave directory unzipped |
|
162
|
|
|
tag_files (list<string>): Path names of additional tag files to be bagged at the root of the bag |
|
163
|
|
|
""" |
|
164
|
|
|
if tag_files is None: |
|
165
|
|
|
tag_files = [] |
|
166
|
|
|
|
|
167
|
|
|
# create bagdir |
|
168
|
|
|
bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) |
|
169
|
|
|
|
|
170
|
|
|
if dest is None: |
|
171
|
|
|
if not skip_zip: |
|
172
|
|
|
dest = '%s.ocrd.zip' % workspace.directory |
|
173
|
|
|
else: |
|
174
|
|
|
dest = '%s.ocrd' % workspace.directory |
|
175
|
|
|
|
|
176
|
|
|
log = getLogger('ocrd.workspace_bagger') |
|
177
|
|
|
log.info("Bagging %s to %s (temp dir %s)", workspace.directory, dest, bagdir) |
|
178
|
|
|
|
|
179
|
|
|
# create data dir |
|
180
|
|
|
makedirs(join(bagdir, 'data')) |
|
181
|
|
|
|
|
182
|
|
|
# create bagit.txt |
|
183
|
|
|
with open(join(bagdir, 'bagit.txt'), 'wb') as f: |
|
184
|
|
|
f.write(BAGIT_TXT.encode('utf-8')) |
|
185
|
|
|
|
|
186
|
|
|
# create manifests |
|
187
|
|
|
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, |
|
188
|
|
|
include_fileGrp, exclude_fileGrp) |
|
189
|
|
|
|
|
190
|
|
|
# create bag-info.txt |
|
191
|
|
|
bag = Bag(bagdir) |
|
192
|
|
|
self._set_bag_info(bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=ocrd_mets) |
|
193
|
|
|
|
|
194
|
|
|
for tag_file in tag_files: |
|
195
|
|
|
copyfile(tag_file, join(bagdir, os_path_basename(tag_file))) |
|
196
|
|
|
|
|
197
|
|
|
# save bag |
|
198
|
|
|
bag.save() |
|
199
|
|
|
|
|
200
|
|
|
# ZIP it |
|
201
|
|
|
self._serialize_bag(workspace, bagdir, dest, skip_zip) |
|
202
|
|
|
|
|
203
|
|
|
log.info('Created bag at %s', dest) |
|
204
|
|
|
return dest |
|
205
|
|
|
|
|
206
|
|
|
def spill(self, src, dest): |
|
207
|
|
|
""" |
|
208
|
|
|
Spill a workspace, i.e. unpack it and turn it into a workspace. |
|
209
|
|
|
|
|
210
|
|
|
See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace |
|
211
|
|
|
|
|
212
|
|
|
Arguments: |
|
213
|
|
|
src (string): Path to OCRD-ZIP |
|
214
|
|
|
dest (string): Path to directory to unpack data folder to |
|
215
|
|
|
""" |
|
216
|
|
|
log = getLogger('ocrd.workspace_bagger') |
|
217
|
|
|
|
|
218
|
|
|
if exists(dest) and not isdir(dest): |
|
219
|
|
|
raise Exception("Not a directory: %s" % dest) |
|
220
|
|
|
|
|
221
|
|
|
# If dest is an existing directory, try to derive its name from src |
|
222
|
|
|
if isdir(dest): |
|
223
|
|
|
workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', os_path_basename(src)) |
|
224
|
|
|
new_dest = join(dest, workspace_name) |
|
225
|
|
|
if exists(new_dest): |
|
226
|
|
|
raise Exception("Directory exists: %s" % new_dest) |
|
227
|
|
|
dest = new_dest |
|
228
|
|
|
|
|
229
|
|
|
log.info("Spilling %s to %s", src, dest) |
|
230
|
|
|
|
|
231
|
|
|
bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) |
|
232
|
|
|
unzip_file_to_dir(src, bagdir) |
|
233
|
|
|
bag_info = _load_tag_file(join(bagdir, "bag-info.txt")) |
|
234
|
|
|
|
|
235
|
|
|
datadir = join(bagdir, 'data') |
|
236
|
|
|
for root, _, files in walk(datadir): |
|
237
|
|
|
for f in files: |
|
238
|
|
|
srcfile = join(root, f) |
|
239
|
|
|
destdir = join(dest, relpath(root, datadir)) |
|
240
|
|
|
destfile = join(destdir, f) |
|
241
|
|
|
if not exists(destdir): |
|
242
|
|
|
makedirs(destdir) |
|
243
|
|
|
log.debug("Copy %s -> %s", srcfile, destfile) |
|
244
|
|
|
copyfile(srcfile, destfile) |
|
245
|
|
|
|
|
246
|
|
|
# TODO copy allowed tag files if present |
|
247
|
|
|
|
|
248
|
|
|
# TODO validate bagit |
|
249
|
|
|
|
|
250
|
|
|
# Drop tempdir |
|
251
|
|
|
rmtree(bagdir) |
|
252
|
|
|
|
|
253
|
|
|
# Create workspace |
|
254
|
|
|
mets_basename = bag_info.get("Ocrd-Mets", DEFAULT_METS_BASENAME) |
|
255
|
|
|
workspace = Workspace(self.resolver, directory=dest, mets_basename=mets_basename) |
|
256
|
|
|
|
|
257
|
|
|
# TODO validate workspace |
|
258
|
|
|
|
|
259
|
|
|
return workspace |
|
260
|
|
|
|
|
261
|
|
|
def validate(self, bag): |
|
262
|
|
|
""" |
|
263
|
|
|
Validate conformance with BagIt and OCR-D bagit profile. |
|
264
|
|
|
|
|
265
|
|
|
See: |
|
266
|
|
|
- https://ocr-d.github.io/ocrd_zip |
|
267
|
|
|
- https://ocr-d.github.io/bagit-profile.json |
|
268
|
|
|
- https://ocr-d.github.io/bagit-profile.yml |
|
269
|
|
|
""" |
|
270
|
|
|
pass |
|
271
|
|
|
|
|
272
|
|
|
def recreate_checksums(self, src, dest=None, overwrite=False): |
|
273
|
|
|
""" |
|
274
|
|
|
(Re)creates the files containing the checksums of a bag |
|
275
|
|
|
|
|
276
|
|
|
This function uses bag.py to create new files: manifest-sha512.txt and |
|
277
|
|
|
tagminifest-sha512.txt for the bag. Also 'Payload-Oxum' in bag-info.txt will be set to the |
|
278
|
|
|
appropriate value. |
|
279
|
|
|
|
|
280
|
|
|
Arguments: |
|
281
|
|
|
src (string): Path to Bag. May be a zipped or unzipped bagit |
|
282
|
|
|
dest (string): Path to where the result should be stored. Not needed if overwrite is |
|
283
|
|
|
set |
|
284
|
|
|
overwrite(bool): Replace bag with newly created bag |
|
285
|
|
|
""" |
|
286
|
|
|
if overwrite and dest: |
|
287
|
|
|
raise Exception("Setting 'dest' and 'overwrite' is a contradiction") |
|
288
|
|
|
if not overwrite and not dest: |
|
289
|
|
|
raise Exception("For checksum recreation 'dest' must be provided") |
|
290
|
|
|
src_path = Path(src) |
|
291
|
|
|
if not src_path.exists(): |
|
292
|
|
|
raise Exception("Path to bag not existing") |
|
293
|
|
|
is_zipped = src_path.is_file() |
|
294
|
|
|
|
|
295
|
|
|
with TemporaryDirectory() as tempdir: |
|
296
|
|
|
if is_zipped: |
|
297
|
|
|
unzip_file_to_dir(src, tempdir) |
|
298
|
|
|
path_to_bag = Path(tempdir) |
|
299
|
|
|
if not path_to_bag.joinpath("data").exists(): |
|
300
|
|
|
raise FileNotFoundError("data directory of bag not found") |
|
301
|
|
|
else: |
|
302
|
|
|
path_to_bag = src_path if overwrite else Path(dest) |
|
303
|
|
|
if not src_path.joinpath("data").exists(): |
|
304
|
|
|
raise FileNotFoundError(f"data directory of bag not found at {src}") |
|
305
|
|
|
if not overwrite: |
|
306
|
|
|
path_to_bag.mkdir(parents=True, exist_ok=True) |
|
307
|
|
|
copytree(src, dest, dirs_exist_ok=True) |
|
308
|
|
|
|
|
309
|
|
|
with pushd_popd(path_to_bag): |
|
310
|
|
|
n_bytes, n_files = make_manifests("data", 1, ["sha512"]) |
|
311
|
|
|
|
|
312
|
|
|
bag_infos = _load_tag_file("bag-info.txt") |
|
313
|
|
|
bag_infos["Payload-Oxum"] = f"{n_bytes}.{n_files}" |
|
314
|
|
|
_make_tag_file("bag-info.txt", bag_infos) |
|
315
|
|
|
_make_tagmanifest_file("sha512", ".") |
|
316
|
|
|
|
|
317
|
|
|
if is_zipped: |
|
318
|
|
|
name = src_path.name |
|
319
|
|
|
if name.endswith(".zip"): |
|
320
|
|
|
name = name[:-4] |
|
321
|
|
|
zip_path = make_archive(name, "zip", path_to_bag) |
|
322
|
|
|
move(zip_path, src if overwrite else dest) |
|
323
|
|
|
|