1
|
|
|
from datetime import datetime |
2
|
|
|
from os import makedirs, walk |
3
|
|
|
from os.path import join, isdir, basename as os_path_basename, exists, relpath |
4
|
|
|
from pathlib import Path |
5
|
|
|
from shutil import make_archive, rmtree, copyfile, move, copytree |
6
|
|
|
from tempfile import mkdtemp, TemporaryDirectory |
7
|
|
|
import re |
8
|
|
|
import tempfile |
9
|
|
|
import sys |
10
|
|
|
from bagit import ( |
11
|
|
|
Bag, |
12
|
|
|
make_manifests, |
13
|
|
|
_load_tag_file, _make_tag_file, _make_tagmanifest_file, # pylint: disable=no-name-in-module |
14
|
|
|
) |
15
|
|
|
|
16
|
|
|
from ocrd_utils import ( |
17
|
|
|
pushd_popd, |
18
|
|
|
getLogger, |
19
|
|
|
MIME_TO_EXT, |
20
|
|
|
unzip_file_to_dir, |
21
|
|
|
DEFAULT_METS_BASENAME, |
22
|
|
|
MIMETYPE_PAGE, |
23
|
|
|
VERSION, |
24
|
|
|
dist_version, |
25
|
|
|
) |
26
|
|
|
from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL |
27
|
|
|
from ocrd_modelfactory import page_from_file |
28
|
|
|
from ocrd_models.ocrd_page import to_xml |
29
|
|
|
|
30
|
|
|
from .workspace import Workspace |
31
|
|
|
|
32
|
|
|
tempfile.tempdir = '/tmp' # TODO hard-coded |
33
|
|
|
|
34
|
|
|
BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup') |
35
|
|
|
|
36
|
|
|
|
37
|
|
|
class WorkspaceBagger(): |
38
|
|
|
""" |
39
|
|
|
Serialize/De-serialize from OCRD-ZIP to workspace and back. |
40
|
|
|
""" |
41
|
|
|
|
42
|
|
|
def __init__(self, resolver, strict=False): |
43
|
|
|
self.resolver = resolver |
44
|
|
|
self.strict = strict |
45
|
|
|
|
46
|
|
|
def _serialize_bag(self, workspace, bagdir, dest, skip_zip): |
47
|
|
|
if skip_zip: |
48
|
|
|
move(bagdir, dest) |
49
|
|
|
else: |
50
|
|
|
make_archive(dest.replace('.zip', ''), 'zip', bagdir) |
51
|
|
|
|
52
|
|
|
# Remove temporary bagdir |
53
|
|
|
rmtree(bagdir) |
54
|
|
|
|
55
|
|
|
def _log_or_raise(self, msg): |
56
|
|
|
log = getLogger('ocrd.workspace_bagger') |
57
|
|
|
if self.strict: |
58
|
|
|
raise Exception(msg) |
59
|
|
|
else: |
60
|
|
|
log.info(msg) |
61
|
|
|
|
62
|
|
|
def _bag_mets_files( |
63
|
|
|
self, |
64
|
|
|
workspace, |
65
|
|
|
bagdir, |
66
|
|
|
ocrd_mets, |
67
|
|
|
processes, |
68
|
|
|
include_fileGrp=None, |
69
|
|
|
exclude_fileGrp=None, |
70
|
|
|
): |
71
|
|
|
mets = workspace.mets |
72
|
|
|
changed_local_filenames = {} |
73
|
|
|
|
74
|
|
|
log = getLogger('ocrd.workspace_bagger') |
75
|
|
|
# TODO allow filtering by fileGrp@USE and such |
76
|
|
|
|
77
|
|
|
with pushd_popd(workspace.directory): |
78
|
|
|
# local_filenames of the files before changing |
79
|
|
|
for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp): |
80
|
|
|
log.info("Bagging OcrdFile %s", f) |
81
|
|
|
|
82
|
|
|
file_grp_dir = Path(bagdir, 'data', f.fileGrp) |
83
|
|
|
if not file_grp_dir.is_dir(): |
84
|
|
|
file_grp_dir.mkdir() |
85
|
|
|
|
86
|
|
|
attr = 'local_filename' if f.local_filename else 'url' |
87
|
|
|
basename = f.basename if f.basename else f"{f.ID}{MIME_TO_EXT.get(f.mimetype, '.xml')}" |
88
|
|
|
_relpath = join(f.fileGrp, basename) |
89
|
|
|
self.resolver.download_to_directory(file_grp_dir, getattr(f, attr), basename=basename) |
90
|
|
|
changed_local_filenames[str(getattr(f, attr))] = _relpath |
91
|
|
|
f.local_filename = _relpath |
92
|
|
|
|
93
|
|
|
# save mets.xml |
94
|
|
|
with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: |
95
|
|
|
f.write(workspace.mets.to_xml()) |
96
|
|
|
|
97
|
|
|
# Walk through bagged workspace and fix the PAGE |
98
|
|
|
# Page/@imageFilename and |
99
|
|
|
# AlternativeImage/@filename |
100
|
|
|
bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data'), mets_basename=ocrd_mets) |
101
|
|
|
with pushd_popd(bag_workspace.directory): |
102
|
|
|
for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE): |
103
|
|
|
pcgts = page_from_file(page_file) |
104
|
|
|
changed = False |
105
|
|
|
for old, new in changed_local_filenames.items(): |
106
|
|
|
if pcgts.get_Page().imageFilename == old: |
107
|
|
|
pcgts.get_Page().imageFilename = new |
108
|
|
|
changed = True |
109
|
|
|
# TODO replace AlternativeImage, recursively... |
110
|
|
|
if changed: |
111
|
|
|
with open(page_file.local_filename, 'w') as out: |
112
|
|
|
out.write(to_xml(pcgts)) |
113
|
|
|
# log.info("Replace %s -> %s in %s" % (old, new, page_file)) |
114
|
|
|
|
115
|
|
|
with pushd_popd(bagdir): |
116
|
|
|
total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) |
117
|
|
|
log.info("New vs. old: %s" % changed_local_filenames) |
118
|
|
|
return total_bytes, total_files |
119
|
|
|
|
120
|
|
|
def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, |
121
|
|
|
ocrd_mets=DEFAULT_METS_BASENAME): |
122
|
|
|
bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL |
123
|
|
|
bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % ( |
124
|
|
|
VERSION, # TODO |
125
|
|
|
dist_version('ocrd-fork-bagit'), |
126
|
|
|
dist_version('ocrd-fork-bagit_profile'), |
127
|
|
|
' '.join(sys.argv)) |
128
|
|
|
|
129
|
|
|
bag.info['Ocrd-Identifier'] = ocrd_identifier |
130
|
|
|
if ocrd_base_version_checksum: |
131
|
|
|
bag.info['Ocrd-Base-Version-Checksum'] = ocrd_base_version_checksum |
132
|
|
|
bag.info['Bagging-Date'] = str(datetime.now()) |
133
|
|
|
bag.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files) |
134
|
|
|
if ocrd_mets != DEFAULT_METS_BASENAME: |
135
|
|
|
bag.info['Ocrd-Mets'] = ocrd_mets |
136
|
|
|
|
137
|
|
|
def bag(self, |
138
|
|
|
workspace, |
139
|
|
|
ocrd_identifier, |
140
|
|
|
dest=None, |
141
|
|
|
ocrd_mets=DEFAULT_METS_BASENAME, |
142
|
|
|
ocrd_base_version_checksum=None, |
143
|
|
|
processes=1, |
144
|
|
|
skip_zip=False, |
145
|
|
|
tag_files=None, |
146
|
|
|
include_fileGrp=None, |
147
|
|
|
exclude_fileGrp=None, |
148
|
|
|
): |
149
|
|
|
""" |
150
|
|
|
Bag a workspace |
151
|
|
|
|
152
|
|
|
See https://ocr-d.github.com/ocrd_zip#packing-a-workspace-as-ocrd-zip |
153
|
|
|
|
154
|
|
|
Arguments: |
155
|
|
|
workspace (ocrd.Workspace): workspace to bag |
156
|
|
|
ord_identifier (string): Ocrd-Identifier in bag-info.txt |
157
|
|
|
dest (string): Path of the generated OCRD-ZIP. |
158
|
|
|
ord_mets (string): Ocrd-Mets in bag-info.txt |
159
|
|
|
ord_base_version_checksum (string): Ocrd-Base-Version-Checksum in bag-info.txt |
160
|
|
|
processes (integer): Number of parallel processes checksumming |
161
|
|
|
skip_zip (boolean): Whether to leave directory unzipped |
162
|
|
|
tag_files (list<string>): Path names of additional tag files to be bagged at the root of the bag |
163
|
|
|
""" |
164
|
|
|
if tag_files is None: |
165
|
|
|
tag_files = [] |
166
|
|
|
|
167
|
|
|
# create bagdir |
168
|
|
|
bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) |
169
|
|
|
|
170
|
|
|
if dest is None: |
171
|
|
|
if not skip_zip: |
172
|
|
|
dest = '%s.ocrd.zip' % workspace.directory |
173
|
|
|
else: |
174
|
|
|
dest = '%s.ocrd' % workspace.directory |
175
|
|
|
|
176
|
|
|
log = getLogger('ocrd.workspace_bagger') |
177
|
|
|
log.info("Bagging %s to %s (temp dir %s)", workspace.directory, dest, bagdir) |
178
|
|
|
|
179
|
|
|
# create data dir |
180
|
|
|
makedirs(join(bagdir, 'data')) |
181
|
|
|
|
182
|
|
|
# create bagit.txt |
183
|
|
|
with open(join(bagdir, 'bagit.txt'), 'wb') as f: |
184
|
|
|
f.write(BAGIT_TXT.encode('utf-8')) |
185
|
|
|
|
186
|
|
|
# create manifests |
187
|
|
|
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, |
188
|
|
|
include_fileGrp, exclude_fileGrp) |
189
|
|
|
|
190
|
|
|
# create bag-info.txt |
191
|
|
|
bag = Bag(bagdir) |
192
|
|
|
self._set_bag_info(bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=ocrd_mets) |
193
|
|
|
|
194
|
|
|
for tag_file in tag_files: |
195
|
|
|
copyfile(tag_file, join(bagdir, os_path_basename(tag_file))) |
196
|
|
|
|
197
|
|
|
# save bag |
198
|
|
|
bag.save() |
199
|
|
|
|
200
|
|
|
# ZIP it |
201
|
|
|
self._serialize_bag(workspace, bagdir, dest, skip_zip) |
202
|
|
|
|
203
|
|
|
log.info('Created bag at %s', dest) |
204
|
|
|
return dest |
205
|
|
|
|
206
|
|
|
def spill(self, src, dest): |
207
|
|
|
""" |
208
|
|
|
Spill a workspace, i.e. unpack it and turn it into a workspace. |
209
|
|
|
|
210
|
|
|
See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace |
211
|
|
|
|
212
|
|
|
Arguments: |
213
|
|
|
src (string): Path to OCRD-ZIP |
214
|
|
|
dest (string): Path to directory to unpack data folder to |
215
|
|
|
""" |
216
|
|
|
log = getLogger('ocrd.workspace_bagger') |
217
|
|
|
|
218
|
|
|
if exists(dest) and not isdir(dest): |
219
|
|
|
raise Exception("Not a directory: %s" % dest) |
220
|
|
|
|
221
|
|
|
# If dest is an existing directory, try to derive its name from src |
222
|
|
|
if isdir(dest): |
223
|
|
|
workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', os_path_basename(src)) |
224
|
|
|
new_dest = join(dest, workspace_name) |
225
|
|
|
if exists(new_dest): |
226
|
|
|
raise Exception("Directory exists: %s" % new_dest) |
227
|
|
|
dest = new_dest |
228
|
|
|
|
229
|
|
|
log.info("Spilling %s to %s", src, dest) |
230
|
|
|
|
231
|
|
|
bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) |
232
|
|
|
unzip_file_to_dir(src, bagdir) |
233
|
|
|
bag_info = _load_tag_file(join(bagdir, "bag-info.txt")) |
234
|
|
|
|
235
|
|
|
datadir = join(bagdir, 'data') |
236
|
|
|
for root, _, files in walk(datadir): |
237
|
|
|
for f in files: |
238
|
|
|
srcfile = join(root, f) |
239
|
|
|
destdir = join(dest, relpath(root, datadir)) |
240
|
|
|
destfile = join(destdir, f) |
241
|
|
|
if not exists(destdir): |
242
|
|
|
makedirs(destdir) |
243
|
|
|
log.debug("Copy %s -> %s", srcfile, destfile) |
244
|
|
|
copyfile(srcfile, destfile) |
245
|
|
|
|
246
|
|
|
# TODO copy allowed tag files if present |
247
|
|
|
|
248
|
|
|
# TODO validate bagit |
249
|
|
|
|
250
|
|
|
# Drop tempdir |
251
|
|
|
rmtree(bagdir) |
252
|
|
|
|
253
|
|
|
# Create workspace |
254
|
|
|
mets_basename = bag_info.get("Ocrd-Mets", DEFAULT_METS_BASENAME) |
255
|
|
|
workspace = Workspace(self.resolver, directory=dest, mets_basename=mets_basename) |
256
|
|
|
|
257
|
|
|
# TODO validate workspace |
258
|
|
|
|
259
|
|
|
return workspace |
260
|
|
|
|
261
|
|
|
def validate(self, bag): |
262
|
|
|
""" |
263
|
|
|
Validate conformance with BagIt and OCR-D bagit profile. |
264
|
|
|
|
265
|
|
|
See: |
266
|
|
|
- https://ocr-d.github.io/ocrd_zip |
267
|
|
|
- https://ocr-d.github.io/bagit-profile.json |
268
|
|
|
- https://ocr-d.github.io/bagit-profile.yml |
269
|
|
|
""" |
270
|
|
|
pass |
271
|
|
|
|
272
|
|
|
def recreate_checksums(self, src, dest=None, overwrite=False): |
273
|
|
|
""" |
274
|
|
|
(Re)creates the files containing the checksums of a bag |
275
|
|
|
|
276
|
|
|
This function uses bag.py to create new files: manifest-sha512.txt and |
277
|
|
|
tagminifest-sha512.txt for the bag. Also 'Payload-Oxum' in bag-info.txt will be set to the |
278
|
|
|
appropriate value. |
279
|
|
|
|
280
|
|
|
Arguments: |
281
|
|
|
src (string): Path to Bag. May be a zipped or unzipped bagit |
282
|
|
|
dest (string): Path to where the result should be stored. Not needed if overwrite is |
283
|
|
|
set |
284
|
|
|
overwrite(bool): Replace bag with newly created bag |
285
|
|
|
""" |
286
|
|
|
if overwrite and dest: |
287
|
|
|
raise Exception("Setting 'dest' and 'overwrite' is a contradiction") |
288
|
|
|
if not overwrite and not dest: |
289
|
|
|
raise Exception("For checksum recreation 'dest' must be provided") |
290
|
|
|
src_path = Path(src) |
291
|
|
|
if not src_path.exists(): |
292
|
|
|
raise Exception("Path to bag not existing") |
293
|
|
|
is_zipped = src_path.is_file() |
294
|
|
|
|
295
|
|
|
with TemporaryDirectory() as tempdir: |
296
|
|
|
if is_zipped: |
297
|
|
|
unzip_file_to_dir(src, tempdir) |
298
|
|
|
path_to_bag = Path(tempdir) |
299
|
|
|
if not path_to_bag.joinpath("data").exists(): |
300
|
|
|
raise FileNotFoundError("data directory of bag not found") |
301
|
|
|
else: |
302
|
|
|
path_to_bag = src_path if overwrite else Path(dest) |
303
|
|
|
if not src_path.joinpath("data").exists(): |
304
|
|
|
raise FileNotFoundError(f"data directory of bag not found at {src}") |
305
|
|
|
if not overwrite: |
306
|
|
|
path_to_bag.mkdir(parents=True, exist_ok=True) |
307
|
|
|
copytree(src, dest, dirs_exist_ok=True) |
308
|
|
|
|
309
|
|
|
with pushd_popd(path_to_bag): |
310
|
|
|
n_bytes, n_files = make_manifests("data", 1, ["sha512"]) |
311
|
|
|
|
312
|
|
|
bag_infos = _load_tag_file("bag-info.txt") |
313
|
|
|
bag_infos["Payload-Oxum"] = f"{n_bytes}.{n_files}" |
314
|
|
|
_make_tag_file("bag-info.txt", bag_infos) |
315
|
|
|
_make_tagmanifest_file("sha512", ".") |
316
|
|
|
|
317
|
|
|
if is_zipped: |
318
|
|
|
name = src_path.name |
319
|
|
|
if name.endswith(".zip"): |
320
|
|
|
name = name[:-4] |
321
|
|
|
zip_path = make_archive(name, "zip", path_to_bag) |
322
|
|
|
move(zip_path, src if overwrite else dest) |
323
|
|
|
|