Passed
Push — master ( 407c02...ff81c6 )
by Konstantin
02:41
created

ocrd.cli.workspace.WorkspaceCtx.workspace()   A

Complexity

Conditions 1

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 7
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd, rmdir, unlink
10
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
import numpy as np
18
19
import click
20
21
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
22
from ocrd.mets_server import OcrdMetsServer
23
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
24
from ocrd.decorators import mets_find_options
25
from . import command_with_replaced_help
26
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
27
28
29
class WorkspaceCtx():
30
31
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
32
        self.log = getLogger('ocrd.cli.workspace')
33
        if mets_basename:
34
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
35
        self.resolver = Resolver()
36
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
37
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
38
        self.automatic_backup = automatic_backup
39
40
    def workspace(self):
41
        return Workspace(
42
            self.resolver,
43
            directory=self.directory,
44
            mets_basename=self.mets_basename,
45
            automatic_backup=self.automatic_backup,
46
            mets_server_url=self.mets_server_url,
47
        )
48
    def backup_manager(self):
49
        return WorkspaceBackupManager(self.workspace())
50
51
52
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
53
54
# ----------------------------------------------------------------------
55
# ocrd workspace
56
# ----------------------------------------------------------------------
57
58
@click.group("workspace")
59
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
60
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
61
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
62
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server")
63
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
64
@click.pass_context
65
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
66
    """
67
    Managing workspaces
68
69
    A workspace comprises a METS file and a directory as point of reference.
70
71
    Operates on the file system directly or via a METS server 
72
    (already running via some prior `server start` subcommand).
73
    """
74
    initLogging()
75
    ctx.obj = WorkspaceCtx(
76
        directory,
77
        mets_url=mets,
78
        mets_basename=mets_basename,
79
        mets_server_url=mets_server_url,
80
        automatic_backup=backup
81
    )
82
83
# ----------------------------------------------------------------------
84
# ocrd workspace validate
85
# ----------------------------------------------------------------------
86
87
@workspace_cli.command('validate', cls=command_with_replaced_help(
88
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
89
@pass_workspace
90
@click.option('-a', '--download', is_flag=True, help="Download all files")
91
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
92
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
93
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
94
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
95
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
96
@click.argument('mets_url', default=None, required=False)
97
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
98
    """
99
    Validate a workspace
100
101
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
102
    If not given, use --mets accordingly.
103
104
    Check that the METS and its referenced file contents
105
    abide by the OCR-D specifications.
106
    """
107
    LOG = getLogger('ocrd.cli.workspace.validate')
108
    if mets_url:
109
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
110
    else:
111
        mets_url = ctx.mets_url
112
    report = WorkspaceValidator.validate(
113
        ctx.resolver,
114
        mets_url,
115
        src_dir=ctx.directory,
116
        skip=skip,
117
        download=download,
118
        page_strictness=page_textequiv_consistency,
119
        page_coordinate_consistency=page_coordinate_consistency
120
    )
121
    print(report.to_xml())
122
    if not report.is_valid:
123
        sys.exit(128)
124
125
# ----------------------------------------------------------------------
126
# ocrd workspace clone
127
# ----------------------------------------------------------------------
128
129
@workspace_cli.command('clone', cls=command_with_replaced_help(
130
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
131
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
132
@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards")
133
@click.argument('mets_url')
134
@mets_find_options
135
# XXX deprecated
136
@click.argument('workspace_dir', default=None, required=False)
137
@pass_workspace
138
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
139
    """
140
    Create a workspace from METS_URL and return the directory
141
142
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
143
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
144
145
    Additional options pertain to the selection of files / fileGrps / pages
146
    to be downloaded, if --download is used.
147
    """
148
    LOG = getLogger('ocrd.cli.workspace.clone')
149
    if workspace_dir:
150
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
151
        ctx.directory = workspace_dir
152
153
    assert not ctx.mets_server_url, \
154
        f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
155
    workspace = ctx.resolver.workspace_from_url(
156
        mets_url,
157
        dst_dir=ctx.directory,
158
        mets_basename=ctx.mets_basename,
159
        clobber_mets=clobber_mets,
160
        download=download,
161
        fileGrp=file_grp,
162
        ID=file_id,
163
        pageId=page_id,
164
        mimetype=mimetype,
165
        include_fileGrp=include_fileGrp,
166
        exclude_fileGrp=exclude_fileGrp,
167
    )
168
    workspace.save_mets()
169
    print(workspace.directory)
170
171
# ----------------------------------------------------------------------
172
# ocrd workspace init
173
# ----------------------------------------------------------------------
174
175
@workspace_cli.command('init', cls=command_with_replaced_help(
176
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
177
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
178
# XXX deprecated
179
@click.argument('directory', default=None, required=False)
180
@pass_workspace
181
def workspace_init(ctx, clobber_mets, directory):
182
    """
183
    Create a workspace with an empty METS file in DIRECTORY or CWD.
184
185
    """
186
    LOG = getLogger('ocrd.cli.workspace.init')
187
    if directory:
188
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
189
        ctx.directory = directory
190
    assert not ctx.mets_server_url, \
191
        f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
192
    workspace = ctx.resolver.workspace_from_nothing(
193
        directory=ctx.directory,
194
        mets_basename=ctx.mets_basename,
195
        clobber_mets=clobber_mets,
196
    )
197
    workspace.save_mets()
198
    print(workspace.directory)
199
200
# ----------------------------------------------------------------------
201
# ocrd workspace add
202
# ----------------------------------------------------------------------
203
204
@workspace_cli.command('add')
205
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
206
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
207
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
208
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
209
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
210
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
211
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
212
@click.argument('fname', required=True)
213
@pass_workspace
214
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
215
    """
216
    Add a file or http(s) URL FNAME to METS in a workspace.
217
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
218
    """
219
    workspace = ctx.workspace()
220
221
    log = getLogger('ocrd.cli.workspace.add')
222
    if not mimetype:
223
        try:
224
            mimetype = EXT_TO_MIME[Path(fname).suffix]
225
            log.info("Guessed mimetype to be %s" % mimetype)
226
        except KeyError:
227
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
228
229
    log.debug("Adding '%s'", fname)
230
    local_filename = None
231
    if not (fname.startswith('http://') or fname.startswith('https://')):
232
        if not fname.startswith(ctx.directory):
233
            if not isabs(fname) and exists(join(ctx.directory, fname)):
234
                fname = join(ctx.directory, fname)
235
            else:
236
                log.debug("File '%s' is not in workspace, copying", fname)
237
                try:
238
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
239
                except FileNotFoundError:
240
                    if check_file_exists:
241
                        log.error("File '%s' does not exist, halt execution!" % fname)
242
                        sys.exit(1)
243
        if check_file_exists and not exists(fname):
244
            log.error("File '%s' does not exist, halt execution!" % fname)
245
            sys.exit(1)
246
        if fname.startswith(ctx.directory):
247
            fname = relpath(fname, ctx.directory)
248
        local_filename = fname
249
250
    if not page_id:
251
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
252
    kwargs = {
253
        'file_id': file_id,
254
        'mimetype': mimetype,
255
        'page_id': page_id,
256
        'force': force,
257
        'ignore': ignore,
258
        'local_filename': local_filename,
259
        'url': fname
260
    }
261
    workspace.add_file(file_grp, **kwargs)
262
    workspace.save_mets()
263
264
# ----------------------------------------------------------------------
265
# ocrd workspace bulk-add
266
# ----------------------------------------------------------------------
267
268
# pylint: disable=broad-except
269
@workspace_cli.command('bulk-add')
270
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
271
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
272
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
273
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
274
@click.option('-u', '--url', help="Remote URL of the file", required=False)
275
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
276
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
277
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
278
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
279
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
280
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
281
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
282
@click.argument('file_glob', nargs=-1, required=True)
283
@pass_workspace
284
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
285
    """
286
    Add files in bulk to an OCR-D workspace.
287
288
    FILE_GLOB can either be a shell glob expression to match file names,
289
    or a list of expressions or '-', in which case expressions are read from STDIN.
290
291
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
292
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
293
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
294
295
    If the FILE_GLOB expressions do not denote the file names themselves
296
    (but arbitrary strings for --regex matching), then use --source-path to set
297
    the actual file paths to use. (This could involve fixed strings or group references.)
298
299
    \b
300
    Examples:
301
        ocrd workspace bulk-add \\
302
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
303
                --page-id 'PHYS_{{ pageid }}' \\
304
                --file-grp "{{ fileGrp }}" \\
305
                path/to/files/*/*.*
306
        \b
307
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
308
        | ocrd workspace bulk-add \\
309
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
310
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
311
                --page-id 'PHYS_{{ pageid }}' \\
312
                --file-grp "{{ fileGrp }}" \\
313
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
314
                -
315
316
        \b
317
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
318
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
319
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
320
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
321
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
322
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
323
    """
324
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
325
    workspace = ctx.workspace()
326
327
    try:
328
        pat = re.compile(regex)
329
    except Exception as e:
330
        log.error("Invalid regex: %s" % e)
331
        sys.exit(1)
332
333
    file_paths = []
334
    from_stdin = file_glob == ('-',)
335
    if from_stdin:
336
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
337
    else:
338
        for fglob in file_glob:
339
            expanded = glob(fglob)
340
            if not expanded:
341
                file_paths += [Path(fglob)]
342
            else:
343
                file_paths += [Path(x) for x in expanded]
344
345
    for i, file_path in enumerate(file_paths):
346
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
347
348
        # match regex
349
        m = pat.match(str(file_path))
350
        if not m:
351
            if skip:
352
                continue
353
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
354
            sys.exit(1)
355
        group_dict = m.groupdict()
356
357
        # set up file info
358
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
359
360
        # Flag to track whether 'local_filename' should be 'src'
361
        local_filename_is_src = False
362
363
        # expand templates
364
        for param_name in file_dict:
365
            if not file_dict[param_name]:
366
                if param_name == 'local_filename':
367
                    local_filename_is_src = True
368
                    continue
369
                elif param_name in ['mimetype', 'file_id']:
370
                    # auto-filled below once the other
371
                    # replacements have happened
372
                    continue
373
                elif param_name == 'url':
374
                    # Remote URL is not required
375
                    continue
376
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
377
            for group_name in group_dict:
378
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
379
380
        # Where to copy from
381
        if src_path_option:
382
            src_path = src_path_option
383
            for group_name in group_dict:
384
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
385
            srcpath = Path(src_path)
386
        else:
387
            srcpath = file_path
388
389
        # derive --file-id from filename if not --file-id not explicitly set
390
        if not file_id:
391
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
392
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
393
        if not mimetype:
394
            try:
395
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
396
            except KeyError:
397
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
398
399
        # copy files if src != url
400
        if local_filename_is_src:
401
            file_dict['local_filename'] = srcpath
402
        else:
403
            destpath = Path(workspace.directory, file_dict['local_filename'])
404
            if srcpath != destpath and not destpath.exists():
405
                log.info("cp '%s' '%s'", srcpath, destpath)
406
                if not dry_run:
407
                    if not destpath.parent.is_dir():
408
                        destpath.parent.mkdir()
409
                    destpath.write_bytes(srcpath.read_bytes())
410
411
        # Add to workspace (or not)
412
        fileGrp = file_dict.pop('file_grp')
413
        if dry_run:
414
            log.info('workspace.add_file(%s)' % file_dict)
415
        else:
416
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg
417
418
    # save changes to disk
419
    workspace.save_mets()
420
421
422
# ----------------------------------------------------------------------
423
# ocrd workspace find
424
# ----------------------------------------------------------------------
425
426
@workspace_cli.command('find')
427
@mets_find_options
428
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
429
              default=['local_filename'],
430
              show_default=True,
431
              multiple=True,
432
              type=click.Choice([
433
                  'url',
434
                  'mimetype',
435
                  'page_id',
436
                  'pageId',
437
                  'file_id',
438
                  'ID',
439
                  'file_grp',
440
                  'fileGrp',
441
                  'basename',
442
                  'basename_without_extension',
443
                  'local_filename',
444
              ]))
445
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
446
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
447
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
448
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
449
@pass_workspace
450
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
451
    """
452
    Find files.
453
454
    (If any ``FILTER`` starts with ``//``, then its remainder
455
     will be interpreted as a regular expression.)
456
    """
457
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
458
    output_field = [snake_to_camel.get(x, x) for x in output_field]
459
    modified_mets = False
460
    ret = []
461
    workspace = ctx.workspace()
462
    with pushd_popd(workspace.directory):
463
        for f in workspace.find_files(
464
                file_id=file_id,
465
                file_grp=file_grp,
466
                mimetype=mimetype,
467
                page_id=page_id,
468
                include_fileGrp=include_fileGrp,
469
                exclude_fileGrp=exclude_fileGrp,
470
            ):
471
            if download and not f.local_filename:
472
                workspace.download_file(f)
473
                modified_mets = True
474
                if wait:
475
                    time.sleep(wait)
476
            if undo_download and f.url and f.local_filename:
477
                f.local_filename = None
478
                modified_mets = True
479
                if not keep_files:
480
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
481
                    unlink(f.local_filename)
482
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
483
            ret.append(ret_entry)
484
    if modified_mets:
485
        workspace.save_mets()
486
    if 'pageId' in output_field:
487
        idx = output_field.index('pageId')
488
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 486 is False. Are you sure this can never be the case?
Loading history...
489
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
490
        for fields, page in zip(ret, pages):
491
            fields[idx] = page or ''
492
    for fields in ret:
493
        print('\t'.join(fields))
494
495
# ----------------------------------------------------------------------
496
# ocrd workspace remove
497
# ----------------------------------------------------------------------
498
499
@workspace_cli.command('remove')
500
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
501
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
502
@click.argument('ID', nargs=-1)
503
@pass_workspace
504
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
505
    """
506
    Delete files (given by their ID attribute ``ID``).
507
508
    (If any ``ID`` starts with ``//``, then its remainder
509
     will be interpreted as a regular expression.)
510
    """
511
    assert not ctx.mets_server_url, \
512
        f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
513
    workspace = ctx.workspace()
514
    for i in id:
515
        workspace.remove_file(i, force=force, keep_file=keep_file)
516
    workspace.save_mets()
517
518
519
# ----------------------------------------------------------------------
520
# ocrd workspace rename-group
521
# ----------------------------------------------------------------------
522
523
@workspace_cli.command('rename-group')
524
@click.argument('OLD', nargs=1)
525
@click.argument('NEW', nargs=1)
526
@pass_workspace
527
def rename_group(ctx, old, new):
528
    """
529
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
530
    """
531
    assert not ctx.mets_server_url, \
532
        f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
533
    workspace = ctx.workspace()
534
    workspace.rename_file_group(old, new)
535
    workspace.save_mets()
536
537
# ----------------------------------------------------------------------
538
# ocrd workspace remove-group
539
# ----------------------------------------------------------------------
540
541
@workspace_cli.command('remove-group')
542
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
543
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
544
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
545
@click.argument('GROUP', nargs=-1)
546
@pass_workspace
547
def remove_group(ctx, group, recursive, force, keep_files):
548
    """
549
    Delete fileGrps (given by their USE attribute ``GROUP``).
550
551
    (If any ``GROUP`` starts with ``//``, then its remainder
552
     will be interpreted as a regular expression.)
553
    """
554
    assert not ctx.mets_server_url, \
555
        f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
556
    workspace = ctx.workspace()
557
    for g in group:
558
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
559
    workspace.save_mets()
560
561
# ----------------------------------------------------------------------
562
# ocrd workspace prune-files
563
# ----------------------------------------------------------------------
564
565
@workspace_cli.command('prune-files')
566
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
567
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
568
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
569
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
570
@pass_workspace
571
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
572
    """
573
    Removes mets:files that point to non-existing local files
574
575
    (If any ``FILTER`` starts with ``//``, then its remainder
576
     will be interpreted as a regular expression.)
577
    """
578
    assert not ctx.mets_server_url, \
579
        f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
580
    workspace = ctx.workspace()
581
    with pushd_popd(workspace.directory):
582
        for f in workspace.find_files(
583
            file_id=file_id,
584
            file_grp=file_grp,
585
            mimetype=mimetype,
586
            page_id=page_id,
587
        ):
588
            try:
589
                if not f.local_filename or not exists(f.local_filename):
590
                    workspace.mets.remove_file(f.ID)
591
            except Exception as e:
592
                ctx.log.exception("Error removing %f: %s", f, e)
593
                raise(e)
594
        workspace.save_mets()
595
596
# ----------------------------------------------------------------------
597
# ocrd workspace clean
598
# ----------------------------------------------------------------------
599
600
@workspace_cli.command('clean')
601
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True)
602
@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", default=False, is_flag=True)
603
@click.argument('path_glob', nargs=-1, required=False)
604
@pass_workspace
605
def clean(ctx, dry_run, directories, path_glob):
606
    """
607
    Removes files and directories from the workspace that are not
608
    referenced by any mets:files.
609
610
    PATH_GLOB can be a shell glob expression to match file names,
611
    directory names (recursively), or plain paths. All paths are
612
    resolved w.r.t. the workspace.
613
614
    If no PATH_GLOB are specified, then all files and directories
615
    may match.
616
    """
617
    workspace = ctx.workspace()
618
    allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
619
    allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
620
    allowed_dirs = set(dirname(path) for path in allowed_files)
621
    with pushd_popd(workspace.directory):
622
        if len(path_glob):
623
            paths = []
624
            for expression in path_glob:
625
                if isabs(expression):
626
                    expression = relpath(expression)
627
                paths += glob(expression, recursive=True) or [expression]
628
        else:
629
            paths = glob('**', recursive=True)
630
        file_paths = [path for path in paths if not isdir(path)]
631
        for path in file_paths:
632
            if normpath(path) in allowed_files:
633
                continue
634
            if dry_run:
635
                ctx.log.info('unlink(%s)' % path)
636
            else:
637
                unlink(path)
638
        if not directories:
639
            return
640
        dir_paths = [path for path in paths if isdir(path)]
641
        for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
642
            if normpath(path) in allowed_dirs:
643
                continue
644
            if dry_run:
645
                ctx.log.info('rmdir(%s)' % path)
646
            else:
647
                rmdir(path)
648
649
# ----------------------------------------------------------------------
650
# ocrd workspace list-group
651
# ----------------------------------------------------------------------
652
653
@workspace_cli.command('list-group')
654
@pass_workspace
655
def list_groups(ctx):
656
    """
657
    List fileGrp USE attributes
658
    """
659
    workspace = ctx.workspace()
660
    print("\n".join(workspace.mets.file_groups))
661
662
# ----------------------------------------------------------------------
663
# ocrd workspace list-page
664
# ----------------------------------------------------------------------
665
666
@workspace_cli.command('list-page')
667
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
668
              default=['ID'],
669
              show_default=True,
670
              multiple=True,
671
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
672
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
673
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
674
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
675
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
676
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
677
@pass_workspace
678
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
679
    """
680
    List physical page IDs
681
682
    (If any ``FILTER`` starts with ``//``, then its remainder
683
     will be interpreted as a regular expression.)
684
    """
685
    workspace = ctx.workspace()
686
    ret = []
687
    if page_id_range or list(output_field) != ['ID']:
688
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
689
            ret.append([])
690
            for k in output_field:
691
                ret[i].append(page_div.get(k, 'None'))
692
    else:
693
        for page_id in workspace.mets.physical_pages:
694
            ret.append([page_id])
695
696
    if numeric_range:
697
        start, end = map(int, numeric_range.split('..'))
698
        ret = ret[start-1:end]
699
700
    chunks = partition_list(ret, chunk_number, chunk_index)
701
    lines = []
702
    if output_format == 'one-per-line':
703
        for chunk in chunks:
704
            line_strs = []
705
            for entry in chunk:
706
                line_strs.append("\t".join(entry))
707
            lines.append('\n'.join(line_strs))
708
    elif output_format == 'comma-separated':
709
        for chunk in chunks:
710
            line_strs = []
711
            for entry in chunk:
712
                line_strs.append("\t".join(entry))
713
            lines.append(','.join(line_strs))
714
    elif output_format == 'json':
715
        lines.append(dumps(chunks))
716
    print('\n'.join(lines))
717
718
# ----------------------------------------------------------------------
719
# ocrd workspace get-id
720
# ----------------------------------------------------------------------
721
722
@workspace_cli.command('get-id')
723
@pass_workspace
724
def get_id(ctx):
725
    """
726
    Get METS id if any
727
    """
728
    workspace = ctx.workspace()
729
    ID = workspace.mets.unique_identifier
730
    if ID:
731
        print(ID)
732
733
# ----------------------------------------------------------------------
734
# ocrd workspace set-id
735
# ----------------------------------------------------------------------
736
737
@workspace_cli.command('set-id')
738
@click.argument('ID')
739
@pass_workspace
740
def set_id(ctx, id):   # pylint: disable=redefined-builtin
741
    """
742
    Set METS ID.
743
744
    If one of the supported identifier mechanisms is used, will set this identifier.
745
746
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
747
    """
748
    workspace = ctx.workspace()
749
    workspace.mets.unique_identifier = id
750
    workspace.save_mets()
751
752
@workspace_cli.command('update-page')
753
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
754
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')               
755
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
756
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
757
@click.argument('PAGE_ID')
758
@pass_workspace
759
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
760
    """
761
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
762
    """
763
    update_kwargs = {k: v for k, v in attr_value_pairs}
764
    if order:
765
        update_kwargs['ORDER'] = order
766
    if orderlabel:
767
        update_kwargs['ORDERLABEL'] = orderlabel
768
    if contentids:
769
        update_kwargs['CONTENTIDS'] = contentids
770
    try:
771
        assert not ctx.mets_server_url, \
772
            f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
773
        workspace = ctx.workspace()
774
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
775
        workspace.save_mets()
776
    except Exception as err:
777
        print(f"Error: {err}")
778
        sys.exit(1)
779
780
# ----------------------------------------------------------------------
781
# ocrd workspace merge
782
# ----------------------------------------------------------------------
783
784
def _handle_json_option(ctx, param, value):
785
    return parse_json_string_or_file(value) if value else None
786
787
@workspace_cli.command('merge')
788
@click.argument('METS_PATH')
789
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
790
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
791
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
792
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
793
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
794
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
795
@mets_find_options
796
@pass_workspace
797
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
798
    """
799
    Merges this workspace with the workspace that contains ``METS_PATH``
800
801
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
802
    in order to rename all fileGrp, file ID or page ID values, respectively.
803
804
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
805
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
806
    for an explanation.
807
    """
808
    mets_path = Path(mets_path)
809
    if filegrp_mapping:
810
        filegrp_mapping = loads(filegrp_mapping)
811
    assert not ctx.mets_server_url, \
812
        f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
813
    workspace = ctx.workspace()
814
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
815
    workspace.merge(
816
        other_workspace,
817
        force=force,
818
        overwrite=overwrite,
819
        copy_files=copy_files,
820
        fileGrp_mapping=filegrp_mapping,
821
        fileId_mapping=fileid_mapping,
822
        pageId_mapping=pageid_mapping,
823
        file_grp=file_grp,
824
        file_id=file_id,
825
        page_id=page_id,
826
        mimetype=mimetype,
827
        include_fileGrp=include_fileGrp,
828
        exclude_fileGrp=exclude_fileGrp,
829
    )
830
    workspace.save_mets()
831
832
# ----------------------------------------------------------------------
833
# ocrd workspace backup
834
# ----------------------------------------------------------------------
835
836
@workspace_cli.group('backup')
837
@pass_workspace
838
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
839
    """
840
    Backing and restoring workspaces - dev edition
841
    """
842
    assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
843
844
@workspace_backup_cli.command('add')
845
@pass_workspace
846
def workspace_backup_add(ctx):
847
    """
848
    Create a new backup
849
    """
850
    backup_manager = ctx.backup_manager()
851
    backup_manager.add()
852
853
@workspace_backup_cli.command('list')
854
@pass_workspace
855
def workspace_backup_list(ctx):
856
    """
857
    List backups
858
    """
859
    backup_manager = ctx.backup_manager()
860
    for b in backup_manager.list():
861
        print(b)
862
863
@workspace_backup_cli.command('restore')
864
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
865
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
866
@pass_workspace
867
def workspace_backup_restore(ctx, choose_first, bak):
868
    """
869
    Restore backup BAK
870
    """
871
    backup_manager = ctx.backup_manager()
872
    backup_manager.restore(bak, choose_first)
873
874
@workspace_backup_cli.command('undo')
875
@pass_workspace
876
def workspace_backup_undo(ctx):
877
    """
878
    Restore the last backup
879
    """
880
    backup_manager = ctx.backup_manager()
881
    backup_manager.undo()
882
883
884
# ----------------------------------------------------------------------
885
# ocrd workspace server
886
# ----------------------------------------------------------------------
887
888
@workspace_cli.group('server')
889
@pass_workspace
890
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
891
    """Control a METS server for this workspace"""
892
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
893
894
@workspace_serve_cli.command('stop')
895
@pass_workspace
896
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
897
    """Stop the METS server (saving changes to disk)"""
898
    workspace = ctx.workspace()
899
    workspace.mets.stop()
900
901
@workspace_serve_cli.command('reload')
902
@pass_workspace
903
def workspace_serve_reload(ctx): # pylint: disable=unused-argument
904
    """Reload the METS server from disk"""
905
    workspace = ctx.workspace()
906
    workspace.mets.reload()
907
908
@workspace_serve_cli.command('save')
909
@pass_workspace
910
def workspace_serve_save(ctx): # pylint: disable=unused-argument
911
    """Save the METS changes to disk"""
912
    workspace = ctx.workspace()
913
    workspace.mets.save()
914
915
@workspace_serve_cli.command('start')
916
@pass_workspace
917
def workspace_serve_start(ctx): # pylint: disable=unused-argument
918
    """
919
    Start a METS server
920
921
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
922
    """
923
    OcrdMetsServer(
924
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
925
        url=ctx.mets_server_url,
926
    ).startup()
927