Passed
Pull Request — master (#1134)
by Konstantin
02:27
created

ocrd.cli.workspace.merge()   A

Complexity

Conditions 2

Size

Total Lines 40
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 29
dl 0
loc 40
rs 9.184
c 0
b 0
f 0
cc 2
nop 12

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd
10
from os.path import relpath, exists, join, isabs
11
from pathlib import Path
12
from json import loads
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
18
import click
19
20
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
21
from ocrd.mets_server import OcrdMetsServer
22
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file
23
from ocrd.decorators import mets_find_options
24
from . import command_with_replaced_help
25
26
27
class WorkspaceCtx():
28
29
    def __init__(self, directory, mets_url, mets_basename, mets_server_url, automatic_backup):
30
        self.log = getLogger('ocrd.cli.workspace')
31
        if mets_basename:
32
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
33
        self.resolver = Resolver()
34
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
35
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
36
        self.automatic_backup = automatic_backup
37
38
39
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
40
41
# ----------------------------------------------------------------------
42
# ocrd workspace
43
# ----------------------------------------------------------------------
44
45
@click.group("workspace")
46
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
47
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
48
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
49
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
50
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
51
@click.pass_context
52
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
53
    """
54
    Managing workspaces
55
56
    A workspace comprises a METS file and a directory as point of reference.
57
58
    Operates on the file system directly or via a METS server 
59
    (already running via some prior `server start` subcommand).
60
    """
61
    initLogging()
62
    ctx.obj = WorkspaceCtx(
63
        directory,
64
        mets_url=mets,
65
        mets_basename=mets_basename,
66
        mets_server_url=mets_server_url,
67
        automatic_backup=backup
68
    )
69
70
# ----------------------------------------------------------------------
71
# ocrd workspace validate
72
# ----------------------------------------------------------------------
73
74
@workspace_cli.command('validate', cls=command_with_replaced_help(
75
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
76
@pass_workspace
77
@click.option('-a', '--download', is_flag=True, help="Download all files")
78
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
79
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
80
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
81
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
82
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
83
@click.argument('mets_url', default=None, required=False)
84
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
85
    """
86
    Validate a workspace
87
88
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
89
    If not given, use --mets accordingly.
90
91
    Check that the METS and its referenced file contents
92
    abide by the OCR-D specifications.
93
    """
94
    LOG = getLogger('ocrd.cli.workspace.validate')
95
    if mets_url:
96
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
97
    else:
98
        mets_url = ctx.mets_url
99
    report = WorkspaceValidator.validate(
100
        ctx.resolver,
101
        mets_url,
102
        src_dir=ctx.directory,
103
        skip=skip,
104
        download=download,
105
        page_strictness=page_textequiv_consistency,
106
        page_coordinate_consistency=page_coordinate_consistency
107
    )
108
    print(report.to_xml())
109
    if not report.is_valid:
110
        sys.exit(128)
111
112
# ----------------------------------------------------------------------
113
# ocrd workspace clone
114
# ----------------------------------------------------------------------
115
116
@workspace_cli.command('clone', cls=command_with_replaced_help(
117
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
118
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
119
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
120
@click.argument('mets_url')
121
# XXX deprecated
122
@click.argument('workspace_dir', default=None, required=False)
123
@pass_workspace
124
def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
125
    """
126
    Create a workspace from METS_URL and return the directory
127
128
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
129
    If METS_URL is not provided, use --mets accordingly.
130
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
131
    """
132
    LOG = getLogger('ocrd.cli.workspace.clone')
133
    if workspace_dir:
134
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
135
        ctx.directory = workspace_dir
136
137
    workspace = ctx.resolver.workspace_from_url(
138
        mets_url,
139
        dst_dir=ctx.directory,
140
        mets_basename=ctx.mets_basename,
141
        clobber_mets=clobber_mets,
142
        download=download,
143
    )
144
    workspace.save_mets()
145
    print(workspace.directory)
146
147
# ----------------------------------------------------------------------
148
# ocrd workspace init
149
# ----------------------------------------------------------------------
150
151
@workspace_cli.command('init', cls=command_with_replaced_help(
152
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
153
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
154
# XXX deprecated
155
@click.argument('directory', default=None, required=False)
156
@pass_workspace
157
def workspace_init(ctx, clobber_mets, directory):
158
    """
159
    Create a workspace with an empty METS file in --directory.
160
161
    """
162
    LOG = getLogger('ocrd.cli.workspace.init')
163
    if directory:
164
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
165
        ctx.directory = directory
166
    workspace = ctx.resolver.workspace_from_nothing(
167
        directory=ctx.directory,
168
        mets_basename=ctx.mets_basename,
169
        clobber_mets=clobber_mets
170
    )
171
    workspace.save_mets()
172
    print(workspace.directory)
173
174
# ----------------------------------------------------------------------
175
# ocrd workspace add
176
# ----------------------------------------------------------------------
177
178
@workspace_cli.command('add')
179
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
180
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
181
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
182
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
183
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
184
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
185
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
186
@click.argument('fname', required=True)
187
@pass_workspace
188
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
189
    """
190
    Add a file or http(s) URL FNAME to METS in a workspace.
191
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
192
    """
193
    workspace = Workspace(
194
        ctx.resolver,
195
        directory=ctx.directory,
196
        mets_basename=ctx.mets_basename,
197
        automatic_backup=ctx.automatic_backup,
198
        mets_server_url=ctx.mets_server_url,
199
    )
200
201
    log = getLogger('ocrd.cli.workspace.add')
202
    if not mimetype:
203
        try:
204
            mimetype = EXT_TO_MIME[Path(fname).suffix]
205
            log.info("Guessed mimetype to be %s" % mimetype)
206
        except KeyError:
207
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
208
209
    log.debug("Adding '%s'", fname)
210
    local_filename = None
211
    if not (fname.startswith('http://') or fname.startswith('https://')):
212
        if not fname.startswith(ctx.directory):
213
            if not isabs(fname) and exists(join(ctx.directory, fname)):
214
                fname = join(ctx.directory, fname)
215
            else:
216
                log.debug("File '%s' is not in workspace, copying", fname)
217
                try:
218
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
219
                except FileNotFoundError:
220
                    if check_file_exists:
221
                        log.error("File '%s' does not exist, halt execution!" % fname)
222
                        sys.exit(1)
223
        if check_file_exists and not exists(fname):
224
            log.error("File '%s' does not exist, halt execution!" % fname)
225
            sys.exit(1)
226
        if fname.startswith(ctx.directory):
227
            fname = relpath(fname, ctx.directory)
228
        local_filename = fname
229
230
    if not page_id:
231
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
232
    kwargs = {
233
        'file_id': file_id,
234
        'mimetype': mimetype,
235
        'page_id': page_id,
236
        'force': force,
237
        'ignore': ignore,
238
        'local_filename': local_filename,
239
        'url': fname
240
    }
241
    workspace.add_file(file_grp, **kwargs)
242
    workspace.save_mets()
243
244
# ----------------------------------------------------------------------
245
# ocrd workspace bulk-add
246
# ----------------------------------------------------------------------
247
248
# pylint: disable=broad-except
249
@workspace_cli.command('bulk-add')
250
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
251
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
252
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
253
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
254
@click.option('-u', '--url', help="Remote URL of the file", required=False)
255
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
256
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
257
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
258
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
259
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
260
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
261
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
262
@click.argument('file_glob', nargs=-1, required=True)
263
@pass_workspace
264
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
265
    """
266
    Add files in bulk to an OCR-D workspace.
267
268
    FILE_GLOB can either be a shell glob expression to match file names,
269
    or a list of expressions or '-', in which case expressions are read from STDIN.
270
271
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
272
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
273
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
274
275
    If the FILE_GLOB expressions do not denote the file names themselves
276
    (but arbitrary strings for --regex matching), then use --source-path to set
277
    the actual file paths to use. (This could involve fixed strings or group references.)
278
279
    \b
280
    Examples:
281
        ocrd workspace bulk-add \\
282
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.[^.]+' \\
283
                --page-id 'PHYS_{{ pageid }}' \\
284
                --file-grp "{{ fileGrp }}" \\
285
                path/to/files/*/*.*
286
        \b
287
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
288
        | ocrd workspace bulk-add \\
289
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)' \\
290
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
291
                --page-id 'PHYS_{{ pageid }}' \\
292
                --file-grp "{{ fileGrp }}" \\
293
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
294
                -
295
296
        \b
297
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
298
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
299
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
300
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
301
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
302
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
303
    """
304
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
305
    workspace = Workspace(
306
        ctx.resolver,
307
        directory=ctx.directory,
308
        mets_basename=ctx.mets_basename,
309
        automatic_backup=ctx.automatic_backup,
310
        mets_server_url=ctx.mets_server_url,
311
    )
312
313
    try:
314
        pat = re.compile(regex)
315
    except Exception as e:
316
        log.error("Invalid regex: %s" % e)
317
        sys.exit(1)
318
319
    file_paths = []
320
    from_stdin = file_glob == ('-',)
321
    if from_stdin:
322
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
323
    else:
324
        for fglob in file_glob:
325
            expanded = glob(fglob)
326
            if not expanded:
327
                file_paths += [Path(fglob)]
328
            else:
329
                file_paths += [Path(x) for x in expanded]
330
331
    for i, file_path in enumerate(file_paths):
332
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
333
334
        # match regex
335
        m = pat.match(str(file_path))
336
        if not m:
337
            if skip:
338
                continue
339
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
340
            sys.exit(1)
341
        group_dict = m.groupdict()
342
343
        # set up file info
344
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
345
346
        # Flag to track whether 'local_filename' should be 'src'
347
        local_filename_is_src = False
348
349
        # expand templates
350
        for param_name in file_dict:
351
            if not file_dict[param_name]:
352
                if param_name == 'local_filename':
353
                    local_filename_is_src = True
354
                    continue
355
                elif param_name in ['mimetype', 'file_id']:
356
                    # auto-filled below once the other
357
                    # replacements have happened
358
                    continue
359
                elif param_name == 'url':
360
                    # Remote URL is not required
361
                    continue
362
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
363
            for group_name in group_dict:
364
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
365
366
        # Where to copy from
367
        if src_path_option:
368
            src_path = src_path_option
369
            for group_name in group_dict:
370
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
371
            srcpath = Path(src_path)
372
        else:
373
            srcpath = file_path
374
375
        # derive --file-id from filename if not --file-id not explicitly set
376
        if not file_id:
377
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
378
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
379
        if not mimetype:
380
            try:
381
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
382
            except KeyError:
383
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
384
385
        # copy files if src != url
386
        if local_filename_is_src:
387
            file_dict['local_filename'] = srcpath
388
        else:
389
            destpath = Path(workspace.directory, file_dict['local_filename'])
390
            if srcpath != destpath and not destpath.exists():
391
                log.info("cp '%s' '%s'", srcpath, destpath)
392
                if not dry_run:
393
                    if not destpath.parent.is_dir():
394
                        destpath.parent.mkdir()
395
                    destpath.write_bytes(srcpath.read_bytes())
396
397
        # Add to workspace (or not)
398
        fileGrp = file_dict.pop('file_grp')
399
        if dry_run:
400
            log.info('workspace.add_file(%s)' % file_dict)
401
        else:
402
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
403
404
    # save changes to disk
405
    workspace.save_mets()
406
407
408
# ----------------------------------------------------------------------
409
# ocrd workspace find
410
# ----------------------------------------------------------------------
411
412
@workspace_cli.command('find')
413
@mets_find_options
414
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
415
        default=['local_filename'],
416
        multiple=True,
417
        type=click.Choice([
418
            'url',
419
            'mimetype',
420
            'page_id',
421
            'pageId',
422
            'file_id',
423
            'ID',
424
            'file_grp',
425
            'fileGrp',
426
            'basename',
427
            'basename_without_extension',
428
            'local_filename',
429
        ]))
430
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
431
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
432
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
433
@pass_workspace
434
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait):
435
    """
436
    Find files.
437
438
    (If any ``FILTER`` starts with ``//``, then its remainder
439
     will be interpreted as a regular expression.)
440
    """
441
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
442
    output_field = [snake_to_camel.get(x, x) for x in output_field]
443
    modified_mets = False
444
    ret = list()
445
    workspace = Workspace(
446
        ctx.resolver,
447
        directory=ctx.directory,
448
        mets_basename=ctx.mets_basename,
449
        mets_server_url=ctx.mets_server_url,
450
    )
451
    for f in workspace.find_files(
452
            file_id=file_id,
453
            file_grp=file_grp,
454
            mimetype=mimetype,
455
            page_id=page_id,
456
        ):
457
        ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
458
        if download and not f.local_filename:
459
            workspace.download_file(f)
460
            modified_mets = True
461
            if wait:
462
                time.sleep(wait)
463
        if undo_download and f.local_filename:
464
            ret_entry = [f'Removed local_filename {f.local_filename}']
465
            f.local_filename = None
466
            modified_mets = True
467
        ret.append(ret_entry)
468
    if modified_mets:
469
        workspace.save_mets()
470
    if 'pageId' in output_field:
471
        idx = output_field.index('pageId')
472
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 470 is False. Are you sure this can never be the case?
Loading history...
473
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
474
        for fields, page in zip(ret, pages):
475
            fields[idx] = page or ''
476
    for fields in ret:
477
        print('\t'.join(fields))
478
479
# ----------------------------------------------------------------------
480
# ocrd workspace remove
481
# ----------------------------------------------------------------------
482
483
@workspace_cli.command('remove')
484
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
485
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
486
@click.argument('ID', nargs=-1)
487
@pass_workspace
488
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
489
    """
490
    Delete files (given by their ID attribute ``ID``).
491
492
    (If any ``ID`` starts with ``//``, then its remainder
493
     will be interpreted as a regular expression.)
494
    """
495
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
496
    for i in id:
497
        workspace.remove_file(i, force=force, keep_file=keep_file)
498
    workspace.save_mets()
499
500
501
# ----------------------------------------------------------------------
502
# ocrd workspace rename-group
503
# ----------------------------------------------------------------------
504
505
@workspace_cli.command('rename-group')
506
@click.argument('OLD', nargs=1)
507
@click.argument('NEW', nargs=1)
508
@pass_workspace
509
def rename_group(ctx, old, new):
510
    """
511
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
512
    """
513
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
514
    workspace.rename_file_group(old, new)
515
    workspace.save_mets()
516
517
# ----------------------------------------------------------------------
518
# ocrd workspace remove-group
519
# ----------------------------------------------------------------------
520
521
@workspace_cli.command('remove-group')
522
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
523
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
524
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
525
@click.argument('GROUP', nargs=-1)
526
@pass_workspace
527
def remove_group(ctx, group, recursive, force, keep_files):
528
    """
529
    Delete fileGrps (given by their USE attribute ``GROUP``).
530
531
    (If any ``GROUP`` starts with ``//``, then its remainder
532
     will be interpreted as a regular expression.)
533
    """
534
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
535
    for g in group:
536
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
537
    workspace.save_mets()
538
539
# ----------------------------------------------------------------------
540
# ocrd workspace prune-files
541
# ----------------------------------------------------------------------
542
543
@workspace_cli.command('prune-files')
544
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
545
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
546
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
547
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
548
@pass_workspace
549
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
550
    """
551
    Removes mets:files that point to non-existing local files
552
553
    (If any ``FILTER`` starts with ``//``, then its remainder
554
     will be interpreted as a regular expression.)
555
    """
556
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
557
    with pushd_popd(workspace.directory):
558
        for f in workspace.find_files(
559
            file_id=file_id,
560
            file_grp=file_grp,
561
            mimetype=mimetype,
562
            page_id=page_id,
563
        ):
564
            try:
565
                if not f.local_filename or not exists(f.local_filename):
566
                    workspace.mets.remove_file(f.ID)
567
            except Exception as e:
568
                ctx.log.exception("Error removing %f: %s", f, e)
569
                raise(e)
570
        workspace.save_mets()
571
572
# ----------------------------------------------------------------------
573
# ocrd workspace list-group
574
# ----------------------------------------------------------------------
575
576
@workspace_cli.command('list-group')
577
@pass_workspace
578
def list_groups(ctx):
579
    """
580
    List fileGrp USE attributes
581
    """
582
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
583
    print("\n".join(workspace.mets.file_groups))
584
585
# ----------------------------------------------------------------------
586
# ocrd workspace list-pages
587
# ----------------------------------------------------------------------
588
589
@workspace_cli.command('list-page')
590
@pass_workspace
591
def list_pages(ctx):
592
    """
593
    List physical page IDs
594
    """
595
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
596
    print("\n".join(workspace.mets.physical_pages))
597
598
# ----------------------------------------------------------------------
599
# ocrd workspace get-id
600
# ----------------------------------------------------------------------
601
602
@workspace_cli.command('get-id')
603
@pass_workspace
604
def get_id(ctx):
605
    """
606
    Get METS id if any
607
    """
608
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
609
    ID = workspace.mets.unique_identifier
610
    if ID:
611
        print(ID)
612
613
# ----------------------------------------------------------------------
614
# ocrd workspace set-id
615
# ----------------------------------------------------------------------
616
617
@workspace_cli.command('set-id')
618
@click.argument('ID')
619
@pass_workspace
620
def set_id(ctx, id):   # pylint: disable=redefined-builtin
621
    """
622
    Set METS ID.
623
624
    If one of the supported identifier mechanisms is used, will set this identifier.
625
626
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
627
    """
628
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
629
    workspace.mets.unique_identifier = id
630
    workspace.save_mets()
631
632
@workspace_cli.command('update-page')
633
@click.option('--order', help="@ORDER attribute for this mets:div", metavar='ORDER')
634
@click.option('--orderlabel', help="@ORDERLABEL attribute for this mets:div", metavar='ORDERLABEL')
635
@click.option('--contentids', help="@CONTENTIDS attribute for this mets:div", metavar='ORDERLABEL')
636
@click.argument('PAGE_ID')
637
@pass_workspace
638
def update_page(ctx, order, orderlabel, contentids, page_id):
639
    """
640
    Update the @ORDER, @ORDERLABEL o @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
641
    """
642
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
643
    workspace.mets.update_physical_page_attributes(page_id, order=order, orderlabel=orderlabel, contentids=contentids)
644
    workspace.save_mets()
645
646
# ----------------------------------------------------------------------
647
# ocrd workspace merge
648
# ----------------------------------------------------------------------
649
650
def _handle_json_option(ctx, param, value):
651
    return parse_json_string_or_file(value) if value else None
652
653
@workspace_cli.command('merge')
654
@click.argument('METS_PATH')
655
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
656
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
657
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
658
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
659
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
660
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
661
@mets_find_options
662
@pass_workspace
663
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, mets_path):   # pylint: disable=redefined-builtin
664
    """
665
    Merges this workspace with the workspace that contains ``METS_PATH``
666
667
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
668
    in order to rename all fileGrp, file ID or page ID values, respectively.
669
670
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
671
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
672
    for an explanation.
673
    """
674
    mets_path = Path(mets_path)
675
    if filegrp_mapping:
676
        filegrp_mapping = loads(filegrp_mapping)
677
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
678
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
679
    workspace.merge(
680
        other_workspace,
681
        force=force,
682
        overwrite=overwrite,
683
        copy_files=copy_files,
684
        fileGrp_mapping=filegrp_mapping,
685
        fileId_mapping=fileid_mapping,
686
        pageId_mapping=pageid_mapping,
687
        file_grp=file_grp,
688
        file_id=file_id,
689
        page_id=page_id,
690
        mimetype=mimetype
691
    )
692
    workspace.save_mets()
693
694
# ----------------------------------------------------------------------
695
# ocrd workspace backup
696
# ----------------------------------------------------------------------
697
698
@workspace_cli.group('backup')
699
@click.pass_context
700
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
701
    """
702
    Backing and restoring workspaces - dev edition
703
    """
704
705
@workspace_backup_cli.command('add')
706
@pass_workspace
707
def workspace_backup_add(ctx):
708
    """
709
    Create a new backup
710
    """
711
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
712
    backup_manager.add()
713
714
@workspace_backup_cli.command('list')
715
@pass_workspace
716
def workspace_backup_list(ctx):
717
    """
718
    List backups
719
    """
720
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
721
    for b in backup_manager.list():
722
        print(b)
723
724
@workspace_backup_cli.command('restore')
725
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
726
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
727
@pass_workspace
728
def workspace_backup_restore(ctx, choose_first, bak):
729
    """
730
    Restore backup BAK
731
    """
732
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
733
    backup_manager.restore(bak, choose_first)
734
735
@workspace_backup_cli.command('undo')
736
@pass_workspace
737
def workspace_backup_undo(ctx):
738
    """
739
    Restore the last backup
740
    """
741
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
742
    backup_manager.undo()
743
744
745
# ----------------------------------------------------------------------
746
# ocrd workspace server
747
# ----------------------------------------------------------------------
748
749
@workspace_cli.group('server')
750
@pass_workspace
751
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
752
    """Control a METS server for this workspace"""
753
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
754
755
@workspace_serve_cli.command('stop')
756
@pass_workspace
757
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
758
    """Stop the METS server"""
759
    workspace = Workspace(
760
        ctx.resolver,
761
        directory=ctx.directory,
762
        mets_basename=ctx.mets_basename,
763
        mets_server_url=ctx.mets_server_url,
764
    )
765
    workspace.mets.stop()
766
767
@workspace_serve_cli.command('start')
768
@pass_workspace
769
def workspace_serve_start(ctx): # pylint: disable=unused-argument
770
    """
771
    Start a METS server
772
773
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
774
    """
775
    OcrdMetsServer(
776
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
777
        url=ctx.mets_server_url,
778
    ).startup()
779