Passed
Pull Request — master (#1235)
by
unknown
04:05
created

ocrd.cli.workspace.workspace_init()   A

Complexity

Conditions 2

Size

Total Lines 22
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 22
rs 9.6
c 0
b 0
f 0
cc 2
nop 3
1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd, unlink
10
from os.path import relpath, exists, join, isabs
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
import numpy as np
18
19
import click
20
21
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
22
from ocrd.mets_server import OcrdMetsServer
23
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
24
from ocrd.decorators import mets_find_options
25
from . import command_with_replaced_help
26
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
27
28
29
class WorkspaceCtx():
30
31
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
32
        self.log = getLogger('ocrd.cli.workspace')
33
        if mets_basename:
34
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
35
        self.resolver = Resolver()
36
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
37
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
38
        self.automatic_backup = automatic_backup
39
40
41
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
42
43
# ----------------------------------------------------------------------
44
# ocrd workspace
45
# ----------------------------------------------------------------------
46
47
@click.group("workspace")
48
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
49
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
50
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
51
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
52
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
53
@click.pass_context
54
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
55
    """
56
    Managing workspaces
57
58
    A workspace comprises a METS file and a directory as point of reference.
59
60
    Operates on the file system directly or via a METS server 
61
    (already running via some prior `server start` subcommand).
62
    """
63
    initLogging()
64
    ctx.obj = WorkspaceCtx(
65
        directory,
66
        mets_url=mets,
67
        mets_basename=mets_basename,
68
        mets_server_url=mets_server_url,
69
        automatic_backup=backup
70
    )
71
72
# ----------------------------------------------------------------------
73
# ocrd workspace validate
74
# ----------------------------------------------------------------------
75
76
@workspace_cli.command('validate', cls=command_with_replaced_help(
77
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
78
@pass_workspace
79
@click.option('-a', '--download', is_flag=True, help="Download all files")
80
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
81
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
82
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
83
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
84
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
85
@click.argument('mets_url', default=None, required=False)
86
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
87
    """
88
    Validate a workspace
89
90
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
91
    If not given, use --mets accordingly.
92
93
    Check that the METS and its referenced file contents
94
    abide by the OCR-D specifications.
95
    """
96
    LOG = getLogger('ocrd.cli.workspace.validate')
97
    if mets_url:
98
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
99
    else:
100
        mets_url = ctx.mets_url
101
    report = WorkspaceValidator.validate(
102
        ctx.resolver,
103
        mets_url,
104
        src_dir=ctx.directory,
105
        skip=skip,
106
        download=download,
107
        page_strictness=page_textequiv_consistency,
108
        page_coordinate_consistency=page_coordinate_consistency
109
    )
110
    print(report.to_xml())
111
    if not report.is_valid:
112
        sys.exit(128)
113
114
# ----------------------------------------------------------------------
115
# ocrd workspace clone
116
# ----------------------------------------------------------------------
117
118
@workspace_cli.command('clone', cls=command_with_replaced_help(
119
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
120
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
121
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
122
@click.argument('mets_url')
123
@mets_find_options
124
# XXX deprecated
125
@click.argument('workspace_dir', default=None, required=False)
126
@pass_workspace
127
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
128
    """
129
    Create a workspace from METS_URL and return the directory
130
131
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
132
    If METS_URL is not provided, use --mets accordingly.
133
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
134
    """
135
    LOG = getLogger('ocrd.cli.workspace.clone')
136
    if workspace_dir:
137
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
138
        ctx.directory = workspace_dir
139
140
    workspace = ctx.resolver.workspace_from_url(
141
        mets_url,
142
        dst_dir=ctx.directory,
143
        mets_basename=ctx.mets_basename,
144
        clobber_mets=clobber_mets,
145
        download=download,
146
        ID=file_id,
147
        pageId=page_id,
148
        mimetype=mimetype,
149
        include_fileGrp=include_fileGrp,
150
        exclude_fileGrp=exclude_fileGrp,
151
    )
152
    workspace.save_mets()
153
    print(workspace.directory)
154
155
# ----------------------------------------------------------------------
156
# ocrd workspace init
157
# ----------------------------------------------------------------------
158
159
@workspace_cli.command('init', cls=command_with_replaced_help(
160
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
161
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
162
# XXX deprecated
163
@click.argument('directory', default=None, required=False)
164
@pass_workspace
165
def workspace_init(ctx, clobber_mets, directory):
166
    """
167
    Create a workspace with an empty METS file in --directory.
168
169
    """
170
    LOG = getLogger('ocrd.cli.workspace.init')
171
    if directory:
172
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
173
        ctx.directory = directory
174
    workspace = ctx.resolver.workspace_from_nothing(
175
        directory=ctx.directory,
176
        mets_basename=ctx.mets_basename,
177
        clobber_mets=clobber_mets
178
    )
179
    workspace.save_mets()
180
    print(workspace.directory)
181
182
# ----------------------------------------------------------------------
183
# ocrd workspace add
184
# ----------------------------------------------------------------------
185
186
@workspace_cli.command('add')
187
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
188
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
189
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
190
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
191
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
192
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
193
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
194
@click.argument('fname', required=True)
195
@pass_workspace
196
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
197
    """
198
    Add a file or http(s) URL FNAME to METS in a workspace.
199
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
200
    """
201
    workspace = Workspace(
202
        ctx.resolver,
203
        directory=ctx.directory,
204
        mets_basename=ctx.mets_basename,
205
        automatic_backup=ctx.automatic_backup,
206
        mets_server_url=ctx.mets_server_url,
207
    )
208
209
    log = getLogger('ocrd.cli.workspace.add')
210
    if not mimetype:
211
        try:
212
            mimetype = EXT_TO_MIME[Path(fname).suffix]
213
            log.info("Guessed mimetype to be %s" % mimetype)
214
        except KeyError:
215
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
216
217
    log.debug("Adding '%s'", fname)
218
    local_filename = None
219
    if not (fname.startswith('http://') or fname.startswith('https://')):
220
        if not fname.startswith(ctx.directory):
221
            if not isabs(fname) and exists(join(ctx.directory, fname)):
222
                fname = join(ctx.directory, fname)
223
            else:
224
                log.debug("File '%s' is not in workspace, copying", fname)
225
                try:
226
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
227
                except FileNotFoundError:
228
                    if check_file_exists:
229
                        log.error("File '%s' does not exist, halt execution!" % fname)
230
                        sys.exit(1)
231
        if check_file_exists and not exists(fname):
232
            log.error("File '%s' does not exist, halt execution!" % fname)
233
            sys.exit(1)
234
        if fname.startswith(ctx.directory):
235
            fname = relpath(fname, ctx.directory)
236
        local_filename = fname
237
238
    if not page_id:
239
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
240
    kwargs = {
241
        'file_id': file_id,
242
        'mimetype': mimetype,
243
        'page_id': page_id,
244
        'force': force,
245
        'ignore': ignore,
246
        'local_filename': local_filename,
247
        'url': fname
248
    }
249
    workspace.add_file(file_grp, **kwargs)
250
    workspace.save_mets()
251
252
# ----------------------------------------------------------------------
253
# ocrd workspace bulk-add
254
# ----------------------------------------------------------------------
255
256
# pylint: disable=broad-except
257
@workspace_cli.command('bulk-add')
258
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
259
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
260
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
261
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
262
@click.option('-u', '--url', help="Remote URL of the file", required=False)
263
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
264
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
265
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
266
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
267
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
268
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
269
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
270
@click.argument('file_glob', nargs=-1, required=True)
271
@pass_workspace
272
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
273
    """
274
    Add files in bulk to an OCR-D workspace.
275
276
    FILE_GLOB can either be a shell glob expression to match file names,
277
    or a list of expressions or '-', in which case expressions are read from STDIN.
278
279
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
280
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
281
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
282
283
    If the FILE_GLOB expressions do not denote the file names themselves
284
    (but arbitrary strings for --regex matching), then use --source-path to set
285
    the actual file paths to use. (This could involve fixed strings or group references.)
286
287
    \b
288
    Examples:
289
        ocrd workspace bulk-add \\
290
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
291
                --page-id 'PHYS_{{ pageid }}' \\
292
                --file-grp "{{ fileGrp }}" \\
293
                path/to/files/*/*.*
294
        \b
295
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
296
        | ocrd workspace bulk-add \\
297
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
298
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
299
                --page-id 'PHYS_{{ pageid }}' \\
300
                --file-grp "{{ fileGrp }}" \\
301
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
302
                -
303
304
        \b
305
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
306
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
307
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
308
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
309
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
310
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
311
    """
312
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
313
    workspace = Workspace(
314
        ctx.resolver,
315
        directory=ctx.directory,
316
        mets_basename=ctx.mets_basename,
317
        automatic_backup=ctx.automatic_backup,
318
        mets_server_url=ctx.mets_server_url,
319
    )
320
321
    try:
322
        pat = re.compile(regex)
323
    except Exception as e:
324
        log.error("Invalid regex: %s" % e)
325
        sys.exit(1)
326
327
    file_paths = []
328
    from_stdin = file_glob == ('-',)
329
    if from_stdin:
330
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
331
    else:
332
        for fglob in file_glob:
333
            expanded = glob(fglob)
334
            if not expanded:
335
                file_paths += [Path(fglob)]
336
            else:
337
                file_paths += [Path(x) for x in expanded]
338
339
    for i, file_path in enumerate(file_paths):
340
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
341
342
        # match regex
343
        m = pat.match(str(file_path))
344
        if not m:
345
            if skip:
346
                continue
347
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
348
            sys.exit(1)
349
        group_dict = m.groupdict()
350
351
        # set up file info
352
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
353
354
        # Flag to track whether 'local_filename' should be 'src'
355
        local_filename_is_src = False
356
357
        # expand templates
358
        for param_name in file_dict:
359
            if not file_dict[param_name]:
360
                if param_name == 'local_filename':
361
                    local_filename_is_src = True
362
                    continue
363
                elif param_name in ['mimetype', 'file_id']:
364
                    # auto-filled below once the other
365
                    # replacements have happened
366
                    continue
367
                elif param_name == 'url':
368
                    # Remote URL is not required
369
                    continue
370
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
371
            for group_name in group_dict:
372
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
373
374
        # Where to copy from
375
        if src_path_option:
376
            src_path = src_path_option
377
            for group_name in group_dict:
378
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
379
            srcpath = Path(src_path)
380
        else:
381
            srcpath = file_path
382
383
        # derive --file-id from filename if not --file-id not explicitly set
384
        if not file_id:
385
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
386
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
387
        if not mimetype:
388
            try:
389
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
390
            except KeyError:
391
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
392
393
        # copy files if src != url
394
        if local_filename_is_src:
395
            file_dict['local_filename'] = srcpath
396
        else:
397
            destpath = Path(workspace.directory, file_dict['local_filename'])
398
            if srcpath != destpath and not destpath.exists():
399
                log.info("cp '%s' '%s'", srcpath, destpath)
400
                if not dry_run:
401
                    if not destpath.parent.is_dir():
402
                        destpath.parent.mkdir()
403
                    destpath.write_bytes(srcpath.read_bytes())
404
405
        # Add to workspace (or not)
406
        fileGrp = file_dict.pop('file_grp')
407
        if dry_run:
408
            log.info('workspace.add_file(%s)' % file_dict)
409
        else:
410
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
411
412
    # save changes to disk
413
    workspace.save_mets()
414
415
416
# ----------------------------------------------------------------------
417
# ocrd workspace find
418
# ----------------------------------------------------------------------
419
420
@workspace_cli.command('find')
421
@mets_find_options
422
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
423
              default=['local_filename'],
424
              show_default=True,
425
              multiple=True,
426
              type=click.Choice([
427
                  'url',
428
                  'mimetype',
429
                  'page_id',
430
                  'pageId',
431
                  'file_id',
432
                  'ID',
433
                  'file_grp',
434
                  'fileGrp',
435
                  'basename',
436
                  'basename_without_extension',
437
                  'local_filename',
438
              ]))
439
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
440
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
441
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
442
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
443
@pass_workspace
444
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
445
    """
446
    Find files.
447
448
    (If any ``FILTER`` starts with ``//``, then its remainder
449
     will be interpreted as a regular expression.)
450
    """
451
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
452
    output_field = [snake_to_camel.get(x, x) for x in output_field]
453
    modified_mets = False
454
    ret = list()
455
    workspace = Workspace(
456
        ctx.resolver,
457
        directory=ctx.directory,
458
        mets_basename=ctx.mets_basename,
459
        mets_server_url=ctx.mets_server_url,
460
    )
461
    with pushd_popd(workspace.directory):
462
        for f in workspace.find_files(
463
                file_id=file_id,
464
                file_grp=file_grp,
465
                mimetype=mimetype,
466
                page_id=page_id,
467
                include_fileGrp=include_fileGrp,
468
                exclude_fileGrp=exclude_fileGrp,
469
            ):
470
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
471
            if download and not f.local_filename:
472
                workspace.download_file(f)
473
                modified_mets = True
474
                if wait:
475
                    time.sleep(wait)
476
            if undo_download and f.url and f.local_filename:
477
                ret_entry = [f'Removed local_filename {f.local_filename}']
478
                f.local_filename = None
479
                modified_mets = True
480
                if not keep_files:
481
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
482
                    unlink(f.local_filename)
483
            ret.append(ret_entry)
484
    if modified_mets:
485
        workspace.save_mets()
486
    if 'pageId' in output_field:
487
        idx = output_field.index('pageId')
488
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 486 is False. Are you sure this can never be the case?
Loading history...
489
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
490
        for fields, page in zip(ret, pages):
491
            fields[idx] = page or ''
492
    for fields in ret:
493
        print('\t'.join(fields))
494
495
# ----------------------------------------------------------------------
496
# ocrd workspace remove
497
# ----------------------------------------------------------------------
498
499
@workspace_cli.command('remove')
500
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
501
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
502
@click.argument('ID', nargs=-1)
503
@pass_workspace
504
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
505
    """
506
    Delete files (given by their ID attribute ``ID``).
507
508
    (If any ``ID`` starts with ``//``, then its remainder
509
     will be interpreted as a regular expression.)
510
    """
511
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
512
    for i in id:
513
        workspace.remove_file(i, force=force, keep_file=keep_file)
514
    workspace.save_mets()
515
516
517
# ----------------------------------------------------------------------
518
# ocrd workspace rename-group
519
# ----------------------------------------------------------------------
520
521
@workspace_cli.command('rename-group')
522
@click.argument('OLD', nargs=1)
523
@click.argument('NEW', nargs=1)
524
@pass_workspace
525
def rename_group(ctx, old, new):
526
    """
527
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
528
    """
529
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
530
    workspace.rename_file_group(old, new)
531
    workspace.save_mets()
532
533
# ----------------------------------------------------------------------
534
# ocrd workspace remove-group
535
# ----------------------------------------------------------------------
536
537
@workspace_cli.command('remove-group')
538
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
539
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
540
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
541
@click.argument('GROUP', nargs=-1)
542
@pass_workspace
543
def remove_group(ctx, group, recursive, force, keep_files):
544
    """
545
    Delete fileGrps (given by their USE attribute ``GROUP``).
546
547
    (If any ``GROUP`` starts with ``//``, then its remainder
548
     will be interpreted as a regular expression.)
549
    """
550
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
551
    for g in group:
552
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
553
    workspace.save_mets()
554
555
# ----------------------------------------------------------------------
556
# ocrd workspace prune-files
557
# ----------------------------------------------------------------------
558
559
@workspace_cli.command('prune-files')
560
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
561
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
562
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
563
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
564
@pass_workspace
565
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
566
    """
567
    Removes mets:files that point to non-existing local files
568
569
    (If any ``FILTER`` starts with ``//``, then its remainder
570
     will be interpreted as a regular expression.)
571
    """
572
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
573
    with pushd_popd(workspace.directory):
574
        for f in workspace.find_files(
575
            file_id=file_id,
576
            file_grp=file_grp,
577
            mimetype=mimetype,
578
            page_id=page_id,
579
        ):
580
            try:
581
                if not f.local_filename or not exists(f.local_filename):
582
                    workspace.mets.remove_file(f.ID)
583
            except Exception as e:
584
                ctx.log.exception("Error removing %f: %s", f, e)
585
                raise(e)
586
        workspace.save_mets()
587
588
# ----------------------------------------------------------------------
589
# ocrd workspace list-group
590
# ----------------------------------------------------------------------
591
592
@workspace_cli.command('list-group')
593
@pass_workspace
594
def list_groups(ctx):
595
    """
596
    List fileGrp USE attributes
597
    """
598
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
599
    print("\n".join(workspace.mets.file_groups))
600
601
# ----------------------------------------------------------------------
602
# ocrd workspace list-page
603
# ----------------------------------------------------------------------
604
605
@workspace_cli.command('list-page')
606
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
607
              default=['ID'],
608
              show_default=True,
609
              multiple=True,
610
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
611
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
612
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
613
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
614
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
615
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
616
@pass_workspace
617
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
618
    """
619
    List physical page IDs
620
621
    (If any ``FILTER`` starts with ``//``, then its remainder
622
     will be interpreted as a regular expression.)
623
    """
624
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
625
    find_kwargs = {}
626
    if page_id_range and 'ID' in output_field:
627
        find_kwargs['pageId'] = page_id_range
628
    page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
629
    ret = []
630
631
    if output_field == ['ID']:
632
        ret = [[x] for x in page_ids]
633
    else:
634
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
635
            ret.append([])
636
            for k in output_field:
637
                ret[i].append(page_div.get(k, 'None'))
638
639
    if numeric_range:
640
        start, end = map(int, numeric_range.split('..'))
641
        ret = ret[start-1:end]
642
643
    chunks = partition_list(ret, chunk_number, chunk_index)
644
    lines = []
645
    if output_format == 'one-per-line':
646
        for chunk in chunks:
647
            line_strs = []
648
            for entry in chunk:
649
                line_strs.append("\t".join(entry))
650
            lines.append('\n'.join(line_strs))
651
    elif output_format == 'comma-separated':
652
        for chunk in chunks:
653
            line_strs = []
654
            for entry in chunk:
655
                line_strs.append("\t".join(entry))
656
            lines.append(','.join(line_strs))
657
    elif output_format == 'json':
658
        lines.append(dumps(chunks))
659
    print('\n'.join(lines))
660
661
# ----------------------------------------------------------------------
662
# ocrd workspace get-id
663
# ----------------------------------------------------------------------
664
665
@workspace_cli.command('get-id')
666
@pass_workspace
667
def get_id(ctx):
668
    """
669
    Get METS id if any
670
    """
671
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
672
    ID = workspace.mets.unique_identifier
673
    if ID:
674
        print(ID)
675
676
# ----------------------------------------------------------------------
677
# ocrd workspace set-id
678
# ----------------------------------------------------------------------
679
680
@workspace_cli.command('set-id')
681
@click.argument('ID')
682
@pass_workspace
683
def set_id(ctx, id):   # pylint: disable=redefined-builtin
684
    """
685
    Set METS ID.
686
687
    If one of the supported identifier mechanisms is used, will set this identifier.
688
689
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
690
    """
691
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
692
    workspace.mets.unique_identifier = id
693
    workspace.save_mets()
694
695
@workspace_cli.command('update-page')
696
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
697
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')               
698
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
699
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
700
@click.argument('PAGE_ID')
701
@pass_workspace
702
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
703
    """
704
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
705
    """
706
    update_kwargs = {k: v for k, v in attr_value_pairs}
707
    if order:
708
        update_kwargs['ORDER'] = order
709
    if orderlabel:
710
        update_kwargs['ORDERLABEL'] = orderlabel
711
    if contentids:
712
        update_kwargs['CONTENTIDS'] = contentids
713
    try:
714
        workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
715
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
716
        workspace.save_mets()
717
    except Exception as err:
718
        print(f"Error: {err}")
719
        sys.exit(1)
720
721
# ----------------------------------------------------------------------
722
# ocrd workspace merge
723
# ----------------------------------------------------------------------
724
725
def _handle_json_option(ctx, param, value):
726
    return parse_json_string_or_file(value) if value else None
727
728
@workspace_cli.command('merge')
729
@click.argument('METS_PATH')
730
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
731
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
732
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
733
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
734
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
735
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
736
@mets_find_options
737
@pass_workspace
738
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
739
    """
740
    Merges this workspace with the workspace that contains ``METS_PATH``
741
742
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
743
    in order to rename all fileGrp, file ID or page ID values, respectively.
744
745
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
746
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
747
    for an explanation.
748
    """
749
    mets_path = Path(mets_path)
750
    if filegrp_mapping:
751
        filegrp_mapping = loads(filegrp_mapping)
752
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
753
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
754
    workspace.merge(
755
        other_workspace,
756
        force=force,
757
        overwrite=overwrite,
758
        copy_files=copy_files,
759
        fileGrp_mapping=filegrp_mapping,
760
        fileId_mapping=fileid_mapping,
761
        pageId_mapping=pageid_mapping,
762
        file_grp=file_grp,
763
        file_id=file_id,
764
        page_id=page_id,
765
        mimetype=mimetype,
766
        include_fileGrp=include_fileGrp,
767
        exclude_fileGrp=exclude_fileGrp,
768
    )
769
    workspace.save_mets()
770
771
# ----------------------------------------------------------------------
772
# ocrd workspace backup
773
# ----------------------------------------------------------------------
774
775
@workspace_cli.group('backup')
776
@click.pass_context
777
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
778
    """
779
    Backing and restoring workspaces - dev edition
780
    """
781
782
@workspace_backup_cli.command('add')
783
@pass_workspace
784
def workspace_backup_add(ctx):
785
    """
786
    Create a new backup
787
    """
788
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
789
    backup_manager.add()
790
791
@workspace_backup_cli.command('list')
792
@pass_workspace
793
def workspace_backup_list(ctx):
794
    """
795
    List backups
796
    """
797
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
798
    for b in backup_manager.list():
799
        print(b)
800
801
@workspace_backup_cli.command('restore')
802
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
803
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
804
@pass_workspace
805
def workspace_backup_restore(ctx, choose_first, bak):
806
    """
807
    Restore backup BAK
808
    """
809
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
810
    backup_manager.restore(bak, choose_first)
811
812
@workspace_backup_cli.command('undo')
813
@pass_workspace
814
def workspace_backup_undo(ctx):
815
    """
816
    Restore the last backup
817
    """
818
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
819
    backup_manager.undo()
820
821
822
# ----------------------------------------------------------------------
823
# ocrd workspace server
824
# ----------------------------------------------------------------------
825
826
@workspace_cli.group('server')
827
@pass_workspace
828
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
829
    """Control a METS server for this workspace"""
830
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
831
832
@workspace_serve_cli.command('stop')
833
@pass_workspace
834
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
835
    """Stop the METS server"""
836
    workspace = Workspace(
837
        ctx.resolver,
838
        directory=ctx.directory,
839
        mets_basename=ctx.mets_basename,
840
        mets_server_url=ctx.mets_server_url,
841
    )
842
    workspace.mets.stop()
843
844
@workspace_serve_cli.command('start')
845
@pass_workspace
846
def workspace_serve_start(ctx): # pylint: disable=unused-argument
847
    """
848
    Start a METS server
849
850
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
851
    """
852
    OcrdMetsServer(
853
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
854
        url=ctx.mets_server_url,
855
    ).startup()
856