Passed
Pull Request — master (#1236)
by
unknown
02:58
created

ocrd.cli.workspace.merge()   A

Complexity

Conditions 2

Size

Total Lines 42
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 31
dl 0
loc 42
rs 9.1359
c 0
b 0
f 0
cc 2
nop 14

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd, rmdir, unlink
10
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
import numpy as np
18
19
import click
20
21
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
22
from ocrd.mets_server import OcrdMetsServer
23
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
24
from ocrd.decorators import mets_find_options
25
from . import command_with_replaced_help
26
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
27
28
29
class WorkspaceCtx():
30
31
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
32
        self.log = getLogger('ocrd.cli.workspace')
33
        if mets_basename:
34
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
35
        self.resolver = Resolver()
36
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
37
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
38
        self.automatic_backup = automatic_backup
39
40
41
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
42
43
# ----------------------------------------------------------------------
44
# ocrd workspace
45
# ----------------------------------------------------------------------
46
47
@click.group("workspace")
48
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
49
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
50
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
51
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
52
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
53
@click.pass_context
54
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
55
    """
56
    Managing workspaces
57
58
    A workspace comprises a METS file and a directory as point of reference.
59
60
    Operates on the file system directly or via a METS server 
61
    (already running via some prior `server start` subcommand).
62
    """
63
    initLogging()
64
    ctx.obj = WorkspaceCtx(
65
        directory,
66
        mets_url=mets,
67
        mets_basename=mets_basename,
68
        mets_server_url=mets_server_url,
69
        automatic_backup=backup
70
    )
71
72
# ----------------------------------------------------------------------
73
# ocrd workspace validate
74
# ----------------------------------------------------------------------
75
76
@workspace_cli.command('validate', cls=command_with_replaced_help(
77
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
78
@pass_workspace
79
@click.option('-a', '--download', is_flag=True, help="Download all files")
80
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
81
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
82
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
83
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
84
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
85
@click.argument('mets_url', default=None, required=False)
86
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
87
    """
88
    Validate a workspace
89
90
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
91
    If not given, use --mets accordingly.
92
93
    Check that the METS and its referenced file contents
94
    abide by the OCR-D specifications.
95
    """
96
    LOG = getLogger('ocrd.cli.workspace.validate')
97
    if mets_url:
98
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
99
    else:
100
        mets_url = ctx.mets_url
101
    report = WorkspaceValidator.validate(
102
        ctx.resolver,
103
        mets_url,
104
        src_dir=ctx.directory,
105
        skip=skip,
106
        download=download,
107
        page_strictness=page_textequiv_consistency,
108
        page_coordinate_consistency=page_coordinate_consistency
109
    )
110
    print(report.to_xml())
111
    if not report.is_valid:
112
        sys.exit(128)
113
114
# ----------------------------------------------------------------------
115
# ocrd workspace clone
116
# ----------------------------------------------------------------------
117
118
@workspace_cli.command('clone', cls=command_with_replaced_help(
119
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
120
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
121
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
122
@click.argument('mets_url')
123
@mets_find_options
124
# XXX deprecated
125
@click.argument('workspace_dir', default=None, required=False)
126
@pass_workspace
127
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
128
    """
129
    Create a workspace from METS_URL and return the directory
130
131
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
132
    If METS_URL is not provided, use --mets accordingly.
133
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
134
    """
135
    LOG = getLogger('ocrd.cli.workspace.clone')
136
    if workspace_dir:
137
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
138
        ctx.directory = workspace_dir
139
140
    workspace = ctx.resolver.workspace_from_url(
141
        mets_url,
142
        dst_dir=ctx.directory,
143
        mets_basename=ctx.mets_basename,
144
        clobber_mets=clobber_mets,
145
        download=download,
146
        ID=file_id,
147
        pageId=page_id,
148
        mimetype=mimetype,
149
        include_fileGrp=include_fileGrp,
150
        exclude_fileGrp=exclude_fileGrp,
151
    )
152
    workspace.save_mets()
153
    print(workspace.directory)
154
155
# ----------------------------------------------------------------------
156
# ocrd workspace init
157
# ----------------------------------------------------------------------
158
159
@workspace_cli.command('init', cls=command_with_replaced_help(
160
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
161
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
162
# XXX deprecated
163
@click.argument('directory', default=None, required=False)
164
@pass_workspace
165
def workspace_init(ctx, clobber_mets, directory):
166
    """
167
    Create a workspace with an empty METS file in DIRECTORY or CWD.
168
169
    """
170
    LOG = getLogger('ocrd.cli.workspace.init')
171
    if directory:
172
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
173
        ctx.directory = directory
174
    workspace = ctx.resolver.workspace_from_nothing(
175
        directory=ctx.directory,
176
        mets_basename=ctx.mets_basename,
177
        clobber_mets=clobber_mets
178
    )
179
    workspace.save_mets()
180
    print(workspace.directory)
181
182
# ----------------------------------------------------------------------
183
# ocrd workspace add
184
# ----------------------------------------------------------------------
185
186
@workspace_cli.command('add')
187
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
188
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
189
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
190
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
191
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
192
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
193
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
194
@click.argument('fname', required=True)
195
@pass_workspace
196
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
197
    """
198
    Add a file or http(s) URL FNAME to METS in a workspace.
199
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
200
    """
201
    workspace = Workspace(
202
        ctx.resolver,
203
        directory=ctx.directory,
204
        mets_basename=ctx.mets_basename,
205
        automatic_backup=ctx.automatic_backup,
206
        mets_server_url=ctx.mets_server_url,
207
    )
208
209
    log = getLogger('ocrd.cli.workspace.add')
210
    if not mimetype:
211
        try:
212
            mimetype = EXT_TO_MIME[Path(fname).suffix]
213
            log.info("Guessed mimetype to be %s" % mimetype)
214
        except KeyError:
215
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
216
217
    log.debug("Adding '%s'", fname)
218
    local_filename = None
219
    if not (fname.startswith('http://') or fname.startswith('https://')):
220
        if not fname.startswith(ctx.directory):
221
            if not isabs(fname) and exists(join(ctx.directory, fname)):
222
                fname = join(ctx.directory, fname)
223
            else:
224
                log.debug("File '%s' is not in workspace, copying", fname)
225
                try:
226
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
227
                except FileNotFoundError:
228
                    if check_file_exists:
229
                        log.error("File '%s' does not exist, halt execution!" % fname)
230
                        sys.exit(1)
231
        if check_file_exists and not exists(fname):
232
            log.error("File '%s' does not exist, halt execution!" % fname)
233
            sys.exit(1)
234
        if fname.startswith(ctx.directory):
235
            fname = relpath(fname, ctx.directory)
236
        local_filename = fname
237
238
    if not page_id:
239
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
240
    kwargs = {
241
        'file_id': file_id,
242
        'mimetype': mimetype,
243
        'page_id': page_id,
244
        'force': force,
245
        'ignore': ignore,
246
        'local_filename': local_filename,
247
        'url': fname
248
    }
249
    workspace.add_file(file_grp, **kwargs)
250
    workspace.save_mets()
251
252
# ----------------------------------------------------------------------
253
# ocrd workspace bulk-add
254
# ----------------------------------------------------------------------
255
256
# pylint: disable=broad-except
257
@workspace_cli.command('bulk-add')
258
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
259
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
260
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
261
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
262
@click.option('-u', '--url', help="Remote URL of the file", required=False)
263
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
264
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
265
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
266
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
267
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
268
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
269
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
270
@click.argument('file_glob', nargs=-1, required=True)
271
@pass_workspace
272
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
273
    """
274
    Add files in bulk to an OCR-D workspace.
275
276
    FILE_GLOB can either be a shell glob expression to match file names,
277
    or a list of expressions or '-', in which case expressions are read from STDIN.
278
279
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
280
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
281
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
282
283
    If the FILE_GLOB expressions do not denote the file names themselves
284
    (but arbitrary strings for --regex matching), then use --source-path to set
285
    the actual file paths to use. (This could involve fixed strings or group references.)
286
287
    \b
288
    Examples:
289
        ocrd workspace bulk-add \\
290
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
291
                --page-id 'PHYS_{{ pageid }}' \\
292
                --file-grp "{{ fileGrp }}" \\
293
                path/to/files/*/*.*
294
        \b
295
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
296
        | ocrd workspace bulk-add \\
297
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
298
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
299
                --page-id 'PHYS_{{ pageid }}' \\
300
                --file-grp "{{ fileGrp }}" \\
301
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
302
                -
303
304
        \b
305
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
306
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
307
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
308
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
309
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
310
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
311
    """
312
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
313
    workspace = Workspace(
314
        ctx.resolver,
315
        directory=ctx.directory,
316
        mets_basename=ctx.mets_basename,
317
        automatic_backup=ctx.automatic_backup,
318
        mets_server_url=ctx.mets_server_url,
319
    )
320
321
    try:
322
        pat = re.compile(regex)
323
    except Exception as e:
324
        log.error("Invalid regex: %s" % e)
325
        sys.exit(1)
326
327
    file_paths = []
328
    from_stdin = file_glob == ('-',)
329
    if from_stdin:
330
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
331
    else:
332
        for fglob in file_glob:
333
            expanded = glob(fglob)
334
            if not expanded:
335
                file_paths += [Path(fglob)]
336
            else:
337
                file_paths += [Path(x) for x in expanded]
338
339
    for i, file_path in enumerate(file_paths):
340
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
341
342
        # match regex
343
        m = pat.match(str(file_path))
344
        if not m:
345
            if skip:
346
                continue
347
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
348
            sys.exit(1)
349
        group_dict = m.groupdict()
350
351
        # set up file info
352
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
353
354
        # Flag to track whether 'local_filename' should be 'src'
355
        local_filename_is_src = False
356
357
        # expand templates
358
        for param_name in file_dict:
359
            if not file_dict[param_name]:
360
                if param_name == 'local_filename':
361
                    local_filename_is_src = True
362
                    continue
363
                elif param_name in ['mimetype', 'file_id']:
364
                    # auto-filled below once the other
365
                    # replacements have happened
366
                    continue
367
                elif param_name == 'url':
368
                    # Remote URL is not required
369
                    continue
370
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
371
            for group_name in group_dict:
372
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
373
374
        # Where to copy from
375
        if src_path_option:
376
            src_path = src_path_option
377
            for group_name in group_dict:
378
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
379
            srcpath = Path(src_path)
380
        else:
381
            srcpath = file_path
382
383
        # derive --file-id from filename if not --file-id not explicitly set
384
        if not file_id:
385
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
386
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
387
        if not mimetype:
388
            try:
389
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
390
            except KeyError:
391
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
392
393
        # copy files if src != url
394
        if local_filename_is_src:
395
            file_dict['local_filename'] = srcpath
396
        else:
397
            destpath = Path(workspace.directory, file_dict['local_filename'])
398
            if srcpath != destpath and not destpath.exists():
399
                log.info("cp '%s' '%s'", srcpath, destpath)
400
                if not dry_run:
401
                    if not destpath.parent.is_dir():
402
                        destpath.parent.mkdir()
403
                    destpath.write_bytes(srcpath.read_bytes())
404
405
        # Add to workspace (or not)
406
        fileGrp = file_dict.pop('file_grp')
407
        if dry_run:
408
            log.info('workspace.add_file(%s)' % file_dict)
409
        else:
410
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
411
412
    # save changes to disk
413
    workspace.save_mets()
414
415
416
# ----------------------------------------------------------------------
417
# ocrd workspace find
418
# ----------------------------------------------------------------------
419
420
@workspace_cli.command('find')
421
@mets_find_options
422
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
423
              default=['local_filename'],
424
              show_default=True,
425
              multiple=True,
426
              type=click.Choice([
427
                  'url',
428
                  'mimetype',
429
                  'page_id',
430
                  'pageId',
431
                  'file_id',
432
                  'ID',
433
                  'file_grp',
434
                  'fileGrp',
435
                  'basename',
436
                  'basename_without_extension',
437
                  'local_filename',
438
              ]))
439
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
440
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
441
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
442
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
443
@pass_workspace
444
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
445
    """
446
    Find files.
447
448
    (If any ``FILTER`` starts with ``//``, then its remainder
449
     will be interpreted as a regular expression.)
450
    """
451
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
452
    output_field = [snake_to_camel.get(x, x) for x in output_field]
453
    modified_mets = False
454
    ret = list()
455
    workspace = Workspace(
456
        ctx.resolver,
457
        directory=ctx.directory,
458
        mets_basename=ctx.mets_basename,
459
        mets_server_url=ctx.mets_server_url,
460
    )
461
    with pushd_popd(workspace.directory):
462
        for f in workspace.find_files(
463
                file_id=file_id,
464
                file_grp=file_grp,
465
                mimetype=mimetype,
466
                page_id=page_id,
467
                include_fileGrp=include_fileGrp,
468
                exclude_fileGrp=exclude_fileGrp,
469
            ):
470
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
471
            if download and not f.local_filename:
472
                workspace.download_file(f)
473
                modified_mets = True
474
                if wait:
475
                    time.sleep(wait)
476
            if undo_download and f.url and f.local_filename:
477
                ret_entry = [f'Removed local_filename {f.local_filename}']
478
                f.local_filename = None
479
                modified_mets = True
480
                if not keep_files:
481
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
482
                    unlink(f.local_filename)
483
            ret.append(ret_entry)
484
    if modified_mets:
485
        workspace.save_mets()
486
    if 'pageId' in output_field:
487
        idx = output_field.index('pageId')
488
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 486 is False. Are you sure this can never be the case?
Loading history...
489
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
490
        for fields, page in zip(ret, pages):
491
            fields[idx] = page or ''
492
    for fields in ret:
493
        print('\t'.join(fields))
494
495
# ----------------------------------------------------------------------
496
# ocrd workspace remove
497
# ----------------------------------------------------------------------
498
499
@workspace_cli.command('remove')
500
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
501
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
502
@click.argument('ID', nargs=-1)
503
@pass_workspace
504
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
505
    """
506
    Delete files (given by their ID attribute ``ID``).
507
508
    (If any ``ID`` starts with ``//``, then its remainder
509
     will be interpreted as a regular expression.)
510
    """
511
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
512
    for i in id:
513
        workspace.remove_file(i, force=force, keep_file=keep_file)
514
    workspace.save_mets()
515
516
517
# ----------------------------------------------------------------------
518
# ocrd workspace rename-group
519
# ----------------------------------------------------------------------
520
521
@workspace_cli.command('rename-group')
522
@click.argument('OLD', nargs=1)
523
@click.argument('NEW', nargs=1)
524
@pass_workspace
525
def rename_group(ctx, old, new):
526
    """
527
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
528
    """
529
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
530
    workspace.rename_file_group(old, new)
531
    workspace.save_mets()
532
533
# ----------------------------------------------------------------------
534
# ocrd workspace remove-group
535
# ----------------------------------------------------------------------
536
537
@workspace_cli.command('remove-group')
538
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
539
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
540
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
541
@click.argument('GROUP', nargs=-1)
542
@pass_workspace
543
def remove_group(ctx, group, recursive, force, keep_files):
544
    """
545
    Delete fileGrps (given by their USE attribute ``GROUP``).
546
547
    (If any ``GROUP`` starts with ``//``, then its remainder
548
     will be interpreted as a regular expression.)
549
    """
550
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
551
    for g in group:
552
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
553
    workspace.save_mets()
554
555
# ----------------------------------------------------------------------
556
# ocrd workspace prune-files
557
# ----------------------------------------------------------------------
558
559
@workspace_cli.command('prune-files')
560
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
561
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
562
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
563
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
564
@pass_workspace
565
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
566
    """
567
    Removes mets:files that point to non-existing local files
568
569
    (If any ``FILTER`` starts with ``//``, then its remainder
570
     will be interpreted as a regular expression.)
571
    """
572
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
573
    with pushd_popd(workspace.directory):
574
        for f in workspace.find_files(
575
            file_id=file_id,
576
            file_grp=file_grp,
577
            mimetype=mimetype,
578
            page_id=page_id,
579
        ):
580
            try:
581
                if not f.local_filename or not exists(f.local_filename):
582
                    workspace.mets.remove_file(f.ID)
583
            except Exception as e:
584
                ctx.log.exception("Error removing %f: %s", f, e)
585
                raise(e)
586
        workspace.save_mets()
587
588
# ----------------------------------------------------------------------
589
# ocrd workspace clean
590
# ----------------------------------------------------------------------
591
592
@workspace_cli.command('clean')
593
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True)
594
@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", default=False, is_flag=True)
595
@click.argument('path_glob', nargs=-1, required=False)
596
@pass_workspace
597
def clean(ctx, dry_run, directories, path_glob):
598
    """
599
    Removes files and directories from the workspace that are not
600
    referenced by any mets:files.
601
602
    PATH_GLOB can be a shell glob expression to match file names,
603
    directory names (recursively), or plain paths. All paths are
604
    resolved w.r.t. the workspace.
605
606
    If no PATH_GLOB are specified, then all files and directories
607
    may match.
608
    """
609
    log = getLogger('ocrd.cli.workspace.clean')
610
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
611
    allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
612
    allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
613
    allowed_dirs = set(dirname(path) for path in allowed_files)
614
    with pushd_popd(workspace.directory):
615
        if len(path_glob):
616
            paths = []
617
            for expression in path_glob:
618
                if isabs(expression):
619
                    expression = relpath(expression)
620
                paths += glob(expression, recursive=True) or [expression]
621
        else:
622
            paths = glob('**', recursive=True)
623
        file_paths = [path for path in paths if not isdir(path)]
624
        for path in file_paths:
625
            if normpath(path) in allowed_files:
626
                continue
627
            if dry_run:
628
                log.info('unlink(%s)' % path)
629
            else:
630
                unlink(path)
631
        if not directories:
632
            return
633
        dir_paths = [path for path in paths if isdir(path)]
634
        for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
635
            if normpath(path) in allowed_dirs:
636
                continue
637
            if dry_run:
638
                log.info('rmdir(%s)' % path)
639
            else:
640
                rmdir(path)
641
642
# ----------------------------------------------------------------------
643
# ocrd workspace list-group
644
# ----------------------------------------------------------------------
645
646
@workspace_cli.command('list-group')
647
@pass_workspace
648
def list_groups(ctx):
649
    """
650
    List fileGrp USE attributes
651
    """
652
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
653
    print("\n".join(workspace.mets.file_groups))
654
655
# ----------------------------------------------------------------------
656
# ocrd workspace list-page
657
# ----------------------------------------------------------------------
658
659
@workspace_cli.command('list-page')
660
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
661
              default=['ID'],
662
              show_default=True,
663
              multiple=True,
664
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
665
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
666
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
667
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
668
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
669
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
670
@pass_workspace
671
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
672
    """
673
    List physical page IDs
674
675
    (If any ``FILTER`` starts with ``//``, then its remainder
676
     will be interpreted as a regular expression.)
677
    """
678
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
679
    find_kwargs = {}
680
    if page_id_range and 'ID' in output_field:
681
        find_kwargs['pageId'] = page_id_range
682
    page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
683
    ret = []
684
685
    if output_field == ['ID']:
686
        ret = [[x] for x in page_ids]
687
    else:
688
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
689
            ret.append([])
690
            for k in output_field:
691
                ret[i].append(page_div.get(k, 'None'))
692
693
    if numeric_range:
694
        start, end = map(int, numeric_range.split('..'))
695
        ret = ret[start-1:end]
696
697
    chunks = partition_list(ret, chunk_number, chunk_index)
698
    lines = []
699
    if output_format == 'one-per-line':
700
        for chunk in chunks:
701
            line_strs = []
702
            for entry in chunk:
703
                line_strs.append("\t".join(entry))
704
            lines.append('\n'.join(line_strs))
705
    elif output_format == 'comma-separated':
706
        for chunk in chunks:
707
            line_strs = []
708
            for entry in chunk:
709
                line_strs.append("\t".join(entry))
710
            lines.append(','.join(line_strs))
711
    elif output_format == 'json':
712
        lines.append(dumps(chunks))
713
    print('\n'.join(lines))
714
715
# ----------------------------------------------------------------------
716
# ocrd workspace get-id
717
# ----------------------------------------------------------------------
718
719
@workspace_cli.command('get-id')
720
@pass_workspace
721
def get_id(ctx):
722
    """
723
    Get METS id if any
724
    """
725
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
726
    ID = workspace.mets.unique_identifier
727
    if ID:
728
        print(ID)
729
730
# ----------------------------------------------------------------------
731
# ocrd workspace set-id
732
# ----------------------------------------------------------------------
733
734
@workspace_cli.command('set-id')
735
@click.argument('ID')
736
@pass_workspace
737
def set_id(ctx, id):   # pylint: disable=redefined-builtin
738
    """
739
    Set METS ID.
740
741
    If one of the supported identifier mechanisms is used, will set this identifier.
742
743
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
744
    """
745
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
746
    workspace.mets.unique_identifier = id
747
    workspace.save_mets()
748
749
@workspace_cli.command('update-page')
750
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
751
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')               
752
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
753
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
754
@click.argument('PAGE_ID')
755
@pass_workspace
756
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
757
    """
758
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
759
    """
760
    update_kwargs = {k: v for k, v in attr_value_pairs}
761
    if order:
762
        update_kwargs['ORDER'] = order
763
    if orderlabel:
764
        update_kwargs['ORDERLABEL'] = orderlabel
765
    if contentids:
766
        update_kwargs['CONTENTIDS'] = contentids
767
    try:
768
        workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
769
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
770
        workspace.save_mets()
771
    except Exception as err:
772
        print(f"Error: {err}")
773
        sys.exit(1)
774
775
# ----------------------------------------------------------------------
776
# ocrd workspace merge
777
# ----------------------------------------------------------------------
778
779
def _handle_json_option(ctx, param, value):
780
    return parse_json_string_or_file(value) if value else None
781
782
@workspace_cli.command('merge')
783
@click.argument('METS_PATH')
784
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
785
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
786
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
787
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
788
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
789
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
790
@mets_find_options
791
@pass_workspace
792
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
793
    """
794
    Merges this workspace with the workspace that contains ``METS_PATH``
795
796
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
797
    in order to rename all fileGrp, file ID or page ID values, respectively.
798
799
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
800
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
801
    for an explanation.
802
    """
803
    mets_path = Path(mets_path)
804
    if filegrp_mapping:
805
        filegrp_mapping = loads(filegrp_mapping)
806
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
807
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
808
    workspace.merge(
809
        other_workspace,
810
        force=force,
811
        overwrite=overwrite,
812
        copy_files=copy_files,
813
        fileGrp_mapping=filegrp_mapping,
814
        fileId_mapping=fileid_mapping,
815
        pageId_mapping=pageid_mapping,
816
        file_grp=file_grp,
817
        file_id=file_id,
818
        page_id=page_id,
819
        mimetype=mimetype,
820
        include_fileGrp=include_fileGrp,
821
        exclude_fileGrp=exclude_fileGrp,
822
    )
823
    workspace.save_mets()
824
825
# ----------------------------------------------------------------------
826
# ocrd workspace backup
827
# ----------------------------------------------------------------------
828
829
@workspace_cli.group('backup')
830
@click.pass_context
831
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
832
    """
833
    Backing and restoring workspaces - dev edition
834
    """
835
836
@workspace_backup_cli.command('add')
837
@pass_workspace
838
def workspace_backup_add(ctx):
839
    """
840
    Create a new backup
841
    """
842
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
843
    backup_manager.add()
844
845
@workspace_backup_cli.command('list')
846
@pass_workspace
847
def workspace_backup_list(ctx):
848
    """
849
    List backups
850
    """
851
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
852
    for b in backup_manager.list():
853
        print(b)
854
855
@workspace_backup_cli.command('restore')
856
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
857
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
858
@pass_workspace
859
def workspace_backup_restore(ctx, choose_first, bak):
860
    """
861
    Restore backup BAK
862
    """
863
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
864
    backup_manager.restore(bak, choose_first)
865
866
@workspace_backup_cli.command('undo')
867
@pass_workspace
868
def workspace_backup_undo(ctx):
869
    """
870
    Restore the last backup
871
    """
872
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
873
    backup_manager.undo()
874
875
876
# ----------------------------------------------------------------------
877
# ocrd workspace server
878
# ----------------------------------------------------------------------
879
880
@workspace_cli.group('server')
881
@pass_workspace
882
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
883
    """Control a METS server for this workspace"""
884
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
885
886
@workspace_serve_cli.command('stop')
887
@pass_workspace
888
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
889
    """Stop the METS server"""
890
    workspace = Workspace(
891
        ctx.resolver,
892
        directory=ctx.directory,
893
        mets_basename=ctx.mets_basename,
894
        mets_server_url=ctx.mets_server_url,
895
    )
896
    workspace.mets.stop()
897
898
@workspace_serve_cli.command('start')
899
@pass_workspace
900
def workspace_serve_start(ctx): # pylint: disable=unused-argument
901
    """
902
    Start a METS server
903
904
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
905
    """
906
    OcrdMetsServer(
907
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
908
        url=ctx.mets_server_url,
909
    ).startup()
910