ocrd.cli.workspace.workspace_remove_file()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 18
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 12
dl 0
loc 18
rs 9.8
c 0
b 0
f 0
cc 2
nop 4
1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import rmdir, unlink
10
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
18
import click
19
20
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
21
from ocrd.mets_server import OcrdMetsServer
22
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
23
from ocrd.decorators import mets_find_options
24
from . import command_with_replaced_help
25
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
26
27
28
class WorkspaceCtx():
29
30
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
31
        self.log = getLogger('ocrd.cli.workspace')
32
        if mets_basename:
33
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
34
        self.resolver = Resolver()
35
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
36
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
37
        self.automatic_backup = automatic_backup
38
39
    def workspace(self):
40
        return Workspace(
41
            self.resolver,
42
            directory=self.directory,
43
            mets_basename=self.mets_basename,
44
            automatic_backup=self.automatic_backup,
45
            mets_server_url=self.mets_server_url,
46
        )
47
    def backup_manager(self):
48
        return WorkspaceBackupManager(self.workspace())
49
50
51
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
52
53
# ----------------------------------------------------------------------
54
# ocrd workspace
55
# ----------------------------------------------------------------------
56
57
@click.group("workspace")
58
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
59
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
60
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
61
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server")
62
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
63
@click.pass_context
64
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
65
    """
66
    Managing workspaces
67
68
    A workspace comprises a METS file and a directory as point of reference.
69
70
    Operates on the file system directly or via a METS server 
71
    (already running via some prior `server start` subcommand).
72
    """
73
    initLogging()
74
    ctx.obj = WorkspaceCtx(
75
        directory,
76
        mets_url=mets,
77
        mets_basename=mets_basename,
78
        mets_server_url=mets_server_url,
79
        automatic_backup=backup
80
    )
81
82
# ----------------------------------------------------------------------
83
# ocrd workspace validate
84
# ----------------------------------------------------------------------
85
86
@workspace_cli.command('validate', cls=command_with_replaced_help(
87
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
88
@pass_workspace
89
@click.option('-a', '--download', is_flag=True, help="Download all files")
90
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
91
    ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density', 'page', 'page_xsd',
92
     'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd']))
93
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
94
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
95
@click.argument('mets_url', default=None, required=False)
96
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
97
    """
98
    Validate a workspace
99
100
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
101
    If not given, use --mets accordingly.
102
103
    Check that the METS and its referenced file contents
104
    abide by the OCR-D specifications.
105
    """
106
    LOG = getLogger('ocrd.cli.workspace.validate')
107
    if mets_url:
108
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
109
    else:
110
        mets_url = ctx.mets_url
111
    report = WorkspaceValidator.validate(
112
        ctx.resolver,
113
        mets_url,
114
        src_dir=ctx.directory,
115
        skip=skip,
116
        download=download,
117
        page_strictness=page_textequiv_consistency,
118
        page_coordinate_consistency=page_coordinate_consistency
119
    )
120
    print(report.to_xml())
121
    if not report.is_valid:
122
        sys.exit(128)
123
124
# ----------------------------------------------------------------------
125
# ocrd workspace clone
126
# ----------------------------------------------------------------------
127
128
@workspace_cli.command('clone', cls=command_with_replaced_help(
129
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
130
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
131
@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards")
132
@click.argument('mets_url')
133
@mets_find_options
134
# XXX deprecated
135
@click.argument('workspace_dir', default=None, required=False)
136
@pass_workspace
137
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
138
    """
139
    Create a workspace from METS_URL and return the directory
140
141
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
142
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
143
144
    Additional options pertain to the selection of files / fileGrps / pages
145
    to be downloaded, if --download is used.
146
    """
147
    LOG = getLogger('ocrd.cli.workspace.clone')
148
    if workspace_dir:
149
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
150
        ctx.directory = workspace_dir
151
152
    assert not ctx.mets_server_url, \
153
        f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
154
    workspace = ctx.resolver.workspace_from_url(
155
        mets_url,
156
        dst_dir=ctx.directory,
157
        mets_basename=ctx.mets_basename,
158
        clobber_mets=clobber_mets,
159
        download=download,
160
        fileGrp=file_grp,
161
        ID=file_id,
162
        pageId=page_id,
163
        mimetype=mimetype,
164
        include_fileGrp=include_fileGrp,
165
        exclude_fileGrp=exclude_fileGrp,
166
    )
167
    workspace.save_mets()
168
    print(workspace.directory)
169
170
# ----------------------------------------------------------------------
171
# ocrd workspace init
172
# ----------------------------------------------------------------------
173
174
@workspace_cli.command('init', cls=command_with_replaced_help(
175
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
176
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
177
# XXX deprecated
178
@click.argument('directory', default=None, required=False)
179
@pass_workspace
180
def workspace_init(ctx, clobber_mets, directory):
181
    """
182
    Create a workspace with an empty METS file in DIRECTORY or CWD.
183
184
    """
185
    LOG = getLogger('ocrd.cli.workspace.init')
186
    if directory:
187
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
188
        ctx.directory = directory
189
    assert not ctx.mets_server_url, \
190
        f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
191
    workspace = ctx.resolver.workspace_from_nothing(
192
        directory=ctx.directory,
193
        mets_basename=ctx.mets_basename,
194
        clobber_mets=clobber_mets,
195
    )
196
    workspace.save_mets()
197
    print(workspace.directory)
198
199
# ----------------------------------------------------------------------
200
# ocrd workspace add
201
# ----------------------------------------------------------------------
202
203
@workspace_cli.command('add')
204
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
205
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
206
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
207
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
208
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
209
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
210
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
211
@click.argument('fname', required=True)
212
@pass_workspace
213
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
214
    """
215
    Add a file or http(s) URL FNAME to METS in a workspace.
216
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
217
    """
218
    workspace = ctx.workspace()
219
220
    log = getLogger('ocrd.cli.workspace.add')
221
    if not mimetype:
222
        try:
223
            mimetype = EXT_TO_MIME[Path(fname).suffix]
224
            log.info("Guessed mimetype to be %s" % mimetype)
225
        except KeyError:
226
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
227
228
    log.debug("Adding '%s'", fname)
229
    local_filename = None
230
    if not (fname.startswith('http://') or fname.startswith('https://')):
231
        if not fname.startswith(ctx.directory):
232
            if not isabs(fname) and exists(join(ctx.directory, fname)):
233
                fname = join(ctx.directory, fname)
234
            else:
235
                log.debug("File '%s' is not in workspace, copying", fname)
236
                try:
237
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
238
                except FileNotFoundError:
239
                    if check_file_exists:
240
                        log.error("File '%s' does not exist, halt execution!" % fname)
241
                        sys.exit(1)
242
        if check_file_exists and not exists(fname):
243
            log.error("File '%s' does not exist, halt execution!" % fname)
244
            sys.exit(1)
245
        if fname.startswith(ctx.directory):
246
            fname = relpath(fname, ctx.directory)
247
        local_filename = fname
248
249
    if not page_id:
250
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
251
    kwargs = {
252
        'file_id': file_id,
253
        'mimetype': mimetype,
254
        'page_id': page_id,
255
        'force': force,
256
        'ignore': ignore,
257
        'local_filename': local_filename,
258
        'url': fname
259
    }
260
    workspace.add_file(file_grp, **kwargs)
261
    workspace.save_mets()
262
263
# ----------------------------------------------------------------------
264
# ocrd workspace bulk-add
265
# ----------------------------------------------------------------------
266
267
# pylint: disable=broad-except
268
@workspace_cli.command('bulk-add')
269
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
270
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
271
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
272
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
273
@click.option('-u', '--url', help="Remote URL of the file", required=False)
274
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
275
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
276
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
277
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
278
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
279
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
280
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
281
@click.argument('file_glob', nargs=-1, required=True)
282
@pass_workspace
283
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
284
    """
285
    Add files in bulk to an OCR-D workspace.
286
287
    FILE_GLOB can either be a shell glob expression to match file names,
288
    or a list of expressions or '-', in which case expressions are read from STDIN.
289
290
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
291
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
292
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
293
294
    If the FILE_GLOB expressions do not denote the file names themselves
295
    (but arbitrary strings for --regex matching), then use --source-path to set
296
    the actual file paths to use. (This could involve fixed strings or group references.)
297
298
    \b
299
    Examples:
300
        ocrd workspace bulk-add \\
301
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
302
                --page-id 'PHYS_{{ pageid }}' \\
303
                --file-grp "{{ fileGrp }}" \\
304
                path/to/files/*/*.*
305
        \b
306
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
307
        | ocrd workspace bulk-add \\
308
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
309
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
310
                --page-id 'PHYS_{{ pageid }}' \\
311
                --file-grp "{{ fileGrp }}" \\
312
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
313
                -
314
315
        \b
316
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
317
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
318
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
319
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
320
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
321
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
322
323
    """
324
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
325
    workspace = ctx.workspace()
326
327
    try:
328
        pat = re.compile(regex)
329
    except Exception as e:
330
        log.error("Invalid regex: %s" % e)
331
        sys.exit(1)
332
333
    file_paths = []
334
    from_stdin = file_glob == ('-',)
335
    if from_stdin:
336
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
337
    else:
338
        for fglob in file_glob:
339
            expanded = glob(fglob)
340
            if not expanded:
341
                file_paths += [Path(fglob)]
342
            else:
343
                file_paths += [Path(x) for x in expanded]
344
345
    for i, file_path in enumerate(file_paths):
346
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
347
348
        # match regex
349
        m = pat.match(str(file_path))
350
        if not m:
351
            if skip:
352
                continue
353
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
354
            sys.exit(1)
355
        group_dict = m.groupdict()
356
357
        # set up file info
358
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
359
360
        # Flag to track whether 'local_filename' should be 'src'
361
        local_filename_is_src = False
362
363
        # expand templates
364
        for param_name in file_dict:
365
            if not file_dict[param_name]:
366
                if param_name == 'local_filename':
367
                    local_filename_is_src = True
368
                    continue
369
                elif param_name in ['mimetype', 'file_id']:
370
                    # auto-filled below once the other
371
                    # replacements have happened
372
                    continue
373
                elif param_name == 'url':
374
                    # Remote URL is not required
375
                    continue
376
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
377
            for group_name in group_dict:
378
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
379
380
        # Where to copy from
381
        if src_path_option:
382
            src_path = src_path_option
383
            for group_name in group_dict:
384
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
385
            srcpath = Path(src_path)
386
        else:
387
            srcpath = file_path
388
389
        # derive --file-id from filename if not --file-id not explicitly set
390
        if not file_id:
391
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
392
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
393
        if not mimetype:
394
            try:
395
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
396
            except KeyError:
397
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
398
399
        # copy files if src != url
400
        if local_filename_is_src:
401
            file_dict['local_filename'] = srcpath
402
        else:
403
            destpath = Path(workspace.directory, file_dict['local_filename'])
404
            if srcpath != destpath and not destpath.exists():
405
                log.info("cp '%s' '%s'", srcpath, destpath)
406
                if not dry_run:
407
                    if not destpath.parent.is_dir():
408
                        destpath.parent.mkdir()
409
                    destpath.write_bytes(srcpath.read_bytes())
410
411
        # Add to workspace (or not)
412
        fileGrp = file_dict.pop('file_grp')
413
        if dry_run:
414
            log.info('workspace.add_file(%s)' % file_dict)
415
        else:
416
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg
417
418
    # save changes to disk
419
    workspace.save_mets()
420
421
422
# ----------------------------------------------------------------------
423
# ocrd workspace find
424
# ----------------------------------------------------------------------
425
426
@workspace_cli.command('find')
427
@mets_find_options
428
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
429
              default=['local_filename'],
430
              show_default=True,
431
              multiple=True,
432
              type=click.Choice([
433
                  'url',
434
                  'mimetype',
435
                  'page_id',
436
                  'pageId',
437
                  'file_id',
438
                  'ID',
439
                  'file_grp',
440
                  'fileGrp',
441
                  'basename',
442
                  'basename_without_extension',
443
                  'local_filename',
444
              ]))
445
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
446
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
447
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
448
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
449
@pass_workspace
450
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
451
    """
452
    Find files.
453
454
    (If any ``FILTER`` starts with ``//``, then its remainder
455
     will be interpreted as a regular expression.)
456
    """
457
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
458
    output_field = [snake_to_camel.get(x, x) for x in output_field]
459
    modified_mets = False
460
    ret = []
461
    workspace = ctx.workspace()
462
    with pushd_popd(workspace.directory):
463
        for f in workspace.find_files(
464
                file_id=file_id,
465
                file_grp=file_grp,
466
                mimetype=mimetype,
467
                page_id=page_id,
468
                include_fileGrp=include_fileGrp,
469
                exclude_fileGrp=exclude_fileGrp,
470
            ):
471
            if download and not f.local_filename:
472
                workspace.download_file(f)
473
                modified_mets = True
474
                if wait:
475
                    time.sleep(wait)
476
            if undo_download and f.url and f.local_filename:
477
                modified_mets = True
478
                if not keep_files:
479
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
480
                    unlink(f.local_filename)
481
                f.local_filename = None
482
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
483
            ret.append(ret_entry)
484
    if modified_mets:
485
        workspace.save_mets()
486
    if 'pageId' in output_field:
487
        idx = output_field.index('pageId')
488
        fileIds = list(map(lambda fields: fields[idx], ret))
489
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
490
        for fields, page in zip(ret, pages):
491
            fields[idx] = page or ''
492
    for fields in ret:
493
        print('\t'.join(fields))
494
495
# ----------------------------------------------------------------------
496
# ocrd workspace remove
497
# ----------------------------------------------------------------------
498
499
@workspace_cli.command('remove')
500
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
501
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
502
@click.argument('ID', nargs=-1)
503
@pass_workspace
504
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
505
    """
506
    Delete files (given by their ID attribute ``ID``).
507
508
    (If any ``ID`` starts with ``//``, then its remainder
509
     will be interpreted as a regular expression.)
510
    """
511
    assert not ctx.mets_server_url, \
512
        f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
513
    workspace = ctx.workspace()
514
    for i in id:
515
        workspace.remove_file(i, force=force, keep_file=keep_file)
516
    workspace.save_mets()
517
518
519
# ----------------------------------------------------------------------
520
# ocrd workspace rename-group
521
# ----------------------------------------------------------------------
522
523
@workspace_cli.command('rename-group')
524
@click.argument('OLD', nargs=1)
525
@click.argument('NEW', nargs=1)
526
@pass_workspace
527
def rename_group(ctx, old, new):
528
    """
529
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
530
    """
531
    assert not ctx.mets_server_url, \
532
        f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
533
    workspace = ctx.workspace()
534
    workspace.rename_file_group(old, new)
535
    workspace.save_mets()
536
537
# ----------------------------------------------------------------------
538
# ocrd workspace remove-group
539
# ----------------------------------------------------------------------
540
541
@workspace_cli.command('remove-group')
542
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
543
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
544
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
545
@click.argument('GROUP', nargs=-1)
546
@pass_workspace
547
def remove_group(ctx, group, recursive, force, keep_files):
548
    """
549
    Delete fileGrps (given by their USE attribute ``GROUP``).
550
551
    (If any ``GROUP`` starts with ``//``, then its remainder
552
     will be interpreted as a regular expression.)
553
    """
554
    assert not ctx.mets_server_url, \
555
        f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
556
    workspace = ctx.workspace()
557
    for g in group:
558
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
559
    workspace.save_mets()
560
561
# ----------------------------------------------------------------------
562
# ocrd workspace prune-files
563
# ----------------------------------------------------------------------
564
565
@workspace_cli.command('prune-files')
566
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
567
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
568
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
569
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
570
@pass_workspace
571
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
572
    """
573
    Removes mets:files that point to non-existing local files
574
575
    (If any ``FILTER`` starts with ``//``, then its remainder
576
     will be interpreted as a regular expression.)
577
    """
578
    assert not ctx.mets_server_url, \
579
        f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
580
    workspace = ctx.workspace()
581
    with pushd_popd(workspace.directory):
582
        for f in workspace.find_files(
583
            file_id=file_id,
584
            file_grp=file_grp,
585
            mimetype=mimetype,
586
            page_id=page_id,
587
        ):
588
            try:
589
                if not f.local_filename or not exists(f.local_filename):
590
                    workspace.mets.remove_file(f.ID)
591
            except Exception as e:
592
                ctx.log.exception("Error removing %f: %s", f, e)
593
                raise(e)
594
        workspace.save_mets()
595
596
# ----------------------------------------------------------------------
597
# ocrd workspace clean
598
# ----------------------------------------------------------------------
599
600
@workspace_cli.command('clean')
601
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True)
602
@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", default=False, is_flag=True)
603
@click.argument('path_glob', nargs=-1, required=False)
604
@pass_workspace
605
def clean(ctx, dry_run, directories, path_glob):
606
    """
607
    Removes files and directories from the workspace that are not
608
    referenced by any mets:files.
609
610
    PATH_GLOB can be a shell glob expression to match file names,
611
    directory names (recursively), or plain paths. All paths are
612
    resolved w.r.t. the workspace.
613
614
    If no PATH_GLOB are specified, then all files and directories
615
    may match.
616
    """
617
    workspace = ctx.workspace()
618
    allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
619
    allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
620
    allowed_dirs = set(dirname(path) for path in allowed_files)
621
    with pushd_popd(workspace.directory):
622
        if len(path_glob):
623
            paths = []
624
            for expression in path_glob:
625
                if isabs(expression):
626
                    expression = relpath(expression)
627
                paths += glob(expression, recursive=True) or [expression]
628
        else:
629
            paths = glob('**', recursive=True)
630
        file_paths = [path for path in paths if not isdir(path)]
631
        for path in file_paths:
632
            if normpath(path) in allowed_files:
633
                continue
634
            if dry_run:
635
                ctx.log.info('unlink(%s)' % path)
636
            else:
637
                unlink(path)
638
        if not directories:
639
            return
640
        dir_paths = [path for path in paths if isdir(path)]
641
        for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
642
            if normpath(path) in allowed_dirs:
643
                continue
644
            if dry_run:
645
                ctx.log.info('rmdir(%s)' % path)
646
            else:
647
                rmdir(path)
648
649
# ----------------------------------------------------------------------
650
# ocrd workspace list-group
651
# ----------------------------------------------------------------------
652
653
@workspace_cli.command('list-group')
654
@pass_workspace
655
def list_groups(ctx):
656
    """
657
    List fileGrp USE attributes
658
    """
659
    workspace = ctx.workspace()
660
    print("\n".join(workspace.mets.file_groups))
661
662
# ----------------------------------------------------------------------
663
# ocrd workspace list-page
664
# ----------------------------------------------------------------------
665
666
@workspace_cli.command('list-page')
667
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
668
              default=['ID'],
669
              show_default=True,
670
              multiple=True,
671
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
672
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
673
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
674
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
675
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
676
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
677
@pass_workspace
678
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
679
    """
680
    List physical page IDs
681
682
    (If any ``FILTER`` starts with ``//``, then its remainder
683
     will be interpreted as a regular expression.)
684
    """
685
    workspace = ctx.workspace()
686
    ret = []
687
    if page_id_range or list(output_field) != ['ID']:
688
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
689
            ret.append([])
690
            for k in output_field:
691
                ret[i].append(page_div.get(k, 'None'))
692
    else:
693
        for page_id in workspace.mets.physical_pages:
694
            ret.append([page_id])
695
696
    if numeric_range:
697
        start, end = map(int, numeric_range.split('..'))
698
        ret = ret[start-1:end]
699
700
    chunks = partition_list(ret, chunk_number, chunk_index)
701
    lines = []
702
    if output_format == 'one-per-line':
703
        for chunk in chunks:
704
            line_strs = []
705
            for entry in chunk:
706
                line_strs.append("\t".join(entry))
707
            lines.append('\n'.join(line_strs))
708
    elif output_format == 'comma-separated':
709
        for chunk in chunks:
710
            line_strs = []
711
            for entry in chunk:
712
                line_strs.append("\t".join(entry))
713
            lines.append(','.join(line_strs))
714
    elif output_format == 'json':
715
        lines.append(dumps(chunks))
716
    print('\n'.join(lines))
717
718
# ----------------------------------------------------------------------
719
# ocrd workspace get-id
720
# ----------------------------------------------------------------------
721
722
@workspace_cli.command('get-id')
723
@pass_workspace
724
def get_id(ctx):
725
    """
726
    Get METS id if any
727
    """
728
    workspace = ctx.workspace()
729
    ID = workspace.mets.unique_identifier
730
    if ID:
731
        print(ID)
732
733
# ----------------------------------------------------------------------
734
# ocrd workspace set-id
735
# ----------------------------------------------------------------------
736
737
@workspace_cli.command('set-id')
738
@click.argument('ID')
739
@pass_workspace
740
def set_id(ctx, id):   # pylint: disable=redefined-builtin
741
    """
742
    Set METS ID.
743
744
    If one of the supported identifier mechanisms is used, will set this identifier.
745
746
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
747
    """
748
    workspace = ctx.workspace()
749
    workspace.mets.unique_identifier = id
750
    workspace.save_mets()
751
752
@workspace_cli.command('update-page')
753
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
754
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
755
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
756
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
757
@click.argument('PAGE_ID')
758
@pass_workspace
759
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
760
    """
761
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
762
    """
763
    update_kwargs = dict(attr_value_pairs)
764
    if order:
765
        update_kwargs['ORDER'] = order
766
    if orderlabel:
767
        update_kwargs['ORDERLABEL'] = orderlabel
768
    if contentids:
769
        update_kwargs['CONTENTIDS'] = contentids
770
    try:
771
        assert not ctx.mets_server_url, \
772
            f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
773
        workspace = ctx.workspace()
774
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
775
        workspace.save_mets()
776
    except Exception as err:
777
        print(f"Error: {err}")
778
        sys.exit(1)
779
780
# ----------------------------------------------------------------------
781
# ocrd workspace merge
782
# ----------------------------------------------------------------------
783
784
def _handle_json_option(ctx, param, value):
785
    return parse_json_string_or_file(value) if value else None
786
787
@workspace_cli.command('merge')
788
@click.argument('METS_PATH')
789
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
790
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
791
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
792
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
793
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
794
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
795
@mets_find_options
796
@pass_workspace
797
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
798
    """
799
    Merges this workspace with the workspace that contains ``METS_PATH``
800
801
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
802
    in order to rename all fileGrp, file ID or page ID values, respectively.
803
804
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
805
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
806
    for an explanation.
807
    """
808
    mets_path = Path(mets_path)
809
    if filegrp_mapping:
810
        filegrp_mapping = loads(filegrp_mapping)
811
    assert not ctx.mets_server_url, \
812
        f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
813
    workspace = ctx.workspace()
814
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
815
    workspace.merge(
816
        other_workspace,
817
        force=force,
818
        overwrite=overwrite,
819
        copy_files=copy_files,
820
        fileGrp_mapping=filegrp_mapping,
821
        fileId_mapping=fileid_mapping,
822
        pageId_mapping=pageid_mapping,
823
        file_grp=file_grp,
824
        file_id=file_id,
825
        page_id=page_id,
826
        mimetype=mimetype,
827
        include_fileGrp=include_fileGrp,
828
        exclude_fileGrp=exclude_fileGrp,
829
    )
830
    workspace.save_mets()
831
832
# ----------------------------------------------------------------------
833
# ocrd workspace backup
834
# ----------------------------------------------------------------------
835
836
@workspace_cli.group('backup')
837
@pass_workspace
838
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
839
    """
840
    Backing and restoring workspaces - dev edition
841
    """
842
    assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
843
844
@workspace_backup_cli.command('add')
845
@pass_workspace
846
def workspace_backup_add(ctx):
847
    """
848
    Create a new backup
849
    """
850
    backup_manager = ctx.backup_manager()
851
    backup_manager.add()
852
853
@workspace_backup_cli.command('list')
854
@pass_workspace
855
def workspace_backup_list(ctx):
856
    """
857
    List backups
858
    """
859
    backup_manager = ctx.backup_manager()
860
    for b in backup_manager.list():
861
        print(b)
862
863
@workspace_backup_cli.command('restore')
864
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
865
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
866
@pass_workspace
867
def workspace_backup_restore(ctx, choose_first, bak):
868
    """
869
    Restore backup BAK
870
    """
871
    backup_manager = ctx.backup_manager()
872
    backup_manager.restore(bak, choose_first)
873
874
@workspace_backup_cli.command('undo')
875
@pass_workspace
876
def workspace_backup_undo(ctx):
877
    """
878
    Restore the last backup
879
    """
880
    backup_manager = ctx.backup_manager()
881
    backup_manager.undo()
882
883
884
# ----------------------------------------------------------------------
885
# ocrd workspace server
886
# ----------------------------------------------------------------------
887
888
@workspace_cli.group('server')
889
@pass_workspace
890
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
891
    """Control a METS server for this workspace"""
892
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
893
894
@workspace_serve_cli.command('stop')
895
@pass_workspace
896
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
897
    """Stop the METS server (saving changes to disk)"""
898
    workspace = ctx.workspace()
899
    workspace.mets.stop()
900
901
@workspace_serve_cli.command('reload')
902
@pass_workspace
903
def workspace_serve_reload(ctx): # pylint: disable=unused-argument
904
    """Reload the METS server from disk"""
905
    workspace = ctx.workspace()
906
    workspace.mets.reload()
907
908
@workspace_serve_cli.command('save')
909
@pass_workspace
910
def workspace_serve_save(ctx): # pylint: disable=unused-argument
911
    """Save the METS changes to disk"""
912
    workspace = ctx.workspace()
913
    workspace.mets.save()
914
915
@workspace_serve_cli.command('start')
916
@pass_workspace
917
def workspace_serve_start(ctx): # pylint: disable=unused-argument
918
    """
919
    Start a METS server
920
921
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
922
    """
923
    OcrdMetsServer(
924
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
925
        url=ctx.mets_server_url,
926
    ).startup()
927