Passed
Pull Request — master (#1236)
by
unknown
03:16
created

ocrd.cli.workspace.workspace_init()   A

Complexity

Conditions 2

Size

Total Lines 22
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 22
rs 9.6
c 0
b 0
f 0
cc 2
nop 3
1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd, rmdir, unlink
10
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
import numpy as np
18
19
import click
20
21
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
22
from ocrd.mets_server import OcrdMetsServer
23
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
24
from ocrd.decorators import mets_find_options
25
from . import command_with_replaced_help
26
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
27
28
29
class WorkspaceCtx():
30
31
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
32
        self.log = getLogger('ocrd.cli.workspace')
33
        if mets_basename:
34
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
35
        self.resolver = Resolver()
36
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
37
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
38
        self.automatic_backup = automatic_backup
39
40
41
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
42
43
# ----------------------------------------------------------------------
44
# ocrd workspace
45
# ----------------------------------------------------------------------
46
47
@click.group("workspace")
48
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
49
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
50
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
51
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
52
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
53
@click.pass_context
54
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
55
    """
56
    Managing workspaces
57
58
    A workspace comprises a METS file and a directory as point of reference.
59
60
    Operates on the file system directly or via a METS server 
61
    (already running via some prior `server start` subcommand).
62
    """
63
    initLogging()
64
    ctx.obj = WorkspaceCtx(
65
        directory,
66
        mets_url=mets,
67
        mets_basename=mets_basename,
68
        mets_server_url=mets_server_url,
69
        automatic_backup=backup
70
    )
71
72
# ----------------------------------------------------------------------
73
# ocrd workspace validate
74
# ----------------------------------------------------------------------
75
76
@workspace_cli.command('validate', cls=command_with_replaced_help(
77
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
78
@pass_workspace
79
@click.option('-a', '--download', is_flag=True, help="Download all files")
80
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
81
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
82
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
83
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
84
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
85
@click.argument('mets_url', default=None, required=False)
86
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
87
    """
88
    Validate a workspace
89
90
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
91
    If not given, use --mets accordingly.
92
93
    Check that the METS and its referenced file contents
94
    abide by the OCR-D specifications.
95
    """
96
    LOG = getLogger('ocrd.cli.workspace.validate')
97
    if mets_url:
98
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
99
    else:
100
        mets_url = ctx.mets_url
101
    report = WorkspaceValidator.validate(
102
        ctx.resolver,
103
        mets_url,
104
        src_dir=ctx.directory,
105
        skip=skip,
106
        download=download,
107
        page_strictness=page_textequiv_consistency,
108
        page_coordinate_consistency=page_coordinate_consistency
109
    )
110
    print(report.to_xml())
111
    if not report.is_valid:
112
        sys.exit(128)
113
114
# ----------------------------------------------------------------------
115
# ocrd workspace clone
116
# ----------------------------------------------------------------------
117
118
@workspace_cli.command('clone', cls=command_with_replaced_help(
119
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
120
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
121
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
122
@click.argument('mets_url')
123
@mets_find_options
124
# XXX deprecated
125
@click.argument('workspace_dir', default=None, required=False)
126
@pass_workspace
127
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
128
    """
129
    Create a workspace from METS_URL and return the directory
130
131
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
132
    If METS_URL is not provided, use --mets accordingly.
133
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
134
    """
135
    LOG = getLogger('ocrd.cli.workspace.clone')
136
    if workspace_dir:
137
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
138
        ctx.directory = workspace_dir
139
140
    workspace = ctx.resolver.workspace_from_url(
141
        mets_url,
142
        dst_dir=ctx.directory,
143
        mets_basename=ctx.mets_basename,
144
        clobber_mets=clobber_mets,
145
        download=download,
146
        ID=file_id,
147
        pageId=page_id,
148
        mimetype=mimetype,
149
        include_fileGrp=include_fileGrp,
150
        exclude_fileGrp=exclude_fileGrp,
151
    )
152
    workspace.save_mets()
153
    print(workspace.directory)
154
155
# ----------------------------------------------------------------------
156
# ocrd workspace init
157
# ----------------------------------------------------------------------
158
159
@workspace_cli.command('init', cls=command_with_replaced_help(
160
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
161
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
162
# XXX deprecated
163
@click.argument('directory', default=None, required=False)
164
@pass_workspace
165
def workspace_init(ctx, clobber_mets, directory):
166
    """
167
    Create a workspace with an empty METS file in DIRECTORY or CWD.
168
169
    """
170
    LOG = getLogger('ocrd.cli.workspace.init')
171
    if directory:
172
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
173
        ctx.directory = directory
174
    workspace = ctx.resolver.workspace_from_nothing(
175
        directory=ctx.directory,
176
        mets_basename=ctx.mets_basename,
177
        clobber_mets=clobber_mets
178
    )
179
    workspace.save_mets()
180
    print(workspace.directory)
181
182
# ----------------------------------------------------------------------
183
# ocrd workspace add
184
# ----------------------------------------------------------------------
185
186
@workspace_cli.command('add')
187
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
188
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
189
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
190
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
191
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
192
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
193
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
194
@click.argument('fname', required=True)
195
@pass_workspace
196
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
197
    """
198
    Add a file or http(s) URL FNAME to METS in a workspace.
199
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
200
    """
201
    workspace = Workspace(
202
        ctx.resolver,
203
        directory=ctx.directory,
204
        mets_basename=ctx.mets_basename,
205
        automatic_backup=ctx.automatic_backup,
206
        mets_server_url=ctx.mets_server_url,
207
    )
208
209
    log = getLogger('ocrd.cli.workspace.add')
210
    if not mimetype:
211
        try:
212
            mimetype = EXT_TO_MIME[Path(fname).suffix]
213
            log.info("Guessed mimetype to be %s" % mimetype)
214
        except KeyError:
215
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
216
217
    log.debug("Adding '%s'", fname)
218
    local_filename = None
219
    if not (fname.startswith('http://') or fname.startswith('https://')):
220
        if not fname.startswith(ctx.directory):
221
            if not isabs(fname) and exists(join(ctx.directory, fname)):
222
                fname = join(ctx.directory, fname)
223
            else:
224
                log.debug("File '%s' is not in workspace, copying", fname)
225
                try:
226
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
227
                except FileNotFoundError:
228
                    if check_file_exists:
229
                        log.error("File '%s' does not exist, halt execution!" % fname)
230
                        sys.exit(1)
231
        if check_file_exists and not exists(fname):
232
            log.error("File '%s' does not exist, halt execution!" % fname)
233
            sys.exit(1)
234
        if fname.startswith(ctx.directory):
235
            fname = relpath(fname, ctx.directory)
236
        local_filename = fname
237
238
    if not page_id:
239
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
240
    kwargs = {
241
        'file_id': file_id,
242
        'mimetype': mimetype,
243
        'page_id': page_id,
244
        'force': force,
245
        'ignore': ignore,
246
        'local_filename': local_filename,
247
        'url': fname
248
    }
249
    workspace.add_file(file_grp, **kwargs)
250
    workspace.save_mets()
251
252
# ----------------------------------------------------------------------
253
# ocrd workspace bulk-add
254
# ----------------------------------------------------------------------
255
256
# pylint: disable=broad-except
257
@workspace_cli.command('bulk-add')
258
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
259
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
260
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
261
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
262
@click.option('-u', '--url', help="Remote URL of the file", required=False)
263
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
264
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
265
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
266
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
267
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
268
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
269
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
270
@click.argument('file_glob', nargs=-1, required=True)
271
@pass_workspace
272
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
273
    """
274
    Add files in bulk to an OCR-D workspace.
275
276
    FILE_GLOB can either be a shell glob expression to match file names,
277
    or a list of expressions or '-', in which case expressions are read from STDIN.
278
279
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
280
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
281
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
282
283
    If the FILE_GLOB expressions do not denote the file names themselves
284
    (but arbitrary strings for --regex matching), then use --source-path to set
285
    the actual file paths to use. (This could involve fixed strings or group references.)
286
287
    \b
288
    Examples:
289
        ocrd workspace bulk-add \\
290
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
291
                --page-id 'PHYS_{{ pageid }}' \\
292
                --file-grp "{{ fileGrp }}" \\
293
                path/to/files/*/*.*
294
        \b
295
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
296
        | ocrd workspace bulk-add \\
297
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
298
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
299
                --page-id 'PHYS_{{ pageid }}' \\
300
                --file-grp "{{ fileGrp }}" \\
301
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
302
                -
303
304
        \b
305
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
306
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
307
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
308
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
309
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
310
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
311
    """
312
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
313
    workspace = Workspace(
314
        ctx.resolver,
315
        directory=ctx.directory,
316
        mets_basename=ctx.mets_basename,
317
        automatic_backup=ctx.automatic_backup,
318
        mets_server_url=ctx.mets_server_url,
319
    )
320
321
    try:
322
        pat = re.compile(regex)
323
    except Exception as e:
324
        log.error("Invalid regex: %s" % e)
325
        sys.exit(1)
326
327
    file_paths = []
328
    from_stdin = file_glob == ('-',)
329
    if from_stdin:
330
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
331
    else:
332
        for fglob in file_glob:
333
            expanded = glob(fglob)
334
            if not expanded:
335
                file_paths += [Path(fglob)]
336
            else:
337
                file_paths += [Path(x) for x in expanded]
338
339
    for i, file_path in enumerate(file_paths):
340
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
341
342
        # match regex
343
        m = pat.match(str(file_path))
344
        if not m:
345
            if skip:
346
                continue
347
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
348
            sys.exit(1)
349
        group_dict = m.groupdict()
350
351
        # set up file info
352
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
353
354
        # Flag to track whether 'local_filename' should be 'src'
355
        local_filename_is_src = False
356
357
        # expand templates
358
        for param_name in file_dict:
359
            if not file_dict[param_name]:
360
                if param_name == 'local_filename':
361
                    local_filename_is_src = True
362
                    continue
363
                elif param_name in ['mimetype', 'file_id']:
364
                    # auto-filled below once the other
365
                    # replacements have happened
366
                    continue
367
                elif param_name == 'url':
368
                    # Remote URL is not required
369
                    continue
370
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
371
            for group_name in group_dict:
372
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
373
374
        # Where to copy from
375
        if src_path_option:
376
            src_path = src_path_option
377
            for group_name in group_dict:
378
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
379
            srcpath = Path(src_path)
380
        else:
381
            srcpath = file_path
382
383
        # derive --file-id from filename if not --file-id not explicitly set
384
        if not file_id:
385
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
386
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
387
        if not mimetype:
388
            try:
389
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
390
            except KeyError:
391
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
392
393
        # copy files if src != url
394
        if local_filename_is_src:
395
            file_dict['local_filename'] = srcpath
396
        else:
397
            destpath = Path(workspace.directory, file_dict['local_filename'])
398
            if srcpath != destpath and not destpath.exists():
399
                log.info("cp '%s' '%s'", srcpath, destpath)
400
                if not dry_run:
401
                    if not destpath.parent.is_dir():
402
                        destpath.parent.mkdir()
403
                    destpath.write_bytes(srcpath.read_bytes())
404
405
        # Add to workspace (or not)
406
        fileGrp = file_dict.pop('file_grp')
407
        if dry_run:
408
            log.info('workspace.add_file(%s)' % file_dict)
409
        else:
410
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
411
412
    # save changes to disk
413
    workspace.save_mets()
414
415
416
# ----------------------------------------------------------------------
417
# ocrd workspace find
418
# ----------------------------------------------------------------------
419
420
@workspace_cli.command('find')
421
@mets_find_options
422
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
423
              default=['local_filename'],
424
              show_default=True,
425
              multiple=True,
426
              type=click.Choice([
427
                  'url',
428
                  'mimetype',
429
                  'page_id',
430
                  'pageId',
431
                  'file_id',
432
                  'ID',
433
                  'file_grp',
434
                  'fileGrp',
435
                  'basename',
436
                  'basename_without_extension',
437
                  'local_filename',
438
              ]))
439
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
440
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
441
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
442
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
443
@pass_workspace
444
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
445
    """
446
    Find files.
447
448
    (If any ``FILTER`` starts with ``//``, then its remainder
449
     will be interpreted as a regular expression.)
450
    """
451
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
452
    output_field = [snake_to_camel.get(x, x) for x in output_field]
453
    modified_mets = False
454
    ret = list()
455
    workspace = Workspace(
456
        ctx.resolver,
457
        directory=ctx.directory,
458
        mets_basename=ctx.mets_basename,
459
        mets_server_url=ctx.mets_server_url,
460
    )
461
    with pushd_popd(workspace.directory):
462
        for f in workspace.find_files(
463
                file_id=file_id,
464
                file_grp=file_grp,
465
                mimetype=mimetype,
466
                page_id=page_id,
467
                include_fileGrp=include_fileGrp,
468
                exclude_fileGrp=exclude_fileGrp,
469
            ):
470
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
471
            if download and not f.local_filename:
472
                workspace.download_file(f)
473
                modified_mets = True
474
                if wait:
475
                    time.sleep(wait)
476
            if undo_download and f.url and f.local_filename:
477
                ret_entry = [f'Removed local_filename {f.local_filename}']
478
                f.local_filename = None
479
                modified_mets = True
480
                if not keep_files:
481
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
482
                    unlink(f.local_filename)
483
            ret.append(ret_entry)
484
    if modified_mets:
485
        workspace.save_mets()
486
    if 'pageId' in output_field:
487
        idx = output_field.index('pageId')
488
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 486 is False. Are you sure this can never be the case?
Loading history...
489
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
490
        for fields, page in zip(ret, pages):
491
            fields[idx] = page or ''
492
    for fields in ret:
493
        print('\t'.join(fields))
494
495
# ----------------------------------------------------------------------
496
# ocrd workspace remove
497
# ----------------------------------------------------------------------
498
499
@workspace_cli.command('remove')
500
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
501
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
502
@click.argument('ID', nargs=-1)
503
@pass_workspace
504
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
505
    """
506
    Delete files (given by their ID attribute ``ID``).
507
508
    (If any ``ID`` starts with ``//``, then its remainder
509
     will be interpreted as a regular expression.)
510
    """
511
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
512
    for i in id:
513
        workspace.remove_file(i, force=force, keep_file=keep_file)
514
    workspace.save_mets()
515
516
517
# ----------------------------------------------------------------------
518
# ocrd workspace rename-group
519
# ----------------------------------------------------------------------
520
521
@workspace_cli.command('rename-group')
522
@click.argument('OLD', nargs=1)
523
@click.argument('NEW', nargs=1)
524
@pass_workspace
525
def rename_group(ctx, old, new):
526
    """
527
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
528
    """
529
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
530
    workspace.rename_file_group(old, new)
531
    workspace.save_mets()
532
533
# ----------------------------------------------------------------------
534
# ocrd workspace remove-group
535
# ----------------------------------------------------------------------
536
537
@workspace_cli.command('remove-group')
538
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
539
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
540
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
541
@click.argument('GROUP', nargs=-1)
542
@pass_workspace
543
def remove_group(ctx, group, recursive, force, keep_files):
544
    """
545
    Delete fileGrps (given by their USE attribute ``GROUP``).
546
547
    (If any ``GROUP`` starts with ``//``, then its remainder
548
     will be interpreted as a regular expression.)
549
    """
550
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
551
    for g in group:
552
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
553
    workspace.save_mets()
554
555
# ----------------------------------------------------------------------
556
# ocrd workspace prune-files
557
# ----------------------------------------------------------------------
558
559
@workspace_cli.command('prune-files')
560
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
561
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
562
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
563
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
564
@pass_workspace
565
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
566
    """
567
    Removes mets:files that point to non-existing local files
568
569
    (If any ``FILTER`` starts with ``//``, then its remainder
570
     will be interpreted as a regular expression.)
571
    """
572
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
573
    with pushd_popd(workspace.directory):
574
        for f in workspace.find_files(
575
            file_id=file_id,
576
            file_grp=file_grp,
577
            mimetype=mimetype,
578
            page_id=page_id,
579
        ):
580
            try:
581
                if not f.local_filename or not exists(f.local_filename):
582
                    workspace.mets.remove_file(f.ID)
583
            except Exception as e:
584
                ctx.log.exception("Error removing %f: %s", f, e)
585
                raise(e)
586
        workspace.save_mets()
587
588
# ----------------------------------------------------------------------
589
# ocrd workspace clean
590
# ----------------------------------------------------------------------
591
592
@workspace_cli.command('clean')
593
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True)
594
@click.argument('path_glob', nargs=-1, required=False)
595
@pass_workspace
596
def clean(ctx, dry_run, path_glob):
597
    """
598
    Removes files and directories from the workspace that are not
599
    referenced by any mets:files.
600
601
    PATH_GLOB can be a shell glob expression to match file names,
602
    directory names (recursively), or plain paths. All paths are
603
    resolved w.r.t. the workspace.
604
605
    If no PATH_GLOB are specified, then all files and directories
606
    may match.
607
    """
608
    log = getLogger('ocrd.cli.workspace.clean')
609
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
610
    allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
611
    allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
612
    allowed_dirs = set(dirname(path) for path in allowed_files)
613
    with pushd_popd(workspace.directory):
614
        if len(path_glob):
615
            paths = []
616
            for expression in path_glob:
617
                if isabs(expression):
618
                    expression = relpath(expression)
619
                paths += glob(expression, recursive=True) or [expression]
620
        else:
621
            paths = glob('**', recursive=True)
622
        file_paths = [path for path in paths if not isdir(path)]
623
        dir_paths = [path for path in paths if isdir(path)]
624
        for path in file_paths:
625
            if normpath(path) in allowed_files:
626
                continue
627
            if dry_run:
628
                log.info('unlink(%s)' % path)
629
            else:
630
                unlink(path)
631
        for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
632
            if normpath(path) in allowed_dirs:
633
                continue
634
            if dry_run:
635
                log.info('rmdir(%s)' % path)
636
            else:
637
                rmdir(path)
638
639
# ----------------------------------------------------------------------
640
# ocrd workspace list-group
641
# ----------------------------------------------------------------------
642
643
@workspace_cli.command('list-group')
644
@pass_workspace
645
def list_groups(ctx):
646
    """
647
    List fileGrp USE attributes
648
    """
649
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
650
    print("\n".join(workspace.mets.file_groups))
651
652
# ----------------------------------------------------------------------
653
# ocrd workspace list-page
654
# ----------------------------------------------------------------------
655
656
@workspace_cli.command('list-page')
657
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
658
              default=['ID'],
659
              show_default=True,
660
              multiple=True,
661
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
662
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
663
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
664
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
665
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
666
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
667
@pass_workspace
668
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
669
    """
670
    List physical page IDs
671
672
    (If any ``FILTER`` starts with ``//``, then its remainder
673
     will be interpreted as a regular expression.)
674
    """
675
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
676
    find_kwargs = {}
677
    if page_id_range and 'ID' in output_field:
678
        find_kwargs['pageId'] = page_id_range
679
    page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
680
    ret = []
681
682
    if output_field == ['ID']:
683
        ret = [[x] for x in page_ids]
684
    else:
685
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
686
            ret.append([])
687
            for k in output_field:
688
                ret[i].append(page_div.get(k, 'None'))
689
690
    if numeric_range:
691
        start, end = map(int, numeric_range.split('..'))
692
        ret = ret[start-1:end]
693
694
    chunks = partition_list(ret, chunk_number, chunk_index)
695
    lines = []
696
    if output_format == 'one-per-line':
697
        for chunk in chunks:
698
            line_strs = []
699
            for entry in chunk:
700
                line_strs.append("\t".join(entry))
701
            lines.append('\n'.join(line_strs))
702
    elif output_format == 'comma-separated':
703
        for chunk in chunks:
704
            line_strs = []
705
            for entry in chunk:
706
                line_strs.append("\t".join(entry))
707
            lines.append(','.join(line_strs))
708
    elif output_format == 'json':
709
        lines.append(dumps(chunks))
710
    print('\n'.join(lines))
711
712
# ----------------------------------------------------------------------
713
# ocrd workspace get-id
714
# ----------------------------------------------------------------------
715
716
@workspace_cli.command('get-id')
717
@pass_workspace
718
def get_id(ctx):
719
    """
720
    Get METS id if any
721
    """
722
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
723
    ID = workspace.mets.unique_identifier
724
    if ID:
725
        print(ID)
726
727
# ----------------------------------------------------------------------
728
# ocrd workspace set-id
729
# ----------------------------------------------------------------------
730
731
@workspace_cli.command('set-id')
732
@click.argument('ID')
733
@pass_workspace
734
def set_id(ctx, id):   # pylint: disable=redefined-builtin
735
    """
736
    Set METS ID.
737
738
    If one of the supported identifier mechanisms is used, will set this identifier.
739
740
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
741
    """
742
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
743
    workspace.mets.unique_identifier = id
744
    workspace.save_mets()
745
746
@workspace_cli.command('update-page')
747
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
748
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')               
749
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
750
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
751
@click.argument('PAGE_ID')
752
@pass_workspace
753
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
754
    """
755
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
756
    """
757
    update_kwargs = {k: v for k, v in attr_value_pairs}
758
    if order:
759
        update_kwargs['ORDER'] = order
760
    if orderlabel:
761
        update_kwargs['ORDERLABEL'] = orderlabel
762
    if contentids:
763
        update_kwargs['CONTENTIDS'] = contentids
764
    try:
765
        workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
766
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
767
        workspace.save_mets()
768
    except Exception as err:
769
        print(f"Error: {err}")
770
        sys.exit(1)
771
772
# ----------------------------------------------------------------------
773
# ocrd workspace merge
774
# ----------------------------------------------------------------------
775
776
def _handle_json_option(ctx, param, value):
777
    return parse_json_string_or_file(value) if value else None
778
779
@workspace_cli.command('merge')
780
@click.argument('METS_PATH')
781
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
782
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
783
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
784
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
785
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
786
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
787
@mets_find_options
788
@pass_workspace
789
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
790
    """
791
    Merges this workspace with the workspace that contains ``METS_PATH``
792
793
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
794
    in order to rename all fileGrp, file ID or page ID values, respectively.
795
796
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
797
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
798
    for an explanation.
799
    """
800
    mets_path = Path(mets_path)
801
    if filegrp_mapping:
802
        filegrp_mapping = loads(filegrp_mapping)
803
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
804
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
805
    workspace.merge(
806
        other_workspace,
807
        force=force,
808
        overwrite=overwrite,
809
        copy_files=copy_files,
810
        fileGrp_mapping=filegrp_mapping,
811
        fileId_mapping=fileid_mapping,
812
        pageId_mapping=pageid_mapping,
813
        file_grp=file_grp,
814
        file_id=file_id,
815
        page_id=page_id,
816
        mimetype=mimetype,
817
        include_fileGrp=include_fileGrp,
818
        exclude_fileGrp=exclude_fileGrp,
819
    )
820
    workspace.save_mets()
821
822
# ----------------------------------------------------------------------
823
# ocrd workspace backup
824
# ----------------------------------------------------------------------
825
826
@workspace_cli.group('backup')
827
@click.pass_context
828
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
829
    """
830
    Backing and restoring workspaces - dev edition
831
    """
832
833
@workspace_backup_cli.command('add')
834
@pass_workspace
835
def workspace_backup_add(ctx):
836
    """
837
    Create a new backup
838
    """
839
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
840
    backup_manager.add()
841
842
@workspace_backup_cli.command('list')
843
@pass_workspace
844
def workspace_backup_list(ctx):
845
    """
846
    List backups
847
    """
848
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
849
    for b in backup_manager.list():
850
        print(b)
851
852
@workspace_backup_cli.command('restore')
853
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
854
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
855
@pass_workspace
856
def workspace_backup_restore(ctx, choose_first, bak):
857
    """
858
    Restore backup BAK
859
    """
860
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
861
    backup_manager.restore(bak, choose_first)
862
863
@workspace_backup_cli.command('undo')
864
@pass_workspace
865
def workspace_backup_undo(ctx):
866
    """
867
    Restore the last backup
868
    """
869
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
870
    backup_manager.undo()
871
872
873
# ----------------------------------------------------------------------
874
# ocrd workspace server
875
# ----------------------------------------------------------------------
876
877
@workspace_cli.group('server')
878
@pass_workspace
879
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
880
    """Control a METS server for this workspace"""
881
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
882
883
@workspace_serve_cli.command('stop')
884
@pass_workspace
885
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
886
    """Stop the METS server"""
887
    workspace = Workspace(
888
        ctx.resolver,
889
        directory=ctx.directory,
890
        mets_basename=ctx.mets_basename,
891
        mets_server_url=ctx.mets_server_url,
892
    )
893
    workspace.mets.stop()
894
895
@workspace_serve_cli.command('start')
896
@pass_workspace
897
def workspace_serve_start(ctx): # pylint: disable=unused-argument
898
    """
899
    Start a METS server
900
901
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
902
    """
903
    OcrdMetsServer(
904
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
905
        url=ctx.mets_server_url,
906
    ).startup()
907