Passed
Pull Request — master (#1063)
by Konstantin
03:13
created

ocrd.cli.workspace.update_page()   B

Complexity

Conditions 5

Size

Total Lines 25
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 22
dl 0
loc 25
rs 8.8853
c 0
b 0
f 0
cc 5
nop 6
1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
import os
9
from os import getcwd
10
from os.path import relpath, exists, join, isabs
11
from pathlib import Path
12
from json import loads, dumps
13
import sys
14
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
15
import re
16
import time
17
import numpy as np
18
19
import click
20
21
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
22
from ocrd.mets_server import OcrdMetsServer
23
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
24
from ocrd.decorators import mets_find_options
25
from . import command_with_replaced_help
26
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
27
28
29
class WorkspaceCtx():
30
31
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
32
        self.log = getLogger('ocrd.cli.workspace')
33
        if mets_basename:
34
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
35
        self.resolver = Resolver()
36
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url \
37
                = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
38
        self.automatic_backup = automatic_backup
39
40
41
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
42
43
# ----------------------------------------------------------------------
44
# ocrd workspace
45
# ----------------------------------------------------------------------
46
47
@click.group("workspace")
48
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"')
49
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
50
@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL")
51
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server")
52
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
53
@click.pass_context
54
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
55
    """
56
    Managing workspaces
57
58
    A workspace comprises a METS file and a directory as point of reference.
59
60
    Operates on the file system directly or via a METS server 
61
    (already running via some prior `server start` subcommand).
62
    """
63
    initLogging()
64
    ctx.obj = WorkspaceCtx(
65
        directory,
66
        mets_url=mets,
67
        mets_basename=mets_basename,
68
        mets_server_url=mets_server_url,
69
        automatic_backup=backup
70
    )
71
72
# ----------------------------------------------------------------------
73
# ocrd workspace validate
74
# ----------------------------------------------------------------------
75
76
@workspace_cli.command('validate', cls=command_with_replaced_help(
77
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
78
@pass_workspace
79
@click.option('-a', '--download', is_flag=True, help="Download all files")
80
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
81
    ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
82
     'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
83
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
84
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
85
@click.argument('mets_url', default=None, required=False)
86
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
87
    """
88
    Validate a workspace
89
90
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
91
    If not given, use --mets accordingly.
92
93
    Check that the METS and its referenced file contents
94
    abide by the OCR-D specifications.
95
    """
96
    LOG = getLogger('ocrd.cli.workspace.validate')
97
    if mets_url:
98
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url))
99
    else:
100
        mets_url = ctx.mets_url
101
    report = WorkspaceValidator.validate(
102
        ctx.resolver,
103
        mets_url,
104
        src_dir=ctx.directory,
105
        skip=skip,
106
        download=download,
107
        page_strictness=page_textequiv_consistency,
108
        page_coordinate_consistency=page_coordinate_consistency
109
    )
110
    print(report.to_xml())
111
    if not report.is_valid:
112
        sys.exit(128)
113
114
# ----------------------------------------------------------------------
115
# ocrd workspace clone
116
# ----------------------------------------------------------------------
117
118
@workspace_cli.command('clone', cls=command_with_replaced_help(
119
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
120
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
121
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
122
@click.argument('mets_url')
123
@mets_find_options
124
# XXX deprecated
125
@click.argument('workspace_dir', default=None, required=False)
126
@pass_workspace
127
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
128
    """
129
    Create a workspace from METS_URL and return the directory
130
131
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
132
    If METS_URL is not provided, use --mets accordingly.
133
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
134
    """
135
    LOG = getLogger('ocrd.cli.workspace.clone')
136
    if workspace_dir:
137
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
138
        ctx.directory = workspace_dir
139
140
    workspace = ctx.resolver.workspace_from_url(
141
        mets_url,
142
        dst_dir=ctx.directory,
143
        mets_basename=ctx.mets_basename,
144
        clobber_mets=clobber_mets,
145
        download=download,
146
        ID=file_id,
147
        pageId=page_id,
148
        mimetype=mimetype,
149
        include_fileGrp=include_fileGrp,
150
        exclude_fileGrp=exclude_fileGrp,
151
    )
152
    workspace.save_mets()
153
    print(workspace.directory)
154
155
# ----------------------------------------------------------------------
156
# ocrd workspace init
157
# ----------------------------------------------------------------------
158
159
@workspace_cli.command('init', cls=command_with_replaced_help(
160
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
161
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
162
# XXX deprecated
163
@click.argument('directory', default=None, required=False)
164
@pass_workspace
165
def workspace_init(ctx, clobber_mets, directory):
166
    """
167
    Create a workspace with an empty METS file in --directory.
168
169
    """
170
    LOG = getLogger('ocrd.cli.workspace.init')
171
    if directory:
172
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
173
        ctx.directory = directory
174
    workspace = ctx.resolver.workspace_from_nothing(
175
        directory=ctx.directory,
176
        mets_basename=ctx.mets_basename,
177
        clobber_mets=clobber_mets
178
    )
179
    workspace.save_mets()
180
    print(workspace.directory)
181
182
# ----------------------------------------------------------------------
183
# ocrd workspace add
184
# ----------------------------------------------------------------------
185
186
@workspace_cli.command('add')
187
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
188
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
189
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE')
190
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
191
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
192
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
193
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
194
@click.argument('fname', required=True)
195
@pass_workspace
196
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
197
    """
198
    Add a file or http(s) URL FNAME to METS in a workspace.
199
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
200
    """
201
    workspace = Workspace(
202
        ctx.resolver,
203
        directory=ctx.directory,
204
        mets_basename=ctx.mets_basename,
205
        automatic_backup=ctx.automatic_backup,
206
        mets_server_url=ctx.mets_server_url,
207
    )
208
209
    log = getLogger('ocrd.cli.workspace.add')
210
    if not mimetype:
211
        try:
212
            mimetype = EXT_TO_MIME[Path(fname).suffix]
213
            log.info("Guessed mimetype to be %s" % mimetype)
214
        except KeyError:
215
            log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))
216
217
    log.debug("Adding '%s'", fname)
218
    local_filename = None
219
    if not (fname.startswith('http://') or fname.startswith('https://')):
220
        if not fname.startswith(ctx.directory):
221
            if not isabs(fname) and exists(join(ctx.directory, fname)):
222
                fname = join(ctx.directory, fname)
223
            else:
224
                log.debug("File '%s' is not in workspace, copying", fname)
225
                try:
226
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
227
                except FileNotFoundError:
228
                    if check_file_exists:
229
                        log.error("File '%s' does not exist, halt execution!" % fname)
230
                        sys.exit(1)
231
        if check_file_exists and not exists(fname):
232
            log.error("File '%s' does not exist, halt execution!" % fname)
233
            sys.exit(1)
234
        if fname.startswith(ctx.directory):
235
            fname = relpath(fname, ctx.directory)
236
        local_filename = fname
237
238
    if not page_id:
239
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
240
    kwargs = {
241
        'file_id': file_id,
242
        'mimetype': mimetype,
243
        'page_id': page_id,
244
        'force': force,
245
        'ignore': ignore,
246
        'local_filename': local_filename,
247
        'url': fname
248
    }
249
    workspace.add_file(file_grp, **kwargs)
250
    workspace.save_mets()
251
252
# ----------------------------------------------------------------------
253
# ocrd workspace bulk-add
254
# ----------------------------------------------------------------------
255
256
# pylint: disable=broad-except
257
@workspace_cli.command('bulk-add')
258
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
259
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
260
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
261
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
262
@click.option('-u', '--url', help="Remote URL of the file", required=False)
263
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
264
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
265
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
266
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
267
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
268
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
269
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
270
@click.argument('file_glob', nargs=-1, required=True)
271
@pass_workspace
272
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
273
    """
274
    Add files in bulk to an OCR-D workspace.
275
276
    FILE_GLOB can either be a shell glob expression to match file names,
277
    or a list of expressions or '-', in which case expressions are read from STDIN.
278
279
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
280
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
281
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
282
283
    If the FILE_GLOB expressions do not denote the file names themselves
284
    (but arbitrary strings for --regex matching), then use --source-path to set
285
    the actual file paths to use. (This could involve fixed strings or group references.)
286
287
    \b
288
    Examples:
289
        ocrd workspace bulk-add \\
290
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
291
                --page-id 'PHYS_{{ pageid }}' \\
292
                --file-grp "{{ fileGrp }}" \\
293
                path/to/files/*/*.*
294
        \b
295
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
296
        | ocrd workspace bulk-add \\
297
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
298
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
299
                --page-id 'PHYS_{{ pageid }}' \\
300
                --file-grp "{{ fileGrp }}" \\
301
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
302
                -
303
304
        \b
305
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
306
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
307
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
308
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
309
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
310
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
311
    """
312
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
313
    workspace = Workspace(
314
        ctx.resolver,
315
        directory=ctx.directory,
316
        mets_basename=ctx.mets_basename,
317
        automatic_backup=ctx.automatic_backup,
318
        mets_server_url=ctx.mets_server_url,
319
    )
320
321
    try:
322
        pat = re.compile(regex)
323
    except Exception as e:
324
        log.error("Invalid regex: %s" % e)
325
        sys.exit(1)
326
327
    file_paths = []
328
    from_stdin = file_glob == ('-',)
329
    if from_stdin:
330
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
331
    else:
332
        for fglob in file_glob:
333
            expanded = glob(fglob)
334
            if not expanded:
335
                file_paths += [Path(fglob)]
336
            else:
337
                file_paths += [Path(x) for x in expanded]
338
339
    for i, file_path in enumerate(file_paths):
340
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
341
342
        # match regex
343
        m = pat.match(str(file_path))
344
        if not m:
345
            if skip:
346
                continue
347
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
348
            sys.exit(1)
349
        group_dict = m.groupdict()
350
351
        # set up file info
352
        file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
353
354
        # Flag to track whether 'local_filename' should be 'src'
355
        local_filename_is_src = False
356
357
        # expand templates
358
        for param_name in file_dict:
359
            if not file_dict[param_name]:
360
                if param_name == 'local_filename':
361
                    local_filename_is_src = True
362
                    continue
363
                elif param_name in ['mimetype', 'file_id']:
364
                    # auto-filled below once the other
365
                    # replacements have happened
366
                    continue
367
                elif param_name == 'url':
368
                    # Remote URL is not required
369
                    continue
370
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
371
            for group_name in group_dict:
372
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
373
374
        # Where to copy from
375
        if src_path_option:
376
            src_path = src_path_option
377
            for group_name in group_dict:
378
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
379
            srcpath = Path(src_path)
380
        else:
381
            srcpath = file_path
382
383
        # derive --file-id from filename if not --file-id not explicitly set
384
        if not file_id:
385
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
386
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
387
        if not mimetype:
388
            try:
389
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
390
            except KeyError:
391
                log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))
392
393
        # copy files if src != url
394
        if local_filename_is_src:
395
            file_dict['local_filename'] = srcpath
396
        else:
397
            destpath = Path(workspace.directory, file_dict['local_filename'])
398
            if srcpath != destpath and not destpath.exists():
399
                log.info("cp '%s' '%s'", srcpath, destpath)
400
                if not dry_run:
401
                    if not destpath.parent.is_dir():
402
                        destpath.parent.mkdir()
403
                    destpath.write_bytes(srcpath.read_bytes())
404
405
        # Add to workspace (or not)
406
        fileGrp = file_dict.pop('file_grp')
407
        if dry_run:
408
            log.info('workspace.add_file(%s)' % file_dict)
409
        else:
410
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
411
412
    # save changes to disk
413
    workspace.save_mets()
414
415
416
# ----------------------------------------------------------------------
417
# ocrd workspace find
418
# ----------------------------------------------------------------------
419
420
@workspace_cli.command('find')
421
@mets_find_options
422
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
423
              default=['local_filename'],
424
              show_default=True,
425
              multiple=True,
426
              type=click.Choice([
427
                  'url',
428
                  'mimetype',
429
                  'page_id',
430
                  'pageId',
431
                  'file_id',
432
                  'ID',
433
                  'file_grp',
434
                  'fileGrp',
435
                  'basename',
436
                  'basename_without_extension',
437
                  'local_filename',
438
              ]))
439
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
440
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
441
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
442
@pass_workspace
443
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, wait):
444
    """
445
    Find files.
446
447
    (If any ``FILTER`` starts with ``//``, then its remainder
448
     will be interpreted as a regular expression.)
449
    """
450
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
451
    output_field = [snake_to_camel.get(x, x) for x in output_field]
452
    modified_mets = False
453
    ret = list()
454
    workspace = Workspace(
455
        ctx.resolver,
456
        directory=ctx.directory,
457
        mets_basename=ctx.mets_basename,
458
        mets_server_url=ctx.mets_server_url,
459
    )
460
    for f in workspace.find_files(
461
            file_id=file_id,
462
            file_grp=file_grp,
463
            mimetype=mimetype,
464
            page_id=page_id,
465
            include_fileGrp=include_fileGrp,
466
            exclude_fileGrp=exclude_fileGrp,
467
        ):
468
        ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
469
        if download and not f.local_filename:
470
            workspace.download_file(f)
471
            modified_mets = True
472
            if wait:
473
                time.sleep(wait)
474
        if undo_download and f.local_filename:
475
            ret_entry = [f'Removed local_filename {f.local_filename}']
476
            f.local_filename = None
477
            modified_mets = True
478
        ret.append(ret_entry)
479
    if modified_mets:
480
        workspace.save_mets()
481
    if 'pageId' in output_field:
482
        idx = output_field.index('pageId')
483
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 481 is False. Are you sure this can never be the case?
Loading history...
484
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
485
        for fields, page in zip(ret, pages):
486
            fields[idx] = page or ''
487
    for fields in ret:
488
        print('\t'.join(fields))
489
490
# ----------------------------------------------------------------------
491
# ocrd workspace remove
492
# ----------------------------------------------------------------------
493
494
@workspace_cli.command('remove')
495
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
496
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
497
@click.argument('ID', nargs=-1)
498
@pass_workspace
499
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
500
    """
501
    Delete files (given by their ID attribute ``ID``).
502
503
    (If any ``ID`` starts with ``//``, then its remainder
504
     will be interpreted as a regular expression.)
505
    """
506
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
507
    for i in id:
508
        workspace.remove_file(i, force=force, keep_file=keep_file)
509
    workspace.save_mets()
510
511
512
# ----------------------------------------------------------------------
513
# ocrd workspace rename-group
514
# ----------------------------------------------------------------------
515
516
@workspace_cli.command('rename-group')
517
@click.argument('OLD', nargs=1)
518
@click.argument('NEW', nargs=1)
519
@pass_workspace
520
def rename_group(ctx, old, new):
521
    """
522
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
523
    """
524
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
525
    workspace.rename_file_group(old, new)
526
    workspace.save_mets()
527
528
# ----------------------------------------------------------------------
529
# ocrd workspace remove-group
530
# ----------------------------------------------------------------------
531
532
@workspace_cli.command('remove-group')
533
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
534
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
535
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
536
@click.argument('GROUP', nargs=-1)
537
@pass_workspace
538
def remove_group(ctx, group, recursive, force, keep_files):
539
    """
540
    Delete fileGrps (given by their USE attribute ``GROUP``).
541
542
    (If any ``GROUP`` starts with ``//``, then its remainder
543
     will be interpreted as a regular expression.)
544
    """
545
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
546
    for g in group:
547
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
548
    workspace.save_mets()
549
550
# ----------------------------------------------------------------------
551
# ocrd workspace prune-files
552
# ----------------------------------------------------------------------
553
554
@workspace_cli.command('prune-files')
555
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
556
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
557
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
558
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
559
@pass_workspace
560
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
561
    """
562
    Removes mets:files that point to non-existing local files
563
564
    (If any ``FILTER`` starts with ``//``, then its remainder
565
     will be interpreted as a regular expression.)
566
    """
567
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
568
    with pushd_popd(workspace.directory):
569
        for f in workspace.find_files(
570
            file_id=file_id,
571
            file_grp=file_grp,
572
            mimetype=mimetype,
573
            page_id=page_id,
574
        ):
575
            try:
576
                if not f.local_filename or not exists(f.local_filename):
577
                    workspace.mets.remove_file(f.ID)
578
            except Exception as e:
579
                ctx.log.exception("Error removing %f: %s", f, e)
580
                raise(e)
581
        workspace.save_mets()
582
583
# ----------------------------------------------------------------------
584
# ocrd workspace list-group
585
# ----------------------------------------------------------------------
586
587
@workspace_cli.command('list-group')
588
@pass_workspace
589
def list_groups(ctx):
590
    """
591
    List fileGrp USE attributes
592
    """
593
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
594
    print("\n".join(workspace.mets.file_groups))
595
596
# ----------------------------------------------------------------------
597
# ocrd workspace list-page
598
# ----------------------------------------------------------------------
599
600
@workspace_cli.command('list-page')
601
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
602
              default=['ID'],
603
              show_default=True,
604
              multiple=True,
605
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
606
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
607
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
608
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
609
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
610
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
611
@pass_workspace
612
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
613
    """
614
    List physical page IDs
615
616
    (If any ``FILTER`` starts with ``//``, then its remainder
617
     will be interpreted as a regular expression.)
618
    """
619
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
620
    find_kwargs = {}
621
    if page_id_range and 'ID' in output_field:
622
        find_kwargs['pageId'] = page_id_range
623
    page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
624
    ret = []
625
626
    if output_field == ['ID']:
627
        ret = [[x] for x in page_ids]
628
    else:
629
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
630
            ret.append([])
631
            for k in output_field:
632
                ret[i].append(page_div.get(k, 'None'))
633
634
    if numeric_range:
635
        start, end = map(int, numeric_range.split('..'))
636
        ret = ret[start-1:end]
637
638
    chunks = partition_list(ret, chunk_number, chunk_index)
639
    lines = []
640
    if output_format == 'one-per-line':
641
        for chunk in chunks:
642
            line_strs = []
643
            for entry in chunk:
644
                line_strs.append("\t".join(entry))
645
            lines.append('\n'.join(line_strs))
646
    elif output_format == 'comma-separated':
647
        for chunk in chunks:
648
            line_strs = []
649
            for entry in chunk:
650
                line_strs.append("\t".join(entry))
651
            lines.append(','.join(line_strs))
652
    elif output_format == 'json':
653
        lines.append(dumps(chunks))
654
    print('\n'.join(lines))
655
656
# ----------------------------------------------------------------------
657
# ocrd workspace get-id
658
# ----------------------------------------------------------------------
659
660
@workspace_cli.command('get-id')
661
@pass_workspace
662
def get_id(ctx):
663
    """
664
    Get METS id if any
665
    """
666
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
667
    ID = workspace.mets.unique_identifier
668
    if ID:
669
        print(ID)
670
671
# ----------------------------------------------------------------------
672
# ocrd workspace set-id
673
# ----------------------------------------------------------------------
674
675
@workspace_cli.command('set-id')
676
@click.argument('ID')
677
@pass_workspace
678
def set_id(ctx, id):   # pylint: disable=redefined-builtin
679
    """
680
    Set METS ID.
681
682
    If one of the supported identifier mechanisms is used, will set this identifier.
683
684
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
685
    """
686
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
687
    workspace.mets.unique_identifier = id
688
    workspace.save_mets()
689
690
@workspace_cli.command('update-page')
691
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
692
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')               
693
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
694
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
695
@click.argument('PAGE_ID')
696
@pass_workspace
697
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
698
    """
699
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
700
    """
701
    update_kwargs = {k: v for k, v in attr_value_pairs}
702
    if order:
703
        update_kwargs['ORDER'] = order
704
    if orderlabel:
705
        update_kwargs['ORDERLABEL'] = orderlabel
706
    if contentids:
707
        update_kwargs['CONTENTIDS'] = contentids
708
    try:
709
        workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
710
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
711
        workspace.save_mets()
712
    except Exception as err:
713
        print(f"Error: {err}")
714
        sys.exit(1)
715
716
# ----------------------------------------------------------------------
717
# ocrd workspace merge
718
# ----------------------------------------------------------------------
719
720
def _handle_json_option(ctx, param, value):
721
    return parse_json_string_or_file(value) if value else None
722
723
@workspace_cli.command('merge')
724
@click.argument('METS_PATH')
725
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
726
@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
727
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
728
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
729
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
730
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
731
@mets_find_options
732
@pass_workspace
733
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):   # pylint: disable=redefined-builtin
734
    """
735
    Merges this workspace with the workspace that contains ``METS_PATH``
736
737
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
738
    in order to rename all fileGrp, file ID or page ID values, respectively.
739
740
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
741
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
742
    for an explanation.
743
    """
744
    mets_path = Path(mets_path)
745
    if filegrp_mapping:
746
        filegrp_mapping = loads(filegrp_mapping)
747
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
748
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
749
    workspace.merge(
750
        other_workspace,
751
        force=force,
752
        overwrite=overwrite,
753
        copy_files=copy_files,
754
        fileGrp_mapping=filegrp_mapping,
755
        fileId_mapping=fileid_mapping,
756
        pageId_mapping=pageid_mapping,
757
        file_grp=file_grp,
758
        file_id=file_id,
759
        page_id=page_id,
760
        mimetype=mimetype,
761
        include_fileGrp=include_fileGrp,
762
        exclude_fileGrp=exclude_fileGrp,
763
    )
764
    workspace.save_mets()
765
766
# ----------------------------------------------------------------------
767
# ocrd workspace backup
768
# ----------------------------------------------------------------------
769
770
@workspace_cli.group('backup')
771
@click.pass_context
772
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
773
    """
774
    Backing and restoring workspaces - dev edition
775
    """
776
777
@workspace_backup_cli.command('add')
778
@pass_workspace
779
def workspace_backup_add(ctx):
780
    """
781
    Create a new backup
782
    """
783
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
784
    backup_manager.add()
785
786
@workspace_backup_cli.command('list')
787
@pass_workspace
788
def workspace_backup_list(ctx):
789
    """
790
    List backups
791
    """
792
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
793
    for b in backup_manager.list():
794
        print(b)
795
796
@workspace_backup_cli.command('restore')
797
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
798
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
799
@pass_workspace
800
def workspace_backup_restore(ctx, choose_first, bak):
801
    """
802
    Restore backup BAK
803
    """
804
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
805
    backup_manager.restore(bak, choose_first)
806
807
@workspace_backup_cli.command('undo')
808
@pass_workspace
809
def workspace_backup_undo(ctx):
810
    """
811
    Restore the last backup
812
    """
813
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
814
    backup_manager.undo()
815
816
817
# ----------------------------------------------------------------------
818
# ocrd workspace server
819
# ----------------------------------------------------------------------
820
821
@workspace_cli.group('server')
822
@pass_workspace
823
def workspace_serve_cli(ctx): # pylint: disable=unused-argument
824
    """Control a METS server for this workspace"""
825
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
826
827
@workspace_serve_cli.command('stop')
828
@pass_workspace
829
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
830
    """Stop the METS server"""
831
    workspace = Workspace(
832
        ctx.resolver,
833
        directory=ctx.directory,
834
        mets_basename=ctx.mets_basename,
835
        mets_server_url=ctx.mets_server_url,
836
    )
837
    workspace.mets.stop()
838
839
@workspace_serve_cli.command('start')
840
@pass_workspace
841
def workspace_serve_start(ctx): # pylint: disable=unused-argument
842
    """
843
    Start a METS server
844
845
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
846
    """
847
    OcrdMetsServer(
848
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
849
        url=ctx.mets_server_url,
850
    ).startup()
851