ocrd.cli.workspace.workspace_add_file()   F
last analyzed

Complexity

Conditions 14

Size

Total Lines 62
Code Lines 53

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 53
dl 0
loc 62
rs 3.6
c 0
b 0
f 0
cc 14
nop 9

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.cli.workspace.workspace_add_file() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
OCR-D CLI: workspace management
3
4
.. click:: ocrd.cli.workspace:workspace_cli
5
    :prog: ocrd workspace
6
    :nested: full
7
"""
8
from os import rmdir, unlink
9
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
10
from pathlib import Path
11
from json import loads, dumps
12
import sys
13
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
14
import re
15
import time
16
17
import click
18
19
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
20
from ocrd.mets_server import OcrdMetsServer
21
from ocrd_utils import (
22
    getLogger,
23
    initLogging,
24
    pushd_popd,
25
    EXT_TO_MIME,
26
    safe_filename,
27
    parse_json_string_or_file,
28
    partition_list,
29
    DEFAULT_METS_BASENAME,
30
)
31
from ocrd.decorators import mets_find_options
32
from . import command_with_replaced_help
33
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
34
35
36
class WorkspaceCtx():
37
38
    def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None, automatic_backup=False):
39
        self.log = getLogger('ocrd.cli.workspace')
40
        if mets_basename:
41
            self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.'))
42
        self.resolver = Resolver()
43
        self.directory, self.mets_url, self.mets_basename, self.mets_server_url = \
44
            self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
45
        self.automatic_backup = automatic_backup
46
47
    def workspace(self):
48
        return Workspace(
49
            self.resolver,
50
            directory=self.directory,
51
            mets_basename=self.mets_basename,
52
            automatic_backup=self.automatic_backup,
53
            mets_server_url=self.mets_server_url,
54
        )
55
56
    def backup_manager(self):
57
        return WorkspaceBackupManager(self.workspace())
58
59
60
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
61
62
63
# ----------------------------------------------------------------------
64
# ocrd workspace
65
# ----------------------------------------------------------------------
66
67
@click.group("workspace")
68
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR',
69
              help='Changes the workspace folder location [default: METS_URL directory or .]"')
70
@click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory')
71
@click.option('-m', '--mets', default=None, metavar="METS_URL",
72
              help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]')
73
@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server")
74
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
75
@click.pass_context
76
def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
77
    """
78
    Managing workspaces
79
80
    A workspace comprises a METS file and a directory as point of reference.
81
82
    Operates on the file system directly or via a METS server
83
    (already running via some prior `server start` subcommand).
84
    """
85
    initLogging()
86
    ctx.obj = WorkspaceCtx(
87
        directory,
88
        mets_url=mets,
89
        mets_basename=mets_basename,
90
        mets_server_url=mets_server_url,
91
        automatic_backup=backup
92
    )
93
94
95
# ----------------------------------------------------------------------
96
# ocrd workspace validate
97
# ----------------------------------------------------------------------
98
99
@workspace_cli.command('validate', cls=command_with_replaced_help(
100
    (r' \[METS_URL\]', ''))) # XXX deprecated argument
101
@pass_workspace
102
@click.option('-a', '--download', is_flag=True, help="Download all files")
103
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
104
    ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density',
105
     'page', 'page_xsd', 'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd']))
106
@click.option('--page-textequiv-consistency', '--page-strictness', type=click.Choice(['strict', 'lax', 'fix', 'off']),
107
              default='strict', help="How strict to check PAGE multi-level textequiv consistency")
108
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency",
109
              type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
110
@click.argument('mets_url', default=None, required=False)
111
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
112
    """
113
    Validate a workspace
114
115
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
116
    If not given, use --mets accordingly.
117
118
    Check that the METS and its referenced file contents
119
    abide by the OCR-D specifications.
120
    """
121
    LOG = getLogger('ocrd.cli.workspace.validate')
122
    if mets_url:
123
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of "
124
                                       "argument 'METS_URL' ('%s')" % mets_url))
125
    else:
126
        mets_url = ctx.mets_url
127
    report = WorkspaceValidator.validate(
128
        ctx.resolver,
129
        mets_url,
130
        src_dir=ctx.directory,
131
        skip=skip,
132
        download=download,
133
        page_strictness=page_textequiv_consistency,
134
        page_coordinate_consistency=page_coordinate_consistency
135
    )
136
    print(report.to_xml())
137
    if not report.is_valid:
138
        sys.exit(128)
139
140
141
# ----------------------------------------------------------------------
142
# ocrd workspace clone
143
# ----------------------------------------------------------------------
144
145
@workspace_cli.command('clone', cls=command_with_replaced_help(
146
    (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
147
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
148
@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local FLocat "
149
              "path references in METS file afterwards")
150
@click.argument('mets_url')
151
@mets_find_options
152
# XXX deprecated
153
@click.argument('workspace_dir', default=None, required=False)
154
@pass_workspace
155
def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype,
156
                    include_fileGrp, exclude_fileGrp, mets_url, workspace_dir):
157
    """
158
    Create a workspace from METS_URL and return the directory
159
160
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
161
    METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
162
163
    Additional options pertain to the selection of files / fileGrps / pages
164
    to be downloaded, if --download is used.
165
    """
166
    LOG = getLogger('ocrd.cli.workspace.clone')
167
    if workspace_dir:
168
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of "
169
                                       "argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
170
        ctx.directory = workspace_dir
171
172
    assert not ctx.mets_server_url, \
173
        f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
174
    workspace = ctx.resolver.workspace_from_url(
175
        mets_url,
176
        dst_dir=ctx.directory,
177
        mets_basename=ctx.mets_basename,
178
        clobber_mets=clobber_mets,
179
        download=download,
180
        fileGrp=file_grp,
181
        ID=file_id,
182
        pageId=page_id,
183
        mimetype=mimetype,
184
        include_fileGrp=include_fileGrp,
185
        exclude_fileGrp=exclude_fileGrp,
186
    )
187
    workspace.save_mets()
188
    print(workspace.directory)
189
190
191
# ----------------------------------------------------------------------
192
# ocrd workspace init
193
# ----------------------------------------------------------------------
194
195
@workspace_cli.command('init', cls=command_with_replaced_help(
196
    (r' \[DIRECTORY\]', ''))) # XXX deprecated argument
197
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
198
# XXX deprecated
199
@click.argument('directory', default=None, required=False)
200
@pass_workspace
201
def workspace_init(ctx, clobber_mets, directory):
202
    """
203
    Create a workspace with an empty METS file in DIRECTORY or CWD.
204
205
    """
206
    LOG = getLogger('ocrd.cli.workspace.init')
207
    if directory:
208
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of "
209
                                       "argument 'DIRECTORY' ('%s')" % directory))
210
        ctx.directory = directory
211
    assert not ctx.mets_server_url, \
212
        f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
213
    workspace = ctx.resolver.workspace_from_nothing(
214
        directory=ctx.directory,
215
        mets_basename=ctx.mets_basename,
216
        clobber_mets=clobber_mets,
217
    )
218
    workspace.save_mets()
219
    print(workspace.directory)
220
221
222
# ----------------------------------------------------------------------
223
# ocrd workspace add
224
# ----------------------------------------------------------------------
225
226
@workspace_cli.command('add')
227
@click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP')
228
@click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID')
229
@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided",
230
              required=False, metavar='TYPE')
231
@click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID')
232
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
233
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
234
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.",
235
              default=False, is_flag=True)
236
@click.argument('fname', required=True)
237
@pass_workspace
238
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
239
    """
240
    Add a file or http(s) URL FNAME to METS in a workspace.
241
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
242
    """
243
    workspace = ctx.workspace()
244
245
    log = getLogger('ocrd.cli.workspace.add')
246
    if not mimetype:
247
        try:
248
            mimetype = EXT_TO_MIME[Path(fname).suffix]
249
            log.info("Guessed mimetype to be %s" % mimetype)
250
        except KeyError:
251
            log.error("Cannot guess mimetype from extension '%s' for '%s'. "
252
                      "Set --mimetype explicitly" % (Path(fname).suffix, fname))
253
254
    log.debug("Adding '%s'", fname)
255
    local_filename = None
256
    if not (fname.startswith('http://') or fname.startswith('https://')):
257
        if not fname.startswith(ctx.directory):
258
            if not isabs(fname) and exists(join(ctx.directory, fname)):
259
                fname = join(ctx.directory, fname)
260
            else:
261
                log.debug("File '%s' is not in workspace, copying", fname)
262
                try:
263
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
264
                except FileNotFoundError:
265
                    if check_file_exists:
266
                        log.error("File '%s' does not exist, halt execution!" % fname)
267
                        sys.exit(1)
268
        if check_file_exists and not exists(fname):
269
            log.error("File '%s' does not exist, halt execution!" % fname)
270
            sys.exit(1)
271
        if fname.startswith(ctx.directory):
272
            fname = relpath(fname, ctx.directory)
273
        local_filename = fname
274
275
    if not page_id:
276
        log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
277
    kwargs = {
278
        'file_id': file_id,
279
        'mimetype': mimetype,
280
        'page_id': page_id,
281
        'force': force,
282
        'ignore': ignore,
283
        'local_filename': local_filename,
284
        'url': fname
285
    }
286
    workspace.add_file(file_grp, **kwargs)
287
    workspace.save_mets()
288
289
290
# ----------------------------------------------------------------------
291
# ocrd workspace bulk-add
292
# ----------------------------------------------------------------------
293
294
# pylint: disable=broad-except
295
@workspace_cli.command('bulk-add')
296
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths "
297
              "to define named captures usable in the other parameters", required=True)
298
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
299
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
300
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
301
@click.option('-u', '--url', help="Remote URL of the file", required=False)
302
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory "
303
              "(copied from source file if different)", required=False)
304
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
305
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview",
306
              default=False, is_flag=True)
307
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)",
308
              required=False)
309
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
310
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)",
311
              default=False, is_flag=True)
312
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
313
@click.argument('file_glob', nargs=-1, required=True)
314
@pass_workspace
315
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run,
316
                           file_glob, src_path_option, ignore, force, skip):
317
    """
318
    Add files in bulk to an OCR-D workspace.
319
320
    FILE_GLOB can either be a shell glob expression to match file names,
321
    or a list of expressions or '-', in which case expressions are read from STDIN.
322
323
    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
324
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
325
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.
326
327
    If the FILE_GLOB expressions do not denote the file names themselves
328
    (but arbitrary strings for --regex matching), then use --source-path to set
329
    the actual file paths to use. (This could involve fixed strings or group references.)
330
331
    \b
332
    Examples:
333
        ocrd workspace bulk-add \\
334
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\\.[^.]+' \\
335
                --page-id 'PHYS_{{ pageid }}' \\
336
                --file-grp "{{ fileGrp }}" \\
337
                path/to/files/*/*.*
338
        \b
339
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
340
        | ocrd workspace bulk-add \\
341
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\\.(?P<ext>[^\\.]*)' \\
342
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
343
                --page-id 'PHYS_{{ pageid }}' \\
344
                --file-grp "{{ fileGrp }}" \\
345
                --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
346
                -
347
348
        \b
349
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
350
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
351
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
352
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
353
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
354
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
355
356
    """
357
    log = getLogger('ocrd.cli.workspace.bulk-add')  # pylint: disable=redefined-outer-name
358
    workspace = ctx.workspace()
359
360
    try:
361
        pat = re.compile(regex)
362
    except Exception as e:
363
        log.error("Invalid regex: %s" % e)
364
        sys.exit(1)
365
366
    file_paths = []
367
    from_stdin = file_glob == ('-',)
368
    if from_stdin:
369
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
370
    else:
371
        for fglob in file_glob:
372
            expanded = glob(fglob)
373
            if not expanded:
374
                file_paths += [Path(fglob)]
375
            else:
376
                file_paths += [Path(x) for x in expanded]
377
378
    for i, file_path in enumerate(file_paths):
379
        log.info("[%4d/%d] %s" % (i + 1, len(file_paths), file_path))
380
381
        # match regex
382
        m = pat.match(str(file_path))
383
        if not m:
384
            if skip:
385
                continue
386
            log.error("File '%s' not matched by regex: '%s'" % (file_path, regex))
387
            sys.exit(1)
388
        group_dict = m.groupdict()
389
390
        # set up file info
391
        file_dict = {'local_filename': local_filename,
392
                     'url': url,
393
                     'mimetype': mimetype,
394
                     'file_id': file_id,
395
                     'page_id': page_id,
396
                     'file_grp': file_grp}
397
398
        # Flag to track whether 'local_filename' should be 'src'
399
        local_filename_is_src = False
400
401
        # expand templates
402
        for param_name in file_dict:
403
            if not file_dict[param_name]:
404
                if param_name == 'local_filename':
405
                    local_filename_is_src = True
406
                    continue
407
                elif param_name in ['mimetype', 'file_id']:
408
                    # auto-filled below once the other
409
                    # replacements have happened
410
                    continue
411
                elif param_name == 'url':
412
                    # Remote URL is not required
413
                    continue
414
                raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
415
            for group_name in group_dict:
416
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
417
418
        # Where to copy from
419
        if src_path_option:
420
            src_path = src_path_option
421
            for group_name in group_dict:
422
                src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
423
            srcpath = Path(src_path)
424
        else:
425
            srcpath = file_path
426
427
        # derive --file-id from filename if not --file-id not explicitly set
428
        if not file_id:
429
            id_field = srcpath.stem if file_path != srcpath else file_path.stem
430
            file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
431
        if not mimetype:
432
            try:
433
                file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
434
            except KeyError:
435
                log.error("Cannot guess MIME type from extension '%s' for '%s'. "
436
                          "Set --mimetype explicitly" % (srcpath.suffix, srcpath))
437
438
        # copy files if src != url
439
        if local_filename_is_src:
440
            file_dict['local_filename'] = srcpath
441
        else:
442
            destpath = Path(workspace.directory, file_dict['local_filename'])
443
            if srcpath != destpath and not destpath.exists():
444
                log.info("cp '%s' '%s'", srcpath, destpath)
445
                if not dry_run:
446
                    if not destpath.parent.is_dir():
447
                        destpath.parent.mkdir()
448
                    destpath.write_bytes(srcpath.read_bytes())
449
450
        # Add to workspace (or not)
451
        fileGrp = file_dict.pop('file_grp')
452
        if dry_run:
453
            log.info('workspace.add_file(%s)' % file_dict)
454
        else:
455
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)  # pylint: disable=redundant-keyword-arg
456
457
    # save changes to disk
458
    workspace.save_mets()
459
460
461
# ----------------------------------------------------------------------
462
# ocrd workspace find
463
# ----------------------------------------------------------------------
464
465
@workspace_cli.command('find')
466
@mets_find_options
467
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
468
              default=['local_filename'],
469
              show_default=True,
470
              multiple=True,
471
              type=click.Choice([
472
                  'url',
473
                  'mimetype',
474
                  'page_id',
475
                  'pageId',
476
                  'file_id',
477
                  'ID',
478
                  'file_grp',
479
                  'fileGrp',
480
                  'basename',
481
                  'basename_without_extension',
482
                  'local_filename',
483
              ]))
484
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file")
485
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS and workspace")
486
@click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download")
487
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
488
@pass_workspace
489
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field,
490
                   include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait):
491
    """
492
    Find files.
493
494
    (If any ``FILTER`` starts with ``//``, then its remainder
495
     will be interpreted as a regular expression.)
496
    """
497
    snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
498
    output_field = [snake_to_camel.get(x, x) for x in output_field]
499
    modified_mets = False
500
    ret = []
501
    workspace = ctx.workspace()
502
    with pushd_popd(workspace.directory):
503
        for f in workspace.find_files(
504
                file_id=file_id,
505
                file_grp=file_grp,
506
                mimetype=mimetype,
507
                page_id=page_id,
508
                include_fileGrp=include_fileGrp,
509
                exclude_fileGrp=exclude_fileGrp,
510
        ):
511
            if download and not f.local_filename:
512
                workspace.download_file(f)
513
                modified_mets = True
514
                if wait:
515
                    time.sleep(wait)
516
            if undo_download and f.url and f.local_filename:
517
                modified_mets = True
518
                if not keep_files:
519
                    ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory)
520
                    unlink(f.local_filename)
521
                f.local_filename = None
522
            ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
523
            ret.append(ret_entry)
524
    if modified_mets:
525
        workspace.save_mets()
526
    if 'pageId' in output_field:
527
        idx = output_field.index('pageId')
528
        fileIds = list(map(lambda fields: fields[idx], ret))
529
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
530
        for fields, page in zip(ret, pages):
531
            fields[idx] = page or ''
532
    for fields in ret:
533
        print('\t'.join(fields))
534
535
536
# ----------------------------------------------------------------------
537
# ocrd workspace remove
538
# ----------------------------------------------------------------------
539
540
@workspace_cli.command('remove')
541
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
542
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist",
543
              default=False, is_flag=True)
544
@click.argument('ID', nargs=-1)
545
@pass_workspace
546
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
547
    """
548
    Delete files (given by their ID attribute ``ID``).
549
550
    (If any ``ID`` starts with ``//``, then its remainder
551
     will be interpreted as a regular expression.)
552
    """
553
    assert not ctx.mets_server_url, \
554
        f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
555
    workspace = ctx.workspace()
556
    for i in id:
557
        workspace.remove_file(i, force=force, keep_file=keep_file)
558
    workspace.save_mets()
559
560
561
# ----------------------------------------------------------------------
562
# ocrd workspace rename-group
563
# ----------------------------------------------------------------------
564
565
@workspace_cli.command('rename-group')
566
@click.argument('OLD', nargs=1)
567
@click.argument('NEW', nargs=1)
568
@pass_workspace
569
def rename_group(ctx, old, new):
570
    """
571
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
572
    """
573
    assert not ctx.mets_server_url, \
574
        f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
575
    workspace = ctx.workspace()
576
    workspace.rename_file_group(old, new)
577
    workspace.save_mets()
578
579
580
# ----------------------------------------------------------------------
581
# ocrd workspace remove-group
582
# ----------------------------------------------------------------------
583
584
@workspace_cli.command('remove-group')
585
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself",
586
              default=False, is_flag=True)
587
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS",
588
              default=False, is_flag=True)
589
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
590
@click.argument('GROUP', nargs=-1)
591
@pass_workspace
592
def remove_group(ctx, group, recursive, force, keep_files):
593
    """
594
    Delete fileGrps (given by their USE attribute ``GROUP``).
595
596
    (If any ``GROUP`` starts with ``//``, then its remainder
597
     will be interpreted as a regular expression.)
598
    """
599
    assert not ctx.mets_server_url, \
600
        f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
601
    workspace = ctx.workspace()
602
    for g in group:
603
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
604
    workspace.save_mets()
605
606
607
# ----------------------------------------------------------------------
608
# ocrd workspace prune-files
609
# ----------------------------------------------------------------------
610
611
@workspace_cli.command('prune-files')
612
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
613
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
614
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
615
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
616
@pass_workspace
617
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
618
    """
619
    Removes mets:files that point to non-existing local files
620
621
    (If any ``FILTER`` starts with ``//``, then its remainder
622
     will be interpreted as a regular expression.)
623
    """
624
    assert not ctx.mets_server_url, \
625
        f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
626
    workspace = ctx.workspace()
627
    with pushd_popd(workspace.directory):
628
        for f in workspace.find_files(
629
            file_id=file_id,
630
            file_grp=file_grp,
631
            mimetype=mimetype,
632
            page_id=page_id,
633
        ):
634
            try:
635
                if not f.local_filename or not exists(f.local_filename):
636
                    workspace.mets.remove_file(f.ID)
637
            except Exception as e:
638
                ctx.log.exception("Error removing %f: %s", f, e)
639
                raise e
640
        workspace.save_mets()
641
642
643
# ----------------------------------------------------------------------
644
# ocrd workspace clean
645
# ----------------------------------------------------------------------
646
647
@workspace_cli.command('clean')
648
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview",
649
              default=False, is_flag=True)
650
@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files",
651
              default=False, is_flag=True)
652
@click.argument('path_glob', nargs=-1, required=False)
653
@pass_workspace
654
def clean(ctx, dry_run, directories, path_glob):
655
    """
656
    Removes files and directories from the workspace that are not
657
    referenced by any mets:files.
658
659
    PATH_GLOB can be a shell glob expression to match file names,
660
    directory names (recursively), or plain paths. All paths are
661
    resolved w.r.t. the workspace.
662
663
    If no PATH_GLOB are specified, then all files and directories
664
    may match.
665
    """
666
    workspace = ctx.workspace()
667
    allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
668
    allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
669
    allowed_dirs = set(dirname(path) for path in allowed_files)
670
    with pushd_popd(workspace.directory):
671
        if len(path_glob):
672
            paths = []
673
            for expression in path_glob:
674
                if isabs(expression):
675
                    expression = relpath(expression)
676
                paths += glob(expression, recursive=True) or [expression]
677
        else:
678
            paths = glob('**', recursive=True)
679
        file_paths = [path for path in paths if not isdir(path)]
680
        for path in file_paths:
681
            if normpath(path) in allowed_files:
682
                continue
683
            if dry_run:
684
                ctx.log.info('unlink(%s)' % path)
685
            else:
686
                unlink(path)
687
        if not directories:
688
            return
689
        dir_paths = [path for path in paths if isdir(path)]
690
        for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
691
            if normpath(path) in allowed_dirs:
692
                continue
693
            if dry_run:
694
                ctx.log.info('rmdir(%s)' % path)
695
            else:
696
                rmdir(path)
697
698
699
# ----------------------------------------------------------------------
700
# ocrd workspace list-group
701
# ----------------------------------------------------------------------
702
703
@workspace_cli.command('list-group')
704
@pass_workspace
705
def list_groups(ctx):
706
    """
707
    List fileGrp USE attributes
708
    """
709
    workspace = ctx.workspace()
710
    print("\n".join(workspace.mets.file_groups))
711
712
713
# ----------------------------------------------------------------------
714
# ocrd workspace list-page
715
# ----------------------------------------------------------------------
716
717
@workspace_cli.command('list-page')
718
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
719
              default=['ID'],
720
              show_default=True,
721
              multiple=True,
722
              type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
723
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']),
724
              default='one-per-line')
725
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks",
726
              default=1, type=int)
727
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
728
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, "
729
              "based on the @ID attribute. Separate start/end with ..")
730
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. "
731
              "Separate start/end with ..")
732
@pass_workspace
733
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
734
    """
735
    List physical page IDs
736
737
    (If any ``FILTER`` starts with ``//``, then its remainder
738
     will be interpreted as a regular expression.)
739
    """
740
    workspace = ctx.workspace()
741
    ret = []
742
    if page_id_range or list(output_field) != ['ID']:
743
        for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
744
            ret.append([])
745
            for k in output_field:
746
                ret[i].append(page_div.get(k, 'None'))
747
    else:
748
        for page_id in workspace.mets.physical_pages:
749
            ret.append([page_id])
750
751
    if numeric_range:
752
        start, end = map(int, numeric_range.split('..'))
753
        ret = ret[start-1:end]
754
755
    chunks = partition_list(ret, chunk_number, chunk_index)
756
    lines = []
757
    if output_format == 'one-per-line':
758
        for chunk in chunks:
759
            line_strs = []
760
            for entry in chunk:
761
                line_strs.append("\t".join(entry))
762
            lines.append('\n'.join(line_strs))
763
    elif output_format == 'comma-separated':
764
        for chunk in chunks:
765
            line_strs = []
766
            for entry in chunk:
767
                line_strs.append("\t".join(entry))
768
            lines.append(','.join(line_strs))
769
    elif output_format == 'json':
770
        lines.append(dumps(chunks))
771
    print('\n'.join(lines))
772
773
774
# ----------------------------------------------------------------------
775
# ocrd workspace get-id
776
# ----------------------------------------------------------------------
777
778
@workspace_cli.command('get-id')
779
@pass_workspace
780
def get_id(ctx):
781
    """
782
    Get METS id if any
783
    """
784
    workspace = ctx.workspace()
785
    ID = workspace.mets.unique_identifier
786
    if ID:
787
        print(ID)
788
789
790
# ----------------------------------------------------------------------
791
# ocrd workspace set-id
792
# ----------------------------------------------------------------------
793
794
@workspace_cli.command('set-id')
795
@click.argument('ID')
796
@pass_workspace
797
def set_id(ctx, id):   # pylint: disable=redefined-builtin
798
    """
799
    Set METS ID.
800
801
    If one of the supported identifier mechanisms is used, will set this identifier.
802
803
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
804
    """
805
    workspace = ctx.workspace()
806
    workspace.mets.unique_identifier = id
807
    workspace.save_mets()
808
809
810
@workspace_cli.command('update-page')
811
@click.option('--set', 'attr_value_pairs', help="set mets:div ATTR to VALUE", metavar="ATTR VALUE",
812
              type=(click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()), str), nargs=2, multiple=True)
813
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
814
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
815
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
816
@click.argument('PAGE_ID')
817
@pass_workspace
818
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
819
    """
820
    Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
821
    """
822
    update_kwargs = dict(attr_value_pairs)
823
    if order:
824
        update_kwargs['ORDER'] = order
825
    if orderlabel:
826
        update_kwargs['ORDERLABEL'] = orderlabel
827
    if contentids:
828
        update_kwargs['CONTENTIDS'] = contentids
829
    try:
830
        assert not ctx.mets_server_url, \
831
            f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
832
        workspace = ctx.workspace()
833
        workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
834
        workspace.save_mets()
835
    except Exception as err:
836
        print(f"Error: {err}")
837
        sys.exit(1)
838
839
840
# ----------------------------------------------------------------------
841
# ocrd workspace merge
842
# ----------------------------------------------------------------------
843
844
def _handle_json_option(ctx, param, value):
845
    return parse_json_string_or_file(value) if value else None
846
847
848
@workspace_cli.command('merge')
849
@click.argument('METS_PATH')
850
@click.option('--overwrite/--no-overwrite', is_flag=True, default=False,
851
              help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH")
852
@click.option('--force/--no-force', is_flag=True, default=False,
853
              help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash")
854
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
855
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
856
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
857
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
858
@mets_find_options
859
@pass_workspace
860
def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping,
861
          file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path):  # pylint: disable=redefined-builtin
862
    """
863
    Merges this workspace with the workspace that contains ``METS_PATH``
864
865
    Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
866
    in order to rename all fileGrp, file ID or page ID values, respectively.
867
868
    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
869
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
870
    for an explanation.
871
    """
872
    mets_path = Path(mets_path)
873
    if filegrp_mapping:
874
        filegrp_mapping = loads(filegrp_mapping)
875
    assert not ctx.mets_server_url, \
876
        f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
877
    workspace = ctx.workspace()
878
    other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
879
    workspace.merge(
880
        other_workspace,
881
        force=force,
882
        overwrite=overwrite,
883
        copy_files=copy_files,
884
        fileGrp_mapping=filegrp_mapping,
885
        fileId_mapping=fileid_mapping,
886
        pageId_mapping=pageid_mapping,
887
        file_grp=file_grp,
888
        file_id=file_id,
889
        page_id=page_id,
890
        mimetype=mimetype,
891
        include_fileGrp=include_fileGrp,
892
        exclude_fileGrp=exclude_fileGrp,
893
    )
894
    workspace.save_mets()
895
896
897
# ----------------------------------------------------------------------
898
# ocrd workspace backup
899
# ----------------------------------------------------------------------
900
901
@workspace_cli.group('backup')
902
@pass_workspace
903
def workspace_backup_cli(ctx):  # pylint: disable=unused-argument
904
    """
905
    Backing and restoring workspaces - dev edition
906
    """
907
    assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
908
909
910
@workspace_backup_cli.command('add')
911
@pass_workspace
912
def workspace_backup_add(ctx):
913
    """
914
    Create a new backup
915
    """
916
    backup_manager = ctx.backup_manager()
917
    backup_manager.add()
918
919
920
@workspace_backup_cli.command('list')
921
@pass_workspace
922
def workspace_backup_list(ctx):
923
    """
924
    List backups
925
    """
926
    backup_manager = ctx.backup_manager()
927
    for b in backup_manager.list():
928
        print(b)
929
930
931
@workspace_backup_cli.command('restore')
932
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
933
@click.argument('bak')  # type=click.Path(dir_okay=False, readable=True, resolve_path=True))
934
@pass_workspace
935
def workspace_backup_restore(ctx, choose_first, bak):
936
    """
937
    Restore backup BAK
938
    """
939
    backup_manager = ctx.backup_manager()
940
    backup_manager.restore(bak, choose_first)
941
942
943
@workspace_backup_cli.command('undo')
944
@pass_workspace
945
def workspace_backup_undo(ctx):
946
    """
947
    Restore the last backup
948
    """
949
    backup_manager = ctx.backup_manager()
950
    backup_manager.undo()
951
952
953
# ----------------------------------------------------------------------
954
# ocrd workspace server
955
# ----------------------------------------------------------------------
956
957
@workspace_cli.group('server')
958
@pass_workspace
959
def workspace_serve_cli(ctx):  # pylint: disable=unused-argument
960
    """Control a METS server for this workspace"""
961
    assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'"
962
963
964
@workspace_serve_cli.command('stop')
965
@pass_workspace
966
def workspace_serve_stop(ctx):  # pylint: disable=unused-argument
967
    """Stop the METS server (saving changes to disk)"""
968
    workspace = ctx.workspace()
969
    workspace.mets.stop()
970
971
972
@workspace_serve_cli.command('reload')
973
@pass_workspace
974
def workspace_serve_reload(ctx):  # pylint: disable=unused-argument
975
    """Reload the METS server from disk"""
976
    workspace = ctx.workspace()
977
    workspace.mets.reload()
978
979
980
@workspace_serve_cli.command('save')
981
@pass_workspace
982
def workspace_serve_save(ctx):  # pylint: disable=unused-argument
983
    """Save the METS changes to disk"""
984
    workspace = ctx.workspace()
985
    workspace.mets.save()
986
987
988
@workspace_serve_cli.command('start')
989
@pass_workspace
990
def workspace_serve_start(ctx):  # pylint: disable=unused-argument
991
    """
992
    Start a METS server
993
994
    (For TCP backend, pass a network interface to bind to as the '-U/--mets-server-url' parameter.)
995
    """
996
    OcrdMetsServer(
997
        workspace=Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename),
998
        url=ctx.mets_server_url,
999
    ).startup()
1000