Passed
Push — master ( 008d0f...015064 )
by Konstantin
54s queued 13s
created

ocrd.cli.workspace.clean_id()   A

Complexity

Conditions 2

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
import os
2
from os.path import relpath, exists, join
3
from pathlib import Path
4
import sys
5
from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
6
import re
7
8
import click
9
10
from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
11
from ocrd_models import OcrdFile
12
from ocrd_utils import getLogger, pushd_popd, EXT_TO_MIME
13
14
log = getLogger('ocrd.cli.workspace')
15
16
def clean_id(dirty):
17
    if re.match('^[0-9]', dirty):
18
        raise ValueError("Make sure files and directories do not begin with a numeral which will lead to invalid XML identifiers")
19
    return re.sub('[^A-Za-z0-9-_]+', '_', dirty)
20
21
22
class WorkspaceCtx():
23
24
    def __init__(self, directory, mets_basename, automatic_backup):
25
        self.directory = directory
26
        self.resolver = Resolver()
27
        self.mets_basename = mets_basename
28
        self.automatic_backup = automatic_backup
29
30
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
31
32
# ----------------------------------------------------------------------
33
# ocrd workspace
34
# ----------------------------------------------------------------------
35
36
@click.group("workspace")
37
@click.option('-d', '--directory', envvar='WORKSPACE_DIR', default='.', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location.', show_default=True)
38
@click.option('-M', '--mets-basename', default="mets.xml", help='The basename of the METS file.', show_default=True)
39
@click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True)
40
@click.pass_context
41
def workspace_cli(ctx, directory, mets_basename, backup):
42
    """
43
    Working with workspace
44
    """
45
    ctx.obj = WorkspaceCtx(os.path.abspath(directory), mets_basename, automatic_backup=backup)
46
47
# ----------------------------------------------------------------------
48
# ocrd workspace validate
49
# ----------------------------------------------------------------------
50
51
@workspace_cli.command('validate', help='''
52
53
    Validate a workspace
54
55
''')
56
@pass_workspace
57
@click.option('-a', '--download', is_flag=True, help="Download all files")
58
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['imagefilename', 'dimension', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url']))
59
@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
60
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
61
@click.argument('mets_url', nargs=-1)
62
def validate_workspace(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
63
    if not mets_url:
64
        mets_url = 'mets.xml'
65
    else:
66
        mets_url = mets_url[0]
67
    report = WorkspaceValidator.validate(
68
        ctx.resolver,
69
        mets_url,
70
        src_dir=ctx.directory,
71
        skip=skip,
72
        download=download,
73
        page_strictness=page_textequiv_consistency,
74
        page_coordinate_consistency=page_coordinate_consistency
75
    )
76
    print(report.to_xml())
77
    if not report.is_valid:
78
        sys.exit(128)
79
80
# ----------------------------------------------------------------------
81
# ocrd workspace clone
82
# ----------------------------------------------------------------------
83
84
@workspace_cli.command('clone')
85
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
86
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
87
@click.argument('mets_url')
88
@click.argument('workspace_dir', default=None, required=False)
89
@pass_workspace
90
def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
91
    """
92
    Create a workspace from a METS_URL and return the directory
93
94
    METS_URL can be a URL, an absolute path or a path relative to $PWD.
95
96
    If WORKSPACE_DIR is not provided, use the current working directory
97
    """
98
    if not workspace_dir:
99
        workspace_dir = '.'
100
    workspace = ctx.resolver.workspace_from_url(
101
        mets_url,
102
        dst_dir=os.path.abspath(workspace_dir),
103
        mets_basename=ctx.mets_basename,
104
        clobber_mets=clobber_mets,
105
        download=download,
106
    )
107
    workspace.save_mets()
108
    print(workspace.directory)
109
110
# ----------------------------------------------------------------------
111
# ocrd workspace init
112
# ----------------------------------------------------------------------
113
114
@workspace_cli.command('init')
115
@click.option('-f', '--clobber-mets', help="Clobber mets.xml if it exists", is_flag=True, default=False)
116
@click.argument('directory', required=False)
117
@pass_workspace
118
def workspace_create(ctx, clobber_mets, directory):
119
    """
120
    Create a workspace with an empty METS file in DIRECTORY.
121
122
    Use '.' for $PWD"
123
    """
124
    if not directory:
125
        directory = '.'
126
    workspace = ctx.resolver.workspace_from_nothing(
127
        directory=os.path.abspath(directory),
128
        mets_basename=ctx.mets_basename,
129
        clobber_mets=clobber_mets
130
    )
131
    workspace.save_mets()
132
    print(workspace.directory)
133
134
# ----------------------------------------------------------------------
135
# ocrd workspace add
136
# ----------------------------------------------------------------------
137
138
@workspace_cli.command('add')
139
@click.option('-G', '--file-grp', help="fileGrp USE", required=True)
140
@click.option('-i', '--file-id', help="ID for the file", required=True)
141
@click.option('-m', '--mimetype', help="Media type of the file", required=True)
142
@click.option('-g', '--page-id', help="ID of the physical page")
143
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
144
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
145
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
146
@click.argument('fname', required=True)
147
@pass_workspace
148
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname):
149
    """
150
    Add a file or http(s) URL FNAME to METS in a workspace.
151
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
152
    """
153
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
154
155
    kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore}
156
    log = getLogger('ocrd.cli.workspace.add')
157
    if not (fname.startswith('http://') or fname.startswith('https://')):
158
        if not fname.startswith(ctx.directory):
159
            if exists(join(ctx.directory, fname)):
160
                fname = join(ctx.directory, fname)
161
            else:
162
                log.debug("File '%s' is not in workspace, copying", fname)
163
                try:
164
                    fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
165
                except FileNotFoundError as e:
166
                    if check_file_exists:
167
                        log.error("File '%s' does not exist, halt execution!" % fname)
168
                        sys.exit(1)
169
        if check_file_exists and not exists(fname):
170
            log.error("File '%s' does not exist, halt execution!" % fname)
171
            sys.exit(1)
172
        if fname.startswith(ctx.directory):
173
            fname = relpath(fname, ctx.directory)
174
        kwargs['local_filename'] = fname
175
176
    kwargs['url'] = fname
177
    workspace.mets.add_file(**kwargs)
178
    workspace.save_mets()
179
180
# ----------------------------------------------------------------------
181
# ocrd workspace add-bulk
182
# ----------------------------------------------------------------------
183
184
# pylint: disable=bad-whitespace, broad-except
185
@workspace_cli.command('bulk-add')
186
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
187
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
188
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
189
@click.option('-i', '--file-id', help="ID of the file", required=True)
190
@click.option('-u', '--url', help="local filesystem path in the workspace directory (copied from source file if different)", required=True)
191
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
192
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
193
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
194
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
195
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
196
@click.argument('file_glob', nargs=-1, required=True)
197
@pass_workspace
198
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, ignore, force, skip):
199
    r"""
200
    Add files in bulk to an OCR-D workspace.
201
202
    FILE_GLOB can either be a shell glob expression or a list of files.
203
204
    --regex is applied to the absolute path of every file in FILE_GLOB and can
205
    define named groups that can be used in --page-id, --file-id, --mimetype, --url and
206
    --file-grp by referencing the named group 'grp' in the regex as '{{ grp }}'.
207
208
    \b
209
    Example:
210
        ocrd workspace bulk-add \\
211
                --regex '^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$' \\
212
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
213
                --page-id 'PHYS_{{ pageid }}' \\
214
                --file-grp "{{ fileGrp }}" \\
215
                --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
216
                path/to/files/*/*.*
217
218
    """
219
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
220
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
221
222
    try:
223
        pat = re.compile(regex)
224
    except Exception as e:
225
        log.error("Invalid regex: %s" % e)
226
        sys.exit(1)
227
228
    file_paths = []
229
    for fglob in file_glob:
230
        file_paths += [Path(x).resolve() for x in glob(fglob)]
231
232
    for i, file_path in enumerate(file_paths):
233
        log.info("[%4d/%d] %s" % (i, len(file_paths), file_path))
234
235
        # match regex
236
        m = pat.match(str(file_path))
237
        if not m:
238
            if skip:
239
                continue
240
            log.error("File not matched by regex: '%s'" % file_path)
241
            sys.exit(1)
242
        group_dict = m.groupdict()
243
244
        # set up file info
245
        file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id, 'pageId': page_id, 'fileGrp': file_grp}
246
247
        # guess mime type
248
        if not file_dict['mimetype']:
249
            try:
250
                file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix]
251
            except KeyError:
252
                log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path))
253
254
        # expand templates
255
        for param_name in file_dict:
256
            for group_name in group_dict:
257
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
258
259
        # copy files
260
        if file_dict['url']:
261
            urlpath = Path(workspace.directory, file_dict['url'])
262
            if not urlpath.exists():
263
                log.info("cp '%s' '%s'", file_path, urlpath)
264
                if not dry_run:
265
                    if not urlpath.parent.is_dir():
266
                        urlpath.parent.mkdir()
267
                    urlpath.write_bytes(file_path.read_bytes())
268
269
        # Add to workspace (or not)
270
        fileGrp = file_dict.pop('fileGrp')
271
        if dry_run:
272
            log.info('workspace.add_file(%s)' % file_dict)
273
        else:
274
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
275
276
    # save changes to disk
277
    workspace.save_mets()
278
279
280
# ----------------------------------------------------------------------
281
# ocrd workspace find
282
# ----------------------------------------------------------------------
283
284
@workspace_cli.command('find')
285
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
286
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
287
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
288
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
289
# pylint: disable=bad-continuation
290
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
291
        default=['url'],
292
        multiple=True,
293
        type=click.Choice([
294
            'url',
295
            'mimetype',
296
            'pageId',
297
            'ID',
298
            'fileGrp',
299
            'basename',
300
            'basename_without_extension',
301
            'local_filename',
302
        ]))
303
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
304
@pass_workspace
305
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
306
    """
307
    Find files.
308
309
    (If any ``FILTER`` starts with ``//``, then its remainder
310
     will be interpreted as a regular expression.)
311
    """
312
    modified_mets = False
313
    ret = list()
314
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
315
    for f in workspace.mets.find_files(
316
            ID=file_id,
317
            fileGrp=file_grp,
318
            mimetype=mimetype,
319
            pageId=page_id,
320
        ):
321
        if download and not f.local_filename:
322
            workspace.download_file(f)
323
            modified_mets = True
324
        ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
325
                    for field in output_field])
326
    if modified_mets:
327
        workspace.save_mets()
328
    if 'pageId' in output_field:
329
        idx = output_field.index('pageId')
330
        fileIds = list(map(lambda fields: fields[idx], ret))
0 ignored issues
show
introduced by
The variable idx does not seem to be defined in case 'pageId' in output_field on line 328 is False. Are you sure this can never be the case?
Loading history...
331
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
332
        for fields, page in zip(ret, pages):
333
            fields[idx] = page or ''
334
    for fields in ret:
335
        print('\t'.join(fields))
336
337
# ----------------------------------------------------------------------
338
# ocrd workspace remove
339
# ----------------------------------------------------------------------
340
341
@workspace_cli.command('remove')
342
@click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True)
343
@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True)
344
@click.argument('ID', nargs=-1)
345
@pass_workspace
346
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
347
    """
348
    Delete files (given by their ID attribute ``ID``).
349
    
350
    (If any ``ID`` starts with ``//``, then its remainder
351
     will be interpreted as a regular expression.)
352
    """
353
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
354
    for i in id:
355
        workspace.remove_file(i, force=force, keep_file=keep_file)
356
    workspace.save_mets()
357
358
359
# ----------------------------------------------------------------------
360
# ocrd workspace remove-group
361
# ----------------------------------------------------------------------
362
363
@workspace_cli.command('remove-group')
364
@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True)
365
@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True)
366
@click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True)
367
@click.argument('GROUP', nargs=-1)
368
@pass_workspace
369
def remove_group(ctx, group, recursive, force, keep_files):
370
    """
371
    Delete fileGrps (given by their USE attribute ``GROUP``).
372
    
373
    (If any ``GROUP`` starts with ``//``, then its remainder
374
     will be interpreted as a regular expression.)
375
    """
376
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
377
    for g in group:
378
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
379
    workspace.save_mets()
380
381
# ----------------------------------------------------------------------
382
# ocrd workspace prune-files
383
# ----------------------------------------------------------------------
384
385
@workspace_cli.command('prune-files')
386
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
387
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
388
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
389
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
390
@pass_workspace
391
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
392
    """
393
    Removes mets:files that point to non-existing local files
394
395
    (If any ``FILTER`` starts with ``//``, then its remainder
396
     will be interpreted as a regular expression.)
397
    """
398
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
399
    with pushd_popd(workspace.directory):
400
        for f in workspace.mets.find_files(
401
            ID=file_id,
402
            fileGrp=file_grp,
403
            mimetype=mimetype,
404
            pageId=page_id,
405
        ):
406
            try:
407
                if not f.local_filename or not exists(f.local_filename):
408
                    workspace.mets.remove_file(f.ID)
409
            except Exception as e:
410
                log.exception("Error removing %f: %s", f, e)
411
                raise(e)
412
        workspace.save_mets()
413
414
# ----------------------------------------------------------------------
415
# ocrd workspace list-group
416
# ----------------------------------------------------------------------
417
418
@workspace_cli.command('list-group', help="""
419
420
    List fileGrp USE attributes
421
422
""")
423
@pass_workspace
424
def list_groups(ctx):
425
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
426
    print("\n".join(workspace.mets.file_groups))
427
428
# ----------------------------------------------------------------------
429
# ocrd workspace list-pages
430
# ----------------------------------------------------------------------
431
432
@workspace_cli.command('list-page', help="""
433
434
    List page IDs
435
436
""")
437
@pass_workspace
438
def list_pages(ctx):
439
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
440
    print("\n".join(workspace.mets.physical_pages))
441
442
# ----------------------------------------------------------------------
443
# ocrd workspace get-id
444
# ----------------------------------------------------------------------
445
446
@workspace_cli.command('get-id', help="""
447
448
    Get METS id if any
449
450
""")
451
@pass_workspace
452
def get_id(ctx):
453
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
454
    ID = workspace.mets.unique_identifier
455
    if ID:
456
        print(ID)
457
458
# ----------------------------------------------------------------------
459
# ocrd workspace set-id
460
# ----------------------------------------------------------------------
461
462
@workspace_cli.command('set-id', help="""
463
464
    Set METS ID.
465
466
    If one of the supported identifier mechanisms is used, will set this identifier.
467
468
    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
469
""")
470
@click.argument('ID')
471
@pass_workspace
472
def set_id(ctx, id):   # pylint: disable=redefined-builtin
473
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
474
    workspace.mets.unique_identifier = id
475
    workspace.save_mets()
476
477
# ----------------------------------------------------------------------
478
# ocrd workspace backup
479
# ----------------------------------------------------------------------
480
481
@workspace_cli.group('backup')
482
@click.pass_context
483
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
484
    """
485
    Backing and restoring workspaces - dev edition
486
    """
487
488
@workspace_backup_cli.command('add')
489
@pass_workspace
490
def workspace_backup_add(ctx):
491
    """
492
    Create a new backup
493
    """
494
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
495
    backup_manager.add()
496
497
@workspace_backup_cli.command('list')
498
@pass_workspace
499
def workspace_backup_list(ctx):
500
    """
501
    List backups
502
    """
503
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
504
    for b in backup_manager.list():
505
        print(b)
506
507
@workspace_backup_cli.command('restore')
508
@click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True)
509
@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True))
510
@pass_workspace
511
def workspace_backup_restore(ctx, choose_first, bak):
512
    """
513
    Restore backup BAK
514
    """
515
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
516
    backup_manager.restore(bak, choose_first)
517
518
@workspace_backup_cli.command('undo')
519
@pass_workspace
520
def workspace_backup_undo(ctx):
521
    """
522
    Restore the last backup
523
    """
524
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
525
    backup_manager.undo()
526