Passed
Push — issue631-rest-api-language-det... ( 34c253...1cd800 )
by Osma
04:27
created

annif.cli   D

Complexity

Total Complexity 58

Size/Duplication

Total Lines 740
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 485
dl 0
loc 740
rs 4.5599
c 0
b 0
f 0
wmc 58

16 Functions

Rating   Name   Duplication   Size   Complexity  
A run_train() 0 42 3
A run_load_vocab() 0 36 4
B run_suggest() 0 42 6
A run_learn() 0 20 1
A run_list_vocabs() 0 28 4
A run_list_projects() 0 39 3
A run_clear_project() 0 9 1
A run_show_project() 0 18 1
B run_index() 0 43 6
C run_eval() 0 108 9
A run_hyperopt() 0 36 2
A run_completion() 0 16 2
C run_optimize() 0 96 8
A run_download() 0 58 3
A run_app() 0 14 1
A run_upload() 0 56 4

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util, hfh_util
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.util import metric_code
28
29
logger = annif.logger
30
click_log.basic_config(logger)
31
32
create_app = annif.create_flask_app
33
cli = FlaskGroup(
34
    create_app=create_app, add_default_commands=False, add_version_option=False
35
)
36
cli = click.version_option(message="%(version)s")(cli)
37
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
38
39
40
@cli.command("list-projects")
41
@cli_util.common_options
42
@click_log.simple_verbosity_option(logger, default="ERROR")
43
def run_list_projects():
44
    """
45
    List available projects.
46
    \f
47
    Show a list of currently defined projects. Projects are defined in a
48
    configuration file, normally called ``projects.cfg``. See `Project
49
    configuration
50
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51
    for details.
52
    """
53
54
    column_headings = (
55
        "Project ID",
56
        "Project Name",
57
        "Vocabulary ID",
58
        "Language",
59
        "Trained",
60
        "Modification time",
61
    )
62
    table = [
63
        (
64
            proj.project_id,
65
            proj.name,
66
            proj.vocab.vocab_id if proj.vocab_spec else "-",
67
            proj.language,
68
            str(proj.is_trained),
69
            cli_util.format_datetime(proj.modification_time),
70
        )
71
        for proj in annif.registry.get_projects(min_access=Access.private).values()
72
    ]
73
    template = cli_util.make_list_template(column_headings, *table)
74
    header = template.format(*column_headings)
75
    click.echo(header)
76
    click.echo("-" * len(header))
77
    for row in table:
78
        click.echo(template.format(*row))
79
80
81
@cli.command("show-project")
82
@cli_util.project_id
83
@cli_util.common_options
84
def run_show_project(project_id):
85
    """
86
    Show information about a project.
87
    """
88
89
    proj = cli_util.get_project(project_id)
90
    click.echo(f"Project ID:        {proj.project_id}")
91
    click.echo(f"Project Name:      {proj.name}")
92
    click.echo(f"Language:          {proj.language}")
93
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
94
    click.echo(f"Vocab language:    {proj.vocab_lang}")
95
    click.echo(f"Access:            {proj.access.name}")
96
    click.echo(f"Backend:           {proj.backend.name}")
97
    click.echo(f"Trained:           {proj.is_trained}")
98
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
99
100
101
@cli.command("clear")
102
@cli_util.project_id
103
@cli_util.common_options
104
def run_clear_project(project_id):
105
    """
106
    Initialize the project to its original, untrained state.
107
    """
108
    proj = cli_util.get_project(project_id)
109
    proj.remove_model_data()
110
111
112
@cli.command("list-vocabs")
113
@cli_util.common_options
114
@click_log.simple_verbosity_option(logger, default="ERROR")
115
def run_list_vocabs():
116
    """
117
    List available vocabularies.
118
    """
119
120
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
121
    table = []
122
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
123
        try:
124
            languages = ",".join(sorted(vocab.languages))
125
            size = len(vocab)
126
            loaded = True
127
        except NotInitializedException:
128
            languages = "-"
129
            size = "-"
130
            loaded = False
131
        row = (vocab.vocab_id, languages, str(size), str(loaded))
132
        table.append(row)
133
134
    template = cli_util.make_list_template(column_headings, *table)
135
    header = template.format(*column_headings)
136
    click.echo(header)
137
    click.echo("-" * len(header))
138
    for row in table:
139
        click.echo(template.format(*row))
140
141
142
@cli.command("load-vocab")
143
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
144
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
145
@click.option("--language", "-L", help="Language of subject file")
146
@click.option(
147
    "--force",
148
    "-f",
149
    default=False,
150
    is_flag=True,
151
    help="Replace existing vocabulary completely instead of updating it",
152
)
153
@cli_util.common_options
154
def run_load_vocab(vocab_id, language, force, subjectfile):
155
    """
156
    Load a vocabulary from a subject file.
157
    """
158
    vocab = cli_util.get_vocab(vocab_id)
159
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
160
        # SKOS/RDF file supported by rdflib
161
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
162
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
163
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
164
        # CSV file
165
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
166
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
167
    else:
168
        # probably a TSV file - we need to know its language
169
        if not language:
170
            click.echo(
171
                "Please use --language option to set the language of a TSV vocabulary.",
172
                err=True,
173
            )
174
            sys.exit(1)
175
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
176
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
177
    vocab.load_vocabulary(subjects, force=force)
178
179
180
@cli.command("train")
181
@cli_util.project_id
182
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
183
@click.option(
184
    "--cached/--no-cached",
185
    "-c/-C",
186
    default=False,
187
    help="Reuse preprocessed training data from previous run",
188
)
189
@click.option(
190
    "--jobs",
191
    "-j",
192
    default=0,
193
    help="Number of parallel jobs (0 means choose automatically)",
194
)
195
@cli_util.docs_limit_option
196
@cli_util.backend_param_option
197
@cli_util.common_options
198
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
199
    """
200
    Train a project on a collection of documents.
201
    \f
202
    This will train the project using the documents from ``PATHS`` (directories
203
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
204
    is set, preprocessed training data from the previous run is reused instead
205
    of documents input; see `Reusing preprocessed training data
206
    <https://github.com/NatLibFi/Annif/wiki/
207
    Reusing-preprocessed-training-data>`_.
208
    """
209
    proj = cli_util.get_project(project_id)
210
    backend_params = cli_util.parse_backend_params(backend_param, proj)
211
    if cached:
212
        if len(paths) > 0:
213
            raise click.UsageError(
214
                "Corpus paths cannot be given when using --cached option."
215
            )
216
        documents = "cached"
217
    else:
218
        documents = cli_util.open_documents(
219
            paths, proj.subjects, proj.vocab_lang, docs_limit
220
        )
221
    proj.train(documents, backend_params, jobs)
222
223
224
@cli.command("learn")
225
@cli_util.project_id
226
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
227
@cli_util.docs_limit_option
228
@cli_util.backend_param_option
229
@cli_util.common_options
230
def run_learn(project_id, paths, docs_limit, backend_param):
231
    """
232
    Further train an existing project on a collection of documents.
233
    \f
234
    Similar to the ``train`` command. This will continue training an already
235
    trained project using the documents given by ``PATHS`` in a single batch
236
    operation. Not supported by all backends.
237
    """
238
    proj = cli_util.get_project(project_id)
239
    backend_params = cli_util.parse_backend_params(backend_param, proj)
240
    documents = cli_util.open_documents(
241
        paths, proj.subjects, proj.vocab_lang, docs_limit
242
    )
243
    proj.learn(documents, backend_params)
244
245
246
@cli.command("suggest")
247
@cli_util.project_id
248
@click.argument(
249
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
250
)
251
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
252
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
253
@click.option("--language", "-L", help="Language of subject labels")
254
@cli_util.docs_limit_option
255
@cli_util.backend_param_option
256
@cli_util.common_options
257
def run_suggest(
258
    project_id, paths, limit, threshold, language, backend_param, docs_limit
259
):
260
    """
261
    Suggest subjects for a single document from standard input or for one or more
262
    document file(s) given its/their path(s).
263
    \f
264
    This will read a text document from standard input and suggest subjects for
265
    it, or if given path(s) to file(s), suggest subjects for it/them.
266
    """
267
    project = cli_util.get_project(project_id)
268
    lang = language or project.vocab_lang
269
    if lang not in project.vocab.languages:
270
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
271
    backend_params = cli_util.parse_backend_params(backend_param, project)
272
273
    if paths and not (len(paths) == 1 and paths[0] == "-"):
274
        docs = cli_util.open_text_documents(paths, docs_limit)
275
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
276
        for (
277
            suggestions,
278
            path,
279
        ) in zip(results, paths):
280
            click.echo(f"Suggestions for {path}")
281
            cli_util.show_hits(suggestions, project, lang)
282
    else:
283
        text = sys.stdin.read()
284
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
285
            0
286
        ]
287
        cli_util.show_hits(suggestions, project, lang)
288
289
290
@cli.command("index")
291
@cli_util.project_id
292
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
293
@click.option(
294
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
295
)
296
@click.option(
297
    "--force/--no-force",
298
    "-f/-F",
299
    default=False,
300
    help="Force overwriting of existing result files",
301
)
302
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
303
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
304
@click.option("--language", "-L", help="Language of subject labels")
305
@cli_util.backend_param_option
306
@cli_util.common_options
307
def run_index(
308
    project_id, directory, suffix, force, limit, threshold, language, backend_param
309
):
310
    """
311
    Index a directory with documents, suggesting subjects for each document.
312
    Write the results in TSV files with the given suffix (``.annif`` by
313
    default).
314
    """
315
    project = cli_util.get_project(project_id)
316
    lang = language or project.vocab_lang
317
    if lang not in project.vocab.languages:
318
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
319
    backend_params = cli_util.parse_backend_params(backend_param, project)
320
321
    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
322
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
323
324
    for (docfilename, _), suggestions in zip(documents, results):
325
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
326
        if os.path.exists(subjectfilename) and not force:
327
            click.echo(
328
                "Not overwriting {} (use --force to override)".format(subjectfilename)
329
            )
330
            continue
331
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
332
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
333
334
335
@cli.command("eval")
336
@cli_util.project_id
337
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
338
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
339
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
340
@click.option(
341
    "--metric",
342
    "-m",
343
    default=[],
344
    multiple=True,
345
    help="Metric to calculate (default: all)",
346
)
347
@click.option(
348
    "--metrics-file",
349
    "-M",
350
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
351
    help="""Specify file in order to write evaluation metrics in JSON format.
352
    File directory must exist, existing file will be overwritten.""",
353
)
354
@click.option(
355
    "--results-file",
356
    "-r",
357
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
358
    help="""Specify file in order to write non-aggregated results per subject.
359
    File directory must exist, existing file will be overwritten.""",
360
)
361
@click.option(
362
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
363
)
364
@cli_util.docs_limit_option
365
@cli_util.backend_param_option
366
@cli_util.common_options
367
def run_eval(
368
    project_id,
369
    paths,
370
    limit,
371
    threshold,
372
    docs_limit,
373
    metric,
374
    metrics_file,
375
    results_file,
376
    jobs,
377
    backend_param,
378
):
379
    """
380
    Suggest subjects for documents and evaluate the results by comparing
381
    against a gold standard.
382
    \f
383
    With this command the documents from ``PATHS`` (directories or possibly
384
    gzipped TSV files) will be assigned subject suggestions and then
385
    statistical measures are calculated that quantify how well the suggested
386
    subjects match the gold-standard subjects in the documents.
387
388
    Normally the output is the list of the metrics calculated across documents.
389
    If ``--results-file <FILENAME>`` option is given, the metrics are
390
    calculated separately for each subject, and written to the given file.
391
    """
392
393
    project = cli_util.get_project(project_id)
394
    backend_params = cli_util.parse_backend_params(backend_param, project)
395
396
    import annif.eval
397
398
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
399
400
    if results_file:
401
        try:
402
            print("", end="", file=results_file)
403
            click.echo(
404
                "Writing per subject evaluation results to {!s}".format(
405
                    results_file.name
406
                )
407
            )
408
        except Exception as e:
409
            raise NotSupportedException(
410
                "cannot open results-file for writing: " + str(e)
411
            )
412
    corpus = cli_util.open_documents(
413
        paths, project.subjects, project.vocab_lang, docs_limit
414
    )
415
    jobs, pool_class = annif.parallel.get_pool(jobs)
416
417
    project.initialize(parallel=True)
418
    psmap = annif.parallel.ProjectSuggestMap(
419
        project.registry, [project_id], backend_params, limit, threshold
420
    )
421
422
    with pool_class(jobs) as pool:
423
        for hit_sets, subject_sets in pool.imap_unordered(
424
            psmap.suggest_batch, corpus.doc_batches
425
        ):
426
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
427
428
    template = "{0:<30}\t{1:{fmt_spec}}"
429
    metrics = eval_batch.results(
430
        metrics=metric, results_file=results_file, language=project.vocab_lang
431
    )
432
    for metric, score in metrics.items():
433
        if isinstance(score, int):
434
            fmt_spec = "d"
435
        elif isinstance(score, float):
436
            fmt_spec = ".04f"
437
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
438
    if metrics_file:
439
        json.dump(
440
            {metric_code(mname): val for mname, val in metrics.items()},
441
            metrics_file,
442
            indent=2,
443
        )
444
445
446
@cli.command("run")
447
@click.option("--host", type=str, default="127.0.0.1")
448
@click.option("--port", type=int, default=5000)
449
@click.option("--log-level")
450
@click_log.simple_verbosity_option(logger, default="ERROR")
451
def run_app(**kwargs):
452
    """
453
    Run Annif in server mode for development.
454
    \f
455
    The server is for development purposes only.
456
    """
457
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
458
    cxapp = annif.create_cx_app()
459
    cxapp.run(**kwargs)
460
461
462
FILTER_BATCH_MAX_LIMIT = 15
463
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
464
465
466
@cli.command("optimize")
467
@cli_util.project_id
468
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
469
@click.option(
470
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
471
)
472
@cli_util.docs_limit_option
473
@cli_util.backend_param_option
474
@cli_util.common_options
475
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
476
    """
477
    Suggest subjects for documents, testing multiple limits and thresholds.
478
    \f
479
    This command will use different limit (maximum number of subjects) and
480
    score threshold values when assigning subjects to each document given by
481
    ``PATHS`` and compare the results against the gold standard subjects in the
482
    documents. The output is a list of parameter combinations and their scores.
483
    From the output, you can determine the optimum limit and threshold
484
    parameters depending on which measure you want to target.
485
    """
486
    project = cli_util.get_project(project_id)
487
    backend_params = cli_util.parse_backend_params(backend_param, project)
488
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
489
490
    import annif.eval
491
492
    corpus = cli_util.open_documents(
493
        paths, project.subjects, project.vocab_lang, docs_limit
494
    )
495
496
    jobs, pool_class = annif.parallel.get_pool(jobs)
497
498
    project.initialize(parallel=True)
499
    psmap = annif.parallel.ProjectSuggestMap(
500
        project.registry,
501
        [project_id],
502
        backend_params,
503
        limit=FILTER_BATCH_MAX_LIMIT,
504
        threshold=0.0,
505
    )
506
507
    ndocs = 0
508
    suggestion_batches = []
509
    subject_set_batches = []
510
    with pool_class(jobs) as pool:
511
        for suggestion_batch, subject_sets in pool.imap_unordered(
512
            psmap.suggest_batch, corpus.doc_batches
513
        ):
514
            ndocs += len(suggestion_batch[project_id])
515
            suggestion_batches.append(suggestion_batch[project_id])
516
            subject_set_batches.append(subject_sets)
517
518
    from annif.suggestion import SuggestionResults
519
520
    orig_suggestion_results = SuggestionResults(suggestion_batches)
521
522
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
523
524
    best_scores = collections.defaultdict(float)
525
    best_params = {}
526
527
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
528
    import annif.eval
529
530
    for limit, threshold in filter_params:
531
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
532
        filtered_results = orig_suggestion_results.filter(limit, threshold)
533
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
534
            eval_batch.evaluate_many(batch, subject_sets)
535
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
536
        for metric, score in results.items():
537
            if score >= best_scores[metric]:
538
                best_scores[metric] = score
539
                best_params[metric] = (limit, threshold)
540
        click.echo(
541
            template.format(
542
                limit,
543
                threshold,
544
                results["Precision (doc avg)"],
545
                results["Recall (doc avg)"],
546
                results["F1 score (doc avg)"],
547
            )
548
        )
549
550
    click.echo()
551
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
552
    for metric in OPTIMIZE_METRICS:
553
        click.echo(
554
            template2.format(
555
                metric,
556
                best_scores[metric],
557
                best_params[metric][0],
558
                best_params[metric][1],
559
            )
560
        )
561
    click.echo("Documents evaluated:\t{}".format(ndocs))
562
563
564
@cli.command("hyperopt")
565
@cli_util.project_id
566
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
567
@click.option("--trials", "-T", default=10, help="Number of trials")
568
@click.option(
569
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
570
)
571
@click.option(
572
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
573
)
574
@click.option(
575
    "--results-file",
576
    "-r",
577
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
578
    help="""Specify file path to write trial results as CSV.
579
    File directory must exist, existing file will be overwritten.""",
580
)
581
@cli_util.docs_limit_option
582
@cli_util.common_options
583
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
584
    """
585
    Optimize the hyperparameters of a project using validation documents from
586
    ``PATHS``. Not supported by all backends. Output is a list of trial results
587
    and a report of the best performing parameters.
588
    """
589
    proj = cli_util.get_project(project_id)
590
    documents = cli_util.open_documents(
591
        paths, proj.subjects, proj.vocab_lang, docs_limit
592
    )
593
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
594
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
595
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
596
    click.echo("---")
597
    for line in rec.lines:
598
        click.echo(line)
599
    click.echo("---")
600
601
602
@cli.command("upload")
603
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
604
@click.argument("repo_id")
605
@click.option(
606
    "--token",
607
    help="""Authentication token, obtained from the Hugging Face Hub.
608
    Will default to the stored token.""",
609
)
610
@click.option(
611
    "--revision",
612
    help="""An optional git revision to commit from. Defaults to the head of the "main"
613
    branch.""",
614
)
615
@click.option(
616
    "--commit-message",
617
    help="""The summary / title / first line of the generated commit.""",
618
)
619
@cli_util.common_options
620
def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
621
    """
622
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
623
    \f
624
    This command zips the project directories and vocabularies of the projects
625
    that match the given `project_ids_pattern` to archive files, and uploads the
626
    archives along with the project configurations to the specified Hugging Face
627
    Hub repository. An authentication token and commit message can be given with
628
    options.
629
    """
630
    from huggingface_hub import HfApi
631
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
632
633
    projects = hfh_util.get_matching_projects(project_ids_pattern)
634
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
635
636
    commit_message = (
637
        commit_message
638
        if commit_message is not None
639
        else f"Upload project(s) {project_ids_pattern} with Annif"
640
    )
641
642
    fobjs, operations = [], []
643
    try:
644
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id)
645
        api = HfApi()
646
        api.create_commit(
647
            repo_id=repo_id,
648
            operations=operations,
649
            commit_message=commit_message,
650
            revision=revision,
651
            token=token,
652
        )
653
    except (HfHubHTTPError, HFValidationError) as err:
654
        raise OperationFailedException(str(err))
655
    finally:
656
        for fobj in fobjs:
657
            fobj.close()
658
659
660
@cli.command("download")
661
@click.argument("project_ids_pattern")
662
@click.argument("repo_id")
663
@click.option(
664
    "--token",
665
    help="""Authentication token, obtained from the Hugging Face Hub.
666
    Will default to the stored token.""",
667
)
668
@click.option(
669
    "--revision",
670
    help="""
671
    An optional Git revision id which can be a branch name, a tag, or a commit
672
    hash.
673
    """,
674
)
675
@click.option(
676
    "--force",
677
    "-f",
678
    default=False,
679
    is_flag=True,
680
    help="Replace an existing project/vocabulary/config with the downloaded one",
681
)
682
@cli_util.common_options
683
def run_download(project_ids_pattern, repo_id, token, revision, force):
684
    """
685
    Download selected projects and their vocabularies from a Hugging Face Hub
686
    repository.
687
    \f
688
    This command downloads the project and vocabulary archives and the
689
    configuration files of the projects that match the given
690
    `project_ids_pattern` from the specified Hugging Face Hub repository and
691
    unzips the archives to `data/` directory and places the configuration files
692
    to `projects.d/` directory. An authentication token and revision can
693
    be given with options.
694
    """
695
696
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
697
        project_ids_pattern, repo_id, token, revision
698
    )
699
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
700
701
    vocab_ids = set()
702
    for project_id in project_ids:
703
        project_zip_cache_path = hfh_util.download_from_hf_hub(
704
            f"projects/{project_id}.zip", repo_id, token, revision
705
        )
706
        hfh_util.unzip_archive(project_zip_cache_path, force)
707
        config_file_cache_path = hfh_util.download_from_hf_hub(
708
            f"{project_id}.cfg", repo_id, token, revision
709
        )
710
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
711
        hfh_util.copy_project_config(config_file_cache_path, force)
712
713
    for vocab_id in vocab_ids:
714
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
715
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
716
        )
717
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
718
719
720
@cli.command("completion")
721
@click.option("--bash", "shell", flag_value="bash")
722
@click.option("--zsh", "shell", flag_value="zsh")
723
@click.option("--fish", "shell", flag_value="fish")
724
def run_completion(shell):
725
    """Generate the script for tab-key autocompletion for the given shell. To enable the
726
    completion support in your current bash terminal session run\n
727
        source <(annif completion --bash)
728
    """
729
730
    if shell is None:
731
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
732
733
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
734
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
735
    click.echo(script)
736
737
738
if __name__ == "__main__":
739
    cli()
740