annif.cli   F
last analyzed

Complexity

Total Complexity 68

Size/Duplication

Total Lines 804
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 521
dl 0
loc 804
rs 2.96
c 0
b 0
f 0
wmc 68

17 Functions

Rating   Name   Duplication   Size   Complexity  
A run_clear_project() 0 9 1
A run_train() 0 42 3
A run_load_vocab() 0 36 4
B run_suggest() 0 42 6
A run_learn() 0 20 1
A run_list_vocabs() 0 28 4
A run_list_projects() 0 39 3
B run_index() 0 43 6
C run_eval() 0 108 9
A run_show_project() 0 18 1
A run_hyperopt() 0 36 2
C run_optimize() 0 96 8
A run_app() 0 14 1
B run_download() 0 68 3
B run_upload() 0 68 6
A run_completion() 0 24 2
B run_detect_language() 0 31 8

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util, hfh_util
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
@click_log.simple_verbosity_option(logger, default="ERROR")
44
def run_list_projects():
45
    """
46
    List available projects.
47
    \f
48
    Show a list of currently defined projects. Projects are defined in a
49
    configuration file, normally called ``projects.cfg``. See `Project
50
    configuration
51
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
52
    for details.
53
    """
54
55
    column_headings = (
56
        "Project ID",
57
        "Project Name",
58
        "Vocabulary ID",
59
        "Language",
60
        "Trained",
61
        "Modification time",
62
    )
63
    table = [
64
        (
65
            proj.project_id,
66
            proj.name,
67
            proj.vocab.vocab_id if proj.vocab_spec else "-",
68
            proj.language,
69
            str(proj.is_trained),
70
            cli_util.format_datetime(proj.modification_time),
71
        )
72
        for proj in annif.registry.get_projects(min_access=Access.private).values()
73
    ]
74
    template = cli_util.make_list_template(column_headings, *table)
75
    header = template.format(*column_headings)
76
    click.echo(header)
77
    click.echo("-" * len(header))
78
    for row in table:
79
        click.echo(template.format(*row))
80
81
82
@cli.command("show-project")
83
@cli_util.project_id
84
@cli_util.common_options
85
def run_show_project(project_id):
86
    """
87
    Show information about a project.
88
    """
89
90
    proj = cli_util.get_project(project_id)
91
    click.echo(f"Project ID:        {proj.project_id}")
92
    click.echo(f"Project Name:      {proj.name}")
93
    click.echo(f"Language:          {proj.language}")
94
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
95
    click.echo(f"Vocab language:    {proj.vocab_lang}")
96
    click.echo(f"Access:            {proj.access.name}")
97
    click.echo(f"Backend:           {proj.backend.name}")
98
    click.echo(f"Trained:           {proj.is_trained}")
99
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
100
101
102
@cli.command("clear")
103
@cli_util.project_id
104
@cli_util.common_options
105
def run_clear_project(project_id):
106
    """
107
    Initialize the project to its original, untrained state.
108
    """
109
    proj = cli_util.get_project(project_id)
110
    proj.remove_model_data()
111
112
113
@cli.command("list-vocabs")
114
@cli_util.common_options
115
@click_log.simple_verbosity_option(logger, default="ERROR")
116
def run_list_vocabs():
117
    """
118
    List available vocabularies.
119
    """
120
121
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
122
    table = []
123
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
124
        try:
125
            languages = ",".join(sorted(vocab.languages))
126
            size = len(vocab)
127
            loaded = True
128
        except NotInitializedException:
129
            languages = "-"
130
            size = "-"
131
            loaded = False
132
        row = (vocab.vocab_id, languages, str(size), str(loaded))
133
        table.append(row)
134
135
    template = cli_util.make_list_template(column_headings, *table)
136
    header = template.format(*column_headings)
137
    click.echo(header)
138
    click.echo("-" * len(header))
139
    for row in table:
140
        click.echo(template.format(*row))
141
142
143
@cli.command("load-vocab")
144
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
145
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
146
@click.option("--language", "-L", help="Language of TSV vocabulary file")
147
@click.option(
148
    "--force",
149
    "-f",
150
    default=False,
151
    is_flag=True,
152
    help="Replace existing vocabulary completely instead of updating it",
153
)
154
@cli_util.common_options
155
def run_load_vocab(vocab_id, language, force, vocab_file):
156
    """
157
    Load a vocabulary from a subject file.
158
    """
159
    vocab = cli_util.get_vocab(vocab_id)
160
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
161
        # SKOS/RDF file supported by rdflib
162
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
163
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
164
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
165
        # CSV file
166
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
167
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
168
    else:
169
        # probably a TSV file - we need to know its language
170
        if not language:
171
            click.echo(
172
                "Please use --language option to set the language of a TSV vocabulary.",
173
                err=True,
174
            )
175
            sys.exit(1)
176
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
177
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
178
    vocab.load_vocabulary(vocab_file, force=force)
179
180
181
@cli.command("train")
182
@cli_util.project_id
183
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
184
@click.option(
185
    "--cached/--no-cached",
186
    "-c/-C",
187
    default=False,
188
    help="Reuse preprocessed training data from previous run",
189
)
190
@click.option(
191
    "--jobs",
192
    "-j",
193
    default=0,
194
    help="Number of parallel jobs (0 means choose automatically)",
195
)
196
@cli_util.docs_limit_option
197
@cli_util.backend_param_option
198
@cli_util.common_options
199
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
200
    """
201
    Train a project on a collection of documents.
202
    \f
203
    This will train the project using the documents from ``PATHS`` (directories
204
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
205
    is set, preprocessed training data from the previous run is reused instead
206
    of documents input; see `Reusing preprocessed training data
207
    <https://github.com/NatLibFi/Annif/wiki/
208
    Reusing-preprocessed-training-data>`_.
209
    """
210
    proj = cli_util.get_project(project_id)
211
    backend_params = cli_util.parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option."
216
            )
217
        documents = "cached"
218
    else:
219
        documents = cli_util.open_documents(
220
            paths, proj.subjects, proj.vocab_lang, docs_limit
221
        )
222
    proj.train(documents, backend_params, jobs)
223
224
225
@cli.command("learn")
226
@cli_util.project_id
227
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
228
@cli_util.docs_limit_option
229
@cli_util.backend_param_option
230
@cli_util.common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    \f
235
    Similar to the ``train`` command. This will continue training an already
236
    trained project using the documents given by ``PATHS`` in a single batch
237
    operation. Not supported by all backends.
238
    """
239
    proj = cli_util.get_project(project_id)
240
    backend_params = cli_util.parse_backend_params(backend_param, proj)
241
    documents = cli_util.open_documents(
242
        paths, proj.subjects, proj.vocab_lang, docs_limit
243
    )
244
    proj.learn(documents, backend_params)
245
246
247
@cli.command("suggest")
248
@cli_util.project_id
249
@click.argument(
250
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
251
)
252
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
253
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
254
@click.option("--language", "-L", help="Language of subject labels")
255
@cli_util.docs_limit_option
256
@cli_util.backend_param_option
257
@cli_util.common_options
258
def run_suggest(
259
    project_id, paths, limit, threshold, language, backend_param, docs_limit
260
):
261
    """
262
    Suggest subjects for a single document from standard input or for one or more
263
    document file(s) given its/their path(s).
264
    \f
265
    This will read a text document from standard input and suggest subjects for
266
    it, or if given path(s) to file(s), suggest subjects for it/them.
267
    """
268
    project = cli_util.get_project(project_id)
269
    lang = language or project.vocab_lang
270
    if lang not in project.vocab.languages:
271
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
272
    backend_params = cli_util.parse_backend_params(backend_param, project)
273
274
    if paths and not (len(paths) == 1 and paths[0] == "-"):
275
        docs = cli_util.open_text_documents(paths, docs_limit)
276
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
277
        for (
278
            suggestions,
279
            path,
280
        ) in zip(results, paths):
281
            click.echo(f"Suggestions for {path}")
282
            cli_util.show_hits(suggestions, project, lang)
283
    else:
284
        text = sys.stdin.read()
285
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
286
            0
287
        ]
288
        cli_util.show_hits(suggestions, project, lang)
289
290
291
@cli.command("index")
292
@cli_util.project_id
293
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
294
@click.option(
295
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
296
)
297
@click.option(
298
    "--force/--no-force",
299
    "-f/-F",
300
    default=False,
301
    help="Force overwriting of existing result files",
302
)
303
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
304
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
305
@click.option("--language", "-L", help="Language of subject labels")
306
@cli_util.backend_param_option
307
@cli_util.common_options
308
def run_index(
309
    project_id, directory, suffix, force, limit, threshold, language, backend_param
310
):
311
    """
312
    Index a directory with documents, suggesting subjects for each document.
313
    Write the results in TSV files with the given suffix (``.annif`` by
314
    default).
315
    """
316
    project = cli_util.get_project(project_id)
317
    lang = language or project.vocab_lang
318
    if lang not in project.vocab.languages:
319
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
320
    backend_params = cli_util.parse_backend_params(backend_param, project)
321
322
    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
323
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
324
325
    for (docfilename, _), suggestions in zip(documents, results):
326
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
327
        if os.path.exists(subjectfilename) and not force:
328
            click.echo(
329
                "Not overwriting {} (use --force to override)".format(subjectfilename)
330
            )
331
            continue
332
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
333
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
334
335
336
@cli.command("eval")
337
@cli_util.project_id
338
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
339
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
340
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
341
@click.option(
342
    "--metric",
343
    "-m",
344
    default=[],
345
    multiple=True,
346
    help="Metric to calculate (default: all)",
347
)
348
@click.option(
349
    "--metrics-file",
350
    "-M",
351
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
352
    help="""Specify file in order to write evaluation metrics in JSON format.
353
    File directory must exist, existing file will be overwritten.""",
354
)
355
@click.option(
356
    "--results-file",
357
    "-r",
358
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
359
    help="""Specify file in order to write non-aggregated results per subject.
360
    File directory must exist, existing file will be overwritten.""",
361
)
362
@click.option(
363
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
364
)
365
@cli_util.docs_limit_option
366
@cli_util.backend_param_option
367
@cli_util.common_options
368
def run_eval(
369
    project_id,
370
    paths,
371
    limit,
372
    threshold,
373
    docs_limit,
374
    metric,
375
    metrics_file,
376
    results_file,
377
    jobs,
378
    backend_param,
379
):
380
    """
381
    Suggest subjects for documents and evaluate the results by comparing
382
    against a gold standard.
383
    \f
384
    With this command the documents from ``PATHS`` (directories or possibly
385
    gzipped TSV files) will be assigned subject suggestions and then
386
    statistical measures are calculated that quantify how well the suggested
387
    subjects match the gold-standard subjects in the documents.
388
389
    Normally the output is the list of the metrics calculated across documents.
390
    If ``--results-file <FILENAME>`` option is given, the metrics are
391
    calculated separately for each subject, and written to the given file.
392
    """
393
394
    project = cli_util.get_project(project_id)
395
    backend_params = cli_util.parse_backend_params(backend_param, project)
396
397
    import annif.eval
398
399
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
400
401
    if results_file:
402
        try:
403
            print("", end="", file=results_file)
404
            click.echo(
405
                "Writing per subject evaluation results to {!s}".format(
406
                    results_file.name
407
                )
408
            )
409
        except Exception as e:
410
            raise NotSupportedException(
411
                "cannot open results-file for writing: " + str(e)
412
            )
413
    corpus = cli_util.open_documents(
414
        paths, project.subjects, project.vocab_lang, docs_limit
415
    )
416
    jobs, pool_class = annif.parallel.get_pool(jobs)
417
418
    project.initialize(parallel=True)
419
    psmap = annif.parallel.ProjectSuggestMap(
420
        project.registry, [project_id], backend_params, limit, threshold
421
    )
422
423
    with pool_class(jobs) as pool:
424
        for hit_sets, subject_sets in pool.imap_unordered(
425
            psmap.suggest_batch, corpus.doc_batches
426
        ):
427
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
428
429
    template = "{0:<30}\t{1:{fmt_spec}}"
430
    metrics = eval_batch.results(
431
        metrics=metric, results_file=results_file, language=project.vocab_lang
432
    )
433
    for metric, score in metrics.items():
434
        if isinstance(score, int):
435
            fmt_spec = "d"
436
        elif isinstance(score, float):
437
            fmt_spec = ".04f"
438
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
439
    if metrics_file:
440
        json.dump(
441
            {metric_code(mname): val for mname, val in metrics.items()},
442
            metrics_file,
443
            indent=2,
444
        )
445
446
447
@cli.command("run")
448
@click.option("--host", type=str, default="127.0.0.1")
449
@click.option("--port", type=int, default=5000)
450
@click.option("--log-level")
451
@click_log.simple_verbosity_option(logger, default="ERROR")
452
def run_app(**kwargs):
453
    """
454
    Run Annif in server mode for development.
455
    \f
456
    The server is for development purposes only.
457
    """
458
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
459
    cxapp = annif.create_cx_app()
460
    cxapp.run(**kwargs)
461
462
463
FILTER_BATCH_MAX_LIMIT = 15
464
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
465
466
467
@cli.command("optimize")
468
@cli_util.project_id
469
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
470
@click.option(
471
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
472
)
473
@cli_util.docs_limit_option
474
@cli_util.backend_param_option
475
@cli_util.common_options
476
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
477
    """
478
    Suggest subjects for documents, testing multiple limits and thresholds.
479
    \f
480
    This command will use different limit (maximum number of subjects) and
481
    score threshold values when assigning subjects to each document given by
482
    ``PATHS`` and compare the results against the gold standard subjects in the
483
    documents. The output is a list of parameter combinations and their scores.
484
    From the output, you can determine the optimum limit and threshold
485
    parameters depending on which measure you want to target.
486
    """
487
    project = cli_util.get_project(project_id)
488
    backend_params = cli_util.parse_backend_params(backend_param, project)
489
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
490
491
    import annif.eval
492
493
    corpus = cli_util.open_documents(
494
        paths, project.subjects, project.vocab_lang, docs_limit
495
    )
496
497
    jobs, pool_class = annif.parallel.get_pool(jobs)
498
499
    project.initialize(parallel=True)
500
    psmap = annif.parallel.ProjectSuggestMap(
501
        project.registry,
502
        [project_id],
503
        backend_params,
504
        limit=FILTER_BATCH_MAX_LIMIT,
505
        threshold=0.0,
506
    )
507
508
    ndocs = 0
509
    suggestion_batches = []
510
    subject_set_batches = []
511
    with pool_class(jobs) as pool:
512
        for suggestion_batch, subject_sets in pool.imap_unordered(
513
            psmap.suggest_batch, corpus.doc_batches
514
        ):
515
            ndocs += len(suggestion_batch[project_id])
516
            suggestion_batches.append(suggestion_batch[project_id])
517
            subject_set_batches.append(subject_sets)
518
519
    from annif.suggestion import SuggestionResults
520
521
    orig_suggestion_results = SuggestionResults(suggestion_batches)
522
523
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
524
525
    best_scores = collections.defaultdict(float)
526
    best_params = {}
527
528
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
529
    import annif.eval
530
531
    for limit, threshold in filter_params:
532
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
533
        filtered_results = orig_suggestion_results.filter(limit, threshold)
534
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
535
            eval_batch.evaluate_many(batch, subject_sets)
536
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
537
        for metric, score in results.items():
538
            if score >= best_scores[metric]:
539
                best_scores[metric] = score
540
                best_params[metric] = (limit, threshold)
541
        click.echo(
542
            template.format(
543
                limit,
544
                threshold,
545
                results["Precision (doc avg)"],
546
                results["Recall (doc avg)"],
547
                results["F1 score (doc avg)"],
548
            )
549
        )
550
551
    click.echo()
552
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
553
    for metric in OPTIMIZE_METRICS:
554
        click.echo(
555
            template2.format(
556
                metric,
557
                best_scores[metric],
558
                best_params[metric][0],
559
                best_params[metric][1],
560
            )
561
        )
562
    click.echo("Documents evaluated:\t{}".format(ndocs))
563
564
565
@cli.command("hyperopt")
566
@cli_util.project_id
567
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
568
@click.option("--trials", "-T", default=10, help="Number of trials")
569
@click.option(
570
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
571
)
572
@click.option(
573
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
574
)
575
@click.option(
576
    "--results-file",
577
    "-r",
578
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
579
    help="""Specify file path to write trial results as CSV.
580
    File directory must exist, existing file will be overwritten.""",
581
)
582
@cli_util.docs_limit_option
583
@cli_util.common_options
584
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
585
    """
586
    Optimize the hyperparameters of a project using validation documents from
587
    ``PATHS``. Not supported by all backends. Output is a list of trial results
588
    and a report of the best performing parameters.
589
    """
590
    proj = cli_util.get_project(project_id)
591
    documents = cli_util.open_documents(
592
        paths, proj.subjects, proj.vocab_lang, docs_limit
593
    )
594
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
595
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
596
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
597
    click.echo("---")
598
    for line in rec.lines:
599
        click.echo(line)
600
    click.echo("---")
601
602
603
@cli.command("upload")
604
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
605
@click.argument("repo_id")
606
@click.option(
607
    "--token",
608
    help="""Authentication token, obtained from the Hugging Face Hub.
609
    Will default to the stored token.""",
610
)
611
@click.option(
612
    "--revision",
613
    help="""An optional git revision to commit from. Defaults to the head of the "main"
614
    branch.""",
615
)
616
@click.option(
617
    "--commit-message",
618
    help="""The summary / title / first line of the generated commit.""",
619
)
620
@click.option(
621
    "--modelcard/--no-modelcard",
622
    default=True,
623
    help="Update or create a Model Card with upload.",
624
)
625
@cli_util.common_options
626
def run_upload(
627
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
628
):
629
    """
630
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
631
    \f
632
    This command zips the project directories and vocabularies of the projects
633
    that match the given `project_ids_pattern` to archive files, and uploads the
634
    archives along with the project configurations to the specified Hugging Face
635
    Hub repository. An authentication token and commit message can be given with
636
    options. If the README.md does not exist in the repository it is
637
    created with default contents and metadata of the uploaded projects, if it exists,
638
    its metadata are updated as necessary.
639
    """
640
    from huggingface_hub import HfApi
641
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
642
643
    projects = hfh_util.get_matching_projects(project_ids_pattern)
644
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
645
646
    commit_message = (
647
        commit_message
648
        if commit_message is not None
649
        else f"Upload project(s) {project_ids_pattern} with Annif"
650
    )
651
652
    fobjs, operations = [], []
653
    try:
654
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
655
        api = HfApi()
656
        api.create_commit(
657
            repo_id=repo_id,
658
            operations=operations,
659
            commit_message=commit_message,
660
            revision=revision,
661
            token=token,
662
        )
663
    except (HfHubHTTPError, HFValidationError) as err:
664
        raise OperationFailedException(str(err))
665
    else:
666
        if modelcard:
667
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
668
    finally:
669
        for fobj in fobjs:
670
            fobj.close()
671
672
673
@cli.command("download")
674
@click.argument("project_ids_pattern")
675
@click.argument("repo_id")
676
@click.option(
677
    "--token",
678
    help="""Authentication token, obtained from the Hugging Face Hub.
679
    Will default to the stored token.""",
680
)
681
@click.option(
682
    "--revision",
683
    help="""
684
    An optional Git revision id which can be a branch name, a tag, or a commit
685
    hash.
686
    """,
687
)
688
@click.option(
689
    "--force",
690
    "-f",
691
    default=False,
692
    is_flag=True,
693
    help="Replace an existing project/vocabulary/config with the downloaded one",
694
)
695
@click.option(
696
    "--trust-repo",
697
    default=False,
698
    is_flag=True,
699
    help="Allow download from the repository even when it has no entries in the cache",
700
)
701
@cli_util.common_options
702
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
703
    """
704
    Download selected projects and their vocabularies from a Hugging Face Hub
705
    repository.
706
    \f
707
    This command downloads the project and vocabulary archives and the
708
    configuration files of the projects that match the given
709
    `project_ids_pattern` from the specified Hugging Face Hub repository and
710
    unzips the archives to `data/` directory and places the configuration files
711
    to `projects.d/` directory. An authentication token and revision can be given with
712
    options. If the repository hasn’t been used for downloads previously
713
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
714
    `--trust-repo` option needs to be used.
715
    """
716
717
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
718
719
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
720
        project_ids_pattern, repo_id, token, revision
721
    )
722
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
723
724
    vocab_ids = set()
725
    for project_id in project_ids:
726
        project_zip_cache_path = hfh_util.download_from_hf_hub(
727
            f"projects/{project_id}.zip", repo_id, token, revision
728
        )
729
        hfh_util.unzip_archive(project_zip_cache_path, force)
730
        config_file_cache_path = hfh_util.download_from_hf_hub(
731
            f"{project_id}.cfg", repo_id, token, revision
732
        )
733
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
734
        hfh_util.copy_project_config(config_file_cache_path, force)
735
736
    for vocab_id in vocab_ids:
737
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
738
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
739
        )
740
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
741
742
743
@cli.command("completion")
744
@click.option("--bash", "shell", flag_value="bash")
745
@click.option("--zsh", "shell", flag_value="zsh")
746
@click.option("--fish", "shell", flag_value="fish")
747
def run_completion(shell):
748
    """Generate the script for tab-key autocompletion for the given shell. To enable the
749
    completion support in your current bash terminal session run\n
750
        source <(annif completion --bash)
751
752
    To enable the completion support in all new sessions first add the completion script
753
    in your home directory:\n
754
        annif completion --bash > ~/.annif-complete.bash
755
756
    Then make the script to be automatically sourced for new terminal sessions by adding
757
    the following to your ~/.bashrc file (or in some alternative startup file)\n
758
        source ~/.annif-complete.bash
759
    """
760
761
    if shell is None:
762
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
763
764
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
765
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
766
    click.echo(script)
767
768
769
@cli.command("detect-language")
770
@click.argument("languages")
771
@click.argument(
772
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
773
)
774
def run_detect_language(languages, paths):
775
    """
776
    Detect the language of a single text document from standard input or for one or more
777
    document file(s) given its/their path(s).
778
    """
779
780
    langs = tuple(languages.split(","))
781
782
    def detect_language_and_show(text, languages):
783
        try:
784
            proportions = detect_language(text, languages)
785
        except ValueError as e:
786
            raise click.UsageError(e)
787
        for lang, score in proportions.items():
788
            if lang == "unk":
789
                lang = "?"
790
            click.echo(f"{lang}\t{score:.04f}")
791
792
    if paths and not (len(paths) == 1 and paths[0] == "-"):
793
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
794
        for doc, path in zip(doclist.documents, paths):
795
            click.echo(f"Detected languages for {path}")
796
            detect_language_and_show(doc.text, langs)
797
    else:
798
        text = sys.stdin.read()
799
        detect_language_and_show(text, langs)
800
801
802
if __name__ == "__main__":
803
    cli()
804