annif.cli   F
last analyzed

Complexity

Total Complexity 68

Size/Duplication

Total Lines 813
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 529
dl 0
loc 813
rs 2.96
c 0
b 0
f 0
wmc 68

17 Functions

Rating   Name   Duplication   Size   Complexity  
A run_clear_project() 0 9 1
A run_train() 0 42 3
A run_load_vocab() 0 36 4
A run_learn() 0 20 1
A run_list_vocabs() 0 28 4
A run_list_projects() 0 39 3
A run_show_project() 0 18 1
A run_hyperopt() 0 36 2
A run_completion() 0 24 2
C run_optimize() 0 96 8
B run_download() 0 68 3
B run_suggest() 0 51 6
A run_app() 0 14 1
B run_upload() 0 68 6
B run_detect_language() 0 31 8
B run_index() 0 43 6
C run_eval() 0 108 9

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
@click_log.simple_verbosity_option(logger, default="ERROR")
44
def run_list_projects():
45
    """
46
    List available projects.
47
    \f
48
    Show a list of currently defined projects. Projects are defined in a
49
    configuration file, normally called ``projects.cfg``. See `Project
50
    configuration
51
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
52
    for details.
53
    """
54
55
    column_headings = (
56
        "Project ID",
57
        "Project Name",
58
        "Vocabulary ID",
59
        "Language",
60
        "Trained",
61
        "Modification time",
62
    )
63
    table = [
64
        (
65
            proj.project_id,
66
            proj.name,
67
            proj.vocab.vocab_id if proj.vocab_spec else "-",
68
            proj.language,
69
            str(proj.is_trained),
70
            cli_util.format_datetime(proj.modification_time),
71
        )
72
        for proj in annif.registry.get_projects(min_access=Access.private).values()
73
    ]
74
    template = cli_util.make_list_template(column_headings, *table)
75
    header = template.format(*column_headings)
76
    click.echo(header)
77
    click.echo("-" * len(header))
78
    for row in table:
79
        click.echo(template.format(*row))
80
81
82
@cli.command("show-project")
83
@cli_util.project_id
84
@cli_util.common_options
85
def run_show_project(project_id):
86
    """
87
    Show information about a project.
88
    """
89
90
    proj = cli_util.get_project(project_id)
91
    click.echo(f"Project ID:        {proj.project_id}")
92
    click.echo(f"Project Name:      {proj.name}")
93
    click.echo(f"Language:          {proj.language}")
94
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
95
    click.echo(f"Vocab language:    {proj.vocab_lang}")
96
    click.echo(f"Access:            {proj.access.name}")
97
    click.echo(f"Backend:           {proj.backend.name}")
98
    click.echo(f"Trained:           {proj.is_trained}")
99
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
100
101
102
@cli.command("clear")
103
@cli_util.project_id
104
@cli_util.common_options
105
def run_clear_project(project_id):
106
    """
107
    Initialize the project to its original, untrained state.
108
    """
109
    proj = cli_util.get_project(project_id)
110
    proj.remove_model_data()
111
112
113
@cli.command("list-vocabs")
114
@cli_util.common_options
115
@click_log.simple_verbosity_option(logger, default="ERROR")
116
def run_list_vocabs():
117
    """
118
    List available vocabularies.
119
    """
120
121
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
122
    table = []
123
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
124
        try:
125
            languages = ",".join(sorted(vocab.languages))
126
            size = len(vocab)
127
            loaded = True
128
        except NotInitializedException:
129
            languages = "-"
130
            size = "-"
131
            loaded = False
132
        row = (vocab.vocab_id, languages, str(size), str(loaded))
133
        table.append(row)
134
135
    template = cli_util.make_list_template(column_headings, *table)
136
    header = template.format(*column_headings)
137
    click.echo(header)
138
    click.echo("-" * len(header))
139
    for row in table:
140
        click.echo(template.format(*row))
141
142
143
@cli.command("load-vocab")
144
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
145
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
146
@click.option("--language", "-L", help="Language of TSV vocabulary file")
147
@click.option(
148
    "--force",
149
    "-f",
150
    default=False,
151
    is_flag=True,
152
    help="Replace existing vocabulary completely instead of updating it",
153
)
154
@cli_util.common_options
155
def run_load_vocab(vocab_id, language, force, vocab_file):
156
    """
157
    Load a vocabulary from a subject file.
158
    """
159
    vocab = cli_util.get_vocab(vocab_id)
160
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
161
        # SKOS/RDF file supported by rdflib
162
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
163
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
164
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
165
        # CSV file
166
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
167
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
168
    else:
169
        # probably a TSV file - we need to know its language
170
        if not language:
171
            click.echo(
172
                "Please use --language option to set the language of a TSV vocabulary.",
173
                err=True,
174
            )
175
            sys.exit(1)
176
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
177
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
178
    vocab.load_vocabulary(vocab_file, force=force)
179
180
181
@cli.command("train")
182
@cli_util.project_id
183
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
184
@click.option(
185
    "--cached/--no-cached",
186
    "-c/-C",
187
    default=False,
188
    help="Reuse preprocessed training data from previous run",
189
)
190
@click.option(
191
    "--jobs",
192
    "-j",
193
    default=0,
194
    help="Number of parallel jobs (0 means choose automatically)",
195
)
196
@cli_util.docs_limit_option
197
@cli_util.backend_param_option
198
@cli_util.common_options
199
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
200
    """
201
    Train a project on a collection of documents.
202
    \f
203
    This will train the project using the documents from ``PATHS`` (directories
204
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
205
    is set, preprocessed training data from the previous run is reused instead
206
    of documents input; see `Reusing preprocessed training data
207
    <https://github.com/NatLibFi/Annif/wiki/
208
    Reusing-preprocessed-training-data>`_.
209
    """
210
    proj = cli_util.get_project(project_id)
211
    backend_params = cli_util.parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option."
216
            )
217
        documents = "cached"
218
    else:
219
        documents = cli_util.open_documents(
220
            paths, proj.subjects, proj.vocab_lang, docs_limit
221
        )
222
    proj.train(documents, backend_params, jobs)
223
224
225
@cli.command("learn")
226
@cli_util.project_id
227
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
228
@cli_util.docs_limit_option
229
@cli_util.backend_param_option
230
@cli_util.common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    \f
235
    Similar to the ``train`` command. This will continue training an already
236
    trained project using the documents given by ``PATHS`` in a single batch
237
    operation. Not supported by all backends.
238
    """
239
    proj = cli_util.get_project(project_id)
240
    backend_params = cli_util.parse_backend_params(backend_param, proj)
241
    documents = cli_util.open_documents(
242
        paths, proj.subjects, proj.vocab_lang, docs_limit
243
    )
244
    proj.learn(documents, backend_params)
245
246
247
@cli.command("suggest")
248
@cli_util.project_id
249
@click.argument(
250
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
251
)
252
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
253
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
254
@click.option("--language", "-L", help="Language of subject labels")
255
@cli_util.docs_limit_option
256
@cli_util.backend_param_option
257
@click.option(
258
    "--metadata",
259
    "-D",
260
    multiple=True,
261
    help="Additional metadata for a document read from standard input. "
262
    + "Syntax: `-D <field>=<value>`.",
263
)
264
@cli_util.common_options
265
def run_suggest(
266
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
267
):
268
    """
269
    Suggest subjects for a single document from standard input (optionally
270
    with metadata) or for one or more document file(s) given its/their
271
    path(s).
272
    \f
273
    This will read a text document from standard input and suggest subjects for
274
    it, or if given path(s) to file(s), suggest subjects for it/them.
275
    """
276
    project = cli_util.get_project(project_id)
277
    lang = language or project.vocab_lang
278
    if lang not in project.vocab.languages:
279
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
280
    backend_params = cli_util.parse_backend_params(backend_param, project)
281
282
    if paths and not (len(paths) == 1 and paths[0] == "-"):
283
        docs = cli_util.open_text_documents(paths, docs_limit)
284
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
285
        for (
286
            suggestions,
287
            path,
288
        ) in zip(results, paths):
289
            click.echo(f"Suggestions for {path}")
290
            cli_util.show_hits(suggestions, project, lang)
291
    else:
292
        text = sys.stdin.read()
293
        doc_metadata = cli_util.parse_metadata(metadata)
294
        suggestions = project.suggest(
295
            [Document(text=text, metadata=doc_metadata)], backend_params
296
        ).filter(limit, threshold)[0]
297
        cli_util.show_hits(suggestions, project, lang)
298
299
300
@cli.command("index")
301
@cli_util.project_id
302
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
303
@click.option(
304
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
305
)
306
@click.option(
307
    "--force/--no-force",
308
    "-f/-F",
309
    default=False,
310
    help="Force overwriting of existing result files",
311
)
312
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
313
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
314
@click.option("--language", "-L", help="Language of subject labels")
315
@cli_util.backend_param_option
316
@cli_util.common_options
317
def run_index(
318
    project_id, directory, suffix, force, limit, threshold, language, backend_param
319
):
320
    """
321
    Index a directory with documents, suggesting subjects for each document.
322
    Write the results in TSV files with the given suffix (``.annif`` by
323
    default).
324
    """
325
    project = cli_util.get_project(project_id)
326
    lang = language or project.vocab_lang
327
    if lang not in project.vocab.languages:
328
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
329
    backend_params = cli_util.parse_backend_params(backend_param, project)
330
331
    corpus = DocumentDirectory(directory, require_subjects=False)
332
    results = project.suggest_corpus(corpus, backend_params).filter(limit, threshold)
333
334
    for doc, suggestions in zip(corpus.documents, results):
335
        subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
336
        if os.path.exists(subjectfilename) and not force:
337
            click.echo(
338
                "Not overwriting {} (use --force to override)".format(subjectfilename)
339
            )
340
            continue
341
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
342
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
343
344
345
@cli.command("eval")
346
@cli_util.project_id
347
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
348
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
349
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
350
@click.option(
351
    "--metric",
352
    "-m",
353
    default=[],
354
    multiple=True,
355
    help="Metric to calculate (default: all)",
356
)
357
@click.option(
358
    "--metrics-file",
359
    "-M",
360
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
361
    help="""Specify file in order to write evaluation metrics in JSON format.
362
    File directory must exist, existing file will be overwritten.""",
363
)
364
@click.option(
365
    "--results-file",
366
    "-r",
367
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
368
    help="""Specify file in order to write non-aggregated results per subject.
369
    File directory must exist, existing file will be overwritten.""",
370
)
371
@click.option(
372
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
373
)
374
@cli_util.docs_limit_option
375
@cli_util.backend_param_option
376
@cli_util.common_options
377
def run_eval(
378
    project_id,
379
    paths,
380
    limit,
381
    threshold,
382
    docs_limit,
383
    metric,
384
    metrics_file,
385
    results_file,
386
    jobs,
387
    backend_param,
388
):
389
    """
390
    Suggest subjects for documents and evaluate the results by comparing
391
    against a gold standard.
392
    \f
393
    With this command the documents from ``PATHS`` (directories or possibly
394
    gzipped TSV files) will be assigned subject suggestions and then
395
    statistical measures are calculated that quantify how well the suggested
396
    subjects match the gold-standard subjects in the documents.
397
398
    Normally the output is the list of the metrics calculated across documents.
399
    If ``--results-file <FILENAME>`` option is given, the metrics are
400
    calculated separately for each subject, and written to the given file.
401
    """
402
403
    project = cli_util.get_project(project_id)
404
    backend_params = cli_util.parse_backend_params(backend_param, project)
405
406
    import annif.eval
407
408
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
409
410
    if results_file:
411
        try:
412
            print("", end="", file=results_file)
413
            click.echo(
414
                "Writing per subject evaluation results to {!s}".format(
415
                    results_file.name
416
                )
417
            )
418
        except Exception as e:
419
            raise NotSupportedException(
420
                "cannot open results-file for writing: " + str(e)
421
            )
422
    corpus = cli_util.open_documents(
423
        paths, project.subjects, project.vocab_lang, docs_limit
424
    )
425
    jobs, pool_class = annif.parallel.get_pool(jobs)
426
427
    project.initialize(parallel=True)
428
    psmap = annif.parallel.ProjectSuggestMap(
429
        project.registry, [project_id], backend_params, limit, threshold
430
    )
431
432
    with pool_class(jobs) as pool:
433
        for hit_sets, subject_sets in pool.imap_unordered(
434
            psmap.suggest_batch, corpus.doc_batches
435
        ):
436
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
437
438
    template = "{0:<30}\t{1:{fmt_spec}}"
439
    metrics = eval_batch.results(
440
        metrics=metric, results_file=results_file, language=project.vocab_lang
441
    )
442
    for metric, score in metrics.items():
443
        if isinstance(score, int):
444
            fmt_spec = "d"
445
        elif isinstance(score, float):
446
            fmt_spec = ".04f"
447
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
448
    if metrics_file:
449
        json.dump(
450
            {metric_code(mname): val for mname, val in metrics.items()},
451
            metrics_file,
452
            indent=2,
453
        )
454
455
456
@cli.command("run")
457
@click.option("--host", type=str, default="127.0.0.1")
458
@click.option("--port", type=int, default=5000)
459
@click.option("--log-level")
460
@click_log.simple_verbosity_option(logger, default="ERROR")
461
def run_app(**kwargs):
462
    """
463
    Run Annif in server mode for development.
464
    \f
465
    The server is for development purposes only.
466
    """
467
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
468
    cxapp = annif.create_cx_app()
469
    cxapp.run(**kwargs)
470
471
472
FILTER_BATCH_MAX_LIMIT = 15
473
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
474
475
476
@cli.command("optimize")
477
@cli_util.project_id
478
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
479
@click.option(
480
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
481
)
482
@cli_util.docs_limit_option
483
@cli_util.backend_param_option
484
@cli_util.common_options
485
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
486
    """
487
    Suggest subjects for documents, testing multiple limits and thresholds.
488
    \f
489
    This command will use different limit (maximum number of subjects) and
490
    score threshold values when assigning subjects to each document given by
491
    ``PATHS`` and compare the results against the gold standard subjects in the
492
    documents. The output is a list of parameter combinations and their scores.
493
    From the output, you can determine the optimum limit and threshold
494
    parameters depending on which measure you want to target.
495
    """
496
    project = cli_util.get_project(project_id)
497
    backend_params = cli_util.parse_backend_params(backend_param, project)
498
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
499
500
    import annif.eval
501
502
    corpus = cli_util.open_documents(
503
        paths, project.subjects, project.vocab_lang, docs_limit
504
    )
505
506
    jobs, pool_class = annif.parallel.get_pool(jobs)
507
508
    project.initialize(parallel=True)
509
    psmap = annif.parallel.ProjectSuggestMap(
510
        project.registry,
511
        [project_id],
512
        backend_params,
513
        limit=FILTER_BATCH_MAX_LIMIT,
514
        threshold=0.0,
515
    )
516
517
    ndocs = 0
518
    suggestion_batches = []
519
    subject_set_batches = []
520
    with pool_class(jobs) as pool:
521
        for suggestion_batch, subject_sets in pool.imap_unordered(
522
            psmap.suggest_batch, corpus.doc_batches
523
        ):
524
            ndocs += len(suggestion_batch[project_id])
525
            suggestion_batches.append(suggestion_batch[project_id])
526
            subject_set_batches.append(subject_sets)
527
528
    from annif.suggestion import SuggestionResults
529
530
    orig_suggestion_results = SuggestionResults(suggestion_batches)
531
532
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
533
534
    best_scores = collections.defaultdict(float)
535
    best_params = {}
536
537
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
538
    import annif.eval
539
540
    for limit, threshold in filter_params:
541
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
542
        filtered_results = orig_suggestion_results.filter(limit, threshold)
543
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
544
            eval_batch.evaluate_many(batch, subject_sets)
545
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
546
        for metric, score in results.items():
547
            if score >= best_scores[metric]:
548
                best_scores[metric] = score
549
                best_params[metric] = (limit, threshold)
550
        click.echo(
551
            template.format(
552
                limit,
553
                threshold,
554
                results["Precision (doc avg)"],
555
                results["Recall (doc avg)"],
556
                results["F1 score (doc avg)"],
557
            )
558
        )
559
560
    click.echo()
561
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
562
    for metric in OPTIMIZE_METRICS:
563
        click.echo(
564
            template2.format(
565
                metric,
566
                best_scores[metric],
567
                best_params[metric][0],
568
                best_params[metric][1],
569
            )
570
        )
571
    click.echo("Documents evaluated:\t{}".format(ndocs))
572
573
574
@cli.command("hyperopt")
575
@cli_util.project_id
576
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
577
@click.option("--trials", "-T", default=10, help="Number of trials")
578
@click.option(
579
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
580
)
581
@click.option(
582
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
583
)
584
@click.option(
585
    "--results-file",
586
    "-r",
587
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
588
    help="""Specify file path to write trial results as TSV.
589
    File directory must exist, existing file will be overwritten.""",
590
)
591
@cli_util.docs_limit_option
592
@cli_util.common_options
593
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
594
    """
595
    Optimize the hyperparameters of a project using validation documents from
596
    ``PATHS``. Not supported by all backends. Output is a list of trial results
597
    and a report of the best performing parameters.
598
    """
599
    proj = cli_util.get_project(project_id)
600
    documents = cli_util.open_documents(
601
        paths, proj.subjects, proj.vocab_lang, docs_limit
602
    )
603
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
604
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
605
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
606
    click.echo("---")
607
    for line in rec.lines:
608
        click.echo(line)
609
    click.echo("---")
610
611
612
@cli.command("upload")
613
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
614
@click.argument("repo_id")
615
@click.option(
616
    "--token",
617
    help="""Authentication token, obtained from the Hugging Face Hub.
618
    Will default to the stored token.""",
619
)
620
@click.option(
621
    "--revision",
622
    help="""An optional git revision to commit from. Defaults to the head of the "main"
623
    branch.""",
624
)
625
@click.option(
626
    "--commit-message",
627
    help="""The summary / title / first line of the generated commit.""",
628
)
629
@click.option(
630
    "--modelcard/--no-modelcard",
631
    default=True,
632
    help="Update or create a Model Card with upload.",
633
)
634
@cli_util.common_options
635
def run_upload(
636
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
637
):
638
    """
639
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
640
    \f
641
    This command zips the project directories and vocabularies of the projects
642
    that match the given `project_ids_pattern` to archive files, and uploads the
643
    archives along with the project configurations to the specified Hugging Face
644
    Hub repository. An authentication token and commit message can be given with
645
    options. If the README.md does not exist in the repository it is
646
    created with default contents and metadata of the uploaded projects, if it exists,
647
    its metadata are updated as necessary.
648
    """
649
    from huggingface_hub import HfApi
650
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
651
652
    projects = hfh_util.get_matching_projects(project_ids_pattern)
653
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
654
655
    commit_message = (
656
        commit_message
657
        if commit_message is not None
658
        else f"Upload project(s) {project_ids_pattern} with Annif"
659
    )
660
661
    fobjs, operations = [], []
662
    try:
663
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
664
        api = HfApi()
665
        api.create_commit(
666
            repo_id=repo_id,
667
            operations=operations,
668
            commit_message=commit_message,
669
            revision=revision,
670
            token=token,
671
        )
672
    except (HfHubHTTPError, HFValidationError) as err:
673
        raise OperationFailedException(str(err))
674
    else:
675
        if modelcard:
676
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
677
    finally:
678
        for fobj in fobjs:
679
            fobj.close()
680
681
682
@cli.command("download")
683
@click.argument("project_ids_pattern")
684
@click.argument("repo_id")
685
@click.option(
686
    "--token",
687
    help="""Authentication token, obtained from the Hugging Face Hub.
688
    Will default to the stored token.""",
689
)
690
@click.option(
691
    "--revision",
692
    help="""
693
    An optional Git revision id which can be a branch name, a tag, or a commit
694
    hash.
695
    """,
696
)
697
@click.option(
698
    "--force",
699
    "-f",
700
    default=False,
701
    is_flag=True,
702
    help="Replace an existing project/vocabulary/config with the downloaded one",
703
)
704
@click.option(
705
    "--trust-repo",
706
    default=False,
707
    is_flag=True,
708
    help="Allow download from the repository even when it has no entries in the cache",
709
)
710
@cli_util.common_options
711
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
712
    """
713
    Download selected projects and their vocabularies from a Hugging Face Hub
714
    repository.
715
    \f
716
    This command downloads the project and vocabulary archives and the
717
    configuration files of the projects that match the given
718
    `project_ids_pattern` from the specified Hugging Face Hub repository and
719
    unzips the archives to `data/` directory and places the configuration files
720
    to `projects.d/` directory. An authentication token and revision can be given with
721
    options. If the repository hasn’t been used for downloads previously
722
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
723
    `--trust-repo` option needs to be used.
724
    """
725
726
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
727
728
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
729
        project_ids_pattern, repo_id, token, revision
730
    )
731
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
732
733
    vocab_ids = set()
734
    for project_id in project_ids:
735
        project_zip_cache_path = hfh_util.download_from_hf_hub(
736
            f"projects/{project_id}.zip", repo_id, token, revision
737
        )
738
        hfh_util.unzip_archive(project_zip_cache_path, force)
739
        config_file_cache_path = hfh_util.download_from_hf_hub(
740
            f"{project_id}.cfg", repo_id, token, revision
741
        )
742
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
743
        hfh_util.copy_project_config(config_file_cache_path, force)
744
745
    for vocab_id in vocab_ids:
746
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
747
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
748
        )
749
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
750
751
752
@cli.command("completion")
753
@click.option("--bash", "shell", flag_value="bash")
754
@click.option("--zsh", "shell", flag_value="zsh")
755
@click.option("--fish", "shell", flag_value="fish")
756
def run_completion(shell):
757
    """Generate the script for tab-key autocompletion for the given shell. To enable the
758
    completion support in your current bash terminal session run\n
759
        source <(annif completion --bash)
760
761
    To enable the completion support in all new sessions first add the completion script
762
    in your home directory:\n
763
        annif completion --bash > ~/.annif-complete.bash
764
765
    Then make the script to be automatically sourced for new terminal sessions by adding
766
    the following to your ~/.bashrc file (or in some alternative startup file)\n
767
        source ~/.annif-complete.bash
768
    """
769
770
    if shell is None:
771
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
772
773
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
774
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
775
    click.echo(script)
776
777
778
@cli.command("detect-language")
779
@click.argument("languages")
780
@click.argument(
781
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
782
)
783
def run_detect_language(languages, paths):
784
    """
785
    Detect the language of a single text document from standard input or for one or more
786
    document file(s) given its/their path(s).
787
    """
788
789
    langs = tuple(languages.split(","))
790
791
    def detect_language_and_show(text, languages):
792
        try:
793
            proportions = detect_language(text, languages)
794
        except ValueError as e:
795
            raise click.UsageError(e)
796
        for lang, score in proportions.items():
797
            if lang == "unk":
798
                lang = "?"
799
            click.echo(f"{lang}\t{score:.04f}")
800
801
    if paths and not (len(paths) == 1 and paths[0] == "-"):
802
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
803
        for doc, path in zip(doclist.documents, paths):
804
            click.echo(f"Detected languages for {path}")
805
            detect_language_and_show(doc.text, langs)
806
    else:
807
        text = sys.stdin.read()
808
        detect_language_and_show(text, langs)
809
810
811
if __name__ == "__main__":
812
    cli()
813