annif.cli   F
last analyzed

Complexity

Total Complexity 68

Size/Duplication

Total Lines 812
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 529
dl 0
loc 812
rs 2.96
c 0
b 0
f 0
wmc 68

17 Functions

Rating   Name   Duplication   Size   Complexity  
A run_clear_project() 0 9 1
A run_train() 0 42 3
A run_load_vocab() 0 36 4
A run_learn() 0 20 1
A run_list_vocabs() 0 28 4
A run_list_projects() 0 39 3
A run_show_project() 0 18 1
A run_hyperopt() 0 36 2
A run_completion() 0 24 2
C run_optimize() 0 96 8
B run_download() 0 68 3
B run_suggest() 0 50 6
A run_app() 0 14 1
B run_upload() 0 68 6
B run_detect_language() 0 31 8
B run_index() 0 43 6
C run_eval() 0 108 9

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
@click_log.simple_verbosity_option(logger, default="ERROR")
44
def run_list_projects():
45
    """
46
    List available projects.
47
    \f
48
    Show a list of currently defined projects. Projects are defined in a
49
    configuration file, normally called ``projects.cfg``. See `Project
50
    configuration
51
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
52
    for details.
53
    """
54
55
    column_headings = (
56
        "Project ID",
57
        "Project Name",
58
        "Vocabulary ID",
59
        "Language",
60
        "Trained",
61
        "Modification time",
62
    )
63
    table = [
64
        (
65
            proj.project_id,
66
            proj.name,
67
            proj.vocab.vocab_id if proj.vocab_spec else "-",
68
            proj.language,
69
            str(proj.is_trained),
70
            cli_util.format_datetime(proj.modification_time),
71
        )
72
        for proj in annif.registry.get_projects(min_access=Access.private).values()
73
    ]
74
    template = cli_util.make_list_template(column_headings, *table)
75
    header = template.format(*column_headings)
76
    click.echo(header)
77
    click.echo("-" * len(header))
78
    for row in table:
79
        click.echo(template.format(*row))
80
81
82
@cli.command("show-project")
83
@cli_util.project_id
84
@cli_util.common_options
85
def run_show_project(project_id):
86
    """
87
    Show information about a project.
88
    """
89
90
    proj = cli_util.get_project(project_id)
91
    click.echo(f"Project ID:        {proj.project_id}")
92
    click.echo(f"Project Name:      {proj.name}")
93
    click.echo(f"Language:          {proj.language}")
94
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
95
    click.echo(f"Vocab language:    {proj.vocab_lang}")
96
    click.echo(f"Access:            {proj.access.name}")
97
    click.echo(f"Backend:           {proj.backend.name}")
98
    click.echo(f"Trained:           {proj.is_trained}")
99
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
100
101
102
@cli.command("clear")
103
@cli_util.project_id
104
@cli_util.common_options
105
def run_clear_project(project_id):
106
    """
107
    Initialize the project to its original, untrained state.
108
    """
109
    proj = cli_util.get_project(project_id)
110
    proj.remove_model_data()
111
112
113
@cli.command("list-vocabs")
114
@cli_util.common_options
115
@click_log.simple_verbosity_option(logger, default="ERROR")
116
def run_list_vocabs():
117
    """
118
    List available vocabularies.
119
    """
120
121
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
122
    table = []
123
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
124
        try:
125
            languages = ",".join(sorted(vocab.languages))
126
            size = len(vocab)
127
            loaded = True
128
        except NotInitializedException:
129
            languages = "-"
130
            size = "-"
131
            loaded = False
132
        row = (vocab.vocab_id, languages, str(size), str(loaded))
133
        table.append(row)
134
135
    template = cli_util.make_list_template(column_headings, *table)
136
    header = template.format(*column_headings)
137
    click.echo(header)
138
    click.echo("-" * len(header))
139
    for row in table:
140
        click.echo(template.format(*row))
141
142
143
@cli.command("load-vocab")
144
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
145
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
146
@click.option("--language", "-L", help="Language of TSV vocabulary file")
147
@click.option(
148
    "--force",
149
    "-f",
150
    default=False,
151
    is_flag=True,
152
    help="Replace existing vocabulary completely instead of updating it",
153
)
154
@cli_util.common_options
155
def run_load_vocab(vocab_id, language, force, vocab_file):
156
    """
157
    Load a vocabulary from a subject file.
158
    """
159
    vocab = cli_util.get_vocab(vocab_id)
160
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
161
        # SKOS/RDF file supported by rdflib
162
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
163
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
164
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
165
        # CSV file
166
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
167
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
168
    else:
169
        # probably a TSV file - we need to know its language
170
        if not language:
171
            click.echo(
172
                "Please use --language option to set the language of a TSV vocabulary.",
173
                err=True,
174
            )
175
            sys.exit(1)
176
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
177
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
178
    vocab.load_vocabulary(vocab_file, force=force)
179
180
181
@cli.command("train")
182
@cli_util.project_id
183
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
184
@click.option(
185
    "--cached/--no-cached",
186
    "-c/-C",
187
    default=False,
188
    help="Reuse preprocessed training data from previous run",
189
)
190
@click.option(
191
    "--jobs",
192
    "-j",
193
    default=0,
194
    help="Number of parallel jobs (0 means choose automatically)",
195
)
196
@cli_util.docs_limit_option
197
@cli_util.backend_param_option
198
@cli_util.common_options
199
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
200
    """
201
    Train a project on a collection of documents.
202
    \f
203
    This will train the project using the documents from ``PATHS`` (directories
204
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
205
    is set, preprocessed training data from the previous run is reused instead
206
    of documents input; see `Reusing preprocessed training data
207
    <https://github.com/NatLibFi/Annif/wiki/
208
    Reusing-preprocessed-training-data>`_.
209
    """
210
    proj = cli_util.get_project(project_id)
211
    backend_params = cli_util.parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option."
216
            )
217
        documents = "cached"
218
    else:
219
        documents = cli_util.open_documents(
220
            paths, proj.subjects, proj.vocab_lang, docs_limit
221
        )
222
    proj.train(documents, backend_params, jobs)
223
224
225
@cli.command("learn")
226
@cli_util.project_id
227
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
228
@cli_util.docs_limit_option
229
@cli_util.backend_param_option
230
@cli_util.common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    \f
235
    Similar to the ``train`` command. This will continue training an already
236
    trained project using the documents given by ``PATHS`` in a single batch
237
    operation. Not supported by all backends.
238
    """
239
    proj = cli_util.get_project(project_id)
240
    backend_params = cli_util.parse_backend_params(backend_param, proj)
241
    documents = cli_util.open_documents(
242
        paths, proj.subjects, proj.vocab_lang, docs_limit
243
    )
244
    proj.learn(documents, backend_params)
245
246
247
@cli.command("suggest")
248
@cli_util.project_id
249
@click.argument(
250
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
251
)
252
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
253
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
254
@click.option("--language", "-L", help="Language of subject labels")
255
@cli_util.docs_limit_option
256
@cli_util.backend_param_option
257
@click.option(
258
    "--metadata",
259
    "-D",
260
    multiple=True,
261
    help="Additional metadata for a document read from standard input. "
262
    + "Syntax: `-D <field>=<value>`.",
263
)
264
@cli_util.common_options
265
def run_suggest(
266
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
267
):
268
    """
269
    Suggest subjects for a single document from standard input or for one or more
270
    document file(s) given its/their path(s).
271
    \f
272
    This will read a text document from standard input and suggest subjects for
273
    it, or if given path(s) to file(s), suggest subjects for it/them.
274
    """
275
    project = cli_util.get_project(project_id)
276
    lang = language or project.vocab_lang
277
    if lang not in project.vocab.languages:
278
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
279
    backend_params = cli_util.parse_backend_params(backend_param, project)
280
281
    if paths and not (len(paths) == 1 and paths[0] == "-"):
282
        docs = cli_util.open_text_documents(paths, docs_limit)
283
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
284
        for (
285
            suggestions,
286
            path,
287
        ) in zip(results, paths):
288
            click.echo(f"Suggestions for {path}")
289
            cli_util.show_hits(suggestions, project, lang)
290
    else:
291
        text = sys.stdin.read()
292
        doc_metadata = cli_util.parse_metadata(metadata)
293
        suggestions = project.suggest(
294
            [Document(text=text, metadata=doc_metadata)], backend_params
295
        ).filter(limit, threshold)[0]
296
        cli_util.show_hits(suggestions, project, lang)
297
298
299
@cli.command("index")
300
@cli_util.project_id
301
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
302
@click.option(
303
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
304
)
305
@click.option(
306
    "--force/--no-force",
307
    "-f/-F",
308
    default=False,
309
    help="Force overwriting of existing result files",
310
)
311
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
312
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
313
@click.option("--language", "-L", help="Language of subject labels")
314
@cli_util.backend_param_option
315
@cli_util.common_options
316
def run_index(
317
    project_id, directory, suffix, force, limit, threshold, language, backend_param
318
):
319
    """
320
    Index a directory with documents, suggesting subjects for each document.
321
    Write the results in TSV files with the given suffix (``.annif`` by
322
    default).
323
    """
324
    project = cli_util.get_project(project_id)
325
    lang = language or project.vocab_lang
326
    if lang not in project.vocab.languages:
327
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
328
    backend_params = cli_util.parse_backend_params(backend_param, project)
329
330
    documents = DocumentDirectory(directory, require_subjects=False)
331
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
332
333
    for (docfilename, _), suggestions in zip(documents, results):
334
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
335
        if os.path.exists(subjectfilename) and not force:
336
            click.echo(
337
                "Not overwriting {} (use --force to override)".format(subjectfilename)
338
            )
339
            continue
340
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
341
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
342
343
344
@cli.command("eval")
345
@cli_util.project_id
346
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
347
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
348
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
349
@click.option(
350
    "--metric",
351
    "-m",
352
    default=[],
353
    multiple=True,
354
    help="Metric to calculate (default: all)",
355
)
356
@click.option(
357
    "--metrics-file",
358
    "-M",
359
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
360
    help="""Specify file in order to write evaluation metrics in JSON format.
361
    File directory must exist, existing file will be overwritten.""",
362
)
363
@click.option(
364
    "--results-file",
365
    "-r",
366
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
367
    help="""Specify file in order to write non-aggregated results per subject.
368
    File directory must exist, existing file will be overwritten.""",
369
)
370
@click.option(
371
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
372
)
373
@cli_util.docs_limit_option
374
@cli_util.backend_param_option
375
@cli_util.common_options
376
def run_eval(
377
    project_id,
378
    paths,
379
    limit,
380
    threshold,
381
    docs_limit,
382
    metric,
383
    metrics_file,
384
    results_file,
385
    jobs,
386
    backend_param,
387
):
388
    """
389
    Suggest subjects for documents and evaluate the results by comparing
390
    against a gold standard.
391
    \f
392
    With this command the documents from ``PATHS`` (directories or possibly
393
    gzipped TSV files) will be assigned subject suggestions and then
394
    statistical measures are calculated that quantify how well the suggested
395
    subjects match the gold-standard subjects in the documents.
396
397
    Normally the output is the list of the metrics calculated across documents.
398
    If ``--results-file <FILENAME>`` option is given, the metrics are
399
    calculated separately for each subject, and written to the given file.
400
    """
401
402
    project = cli_util.get_project(project_id)
403
    backend_params = cli_util.parse_backend_params(backend_param, project)
404
405
    import annif.eval
406
407
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
408
409
    if results_file:
410
        try:
411
            print("", end="", file=results_file)
412
            click.echo(
413
                "Writing per subject evaluation results to {!s}".format(
414
                    results_file.name
415
                )
416
            )
417
        except Exception as e:
418
            raise NotSupportedException(
419
                "cannot open results-file for writing: " + str(e)
420
            )
421
    corpus = cli_util.open_documents(
422
        paths, project.subjects, project.vocab_lang, docs_limit
423
    )
424
    jobs, pool_class = annif.parallel.get_pool(jobs)
425
426
    project.initialize(parallel=True)
427
    psmap = annif.parallel.ProjectSuggestMap(
428
        project.registry, [project_id], backend_params, limit, threshold
429
    )
430
431
    with pool_class(jobs) as pool:
432
        for hit_sets, subject_sets in pool.imap_unordered(
433
            psmap.suggest_batch, corpus.doc_batches
434
        ):
435
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
436
437
    template = "{0:<30}\t{1:{fmt_spec}}"
438
    metrics = eval_batch.results(
439
        metrics=metric, results_file=results_file, language=project.vocab_lang
440
    )
441
    for metric, score in metrics.items():
442
        if isinstance(score, int):
443
            fmt_spec = "d"
444
        elif isinstance(score, float):
445
            fmt_spec = ".04f"
446
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
447
    if metrics_file:
448
        json.dump(
449
            {metric_code(mname): val for mname, val in metrics.items()},
450
            metrics_file,
451
            indent=2,
452
        )
453
454
455
@cli.command("run")
456
@click.option("--host", type=str, default="127.0.0.1")
457
@click.option("--port", type=int, default=5000)
458
@click.option("--log-level")
459
@click_log.simple_verbosity_option(logger, default="ERROR")
460
def run_app(**kwargs):
461
    """
462
    Run Annif in server mode for development.
463
    \f
464
    The server is for development purposes only.
465
    """
466
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
467
    cxapp = annif.create_cx_app()
468
    cxapp.run(**kwargs)
469
470
471
FILTER_BATCH_MAX_LIMIT = 15
472
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
473
474
475
@cli.command("optimize")
476
@cli_util.project_id
477
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
478
@click.option(
479
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
480
)
481
@cli_util.docs_limit_option
482
@cli_util.backend_param_option
483
@cli_util.common_options
484
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
485
    """
486
    Suggest subjects for documents, testing multiple limits and thresholds.
487
    \f
488
    This command will use different limit (maximum number of subjects) and
489
    score threshold values when assigning subjects to each document given by
490
    ``PATHS`` and compare the results against the gold standard subjects in the
491
    documents. The output is a list of parameter combinations and their scores.
492
    From the output, you can determine the optimum limit and threshold
493
    parameters depending on which measure you want to target.
494
    """
495
    project = cli_util.get_project(project_id)
496
    backend_params = cli_util.parse_backend_params(backend_param, project)
497
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
498
499
    import annif.eval
500
501
    corpus = cli_util.open_documents(
502
        paths, project.subjects, project.vocab_lang, docs_limit
503
    )
504
505
    jobs, pool_class = annif.parallel.get_pool(jobs)
506
507
    project.initialize(parallel=True)
508
    psmap = annif.parallel.ProjectSuggestMap(
509
        project.registry,
510
        [project_id],
511
        backend_params,
512
        limit=FILTER_BATCH_MAX_LIMIT,
513
        threshold=0.0,
514
    )
515
516
    ndocs = 0
517
    suggestion_batches = []
518
    subject_set_batches = []
519
    with pool_class(jobs) as pool:
520
        for suggestion_batch, subject_sets in pool.imap_unordered(
521
            psmap.suggest_batch, corpus.doc_batches
522
        ):
523
            ndocs += len(suggestion_batch[project_id])
524
            suggestion_batches.append(suggestion_batch[project_id])
525
            subject_set_batches.append(subject_sets)
526
527
    from annif.suggestion import SuggestionResults
528
529
    orig_suggestion_results = SuggestionResults(suggestion_batches)
530
531
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
532
533
    best_scores = collections.defaultdict(float)
534
    best_params = {}
535
536
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
537
    import annif.eval
538
539
    for limit, threshold in filter_params:
540
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
541
        filtered_results = orig_suggestion_results.filter(limit, threshold)
542
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
543
            eval_batch.evaluate_many(batch, subject_sets)
544
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
545
        for metric, score in results.items():
546
            if score >= best_scores[metric]:
547
                best_scores[metric] = score
548
                best_params[metric] = (limit, threshold)
549
        click.echo(
550
            template.format(
551
                limit,
552
                threshold,
553
                results["Precision (doc avg)"],
554
                results["Recall (doc avg)"],
555
                results["F1 score (doc avg)"],
556
            )
557
        )
558
559
    click.echo()
560
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
561
    for metric in OPTIMIZE_METRICS:
562
        click.echo(
563
            template2.format(
564
                metric,
565
                best_scores[metric],
566
                best_params[metric][0],
567
                best_params[metric][1],
568
            )
569
        )
570
    click.echo("Documents evaluated:\t{}".format(ndocs))
571
572
573
@cli.command("hyperopt")
574
@cli_util.project_id
575
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
576
@click.option("--trials", "-T", default=10, help="Number of trials")
577
@click.option(
578
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
579
)
580
@click.option(
581
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
582
)
583
@click.option(
584
    "--results-file",
585
    "-r",
586
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
587
    help="""Specify file path to write trial results as CSV.
588
    File directory must exist, existing file will be overwritten.""",
589
)
590
@cli_util.docs_limit_option
591
@cli_util.common_options
592
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
593
    """
594
    Optimize the hyperparameters of a project using validation documents from
595
    ``PATHS``. Not supported by all backends. Output is a list of trial results
596
    and a report of the best performing parameters.
597
    """
598
    proj = cli_util.get_project(project_id)
599
    documents = cli_util.open_documents(
600
        paths, proj.subjects, proj.vocab_lang, docs_limit
601
    )
602
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
603
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
604
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
605
    click.echo("---")
606
    for line in rec.lines:
607
        click.echo(line)
608
    click.echo("---")
609
610
611
@cli.command("upload")
612
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
613
@click.argument("repo_id")
614
@click.option(
615
    "--token",
616
    help="""Authentication token, obtained from the Hugging Face Hub.
617
    Will default to the stored token.""",
618
)
619
@click.option(
620
    "--revision",
621
    help="""An optional git revision to commit from. Defaults to the head of the "main"
622
    branch.""",
623
)
624
@click.option(
625
    "--commit-message",
626
    help="""The summary / title / first line of the generated commit.""",
627
)
628
@click.option(
629
    "--modelcard/--no-modelcard",
630
    default=True,
631
    help="Update or create a Model Card with upload.",
632
)
633
@cli_util.common_options
634
def run_upload(
635
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
636
):
637
    """
638
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
639
    \f
640
    This command zips the project directories and vocabularies of the projects
641
    that match the given `project_ids_pattern` to archive files, and uploads the
642
    archives along with the project configurations to the specified Hugging Face
643
    Hub repository. An authentication token and commit message can be given with
644
    options. If the README.md does not exist in the repository it is
645
    created with default contents and metadata of the uploaded projects, if it exists,
646
    its metadata are updated as necessary.
647
    """
648
    from huggingface_hub import HfApi
649
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
650
651
    projects = hfh_util.get_matching_projects(project_ids_pattern)
652
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
653
654
    commit_message = (
655
        commit_message
656
        if commit_message is not None
657
        else f"Upload project(s) {project_ids_pattern} with Annif"
658
    )
659
660
    fobjs, operations = [], []
661
    try:
662
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
663
        api = HfApi()
664
        api.create_commit(
665
            repo_id=repo_id,
666
            operations=operations,
667
            commit_message=commit_message,
668
            revision=revision,
669
            token=token,
670
        )
671
    except (HfHubHTTPError, HFValidationError) as err:
672
        raise OperationFailedException(str(err))
673
    else:
674
        if modelcard:
675
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
676
    finally:
677
        for fobj in fobjs:
678
            fobj.close()
679
680
681
@cli.command("download")
682
@click.argument("project_ids_pattern")
683
@click.argument("repo_id")
684
@click.option(
685
    "--token",
686
    help="""Authentication token, obtained from the Hugging Face Hub.
687
    Will default to the stored token.""",
688
)
689
@click.option(
690
    "--revision",
691
    help="""
692
    An optional Git revision id which can be a branch name, a tag, or a commit
693
    hash.
694
    """,
695
)
696
@click.option(
697
    "--force",
698
    "-f",
699
    default=False,
700
    is_flag=True,
701
    help="Replace an existing project/vocabulary/config with the downloaded one",
702
)
703
@click.option(
704
    "--trust-repo",
705
    default=False,
706
    is_flag=True,
707
    help="Allow download from the repository even when it has no entries in the cache",
708
)
709
@cli_util.common_options
710
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
711
    """
712
    Download selected projects and their vocabularies from a Hugging Face Hub
713
    repository.
714
    \f
715
    This command downloads the project and vocabulary archives and the
716
    configuration files of the projects that match the given
717
    `project_ids_pattern` from the specified Hugging Face Hub repository and
718
    unzips the archives to `data/` directory and places the configuration files
719
    to `projects.d/` directory. An authentication token and revision can be given with
720
    options. If the repository hasn’t been used for downloads previously
721
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
722
    `--trust-repo` option needs to be used.
723
    """
724
725
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
726
727
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
728
        project_ids_pattern, repo_id, token, revision
729
    )
730
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
731
732
    vocab_ids = set()
733
    for project_id in project_ids:
734
        project_zip_cache_path = hfh_util.download_from_hf_hub(
735
            f"projects/{project_id}.zip", repo_id, token, revision
736
        )
737
        hfh_util.unzip_archive(project_zip_cache_path, force)
738
        config_file_cache_path = hfh_util.download_from_hf_hub(
739
            f"{project_id}.cfg", repo_id, token, revision
740
        )
741
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
742
        hfh_util.copy_project_config(config_file_cache_path, force)
743
744
    for vocab_id in vocab_ids:
745
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
746
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
747
        )
748
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
749
750
751
@cli.command("completion")
752
@click.option("--bash", "shell", flag_value="bash")
753
@click.option("--zsh", "shell", flag_value="zsh")
754
@click.option("--fish", "shell", flag_value="fish")
755
def run_completion(shell):
756
    """Generate the script for tab-key autocompletion for the given shell. To enable the
757
    completion support in your current bash terminal session run\n
758
        source <(annif completion --bash)
759
760
    To enable the completion support in all new sessions first add the completion script
761
    in your home directory:\n
762
        annif completion --bash > ~/.annif-complete.bash
763
764
    Then make the script to be automatically sourced for new terminal sessions by adding
765
    the following to your ~/.bashrc file (or in some alternative startup file)\n
766
        source ~/.annif-complete.bash
767
    """
768
769
    if shell is None:
770
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
771
772
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
773
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
774
    click.echo(script)
775
776
777
@cli.command("detect-language")
778
@click.argument("languages")
779
@click.argument(
780
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
781
)
782
def run_detect_language(languages, paths):
783
    """
784
    Detect the language of a single text document from standard input or for one or more
785
    document file(s) given its/their path(s).
786
    """
787
788
    langs = tuple(languages.split(","))
789
790
    def detect_language_and_show(text, languages):
791
        try:
792
            proportions = detect_language(text, languages)
793
        except ValueError as e:
794
            raise click.UsageError(e)
795
        for lang, score in proportions.items():
796
            if lang == "unk":
797
                lang = "?"
798
            click.echo(f"{lang}\t{score:.04f}")
799
800
    if paths and not (len(paths) == 1 and paths[0] == "-"):
801
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
802
        for doc, path in zip(doclist.documents, paths):
803
            click.echo(f"Detected languages for {path}")
804
            detect_language_and_show(doc.text, langs)
805
    else:
806
        text = sys.stdin.read()
807
        detect_language_and_show(text, langs)
808
809
810
if __name__ == "__main__":
811
    cli()
812