annif.cli.run_eval()   C
last analyzed

Complexity

Conditions 9

Size

Total Lines 108
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 74
nop 10
dl 0
loc 108
rs 5.5175
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code, suggestion_to_dict
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
@click_log.simple_verbosity_option(logger, default="ERROR")
44
def run_list_projects():
45
    """
46
    List available projects.
47
    \f
48
    Show a list of currently defined projects. Projects are defined in a
49
    configuration file, normally called ``projects.cfg``. See `Project
50
    configuration
51
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
52
    for details.
53
    """
54
55
    column_headings = (
56
        "Project ID",
57
        "Project Name",
58
        "Vocabulary ID",
59
        "Language",
60
        "Trained",
61
        "Modification time",
62
    )
63
    table = [
64
        (
65
            proj.project_id,
66
            proj.name,
67
            proj.vocab.vocab_id if proj.vocab_spec else "-",
68
            proj.language,
69
            str(proj.is_trained),
70
            cli_util.format_datetime(proj.modification_time),
71
        )
72
        for proj in annif.registry.get_projects(min_access=Access.private).values()
73
    ]
74
    template = cli_util.make_list_template(column_headings, *table)
75
    header = template.format(*column_headings)
76
    click.echo(header)
77
    click.echo("-" * len(header))
78
    for row in table:
79
        click.echo(template.format(*row))
80
81
82
@cli.command("show-project")
83
@cli_util.project_id
84
@cli_util.common_options
85
def run_show_project(project_id):
86
    """
87
    Show information about a project.
88
    """
89
90
    proj = cli_util.get_project(project_id)
91
    click.echo(f"Project ID:        {proj.project_id}")
92
    click.echo(f"Project Name:      {proj.name}")
93
    click.echo(f"Language:          {proj.language}")
94
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
95
    click.echo(f"Vocab language:    {proj.vocab_lang}")
96
    click.echo(f"Access:            {proj.access.name}")
97
    click.echo(f"Backend:           {proj.backend.name}")
98
    click.echo(f"Trained:           {proj.is_trained}")
99
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
100
101
102
@cli.command("clear")
103
@cli_util.project_id
104
@cli_util.common_options
105
def run_clear_project(project_id):
106
    """
107
    Initialize the project to its original, untrained state.
108
    """
109
    proj = cli_util.get_project(project_id)
110
    proj.remove_model_data()
111
112
113
@cli.command("list-vocabs")
114
@cli_util.common_options
115
@click_log.simple_verbosity_option(logger, default="ERROR")
116
def run_list_vocabs():
117
    """
118
    List available vocabularies.
119
    """
120
121
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
122
    table = []
123
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
124
        try:
125
            languages = ",".join(sorted(vocab.languages))
126
            size = len(vocab)
127
            loaded = True
128
        except NotInitializedException:
129
            languages = "-"
130
            size = "-"
131
            loaded = False
132
        row = (vocab.vocab_id, languages, str(size), str(loaded))
133
        table.append(row)
134
135
    template = cli_util.make_list_template(column_headings, *table)
136
    header = template.format(*column_headings)
137
    click.echo(header)
138
    click.echo("-" * len(header))
139
    for row in table:
140
        click.echo(template.format(*row))
141
142
143
@cli.command("load-vocab")
144
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
145
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
146
@click.option("--language", "-L", help="Language of TSV vocabulary file")
147
@click.option(
148
    "--force",
149
    "-f",
150
    default=False,
151
    is_flag=True,
152
    help="Replace existing vocabulary completely instead of updating it",
153
)
154
@cli_util.common_options
155
def run_load_vocab(vocab_id, language, force, vocab_file):
156
    """
157
    Load a vocabulary from a subject file.
158
    """
159
    vocab = cli_util.get_vocab(vocab_id)
160
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
161
        # SKOS/RDF file supported by rdflib
162
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
163
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
164
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
165
        # CSV file
166
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
167
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
168
    else:
169
        # probably a TSV file - we need to know its language
170
        if not language:
171
            click.echo(
172
                "Please use --language option to set the language of a TSV vocabulary.",
173
                err=True,
174
            )
175
            sys.exit(1)
176
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
177
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
178
    vocab.load_vocabulary(vocab_file, force=force)
179
180
181
@cli.command("train")
182
@cli_util.project_id
183
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
184
@click.option(
185
    "--cached/--no-cached",
186
    "-c/-C",
187
    default=False,
188
    help="Reuse preprocessed training data from previous run",
189
)
190
@click.option(
191
    "--jobs",
192
    "-j",
193
    default=0,
194
    help="Number of parallel jobs (0 means choose automatically)",
195
)
196
@cli_util.docs_limit_option
197
@cli_util.backend_param_option
198
@cli_util.common_options
199
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
200
    """
201
    Train a project on a collection of documents.
202
    \f
203
    This will train the project using the documents from ``PATHS`` (directories
204
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
205
    is set, preprocessed training data from the previous run is reused instead
206
    of documents input; see `Reusing preprocessed training data
207
    <https://github.com/NatLibFi/Annif/wiki/
208
    Reusing-preprocessed-training-data>`_.
209
    """
210
    proj = cli_util.get_project(project_id)
211
    backend_params = cli_util.parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option."
216
            )
217
        documents = "cached"
218
    else:
219
        documents = cli_util.open_documents(
220
            paths, proj.subjects, proj.vocab_lang, docs_limit
221
        )
222
    proj.train(documents, backend_params, jobs)
223
224
225
@cli.command("learn")
226
@cli_util.project_id
227
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
228
@cli_util.docs_limit_option
229
@cli_util.backend_param_option
230
@cli_util.common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    \f
235
    Similar to the ``train`` command. This will continue training an already
236
    trained project using the documents given by ``PATHS`` in a single batch
237
    operation. Not supported by all backends.
238
    """
239
    proj = cli_util.get_project(project_id)
240
    backend_params = cli_util.parse_backend_params(backend_param, proj)
241
    documents = cli_util.open_documents(
242
        paths, proj.subjects, proj.vocab_lang, docs_limit
243
    )
244
    proj.learn(documents, backend_params)
245
246
247
@cli.command("suggest")
248
@cli_util.project_id
249
@click.argument(
250
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
251
)
252
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
253
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
254
@click.option("--language", "-L", help="Language of subject labels")
255
@cli_util.docs_limit_option
256
@cli_util.backend_param_option
257
@click.option(
258
    "--metadata",
259
    "-D",
260
    multiple=True,
261
    help="Additional metadata for a document read from standard input. "
262
    + "Syntax: `-D <field>=<value>`.",
263
)
264
@cli_util.common_options
265
def run_suggest(
266
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
267
):
268
    """
269
    Suggest subjects for a single document from standard input (optionally
270
    with metadata) or for one or more document file(s) given its/their
271
    path(s).
272
    \f
273
    This will read a text document from standard input and suggest subjects for
274
    it, or if given path(s) to file(s), suggest subjects for it/them.
275
    """
276
    project = cli_util.get_project(project_id)
277
    lang = language or project.vocab_lang
278
    if lang not in project.vocab.languages:
279
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
280
    backend_params = cli_util.parse_backend_params(backend_param, project)
281
282
    if paths and not (len(paths) == 1 and paths[0] == "-"):
283
        docs = cli_util.open_text_documents(paths, docs_limit)
284
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
285
        for (
286
            suggestions,
287
            path,
288
        ) in zip(results, paths):
289
            click.echo(f"Suggestions for {path}")
290
            cli_util.show_hits(suggestions, project, lang)
291
    else:
292
        text = sys.stdin.read()
293
        doc_metadata = cli_util.parse_metadata(metadata)
294
        suggestions = project.suggest(
295
            [Document(text=text, metadata=doc_metadata)], backend_params
296
        ).filter(limit, threshold)[0]
297
        cli_util.show_hits(suggestions, project, lang)
298
299
300
@cli.command("index")
301
@cli_util.project_id
302
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
303
@click.option(
304
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
305
)
306
@click.option(
307
    "--force/--no-force",
308
    "-f/-F",
309
    default=False,
310
    help="Force overwriting of existing result files",
311
)
312
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
313
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
314
@click.option("--language", "-L", help="Language of subject labels")
315
@cli_util.backend_param_option
316
@cli_util.common_options
317
def run_index(
318
    project_id, directory, suffix, force, limit, threshold, language, backend_param
319
):
320
    """
321
    Index a directory with documents, suggesting subjects for each document.
322
    Write the results in TSV files with the given suffix (``.annif`` by
323
    default).
324
    """
325
    project = cli_util.get_project(project_id)
326
    lang = language or project.vocab_lang
327
    if lang not in project.vocab.languages:
328
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
329
    backend_params = cli_util.parse_backend_params(backend_param, project)
330
331
    corpus = DocumentDirectory(directory, require_subjects=False)
332
    results = project.suggest_corpus(corpus, backend_params).filter(limit, threshold)
333
334
    for doc, suggestions in zip(corpus.documents, results):
335
        subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
336
        if os.path.exists(subjectfilename) and not force:
337
            click.echo(
338
                "Not overwriting {} (use --force to override)".format(subjectfilename)
339
            )
340
            continue
341
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
342
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
343
344
345
@cli.command("index-file")
346
@cli_util.project_id
347
@click.argument("paths", type=click.Path(exists=True, dir_okay=False), nargs=-1)
348
@click.option(
349
    "--suffix", "-s", default=".annif.jsonl", help="File name suffix for result files"
350
)
351
@click.option(
352
    "--force/--no-force",
353
    "-f/-F",
354
    default=False,
355
    help="Force overwriting of existing result files",
356
)
357
@click.option(
358
    "--include-doc/--no-include-doc",
359
    "-i/-I",
360
    default=True,
361
    help="Include input documents in output",
362
)
363
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
364
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
365
@click.option("--language", "-L", help="Language of subject labels")
366
@cli_util.backend_param_option
367
@cli_util.common_options
368
def run_index_file(
369
    project_id,
370
    paths,
371
    suffix,
372
    force,
373
    include_doc,
374
    limit,
375
    threshold,
376
    language,
377
    backend_param,
378
):
379
    """
380
    Index a file with documents, suggesting subjects for each document.
381
    Write the results in JSONL files with the given suffix (``.annif.jsonl`` by
382
    default).
383
    """
384
    project = cli_util.get_project(project_id)
385
    lang = language or project.vocab_lang
386
    if lang not in project.vocab.languages:
387
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
388
    backend_params = cli_util.parse_backend_params(backend_param, project)
389
390
    for path in paths:
391
        corpus = cli_util.open_doc_path(
392
            path, project.subjects, lang, require_subjects=False
393
        )
394
        results = project.suggest_corpus(corpus, backend_params).filter(
395
            limit, threshold
396
        )
397
398
        outfilename = re.sub(r"(\.[^.]+)?(\.gz)?$", "", path) + suffix
399
        if os.path.exists(outfilename) and not force:
400
            click.echo(
401
                "Not overwriting {} (use --force to override)".format(outfilename)
402
            )
403
            continue
404
405
        with open(outfilename, "w", encoding="utf-8") as outfile:
406
            for doc, suggestions in zip(corpus.documents, results):
407
                output = doc.as_dict(project.subjects, lang) if include_doc else {}
408
                output["results"] = [
409
                    suggestion_to_dict(suggestion, project.subjects, lang)
410
                    for suggestion in suggestions
411
                ]
412
                outfile.write(json.dumps(output) + "\n")
413
414
415
@cli.command("eval")
416
@cli_util.project_id
417
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
418
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
419
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
420
@click.option(
421
    "--metric",
422
    "-m",
423
    default=[],
424
    multiple=True,
425
    help="Metric to calculate (default: all)",
426
)
427
@click.option(
428
    "--metrics-file",
429
    "-M",
430
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
431
    help="""Specify file in order to write evaluation metrics in JSON format.
432
    File directory must exist, existing file will be overwritten.""",
433
)
434
@click.option(
435
    "--results-file",
436
    "-r",
437
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
438
    help="""Specify file in order to write non-aggregated results per subject.
439
    File directory must exist, existing file will be overwritten.""",
440
)
441
@click.option(
442
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
443
)
444
@cli_util.docs_limit_option
445
@cli_util.backend_param_option
446
@cli_util.common_options
447
def run_eval(
448
    project_id,
449
    paths,
450
    limit,
451
    threshold,
452
    docs_limit,
453
    metric,
454
    metrics_file,
455
    results_file,
456
    jobs,
457
    backend_param,
458
):
459
    """
460
    Suggest subjects for documents and evaluate the results by comparing
461
    against a gold standard.
462
    \f
463
    With this command the documents from ``PATHS`` (directories or possibly
464
    gzipped TSV files) will be assigned subject suggestions and then
465
    statistical measures are calculated that quantify how well the suggested
466
    subjects match the gold-standard subjects in the documents.
467
468
    Normally the output is the list of the metrics calculated across documents.
469
    If ``--results-file <FILENAME>`` option is given, the metrics are
470
    calculated separately for each subject, and written to the given file.
471
    """
472
473
    project = cli_util.get_project(project_id)
474
    backend_params = cli_util.parse_backend_params(backend_param, project)
475
476
    import annif.eval
477
478
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
479
480
    if results_file:
481
        try:
482
            print("", end="", file=results_file)
483
            click.echo(
484
                "Writing per subject evaluation results to {!s}".format(
485
                    results_file.name
486
                )
487
            )
488
        except Exception as e:
489
            raise NotSupportedException(
490
                "cannot open results-file for writing: " + str(e)
491
            )
492
    corpus = cli_util.open_documents(
493
        paths, project.subjects, project.vocab_lang, docs_limit
494
    )
495
    jobs, pool_class = annif.parallel.get_pool(jobs)
496
497
    project.initialize(parallel=True)
498
    psmap = annif.parallel.ProjectSuggestMap(
499
        project.registry, [project_id], backend_params, limit, threshold
500
    )
501
502
    with pool_class(jobs) as pool:
503
        for hit_sets, subject_sets in pool.imap_unordered(
504
            psmap.suggest_batch, corpus.doc_batches
505
        ):
506
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
507
508
    template = "{0:<30}\t{1:{fmt_spec}}"
509
    metrics = eval_batch.results(
510
        metrics=metric, results_file=results_file, language=project.vocab_lang
511
    )
512
    for metric, score in metrics.items():
513
        if isinstance(score, int):
514
            fmt_spec = "d"
515
        elif isinstance(score, float):
516
            fmt_spec = ".04f"
517
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
518
    if metrics_file:
519
        json.dump(
520
            {metric_code(mname): val for mname, val in metrics.items()},
521
            metrics_file,
522
            indent=2,
523
        )
524
525
526
@cli.command("run")
527
@click.option("--host", type=str, default="127.0.0.1")
528
@click.option("--port", type=int, default=5000)
529
@click.option("--log-level")
530
@click_log.simple_verbosity_option(logger, default="ERROR")
531
def run_app(**kwargs):
532
    """
533
    Run Annif in server mode for development.
534
    \f
535
    The server is for development purposes only.
536
    """
537
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
538
    cxapp = annif.create_cx_app()
539
    cxapp.run(**kwargs)
540
541
542
FILTER_BATCH_MAX_LIMIT = 15
543
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
544
545
546
@cli.command("optimize")
547
@cli_util.project_id
548
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
549
@click.option(
550
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
551
)
552
@cli_util.docs_limit_option
553
@cli_util.backend_param_option
554
@cli_util.common_options
555
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
556
    """
557
    Suggest subjects for documents, testing multiple limits and thresholds.
558
    \f
559
    This command will use different limit (maximum number of subjects) and
560
    score threshold values when assigning subjects to each document given by
561
    ``PATHS`` and compare the results against the gold standard subjects in the
562
    documents. The output is a list of parameter combinations and their scores.
563
    From the output, you can determine the optimum limit and threshold
564
    parameters depending on which measure you want to target.
565
    """
566
    project = cli_util.get_project(project_id)
567
    backend_params = cli_util.parse_backend_params(backend_param, project)
568
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
569
570
    import annif.eval
571
572
    corpus = cli_util.open_documents(
573
        paths, project.subjects, project.vocab_lang, docs_limit
574
    )
575
576
    jobs, pool_class = annif.parallel.get_pool(jobs)
577
578
    project.initialize(parallel=True)
579
    psmap = annif.parallel.ProjectSuggestMap(
580
        project.registry,
581
        [project_id],
582
        backend_params,
583
        limit=FILTER_BATCH_MAX_LIMIT,
584
        threshold=0.0,
585
    )
586
587
    ndocs = 0
588
    suggestion_batches = []
589
    subject_set_batches = []
590
    with pool_class(jobs) as pool:
591
        for suggestion_batch, subject_sets in pool.imap_unordered(
592
            psmap.suggest_batch, corpus.doc_batches
593
        ):
594
            ndocs += len(suggestion_batch[project_id])
595
            suggestion_batches.append(suggestion_batch[project_id])
596
            subject_set_batches.append(subject_sets)
597
598
    from annif.suggestion import SuggestionResults
599
600
    orig_suggestion_results = SuggestionResults(suggestion_batches)
601
602
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
603
604
    best_scores = collections.defaultdict(float)
605
    best_params = {}
606
607
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
608
    import annif.eval
609
610
    for limit, threshold in filter_params:
611
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
612
        filtered_results = orig_suggestion_results.filter(limit, threshold)
613
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
614
            eval_batch.evaluate_many(batch, subject_sets)
615
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
616
        for metric, score in results.items():
617
            if score >= best_scores[metric]:
618
                best_scores[metric] = score
619
                best_params[metric] = (limit, threshold)
620
        click.echo(
621
            template.format(
622
                limit,
623
                threshold,
624
                results["Precision (doc avg)"],
625
                results["Recall (doc avg)"],
626
                results["F1 score (doc avg)"],
627
            )
628
        )
629
630
    click.echo()
631
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
632
    for metric in OPTIMIZE_METRICS:
633
        click.echo(
634
            template2.format(
635
                metric,
636
                best_scores[metric],
637
                best_params[metric][0],
638
                best_params[metric][1],
639
            )
640
        )
641
    click.echo("Documents evaluated:\t{}".format(ndocs))
642
643
644
@cli.command("hyperopt")
645
@cli_util.project_id
646
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
647
@click.option("--trials", "-T", default=10, help="Number of trials")
648
@click.option(
649
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
650
)
651
@click.option(
652
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
653
)
654
@click.option(
655
    "--results-file",
656
    "-r",
657
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
658
    help="""Specify file path to write trial results as TSV.
659
    File directory must exist, existing file will be overwritten.""",
660
)
661
@cli_util.docs_limit_option
662
@cli_util.common_options
663
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
664
    """
665
    Optimize the hyperparameters of a project using validation documents from
666
    ``PATHS``. Not supported by all backends. Output is a list of trial results
667
    and a report of the best performing parameters.
668
    """
669
    proj = cli_util.get_project(project_id)
670
    documents = cli_util.open_documents(
671
        paths, proj.subjects, proj.vocab_lang, docs_limit
672
    )
673
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
674
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
675
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
676
    click.echo("---")
677
    for line in rec.lines:
678
        click.echo(line)
679
    click.echo("---")
680
681
682
@cli.command("upload")
683
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
684
@click.argument("repo_id")
685
@click.option(
686
    "--token",
687
    help="""Authentication token, obtained from the Hugging Face Hub.
688
    Will default to the stored token.""",
689
)
690
@click.option(
691
    "--revision",
692
    help="""An optional git revision to commit from. Defaults to the head of the "main"
693
    branch.""",
694
)
695
@click.option(
696
    "--commit-message",
697
    help="""The summary / title / first line of the generated commit.""",
698
)
699
@click.option(
700
    "--modelcard/--no-modelcard",
701
    default=True,
702
    help="Update or create a Model Card with upload.",
703
)
704
@cli_util.common_options
705
def run_upload(
706
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
707
):
708
    """
709
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
710
    \f
711
    This command zips the project directories and vocabularies of the projects
712
    that match the given `project_ids_pattern` to archive files, and uploads the
713
    archives along with the project configurations to the specified Hugging Face
714
    Hub repository. An authentication token and commit message can be given with
715
    options. If the README.md does not exist in the repository it is
716
    created with default contents and metadata of the uploaded projects, if it exists,
717
    its metadata are updated as necessary.
718
    """
719
    from huggingface_hub import HfApi
720
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
721
722
    projects = hfh_util.get_matching_projects(project_ids_pattern)
723
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
724
725
    commit_message = (
726
        commit_message
727
        if commit_message is not None
728
        else f"Upload project(s) {project_ids_pattern} with Annif"
729
    )
730
731
    fobjs, operations = [], []
732
    try:
733
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
734
        api = HfApi()
735
        api.create_commit(
736
            repo_id=repo_id,
737
            operations=operations,
738
            commit_message=commit_message,
739
            revision=revision,
740
            token=token,
741
        )
742
    except (HfHubHTTPError, HFValidationError) as err:
743
        raise OperationFailedException(str(err))
744
    else:
745
        if modelcard:
746
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
747
    finally:
748
        for fobj in fobjs:
749
            fobj.close()
750
751
752
@cli.command("download")
753
@click.argument("project_ids_pattern")
754
@click.argument("repo_id")
755
@click.option(
756
    "--token",
757
    help="""Authentication token, obtained from the Hugging Face Hub.
758
    Will default to the stored token.""",
759
)
760
@click.option(
761
    "--revision",
762
    help="""
763
    An optional Git revision id which can be a branch name, a tag, or a commit
764
    hash.
765
    """,
766
)
767
@click.option(
768
    "--force",
769
    "-f",
770
    default=False,
771
    is_flag=True,
772
    help="Replace an existing project/vocabulary/config with the downloaded one",
773
)
774
@click.option(
775
    "--trust-repo",
776
    default=False,
777
    is_flag=True,
778
    help="Allow download from the repository even when it has no entries in the cache",
779
)
780
@cli_util.common_options
781
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
782
    """
783
    Download selected projects and their vocabularies from a Hugging Face Hub
784
    repository.
785
    \f
786
    This command downloads the project and vocabulary archives and the
787
    configuration files of the projects that match the given
788
    `project_ids_pattern` from the specified Hugging Face Hub repository and
789
    unzips the archives to `data/` directory and places the configuration files
790
    to `projects.d/` directory. An authentication token and revision can be given with
791
    options. If the repository hasn’t been used for downloads previously
792
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
793
    `--trust-repo` option needs to be used.
794
    """
795
796
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
797
798
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
799
        project_ids_pattern, repo_id, token, revision
800
    )
801
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
802
803
    vocab_ids = set()
804
    for project_id in project_ids:
805
        project_zip_cache_path = hfh_util.download_from_hf_hub(
806
            f"projects/{project_id}.zip", repo_id, token, revision
807
        )
808
        hfh_util.unzip_archive(project_zip_cache_path, force)
809
        config_file_cache_path = hfh_util.download_from_hf_hub(
810
            f"{project_id}.cfg", repo_id, token, revision
811
        )
812
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
813
        hfh_util.copy_project_config(config_file_cache_path, force)
814
815
    for vocab_id in vocab_ids:
816
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
817
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
818
        )
819
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
820
821
822
@cli.command("completion")
823
@click.option("--bash", "shell", flag_value="bash")
824
@click.option("--zsh", "shell", flag_value="zsh")
825
@click.option("--fish", "shell", flag_value="fish")
826
def run_completion(shell):
827
    """Generate the script for tab-key autocompletion for the given shell. To enable the
828
    completion support in your current bash terminal session run\n
829
        source <(annif completion --bash)
830
831
    To enable the completion support in all new sessions first add the completion script
832
    in your home directory:\n
833
        annif completion --bash > ~/.annif-complete.bash
834
835
    Then make the script to be automatically sourced for new terminal sessions by adding
836
    the following to your ~/.bashrc file (or in some alternative startup file)\n
837
        source ~/.annif-complete.bash
838
    """
839
840
    if shell is None:
841
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
842
843
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
844
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
845
    click.echo(script)
846
847
848
@cli.command("detect-language")
849
@click.argument("languages")
850
@click.argument(
851
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
852
)
853
def run_detect_language(languages, paths):
854
    """
855
    Detect the language of a single text document from standard input or for one or more
856
    document file(s) given its/their path(s).
857
    """
858
859
    langs = tuple(languages.split(","))
860
861
    def detect_language_and_show(text, languages):
862
        try:
863
            proportions = detect_language(text, languages)
864
        except ValueError as e:
865
            raise click.UsageError(e)
866
        for lang, score in proportions.items():
867
            if lang == "unk":
868
                lang = "?"
869
            click.echo(f"{lang}\t{score:.04f}")
870
871
    if paths and not (len(paths) == 1 and paths[0] == "-"):
872
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
873
        for doc, path in zip(doclist.documents, paths):
874
            click.echo(f"Detected languages for {path}")
875
            detect_language_and_show(doc.text, langs)
876
    else:
877
        text = sys.stdin.read()
878
        detect_language_and_show(text, langs)
879
880
881
if __name__ == "__main__":
882
    cli()
883