annif.cli.run_index_file()   C
last analyzed

Complexity

Conditions 8

Size

Total Lines 87
Code Lines 68

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 68
nop 11
dl 0
loc 87
rs 6.1806
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code, suggestion_to_dict
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
@click_log.simple_verbosity_option(logger, default="ERROR")
44
def run_list_projects():
45
    """
46
    List available projects.
47
    \f
48
    Show a list of currently defined projects. Projects are defined in a
49
    configuration file, normally called ``projects.cfg``. See `Project
50
    configuration
51
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
52
    for details.
53
    """
54
55
    column_headings = (
56
        "Project ID",
57
        "Project Name",
58
        "Vocabulary ID",
59
        "Language",
60
        "Trained",
61
        "Modification time",
62
    )
63
    table = [
64
        (
65
            proj.project_id,
66
            proj.name,
67
            proj.vocab.vocab_id if proj.vocab_spec else "-",
68
            proj.language,
69
            str(proj.is_trained),
70
            cli_util.format_datetime(proj.modification_time),
71
        )
72
        for proj in annif.registry.get_projects(min_access=Access.private).values()
73
    ]
74
    template = cli_util.make_list_template(column_headings, *table)
75
    header = template.format(*column_headings)
76
    click.echo(header)
77
    click.echo("-" * len(header))
78
    for row in table:
79
        click.echo(template.format(*row))
80
81
82
@cli.command("show-project")
83
@cli_util.project_id
84
@cli_util.common_options
85
def run_show_project(project_id):
86
    """
87
    Show information about a project.
88
    """
89
90
    proj = cli_util.get_project(project_id)
91
    click.echo(f"Project ID:        {proj.project_id}")
92
    click.echo(f"Project Name:      {proj.name}")
93
    click.echo(f"Language:          {proj.language}")
94
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
95
    click.echo(f"Vocab language:    {proj.vocab_lang}")
96
    click.echo(f"Access:            {proj.access.name}")
97
    click.echo(f"Backend:           {proj.backend.name}")
98
    click.echo(f"Trained:           {proj.is_trained}")
99
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
100
101
102
@cli.command("clear")
103
@cli_util.project_id
104
@cli_util.common_options
105
def run_clear_project(project_id):
106
    """
107
    Initialize the project to its original, untrained state.
108
    """
109
    proj = cli_util.get_project(project_id)
110
    proj.remove_model_data()
111
112
113
@cli.command("list-vocabs")
114
@cli_util.common_options
115
@click_log.simple_verbosity_option(logger, default="ERROR")
116
def run_list_vocabs():
117
    """
118
    List available vocabularies.
119
    """
120
121
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
122
    table = []
123
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
124
        try:
125
            languages = ",".join(sorted(vocab.languages))
126
            size = len(vocab)
127
            loaded = True
128
        except NotInitializedException:
129
            languages = "-"
130
            size = "-"
131
            loaded = False
132
        row = (vocab.vocab_id, languages, str(size), str(loaded))
133
        table.append(row)
134
135
    template = cli_util.make_list_template(column_headings, *table)
136
    header = template.format(*column_headings)
137
    click.echo(header)
138
    click.echo("-" * len(header))
139
    for row in table:
140
        click.echo(template.format(*row))
141
142
143
@cli.command("load-vocab")
144
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
145
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
146
@click.option("--language", "-L", help="Language of TSV vocabulary file")
147
@click.option(
148
    "--force",
149
    "-f",
150
    default=False,
151
    is_flag=True,
152
    help="Replace existing vocabulary completely instead of updating it",
153
)
154
@cli_util.common_options
155
def run_load_vocab(vocab_id, language, force, vocab_file):
156
    """
157
    Load a vocabulary from a subject file.
158
    """
159
    vocab = cli_util.get_vocab(vocab_id)
160
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
161
        # SKOS/RDF file supported by rdflib
162
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
163
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
164
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
165
        # CSV file
166
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
167
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
168
    else:
169
        # probably a TSV file - we need to know its language
170
        if not language:
171
            click.echo(
172
                "Please use --language option to set the language of a TSV vocabulary.",
173
                err=True,
174
            )
175
            sys.exit(1)
176
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
177
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
178
    vocab.load_vocabulary(vocab_file, force=force)
179
180
181
@cli.command("train")
182
@cli_util.project_id
183
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
184
@click.option(
185
    "--cached/--no-cached",
186
    "-c/-C",
187
    default=False,
188
    help="Reuse preprocessed training data from previous run",
189
)
190
@click.option(
191
    "--jobs",
192
    "-j",
193
    default=0,
194
    help="Number of parallel jobs (0 means choose automatically)",
195
)
196
@cli_util.docs_limit_option
197
@cli_util.backend_param_option
198
@cli_util.common_options
199
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
200
    """
201
    Train a project on a collection of documents.
202
    \f
203
    This will train the project using the documents from ``PATHS`` (directories
204
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
205
    is set, preprocessed training data from the previous run is reused instead
206
    of documents input; see `Reusing preprocessed training data
207
    <https://github.com/NatLibFi/Annif/wiki/
208
    Reusing-preprocessed-training-data>`_.
209
    """
210
    proj = cli_util.get_project(project_id)
211
    backend_params = cli_util.parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option."
216
            )
217
        documents = "cached"
218
    else:
219
        documents = cli_util.open_documents(
220
            paths, proj.subjects, proj.vocab_lang, docs_limit
221
        )
222
    proj.train(documents, backend_params, jobs)
223
224
225
@cli.command("learn")
226
@cli_util.project_id
227
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
228
@cli_util.docs_limit_option
229
@cli_util.backend_param_option
230
@cli_util.common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    \f
235
    Similar to the ``train`` command. This will continue training an already
236
    trained project using the documents given by ``PATHS`` in a single batch
237
    operation. Not supported by all backends.
238
    """
239
    proj = cli_util.get_project(project_id)
240
    backend_params = cli_util.parse_backend_params(backend_param, proj)
241
    documents = cli_util.open_documents(
242
        paths, proj.subjects, proj.vocab_lang, docs_limit
243
    )
244
    proj.learn(documents, backend_params)
245
246
247
@cli.command("suggest")
248
@cli_util.project_id
249
@click.argument(
250
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
251
)
252
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
253
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
254
@click.option("--language", "-L", help="Language of subject labels")
255
@cli_util.docs_limit_option
256
@cli_util.backend_param_option
257
@click.option(
258
    "--metadata",
259
    "-D",
260
    multiple=True,
261
    help="Additional metadata for a document read from standard input. "
262
    + "Syntax: `-D <field>=<value>`.",
263
)
264
@cli_util.common_options
265
def run_suggest(
266
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
267
):
268
    """
269
    Suggest subjects for a single document from standard input (optionally
270
    with metadata) or for one or more document file(s) given its/their
271
    path(s).
272
    \f
273
    This will read a text document from standard input and suggest subjects for
274
    it, or if given path(s) to file(s), suggest subjects for it/them.
275
    """
276
    project = cli_util.get_project(project_id)
277
    lang = language or project.vocab_lang
278
    if lang not in project.vocab.languages:
279
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
280
    backend_params = cli_util.parse_backend_params(backend_param, project)
281
282
    if paths and not (len(paths) == 1 and paths[0] == "-"):
283
        docs = cli_util.open_text_documents(paths, docs_limit)
284
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
285
        for (
286
            suggestions,
287
            path,
288
        ) in zip(results, paths):
289
            click.echo(f"Suggestions for {path}")
290
            cli_util.show_hits(suggestions, project, lang)
291
    else:
292
        text = sys.stdin.read()
293
        doc_metadata = cli_util.parse_metadata(metadata)
294
        suggestions = project.suggest(
295
            [Document(text=text, metadata=doc_metadata)], backend_params
296
        ).filter(limit, threshold)[0]
297
        cli_util.show_hits(suggestions, project, lang)
298
299
300
@cli.command("index")
301
@cli_util.project_id
302
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
303
@click.option(
304
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
305
)
306
@click.option(
307
    "--force/--no-force",
308
    "-f/-F",
309
    default=False,
310
    help="Force overwriting of existing result files",
311
)
312
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
313
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
314
@click.option("--language", "-L", help="Language of subject labels")
315
@cli_util.backend_param_option
316
@cli_util.common_options
317
def run_index(
318
    project_id, directory, suffix, force, limit, threshold, language, backend_param
319
):
320
    """
321
    Index a directory with documents, suggesting subjects for each document.
322
    Write the results in TSV files with the given suffix (``.annif`` by
323
    default).
324
    """
325
    project = cli_util.get_project(project_id)
326
    lang = language or project.vocab_lang
327
    if lang not in project.vocab.languages:
328
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
329
    backend_params = cli_util.parse_backend_params(backend_param, project)
330
331
    corpus = DocumentDirectory(directory, require_subjects=False)
332
    results = project.suggest_corpus(corpus, backend_params).filter(limit, threshold)
333
334
    for doc, suggestions in zip(corpus.documents, results):
335
        subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
336
        if os.path.exists(subjectfilename) and not force:
337
            click.echo(
338
                "Not overwriting {} (use --force to override)".format(subjectfilename)
339
            )
340
            continue
341
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
342
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
343
344
345
@cli.command("index-file")
346
@cli_util.project_id
347
@click.argument("paths", type=click.Path(exists=True, dir_okay=False), nargs=-1)
348
@click.option(
349
    "--suffix", "-s", default=".annif.jsonl", help="File name suffix for result files"
350
)
351
@click.option(
352
    "--gzip/--no-gzip",
353
    "-z/-Z",
354
    "use_gzip",
355
    default=False,
356
    help="Gzip compress result files",
357
)
358
@click.option(
359
    "--output",
360
    "-O",
361
    type=click.Path(dir_okay=False, writable=True),
362
    default=None,
363
    help="Redirect all output to the given file (or '-' for stdout)",
364
)
365
@click.option(
366
    "--force/--no-force",
367
    "-f/-F",
368
    default=False,
369
    help="Force overwriting of existing result files",
370
)
371
@click.option(
372
    "--include-doc/--no-include-doc",
373
    "-i/-I",
374
    default=True,
375
    help="Include input documents in output",
376
)
377
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
378
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
379
@click.option("--language", "-L", help="Language of subject labels")
380
@cli_util.backend_param_option
381
@cli_util.common_options
382
def run_index_file(
383
    project_id,
384
    paths,
385
    suffix,
386
    use_gzip,
387
    output,
388
    force,
389
    include_doc,
390
    limit,
391
    threshold,
392
    language,
393
    backend_param,
394
):
395
    """
396
    Index file(s) containing documents, suggesting subjects for each document.
397
    Write the results in JSONL files with the given suffix (``.annif.jsonl`` by
398
    default).
399
    """
400
401
    project = cli_util.get_project(project_id)
402
    lang = language or project.vocab_lang
403
    if lang not in project.vocab.languages:
404
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
405
    backend_params = cli_util.parse_backend_params(backend_param, project)
406
407
    for path in paths:
408
        corpus = cli_util.open_doc_path(
409
            path, project.subjects, lang, require_subjects=False
410
        )
411
        results = project.suggest_corpus(corpus, backend_params).filter(
412
            limit, threshold
413
        )
414
415
        stream_cm = cli_util.get_output_stream(path, suffix, output, use_gzip, force)
416
        if stream_cm is None:
417
            continue
418
419
        with stream_cm as stream:
420
            for doc, suggestions in zip(corpus.documents, results):
421
                if include_doc:
422
                    output_data = doc.as_dict(project.subjects, lang)
423
                else:
424
                    output_data = {}
425
                    if doc.document_id:
426
                        output_data["document_id"] = doc.document_id
427
                output_data["results"] = [
428
                    suggestion_to_dict(suggestion, project.subjects, lang)
429
                    for suggestion in suggestions
430
                ]
431
                stream.write(json.dumps(output_data) + "\n")
432
433
434
@cli.command("eval")
435
@cli_util.project_id
436
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
437
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
438
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
439
@click.option(
440
    "--metric",
441
    "-m",
442
    default=[],
443
    multiple=True,
444
    help="Metric to calculate (default: all)",
445
)
446
@click.option(
447
    "--metrics-file",
448
    "-M",
449
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
450
    help="""Specify file in order to write evaluation metrics in JSON format.
451
    File directory must exist, existing file will be overwritten.""",
452
)
453
@click.option(
454
    "--results-file",
455
    "-r",
456
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
457
    help="""Specify file in order to write non-aggregated results per subject.
458
    File directory must exist, existing file will be overwritten.""",
459
)
460
@click.option(
461
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
462
)
463
@cli_util.docs_limit_option
464
@cli_util.backend_param_option
465
@cli_util.common_options
466
def run_eval(
467
    project_id,
468
    paths,
469
    limit,
470
    threshold,
471
    docs_limit,
472
    metric,
473
    metrics_file,
474
    results_file,
475
    jobs,
476
    backend_param,
477
):
478
    """
479
    Suggest subjects for documents and evaluate the results by comparing
480
    against a gold standard.
481
    \f
482
    With this command the documents from ``PATHS`` (directories or possibly
483
    gzipped TSV files) will be assigned subject suggestions and then
484
    statistical measures are calculated that quantify how well the suggested
485
    subjects match the gold-standard subjects in the documents.
486
487
    Normally the output is the list of the metrics calculated across documents.
488
    If ``--results-file <FILENAME>`` option is given, the metrics are
489
    calculated separately for each subject, and written to the given file.
490
    """
491
492
    project = cli_util.get_project(project_id)
493
    backend_params = cli_util.parse_backend_params(backend_param, project)
494
495
    import annif.eval
496
497
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
498
499
    if results_file:
500
        try:
501
            print("", end="", file=results_file)
502
            click.echo(
503
                "Writing per subject evaluation results to {!s}".format(
504
                    results_file.name
505
                )
506
            )
507
        except Exception as e:
508
            raise NotSupportedException(
509
                "cannot open results-file for writing: " + str(e)
510
            )
511
    corpus = cli_util.open_documents(
512
        paths, project.subjects, project.vocab_lang, docs_limit
513
    )
514
    jobs, pool_class = annif.parallel.get_pool(jobs)
515
516
    project.initialize(parallel=True)
517
    psmap = annif.parallel.ProjectSuggestMap(
518
        project.registry, [project_id], backend_params, limit, threshold
519
    )
520
521
    with pool_class(jobs) as pool:
522
        for hit_sets, subject_sets in pool.imap_unordered(
523
            psmap.suggest_batch, corpus.doc_batches
524
        ):
525
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
526
527
    template = "{0:<30}\t{1:{fmt_spec}}"
528
    metrics = eval_batch.results(
529
        metrics=metric, results_file=results_file, language=project.vocab_lang
530
    )
531
    for metric, score in metrics.items():
532
        if isinstance(score, int):
533
            fmt_spec = "d"
534
        elif isinstance(score, float):
535
            fmt_spec = ".04f"
536
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
537
    if metrics_file:
538
        json.dump(
539
            {metric_code(mname): val for mname, val in metrics.items()},
540
            metrics_file,
541
            indent=2,
542
        )
543
544
545
@cli.command("run")
546
@click.option("--host", type=str, default="127.0.0.1")
547
@click.option("--port", type=int, default=5000)
548
@click.option("--log-level")
549
@click_log.simple_verbosity_option(logger, default="ERROR")
550
def run_app(**kwargs):
551
    """
552
    Run Annif in server mode for development.
553
    \f
554
    The server is for development purposes only.
555
    """
556
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
557
    cxapp = annif.create_cx_app()
558
    cxapp.run(**kwargs)
559
560
561
FILTER_BATCH_MAX_LIMIT = 15
562
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
563
564
565
@cli.command("optimize")
566
@cli_util.project_id
567
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
568
@click.option(
569
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
570
)
571
@cli_util.docs_limit_option
572
@cli_util.backend_param_option
573
@cli_util.common_options
574
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
575
    """
576
    Suggest subjects for documents, testing multiple limits and thresholds.
577
    \f
578
    This command will use different limit (maximum number of subjects) and
579
    score threshold values when assigning subjects to each document given by
580
    ``PATHS`` and compare the results against the gold standard subjects in the
581
    documents. The output is a list of parameter combinations and their scores.
582
    From the output, you can determine the optimum limit and threshold
583
    parameters depending on which measure you want to target.
584
    """
585
    project = cli_util.get_project(project_id)
586
    backend_params = cli_util.parse_backend_params(backend_param, project)
587
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
588
589
    import annif.eval
590
591
    corpus = cli_util.open_documents(
592
        paths, project.subjects, project.vocab_lang, docs_limit
593
    )
594
595
    jobs, pool_class = annif.parallel.get_pool(jobs)
596
597
    project.initialize(parallel=True)
598
    psmap = annif.parallel.ProjectSuggestMap(
599
        project.registry,
600
        [project_id],
601
        backend_params,
602
        limit=FILTER_BATCH_MAX_LIMIT,
603
        threshold=0.0,
604
    )
605
606
    ndocs = 0
607
    suggestion_batches = []
608
    subject_set_batches = []
609
    with pool_class(jobs) as pool:
610
        for suggestion_batch, subject_sets in pool.imap_unordered(
611
            psmap.suggest_batch, corpus.doc_batches
612
        ):
613
            ndocs += len(suggestion_batch[project_id])
614
            suggestion_batches.append(suggestion_batch[project_id])
615
            subject_set_batches.append(subject_sets)
616
617
    from annif.suggestion import SuggestionResults
618
619
    orig_suggestion_results = SuggestionResults(suggestion_batches)
620
621
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
622
623
    best_scores = collections.defaultdict(float)
624
    best_params = {}
625
626
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
627
    import annif.eval
628
629
    for limit, threshold in filter_params:
630
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
631
        filtered_results = orig_suggestion_results.filter(limit, threshold)
632
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
633
            eval_batch.evaluate_many(batch, subject_sets)
634
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
635
        for metric, score in results.items():
636
            if score >= best_scores[metric]:
637
                best_scores[metric] = score
638
                best_params[metric] = (limit, threshold)
639
        click.echo(
640
            template.format(
641
                limit,
642
                threshold,
643
                results["Precision (doc avg)"],
644
                results["Recall (doc avg)"],
645
                results["F1 score (doc avg)"],
646
            )
647
        )
648
649
    click.echo()
650
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
651
    for metric in OPTIMIZE_METRICS:
652
        click.echo(
653
            template2.format(
654
                metric,
655
                best_scores[metric],
656
                best_params[metric][0],
657
                best_params[metric][1],
658
            )
659
        )
660
    click.echo("Documents evaluated:\t{}".format(ndocs))
661
662
663
@cli.command("hyperopt")
664
@cli_util.project_id
665
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
666
@click.option("--trials", "-T", default=10, help="Number of trials")
667
@click.option(
668
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
669
)
670
@click.option(
671
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
672
)
673
@click.option(
674
    "--results-file",
675
    "-r",
676
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
677
    help="""Specify file path to write trial results as TSV.
678
    File directory must exist, existing file will be overwritten.""",
679
)
680
@cli_util.docs_limit_option
681
@cli_util.common_options
682
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
683
    """
684
    Optimize the hyperparameters of a project using validation documents from
685
    ``PATHS``. Not supported by all backends. Output is a list of trial results
686
    and a report of the best performing parameters.
687
    """
688
    proj = cli_util.get_project(project_id)
689
    documents = cli_util.open_documents(
690
        paths, proj.subjects, proj.vocab_lang, docs_limit
691
    )
692
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
693
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
694
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
695
    click.echo("---")
696
    for line in rec.lines:
697
        click.echo(line)
698
    click.echo("---")
699
700
701
@cli.command("upload")
702
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
703
@click.argument("repo_id")
704
@click.option(
705
    "--token",
706
    help="""Authentication token, obtained from the Hugging Face Hub.
707
    Will default to the stored token.""",
708
)
709
@click.option(
710
    "--revision",
711
    help="""An optional git revision to commit from. Defaults to the head of the "main"
712
    branch.""",
713
)
714
@click.option(
715
    "--commit-message",
716
    help="""The summary / title / first line of the generated commit.""",
717
)
718
@click.option(
719
    "--modelcard/--no-modelcard",
720
    default=True,
721
    help="Update or create a Model Card with upload.",
722
)
723
@cli_util.common_options
724
def run_upload(
725
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
726
):
727
    """
728
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
729
    \f
730
    This command zips the project directories and vocabularies of the projects
731
    that match the given `project_ids_pattern` to archive files, and uploads the
732
    archives along with the project configurations to the specified Hugging Face
733
    Hub repository. An authentication token and commit message can be given with
734
    options. If the README.md does not exist in the repository it is
735
    created with default contents and metadata of the uploaded projects, if it exists,
736
    its metadata are updated as necessary.
737
    """
738
    from huggingface_hub import HfApi
739
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
740
741
    projects = hfh_util.get_matching_projects(project_ids_pattern)
742
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
743
744
    commit_message = (
745
        commit_message
746
        if commit_message is not None
747
        else f"Upload project(s) {project_ids_pattern} with Annif"
748
    )
749
750
    fobjs, operations = [], []
751
    try:
752
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
753
        api = HfApi()
754
        api.create_commit(
755
            repo_id=repo_id,
756
            operations=operations,
757
            commit_message=commit_message,
758
            revision=revision,
759
            token=token,
760
        )
761
    except (HfHubHTTPError, HFValidationError) as err:
762
        raise OperationFailedException(str(err))
763
    else:
764
        if modelcard:
765
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
766
    finally:
767
        for fobj in fobjs:
768
            fobj.close()
769
770
771
@cli.command("download")
772
@click.argument("project_ids_pattern")
773
@click.argument("repo_id")
774
@click.option(
775
    "--token",
776
    help="""Authentication token, obtained from the Hugging Face Hub.
777
    Will default to the stored token.""",
778
)
779
@click.option(
780
    "--revision",
781
    help="""
782
    An optional Git revision id which can be a branch name, a tag, or a commit
783
    hash.
784
    """,
785
)
786
@click.option(
787
    "--force",
788
    "-f",
789
    default=False,
790
    is_flag=True,
791
    help="Replace an existing project/vocabulary/config with the downloaded one",
792
)
793
@click.option(
794
    "--trust-repo",
795
    default=False,
796
    is_flag=True,
797
    help="Allow download from the repository even when it has no entries in the cache",
798
)
799
@cli_util.common_options
800
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
801
    """
802
    Download selected projects and their vocabularies from a Hugging Face Hub
803
    repository.
804
    \f
805
    This command downloads the project and vocabulary archives and the
806
    configuration files of the projects that match the given
807
    `project_ids_pattern` from the specified Hugging Face Hub repository and
808
    unzips the archives to `data/` directory and places the configuration files
809
    to `projects.d/` directory. An authentication token and revision can be given with
810
    options. If the repository hasn’t been used for downloads previously
811
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
812
    `--trust-repo` option needs to be used.
813
    """
814
815
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
816
817
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
818
        project_ids_pattern, repo_id, token, revision
819
    )
820
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
821
822
    vocab_ids = set()
823
    for project_id in project_ids:
824
        project_zip_cache_path = hfh_util.download_from_hf_hub(
825
            f"projects/{project_id}.zip", repo_id, token, revision
826
        )
827
        hfh_util.unzip_archive(project_zip_cache_path, force)
828
        config_file_cache_path = hfh_util.download_from_hf_hub(
829
            f"{project_id}.cfg", repo_id, token, revision
830
        )
831
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
832
        hfh_util.copy_project_config(config_file_cache_path, force)
833
834
    for vocab_id in vocab_ids:
835
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
836
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
837
        )
838
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
839
840
841
@cli.command("completion")
842
@click.option("--bash", "shell", flag_value="bash")
843
@click.option("--zsh", "shell", flag_value="zsh")
844
@click.option("--fish", "shell", flag_value="fish")
845
def run_completion(shell):
846
    """Generate the script for tab-key autocompletion for the given shell. To enable the
847
    completion support in your current bash terminal session run\n
848
        source <(annif completion --bash)
849
850
    To enable the completion support in all new sessions first add the completion script
851
    in your home directory:\n
852
        annif completion --bash > ~/.annif-complete.bash
853
854
    Then make the script to be automatically sourced for new terminal sessions by adding
855
    the following to your ~/.bashrc file (or in some alternative startup file)\n
856
        source ~/.annif-complete.bash
857
    """
858
859
    if shell is None:
860
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
861
862
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
863
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
864
    click.echo(script)
865
866
867
@cli.command("detect-language")
868
@click.argument("languages")
869
@click.argument(
870
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
871
)
872
def run_detect_language(languages, paths):
873
    """
874
    Detect the language of a single text document from standard input or for one or more
875
    document file(s) given its/their path(s).
876
    """
877
878
    langs = tuple(languages.split(","))
879
880
    def detect_language_and_show(text, languages):
881
        try:
882
            proportions = detect_language(text, languages)
883
        except ValueError as e:
884
            raise click.UsageError(e)
885
        for lang, score in proportions.items():
886
            if lang == "unk":
887
                lang = "?"
888
            click.echo(f"{lang}\t{score:.04f}")
889
890
    if paths and not (len(paths) == 1 and paths[0] == "-"):
891
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
892
        for doc, path in zip(doclist.documents, paths):
893
            click.echo(f"Detected languages for {path}")
894
            detect_language_and_show(doc.text, langs)
895
    else:
896
        text = sys.stdin.read()
897
        detect_language_and_show(text, langs)
898
899
900
if __name__ == "__main__":
901
    cli()
902