annif.cli.run_optimize()   C
last analyzed

Complexity

Conditions 8

Size

Total Lines 96
Code Lines 66

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 66
nop 5
dl 0
loc 96
rs 6.246
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code, suggestion_to_dict
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
def run_list_projects():
44
    """
45
    List available projects.
46
    \f
47
    Show a list of currently defined projects. Projects are defined in a
48
    configuration file, normally called ``projects.cfg``. See `Project
49
    configuration
50
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51
    for details.
52
    """
53
54
    column_headings = (
55
        "Project ID",
56
        "Project Name",
57
        "Vocabulary ID",
58
        "Language",
59
        "Trained",
60
        "Modification time",
61
    )
62
    table = [
63
        (
64
            proj.project_id,
65
            proj.name,
66
            proj.vocab.vocab_id if proj.vocab_spec else "-",
67
            proj.language,
68
            str(proj.is_trained),
69
            cli_util.format_datetime(proj.modification_time),
70
        )
71
        for proj in annif.registry.get_projects(min_access=Access.private).values()
72
    ]
73
    template = cli_util.make_list_template(column_headings, *table)
74
    header = template.format(*column_headings)
75
    click.echo(header)
76
    click.echo("-" * len(header))
77
    for row in table:
78
        click.echo(template.format(*row))
79
80
81
@cli.command("show-project")
82
@cli_util.project_id
83
@cli_util.common_options
84
def run_show_project(project_id):
85
    """
86
    Show information about a project.
87
    """
88
89
    proj = cli_util.get_project(project_id)
90
    click.echo(f"Project ID:        {proj.project_id}")
91
    click.echo(f"Project Name:      {proj.name}")
92
    click.echo(f"Language:          {proj.language}")
93
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
94
    click.echo(f"Vocab language:    {proj.vocab_lang}")
95
    click.echo(f"Access:            {proj.access.name}")
96
    click.echo(f"Backend:           {proj.backend.name}")
97
    click.echo(f"Trained:           {proj.is_trained}")
98
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
99
100
101
@cli.command("clear")
102
@cli_util.project_id
103
@cli_util.common_options
104
def run_clear_project(project_id):
105
    """
106
    Initialize the project to its original, untrained state.
107
    """
108
    proj = cli_util.get_project(project_id)
109
    proj.remove_model_data()
110
111
112
@cli.command("list-vocabs")
113
@cli_util.common_options
114
def run_list_vocabs():
115
    """
116
    List available vocabularies.
117
    """
118
119
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
120
    table = []
121
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
122
        try:
123
            languages = ",".join(sorted(vocab.languages))
124
            size = len(vocab)
125
            loaded = True
126
        except NotInitializedException:
127
            languages = "-"
128
            size = "-"
129
            loaded = False
130
        row = (vocab.vocab_id, languages, str(size), str(loaded))
131
        table.append(row)
132
133
    template = cli_util.make_list_template(column_headings, *table)
134
    header = template.format(*column_headings)
135
    click.echo(header)
136
    click.echo("-" * len(header))
137
    for row in table:
138
        click.echo(template.format(*row))
139
140
141
@cli.command("load-vocab")
142
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
143
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
144
@click.option("--language", "-L", help="Language of TSV vocabulary file")
145
@click.option(
146
    "--force",
147
    "-f",
148
    default=False,
149
    is_flag=True,
150
    help="Replace existing vocabulary completely instead of updating it",
151
)
152
@cli_util.common_options
153
def run_load_vocab(vocab_id, language, force, vocab_file):
154
    """
155
    Load a vocabulary from a subject file.
156
    """
157
    vocab = cli_util.get_vocab(vocab_id)
158
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
159
        # SKOS/RDF file supported by rdflib
160
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
161
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
162
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
163
        # CSV file
164
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
165
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
166
    else:
167
        # probably a TSV file - we need to know its language
168
        if not language:
169
            click.echo(
170
                "Please use --language option to set the language of a TSV vocabulary.",
171
                err=True,
172
            )
173
            sys.exit(1)
174
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
175
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
176
    vocab.load_vocabulary(vocab_file, force=force)
177
178
179
@cli.command("train")
180
@cli_util.project_id
181
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
182
@click.option(
183
    "--cached/--no-cached",
184
    "-c/-C",
185
    default=False,
186
    help="Reuse preprocessed training data from previous run",
187
)
188
@click.option(
189
    "--jobs",
190
    "-j",
191
    default=0,
192
    help="Number of parallel jobs (0 means choose automatically)",
193
)
194
@cli_util.docs_limit_option
195
@cli_util.backend_param_option
196
@cli_util.common_options
197
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
198
    """
199
    Train a project on a collection of documents.
200
    \f
201
    This will train the project using the documents from ``PATHS`` (directories
202
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
203
    is set, preprocessed training data from the previous run is reused instead
204
    of documents input; see `Reusing preprocessed training data
205
    <https://github.com/NatLibFi/Annif/wiki/
206
    Reusing-preprocessed-training-data>`_.
207
    """
208
    proj = cli_util.get_project(project_id)
209
    backend_params = cli_util.parse_backend_params(backend_param, proj)
210
    if cached:
211
        if len(paths) > 0:
212
            raise click.UsageError(
213
                "Corpus paths cannot be given when using --cached option."
214
            )
215
        documents = "cached"
216
    else:
217
        documents = cli_util.open_documents(
218
            paths, proj.subjects, proj.vocab_lang, docs_limit
219
        )
220
    proj.train(documents, backend_params, jobs)
221
222
223
@cli.command("learn")
224
@cli_util.project_id
225
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
226
@cli_util.docs_limit_option
227
@cli_util.backend_param_option
228
@cli_util.common_options
229
def run_learn(project_id, paths, docs_limit, backend_param):
230
    """
231
    Further train an existing project on a collection of documents.
232
    \f
233
    Similar to the ``train`` command. This will continue training an already
234
    trained project using the documents given by ``PATHS`` in a single batch
235
    operation. Not supported by all backends.
236
    """
237
    proj = cli_util.get_project(project_id)
238
    backend_params = cli_util.parse_backend_params(backend_param, proj)
239
    documents = cli_util.open_documents(
240
        paths, proj.subjects, proj.vocab_lang, docs_limit
241
    )
242
    proj.learn(documents, backend_params)
243
244
245
@cli.command("suggest")
246
@cli_util.project_id
247
@click.argument(
248
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
249
)
250
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
251
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
252
@click.option("--language", "-L", help="Language of subject labels")
253
@cli_util.docs_limit_option
254
@cli_util.backend_param_option
255
@click.option(
256
    "--metadata",
257
    "-D",
258
    multiple=True,
259
    help="Additional metadata for a document read from standard input. "
260
    + "Syntax: `-D <field>=<value>`.",
261
)
262
@cli_util.common_options
263
def run_suggest(
264
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
265
):
266
    """
267
    Suggest subjects for a single document from standard input (optionally
268
    with metadata) or for one or more document file(s) given its/their
269
    path(s).
270
    \f
271
    This will read a text document from standard input and suggest subjects for
272
    it, or if given path(s) to file(s), suggest subjects for it/them.
273
    """
274
    project = cli_util.get_project(project_id)
275
    lang = language or project.vocab_lang
276
    if lang not in project.vocab.languages:
277
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
278
    backend_params = cli_util.parse_backend_params(backend_param, project)
279
280
    if paths and not (len(paths) == 1 and paths[0] == "-"):
281
        docs = cli_util.open_text_documents(paths, docs_limit)
282
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
283
        for (
284
            suggestions,
285
            path,
286
        ) in zip(results, paths):
287
            click.echo(f"Suggestions for {path}")
288
            cli_util.show_hits(suggestions, project, lang)
289
    else:
290
        text = sys.stdin.read()
291
        doc_metadata = cli_util.parse_metadata(metadata)
292
        suggestions = project.suggest(
293
            [Document(text=text, metadata=doc_metadata)], backend_params
294
        ).filter(limit, threshold)[0]
295
        cli_util.show_hits(suggestions, project, lang)
296
297
298
@cli.command("index")
299
@cli_util.project_id
300
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
301
@click.option(
302
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
303
)
304
@click.option(
305
    "--force/--no-force",
306
    "-f/-F",
307
    default=False,
308
    help="Force overwriting of existing result files",
309
)
310
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
311
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
312
@click.option("--language", "-L", help="Language of subject labels")
313
@click.option(
314
    "--gzip/--no-gzip",
315
    "-z/-Z",
316
    "use_gzip",
317
    default=False,
318
    help="Gzip compress result files",
319
)
320
@click.option(
321
    "--output",
322
    "-O",
323
    type=click.Path(dir_okay=False, writable=True),
324
    default=None,
325
    help="Redirect all output to the given file (or '-' for stdout)",
326
)
327
@click.option(
328
    "--include-doc/--no-include-doc",
329
    "-i/-I",
330
    default=True,
331
    help="Include input documents in output",
332
)
333
@cli_util.backend_param_option
334
@cli_util.common_options
335
def run_index(
336
    project_id,
337
    paths,
338
    suffix,
339
    force,
340
    limit,
341
    threshold,
342
    language,
343
    backend_param,
344
    use_gzip=False,
345
    output=None,
346
    include_doc=True,
347
):
348
    """
349
    Index documents from directories or files, suggesting subjects for each document.
350
    Write the results in TSV files (for directories) or JSONL files (for files) with
351
    the given suffix (.jsonl suffix will be added to JSONL files).
352
    """
353
    project = cli_util.get_project(project_id)
354
    lang = language or project.vocab_lang
355
    if lang not in project.vocab.languages:
356
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
357
    backend_params = cli_util.parse_backend_params(backend_param, project)
358
359
    # Helper function to process a directory
360
    def process_directory(directory):
361
        corpus = DocumentDirectory(directory, require_subjects=False)
362
        results = project.suggest_corpus(corpus, backend_params).filter(
363
            limit, threshold
364
        )
365
366
        for doc, suggestions in zip(corpus.documents, results):
367
            subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
368
            if os.path.exists(subjectfilename) and not force:
369
                click.echo(
370
                    "Not overwriting {} (use --force to override)".format(
371
                        subjectfilename
372
                    )
373
                )
374
                continue
375
            with open(subjectfilename, "w", encoding="utf-8") as subjfile:
376
                cli_util.show_hits(suggestions, project, lang, file=subjfile)
377
378
    # Helper function to process a file
379
    def process_file(path):
380
        corpus = cli_util.open_doc_path(
381
            path, project.subjects, lang, require_subjects=False
382
        )
383
        results = project.suggest_corpus(corpus, backend_params).filter(
384
            limit, threshold
385
        )
386
387
        jsonl_suffix = suffix + ".jsonl" if not suffix.endswith(".jsonl") else suffix
388
        stream_cm = cli_util.get_output_stream(
389
            path, jsonl_suffix, output, use_gzip, force
390
        )
391
        if stream_cm is None:
392
            return
393
394
        with stream_cm as stream:
395
            for doc, suggestions in zip(corpus.documents, results):
396
                if include_doc:
397
                    output_data = doc.as_dict(project.subjects, lang)
398
                else:
399
                    output_data = {}
400
                    if doc.document_id:
401
                        output_data["document_id"] = doc.document_id
402
                output_data["results"] = [
403
                    suggestion_to_dict(suggestion, project.subjects, lang)
404
                    for suggestion in suggestions
405
                ]
406
                stream.write(json.dumps(output_data) + "\n")
407
408
    # Process paths in the order they were given
409
    for path in paths:
410
        if os.path.isdir(path):
411
            process_directory(path)
412
        elif os.path.isfile(path):
413
            process_file(path)
414
415
416
@cli.command("eval")
417
@cli_util.project_id
418
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
419
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
420
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
421
@click.option(
422
    "--metric",
423
    "-m",
424
    default=[],
425
    multiple=True,
426
    help="Metric to calculate (default: all)",
427
)
428
@click.option(
429
    "--metrics-file",
430
    "-M",
431
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
432
    help="""Specify file in order to write evaluation metrics in JSON format.
433
    File directory must exist, existing file will be overwritten.""",
434
)
435
@click.option(
436
    "--results-file",
437
    "-r",
438
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
439
    help="""Specify file in order to write non-aggregated results per subject.
440
    File directory must exist, existing file will be overwritten.""",
441
)
442
@click.option(
443
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
444
)
445
@cli_util.docs_limit_option
446
@cli_util.backend_param_option
447
@cli_util.common_options
448
def run_eval(
449
    project_id,
450
    paths,
451
    limit,
452
    threshold,
453
    docs_limit,
454
    metric,
455
    metrics_file,
456
    results_file,
457
    jobs,
458
    backend_param,
459
):
460
    """
461
    Suggest subjects for documents and evaluate the results by comparing
462
    against a gold standard.
463
    \f
464
    With this command the documents from ``PATHS`` (directories or possibly
465
    gzipped TSV files) will be assigned subject suggestions and then
466
    statistical measures are calculated that quantify how well the suggested
467
    subjects match the gold-standard subjects in the documents.
468
469
    Normally the output is the list of the metrics calculated across documents.
470
    If ``--results-file <FILENAME>`` option is given, the metrics are
471
    calculated separately for each subject, and written to the given file.
472
    """
473
474
    project = cli_util.get_project(project_id)
475
    backend_params = cli_util.parse_backend_params(backend_param, project)
476
477
    import annif.eval
478
479
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
480
481
    if results_file:
482
        try:
483
            print("", end="", file=results_file)
484
            click.echo(
485
                "Writing per subject evaluation results to {!s}".format(
486
                    results_file.name
487
                )
488
            )
489
        except Exception as e:
490
            raise NotSupportedException(
491
                "cannot open results-file for writing: " + str(e)
492
            )
493
    corpus = cli_util.open_documents(
494
        paths, project.subjects, project.vocab_lang, docs_limit
495
    )
496
    jobs, pool_class = annif.parallel.get_pool(jobs)
497
498
    project.initialize(parallel=True)
499
    psmap = annif.parallel.ProjectSuggestMap(
500
        project.registry, [project_id], backend_params, limit, threshold
501
    )
502
503
    with pool_class(jobs) as pool:
504
        for hit_sets, subject_sets in pool.imap_unordered(
505
            psmap.suggest_batch, corpus.doc_batches
506
        ):
507
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
508
509
    template = "{0:<30}\t{1:{fmt_spec}}"
510
    metrics = eval_batch.results(
511
        metrics=metric, results_file=results_file, language=project.vocab_lang
512
    )
513
    for metric, score in metrics.items():
514
        if isinstance(score, int):
515
            fmt_spec = "d"
516
        elif isinstance(score, float):
517
            fmt_spec = ".04f"
518
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
519
    if metrics_file:
520
        json.dump(
521
            {metric_code(mname): val for mname, val in metrics.items()},
522
            metrics_file,
523
            indent=2,
524
        )
525
526
527
@cli.command("run")
528
@click.option("--host", type=str, default="127.0.0.1")
529
@click.option("--port", type=int, default=5000)
530
@click_log.simple_verbosity_option(logger)
531
def run_app(**kwargs):
532
    """
533
    Run Annif in server mode for development.
534
    \f
535
    The server is for development purposes only.
536
    """
537
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
538
    cxapp = annif.create_cx_app()
539
    cxapp.run(**kwargs)
540
541
542
FILTER_BATCH_MAX_LIMIT = 15
543
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
544
545
546
@cli.command("optimize")
547
@cli_util.project_id
548
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
549
@click.option(
550
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
551
)
552
@cli_util.docs_limit_option
553
@cli_util.backend_param_option
554
@cli_util.common_options
555
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
556
    """
557
    Suggest subjects for documents, testing multiple limits and thresholds.
558
    \f
559
    This command will use different limit (maximum number of subjects) and
560
    score threshold values when assigning subjects to each document given by
561
    ``PATHS`` and compare the results against the gold standard subjects in the
562
    documents. The output is a list of parameter combinations and their scores.
563
    From the output, you can determine the optimum limit and threshold
564
    parameters depending on which measure you want to target.
565
    """
566
    project = cli_util.get_project(project_id)
567
    backend_params = cli_util.parse_backend_params(backend_param, project)
568
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
569
570
    import annif.eval
571
572
    corpus = cli_util.open_documents(
573
        paths, project.subjects, project.vocab_lang, docs_limit
574
    )
575
576
    jobs, pool_class = annif.parallel.get_pool(jobs)
577
578
    project.initialize(parallel=True)
579
    psmap = annif.parallel.ProjectSuggestMap(
580
        project.registry,
581
        [project_id],
582
        backend_params,
583
        limit=FILTER_BATCH_MAX_LIMIT,
584
        threshold=0.0,
585
    )
586
587
    ndocs = 0
588
    suggestion_batches = []
589
    subject_set_batches = []
590
    with pool_class(jobs) as pool:
591
        for suggestion_batch, subject_sets in pool.imap_unordered(
592
            psmap.suggest_batch, corpus.doc_batches
593
        ):
594
            ndocs += len(suggestion_batch[project_id])
595
            suggestion_batches.append(suggestion_batch[project_id])
596
            subject_set_batches.append(subject_sets)
597
598
    from annif.suggestion import SuggestionResults
599
600
    orig_suggestion_results = SuggestionResults(suggestion_batches)
601
602
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
603
604
    best_scores = collections.defaultdict(float)
605
    best_params = {}
606
607
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
608
    import annif.eval
609
610
    for limit, threshold in filter_params:
611
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
612
        filtered_results = orig_suggestion_results.filter(limit, threshold)
613
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
614
            eval_batch.evaluate_many(batch, subject_sets)
615
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
616
        for metric, score in results.items():
617
            if score >= best_scores[metric]:
618
                best_scores[metric] = score
619
                best_params[metric] = (limit, threshold)
620
        click.echo(
621
            template.format(
622
                limit,
623
                threshold,
624
                results["Precision (doc avg)"],
625
                results["Recall (doc avg)"],
626
                results["F1 score (doc avg)"],
627
            )
628
        )
629
630
    click.echo()
631
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
632
    for metric in OPTIMIZE_METRICS:
633
        click.echo(
634
            template2.format(
635
                metric,
636
                best_scores[metric],
637
                best_params[metric][0],
638
                best_params[metric][1],
639
            )
640
        )
641
    click.echo("Documents evaluated:\t{}".format(ndocs))
642
643
644
@cli.command("hyperopt")
645
@cli_util.project_id
646
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
647
@click.option("--trials", "-T", default=10, help="Number of trials")
648
@click.option(
649
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
650
)
651
@click.option(
652
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
653
)
654
@click.option(
655
    "--results-file",
656
    "-r",
657
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
658
    help="""Specify file path to write trial results as TSV.
659
    File directory must exist, existing file will be overwritten.""",
660
)
661
@cli_util.docs_limit_option
662
@cli_util.common_options
663
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
664
    """
665
    Optimize the hyperparameters of a project using validation documents from
666
    ``PATHS``. Not supported by all backends. Output is a list of trial results
667
    and a report of the best performing parameters.
668
    """
669
    proj = cli_util.get_project(project_id)
670
    documents = cli_util.open_documents(
671
        paths, proj.subjects, proj.vocab_lang, docs_limit
672
    )
673
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
674
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
675
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
676
    click.echo("---")
677
    for line in rec.lines:
678
        click.echo(line)
679
    click.echo("---")
680
681
682
@cli.command("upload")
683
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
684
@click.argument("repo_id")
685
@click.option(
686
    "--token",
687
    help="""Authentication token, obtained from the Hugging Face Hub.
688
    Will default to the stored token.""",
689
)
690
@click.option(
691
    "--revision",
692
    help="""An optional git revision to commit from. Defaults to the head of the "main"
693
    branch.""",
694
)
695
@click.option(
696
    "--commit-message",
697
    help="""The summary / title / first line of the generated commit.""",
698
)
699
@click.option(
700
    "--modelcard/--no-modelcard",
701
    default=True,
702
    help="Update or create a Model Card with upload.",
703
)
704
@cli_util.common_options
705
def run_upload(
706
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
707
):
708
    """
709
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
710
    \f
711
    This command zips the project directories and vocabularies of the projects
712
    that match the given `project_ids_pattern` to archive files, and uploads the
713
    archives along with the project configurations to the specified Hugging Face
714
    Hub repository. An authentication token and commit message can be given with
715
    options. If the README.md does not exist in the repository it is
716
    created with default contents and metadata of the uploaded projects, if it exists,
717
    its metadata are updated as necessary.
718
    """
719
    from huggingface_hub import HfApi
720
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
721
722
    projects = hfh_util.get_matching_projects(project_ids_pattern)
723
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
724
725
    commit_message = (
726
        commit_message
727
        if commit_message is not None
728
        else f"Upload project(s) {project_ids_pattern} with Annif"
729
    )
730
731
    fobjs, operations = [], []
732
    try:
733
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
734
        api = HfApi()
735
        api.create_commit(
736
            repo_id=repo_id,
737
            operations=operations,
738
            commit_message=commit_message,
739
            revision=revision,
740
            token=token,
741
        )
742
    except (HfHubHTTPError, HFValidationError) as err:
743
        raise OperationFailedException(str(err))
744
    else:
745
        if modelcard:
746
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
747
    finally:
748
        for fobj in fobjs:
749
            fobj.close()
750
751
752
@cli.command("download")
753
@click.argument("project_ids_pattern")
754
@click.argument("repo_id")
755
@click.option(
756
    "--token",
757
    help="""Authentication token, obtained from the Hugging Face Hub.
758
    Will default to the stored token.""",
759
)
760
@click.option(
761
    "--revision",
762
    help="""
763
    An optional Git revision id which can be a branch name, a tag, or a commit
764
    hash.
765
    """,
766
)
767
@click.option(
768
    "--force",
769
    "-f",
770
    default=False,
771
    is_flag=True,
772
    help="Replace an existing project/vocabulary/config with the downloaded one",
773
)
774
@click.option(
775
    "--trust-repo",
776
    default=False,
777
    is_flag=True,
778
    help="Allow download from the repository even when it has no entries in the cache",
779
)
780
@cli_util.common_options
781
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
782
    """
783
    Download selected projects and their vocabularies from a Hugging Face Hub
784
    repository.
785
    \f
786
    This command downloads the project and vocabulary archives and the
787
    configuration files of the projects that match the given
788
    `project_ids_pattern` from the specified Hugging Face Hub repository and
789
    unzips the archives to `data/` directory and places the configuration files
790
    to `projects.d/` directory. An authentication token and revision can be given with
791
    options. If the repository hasn’t been used for downloads previously
792
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
793
    `--trust-repo` option needs to be used.
794
    """
795
796
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
797
798
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
799
        project_ids_pattern, repo_id, token, revision
800
    )
801
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
802
803
    vocab_ids = set()
804
    for project_id in project_ids:
805
        project_zip_cache_path = hfh_util.download_from_hf_hub(
806
            f"projects/{project_id}.zip", repo_id, token, revision
807
        )
808
        hfh_util.unzip_archive(project_zip_cache_path, force)
809
        config_file_cache_path = hfh_util.download_from_hf_hub(
810
            f"{project_id}.cfg", repo_id, token, revision
811
        )
812
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
813
        hfh_util.copy_project_config(config_file_cache_path, force)
814
815
    for vocab_id in vocab_ids:
816
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
817
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
818
        )
819
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
820
821
822
@cli.command("completion")
823
@click.option("--bash", "shell", flag_value="bash")
824
@click.option("--zsh", "shell", flag_value="zsh")
825
@click.option("--fish", "shell", flag_value="fish")
826
def run_completion(shell):
827
    """Generate the script for tab-key autocompletion for the given shell. To enable the
828
    completion support in your current bash terminal session run\n
829
        source <(annif completion --bash)
830
831
    To enable the completion support in all new sessions first add the completion script
832
    in your home directory:\n
833
        annif completion --bash > ~/.annif-complete.bash
834
835
    Then make the script to be automatically sourced for new terminal sessions by adding
836
    the following to your ~/.bashrc file (or in some alternative startup file)\n
837
        source ~/.annif-complete.bash
838
    """
839
840
    if shell is None:
841
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
842
843
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
844
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
845
    click.echo(script)
846
847
848
@cli.command("detect-language")
849
@click.argument("languages")
850
@click.argument(
851
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
852
)
853
def run_detect_language(languages, paths):
854
    """
855
    Detect the language of a single text document from standard input or for one or more
856
    document file(s) given its/their path(s).
857
    """
858
859
    langs = tuple(languages.split(","))
860
861
    def detect_language_and_show(text, languages):
862
        try:
863
            proportions = detect_language(text, languages)
864
        except ValueError as e:
865
            raise click.UsageError(e)
866
        for lang, score in proportions.items():
867
            if lang == "unk":
868
                lang = "?"
869
            click.echo(f"{lang}\t{score:.04f}")
870
871
    if paths and not (len(paths) == 1 and paths[0] == "-"):
872
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
873
        for doc, path in zip(doclist.documents, paths):
874
            click.echo(f"Detected languages for {path}")
875
            detect_language_and_show(doc.text, langs)
876
    else:
877
        text = sys.stdin.read()
878
        detect_language_and_show(text, langs)
879
880
881
if __name__ == "__main__":
882
    cli()
883