annif.cli.run_clear_project()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 9
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 9
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import gzip
6
import importlib
7
import json
8
import os.path
9
import re
10
import sys
11
12
import click
13
import click_log
14
from flask.cli import FlaskGroup
15
16
import annif
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util, hfh_util
21
from annif.corpus import Document, DocumentDirectory
22
from annif.exception import (
23
    NotInitializedException,
24
    NotSupportedException,
25
    OperationFailedException,
26
)
27
from annif.project import Access
28
from annif.simplemma_util import detect_language
29
from annif.util import metric_code, suggestion_to_dict
30
31
logger = annif.logger
32
click_log.basic_config(logger)
33
34
create_app = annif.create_flask_app
35
cli = FlaskGroup(
36
    create_app=create_app, add_default_commands=False, add_version_option=False
37
)
38
cli = click.version_option(message="%(version)s")(cli)
39
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
40
41
42
@cli.command("list-projects")
43
@cli_util.common_options
44
@click_log.simple_verbosity_option(logger, default="ERROR")
45
def run_list_projects():
46
    """
47
    List available projects.
48
    \f
49
    Show a list of currently defined projects. Projects are defined in a
50
    configuration file, normally called ``projects.cfg``. See `Project
51
    configuration
52
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
53
    for details.
54
    """
55
56
    column_headings = (
57
        "Project ID",
58
        "Project Name",
59
        "Vocabulary ID",
60
        "Language",
61
        "Trained",
62
        "Modification time",
63
    )
64
    table = [
65
        (
66
            proj.project_id,
67
            proj.name,
68
            proj.vocab.vocab_id if proj.vocab_spec else "-",
69
            proj.language,
70
            str(proj.is_trained),
71
            cli_util.format_datetime(proj.modification_time),
72
        )
73
        for proj in annif.registry.get_projects(min_access=Access.private).values()
74
    ]
75
    template = cli_util.make_list_template(column_headings, *table)
76
    header = template.format(*column_headings)
77
    click.echo(header)
78
    click.echo("-" * len(header))
79
    for row in table:
80
        click.echo(template.format(*row))
81
82
83
@cli.command("show-project")
84
@cli_util.project_id
85
@cli_util.common_options
86
def run_show_project(project_id):
87
    """
88
    Show information about a project.
89
    """
90
91
    proj = cli_util.get_project(project_id)
92
    click.echo(f"Project ID:        {proj.project_id}")
93
    click.echo(f"Project Name:      {proj.name}")
94
    click.echo(f"Language:          {proj.language}")
95
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
96
    click.echo(f"Vocab language:    {proj.vocab_lang}")
97
    click.echo(f"Access:            {proj.access.name}")
98
    click.echo(f"Backend:           {proj.backend.name}")
99
    click.echo(f"Trained:           {proj.is_trained}")
100
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
101
102
103
@cli.command("clear")
104
@cli_util.project_id
105
@cli_util.common_options
106
def run_clear_project(project_id):
107
    """
108
    Initialize the project to its original, untrained state.
109
    """
110
    proj = cli_util.get_project(project_id)
111
    proj.remove_model_data()
112
113
114
@cli.command("list-vocabs")
115
@cli_util.common_options
116
@click_log.simple_verbosity_option(logger, default="ERROR")
117
def run_list_vocabs():
118
    """
119
    List available vocabularies.
120
    """
121
122
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
123
    table = []
124
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
125
        try:
126
            languages = ",".join(sorted(vocab.languages))
127
            size = len(vocab)
128
            loaded = True
129
        except NotInitializedException:
130
            languages = "-"
131
            size = "-"
132
            loaded = False
133
        row = (vocab.vocab_id, languages, str(size), str(loaded))
134
        table.append(row)
135
136
    template = cli_util.make_list_template(column_headings, *table)
137
    header = template.format(*column_headings)
138
    click.echo(header)
139
    click.echo("-" * len(header))
140
    for row in table:
141
        click.echo(template.format(*row))
142
143
144
@cli.command("load-vocab")
145
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
146
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
147
@click.option("--language", "-L", help="Language of TSV vocabulary file")
148
@click.option(
149
    "--force",
150
    "-f",
151
    default=False,
152
    is_flag=True,
153
    help="Replace existing vocabulary completely instead of updating it",
154
)
155
@cli_util.common_options
156
def run_load_vocab(vocab_id, language, force, vocab_file):
157
    """
158
    Load a vocabulary from a subject file.
159
    """
160
    vocab = cli_util.get_vocab(vocab_id)
161
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
162
        # SKOS/RDF file supported by rdflib
163
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
164
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
165
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
166
        # CSV file
167
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
168
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
169
    else:
170
        # probably a TSV file - we need to know its language
171
        if not language:
172
            click.echo(
173
                "Please use --language option to set the language of a TSV vocabulary.",
174
                err=True,
175
            )
176
            sys.exit(1)
177
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
178
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
179
    vocab.load_vocabulary(vocab_file, force=force)
180
181
182
@cli.command("train")
183
@cli_util.project_id
184
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
185
@click.option(
186
    "--cached/--no-cached",
187
    "-c/-C",
188
    default=False,
189
    help="Reuse preprocessed training data from previous run",
190
)
191
@click.option(
192
    "--jobs",
193
    "-j",
194
    default=0,
195
    help="Number of parallel jobs (0 means choose automatically)",
196
)
197
@cli_util.docs_limit_option
198
@cli_util.backend_param_option
199
@cli_util.common_options
200
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
201
    """
202
    Train a project on a collection of documents.
203
    \f
204
    This will train the project using the documents from ``PATHS`` (directories
205
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
206
    is set, preprocessed training data from the previous run is reused instead
207
    of documents input; see `Reusing preprocessed training data
208
    <https://github.com/NatLibFi/Annif/wiki/
209
    Reusing-preprocessed-training-data>`_.
210
    """
211
    proj = cli_util.get_project(project_id)
212
    backend_params = cli_util.parse_backend_params(backend_param, proj)
213
    if cached:
214
        if len(paths) > 0:
215
            raise click.UsageError(
216
                "Corpus paths cannot be given when using --cached option."
217
            )
218
        documents = "cached"
219
    else:
220
        documents = cli_util.open_documents(
221
            paths, proj.subjects, proj.vocab_lang, docs_limit
222
        )
223
    proj.train(documents, backend_params, jobs)
224
225
226
@cli.command("learn")
227
@cli_util.project_id
228
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
229
@cli_util.docs_limit_option
230
@cli_util.backend_param_option
231
@cli_util.common_options
232
def run_learn(project_id, paths, docs_limit, backend_param):
233
    """
234
    Further train an existing project on a collection of documents.
235
    \f
236
    Similar to the ``train`` command. This will continue training an already
237
    trained project using the documents given by ``PATHS`` in a single batch
238
    operation. Not supported by all backends.
239
    """
240
    proj = cli_util.get_project(project_id)
241
    backend_params = cli_util.parse_backend_params(backend_param, proj)
242
    documents = cli_util.open_documents(
243
        paths, proj.subjects, proj.vocab_lang, docs_limit
244
    )
245
    proj.learn(documents, backend_params)
246
247
248
@cli.command("suggest")
249
@cli_util.project_id
250
@click.argument(
251
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
252
)
253
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
254
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
255
@click.option("--language", "-L", help="Language of subject labels")
256
@cli_util.docs_limit_option
257
@cli_util.backend_param_option
258
@click.option(
259
    "--metadata",
260
    "-D",
261
    multiple=True,
262
    help="Additional metadata for a document read from standard input. "
263
    + "Syntax: `-D <field>=<value>`.",
264
)
265
@cli_util.common_options
266
def run_suggest(
267
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
268
):
269
    """
270
    Suggest subjects for a single document from standard input (optionally
271
    with metadata) or for one or more document file(s) given its/their
272
    path(s).
273
    \f
274
    This will read a text document from standard input and suggest subjects for
275
    it, or if given path(s) to file(s), suggest subjects for it/them.
276
    """
277
    project = cli_util.get_project(project_id)
278
    lang = language or project.vocab_lang
279
    if lang not in project.vocab.languages:
280
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
281
    backend_params = cli_util.parse_backend_params(backend_param, project)
282
283
    if paths and not (len(paths) == 1 and paths[0] == "-"):
284
        docs = cli_util.open_text_documents(paths, docs_limit)
285
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
286
        for (
287
            suggestions,
288
            path,
289
        ) in zip(results, paths):
290
            click.echo(f"Suggestions for {path}")
291
            cli_util.show_hits(suggestions, project, lang)
292
    else:
293
        text = sys.stdin.read()
294
        doc_metadata = cli_util.parse_metadata(metadata)
295
        suggestions = project.suggest(
296
            [Document(text=text, metadata=doc_metadata)], backend_params
297
        ).filter(limit, threshold)[0]
298
        cli_util.show_hits(suggestions, project, lang)
299
300
301
@cli.command("index")
302
@cli_util.project_id
303
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
304
@click.option(
305
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
306
)
307
@click.option(
308
    "--force/--no-force",
309
    "-f/-F",
310
    default=False,
311
    help="Force overwriting of existing result files",
312
)
313
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
314
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
315
@click.option("--language", "-L", help="Language of subject labels")
316
@cli_util.backend_param_option
317
@cli_util.common_options
318
def run_index(
319
    project_id, directory, suffix, force, limit, threshold, language, backend_param
320
):
321
    """
322
    Index a directory with documents, suggesting subjects for each document.
323
    Write the results in TSV files with the given suffix (``.annif`` by
324
    default).
325
    """
326
    project = cli_util.get_project(project_id)
327
    lang = language or project.vocab_lang
328
    if lang not in project.vocab.languages:
329
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
330
    backend_params = cli_util.parse_backend_params(backend_param, project)
331
332
    corpus = DocumentDirectory(directory, require_subjects=False)
333
    results = project.suggest_corpus(corpus, backend_params).filter(limit, threshold)
334
335
    for doc, suggestions in zip(corpus.documents, results):
336
        subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
337
        if os.path.exists(subjectfilename) and not force:
338
            click.echo(
339
                "Not overwriting {} (use --force to override)".format(subjectfilename)
340
            )
341
            continue
342
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
343
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
344
345
346
@cli.command("index-file")
347
@cli_util.project_id
348
@click.argument("paths", type=click.Path(exists=True, dir_okay=False), nargs=-1)
349
@click.option(
350
    "--suffix", "-s", default=".annif.jsonl", help="File name suffix for result files"
351
)
352
@click.option(
353
    "--gzip/--no-gzip",
354
    "-z/-Z",
355
    "use_gzip",
356
    default=False,
357
    help="Gzip compress result files",
358
)
359
@click.option(
360
    "--force/--no-force",
361
    "-f/-F",
362
    default=False,
363
    help="Force overwriting of existing result files",
364
)
365
@click.option(
366
    "--include-doc/--no-include-doc",
367
    "-i/-I",
368
    default=True,
369
    help="Include input documents in output",
370
)
371
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
372
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
373
@click.option("--language", "-L", help="Language of subject labels")
374
@cli_util.backend_param_option
375
@cli_util.common_options
376
def run_index_file(
377
    project_id,
378
    paths,
379
    suffix,
380
    use_gzip,
381
    force,
382
    include_doc,
383
    limit,
384
    threshold,
385
    language,
386
    backend_param,
387
):
388
    """
389
    Index file(s) containing documents, suggesting subjects for each document.
390
    Write the results in JSONL files with the given suffix (``.annif.jsonl`` by
391
    default).
392
    """
393
    project = cli_util.get_project(project_id)
394
    lang = language or project.vocab_lang
395
    if lang not in project.vocab.languages:
396
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
397
    backend_params = cli_util.parse_backend_params(backend_param, project)
398
399
    for path in paths:
400
        corpus = cli_util.open_doc_path(
401
            path, project.subjects, lang, require_subjects=False
402
        )
403
        results = project.suggest_corpus(corpus, backend_params).filter(
404
            limit, threshold
405
        )
406
407
        outfilename = re.sub(r"(\.[^.]+)?(\.gz)?$", "", path) + suffix
408
409
        if use_gzip:
410
            opener = gzip.open
411
            outfilename += ".gz"
412
        else:
413
            opener = open
414
415
        if os.path.exists(outfilename) and not force:
416
            click.echo(
417
                "Not overwriting {} (use --force to override)".format(outfilename)
418
            )
419
            continue
420
421
        with opener(outfilename, "wt", encoding="utf-8") as outfile:
422
            for doc, suggestions in zip(corpus.documents, results):
423
                output = doc.as_dict(project.subjects, lang) if include_doc else {}
424
                output["results"] = [
425
                    suggestion_to_dict(suggestion, project.subjects, lang)
426
                    for suggestion in suggestions
427
                ]
428
                outfile.write(json.dumps(output) + "\n")
429
430
431
@cli.command("eval")
432
@cli_util.project_id
433
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
434
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
435
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
436
@click.option(
437
    "--metric",
438
    "-m",
439
    default=[],
440
    multiple=True,
441
    help="Metric to calculate (default: all)",
442
)
443
@click.option(
444
    "--metrics-file",
445
    "-M",
446
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
447
    help="""Specify file in order to write evaluation metrics in JSON format.
448
    File directory must exist, existing file will be overwritten.""",
449
)
450
@click.option(
451
    "--results-file",
452
    "-r",
453
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
454
    help="""Specify file in order to write non-aggregated results per subject.
455
    File directory must exist, existing file will be overwritten.""",
456
)
457
@click.option(
458
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
459
)
460
@cli_util.docs_limit_option
461
@cli_util.backend_param_option
462
@cli_util.common_options
463
def run_eval(
464
    project_id,
465
    paths,
466
    limit,
467
    threshold,
468
    docs_limit,
469
    metric,
470
    metrics_file,
471
    results_file,
472
    jobs,
473
    backend_param,
474
):
475
    """
476
    Suggest subjects for documents and evaluate the results by comparing
477
    against a gold standard.
478
    \f
479
    With this command the documents from ``PATHS`` (directories or possibly
480
    gzipped TSV files) will be assigned subject suggestions and then
481
    statistical measures are calculated that quantify how well the suggested
482
    subjects match the gold-standard subjects in the documents.
483
484
    Normally the output is the list of the metrics calculated across documents.
485
    If ``--results-file <FILENAME>`` option is given, the metrics are
486
    calculated separately for each subject, and written to the given file.
487
    """
488
489
    project = cli_util.get_project(project_id)
490
    backend_params = cli_util.parse_backend_params(backend_param, project)
491
492
    import annif.eval
493
494
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
495
496
    if results_file:
497
        try:
498
            print("", end="", file=results_file)
499
            click.echo(
500
                "Writing per subject evaluation results to {!s}".format(
501
                    results_file.name
502
                )
503
            )
504
        except Exception as e:
505
            raise NotSupportedException(
506
                "cannot open results-file for writing: " + str(e)
507
            )
508
    corpus = cli_util.open_documents(
509
        paths, project.subjects, project.vocab_lang, docs_limit
510
    )
511
    jobs, pool_class = annif.parallel.get_pool(jobs)
512
513
    project.initialize(parallel=True)
514
    psmap = annif.parallel.ProjectSuggestMap(
515
        project.registry, [project_id], backend_params, limit, threshold
516
    )
517
518
    with pool_class(jobs) as pool:
519
        for hit_sets, subject_sets in pool.imap_unordered(
520
            psmap.suggest_batch, corpus.doc_batches
521
        ):
522
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
523
524
    template = "{0:<30}\t{1:{fmt_spec}}"
525
    metrics = eval_batch.results(
526
        metrics=metric, results_file=results_file, language=project.vocab_lang
527
    )
528
    for metric, score in metrics.items():
529
        if isinstance(score, int):
530
            fmt_spec = "d"
531
        elif isinstance(score, float):
532
            fmt_spec = ".04f"
533
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
534
    if metrics_file:
535
        json.dump(
536
            {metric_code(mname): val for mname, val in metrics.items()},
537
            metrics_file,
538
            indent=2,
539
        )
540
541
542
@cli.command("run")
543
@click.option("--host", type=str, default="127.0.0.1")
544
@click.option("--port", type=int, default=5000)
545
@click.option("--log-level")
546
@click_log.simple_verbosity_option(logger, default="ERROR")
547
def run_app(**kwargs):
548
    """
549
    Run Annif in server mode for development.
550
    \f
551
    The server is for development purposes only.
552
    """
553
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
554
    cxapp = annif.create_cx_app()
555
    cxapp.run(**kwargs)
556
557
558
FILTER_BATCH_MAX_LIMIT = 15
559
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
560
561
562
@cli.command("optimize")
563
@cli_util.project_id
564
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
565
@click.option(
566
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
567
)
568
@cli_util.docs_limit_option
569
@cli_util.backend_param_option
570
@cli_util.common_options
571
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
572
    """
573
    Suggest subjects for documents, testing multiple limits and thresholds.
574
    \f
575
    This command will use different limit (maximum number of subjects) and
576
    score threshold values when assigning subjects to each document given by
577
    ``PATHS`` and compare the results against the gold standard subjects in the
578
    documents. The output is a list of parameter combinations and their scores.
579
    From the output, you can determine the optimum limit and threshold
580
    parameters depending on which measure you want to target.
581
    """
582
    project = cli_util.get_project(project_id)
583
    backend_params = cli_util.parse_backend_params(backend_param, project)
584
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
585
586
    import annif.eval
587
588
    corpus = cli_util.open_documents(
589
        paths, project.subjects, project.vocab_lang, docs_limit
590
    )
591
592
    jobs, pool_class = annif.parallel.get_pool(jobs)
593
594
    project.initialize(parallel=True)
595
    psmap = annif.parallel.ProjectSuggestMap(
596
        project.registry,
597
        [project_id],
598
        backend_params,
599
        limit=FILTER_BATCH_MAX_LIMIT,
600
        threshold=0.0,
601
    )
602
603
    ndocs = 0
604
    suggestion_batches = []
605
    subject_set_batches = []
606
    with pool_class(jobs) as pool:
607
        for suggestion_batch, subject_sets in pool.imap_unordered(
608
            psmap.suggest_batch, corpus.doc_batches
609
        ):
610
            ndocs += len(suggestion_batch[project_id])
611
            suggestion_batches.append(suggestion_batch[project_id])
612
            subject_set_batches.append(subject_sets)
613
614
    from annif.suggestion import SuggestionResults
615
616
    orig_suggestion_results = SuggestionResults(suggestion_batches)
617
618
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
619
620
    best_scores = collections.defaultdict(float)
621
    best_params = {}
622
623
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
624
    import annif.eval
625
626
    for limit, threshold in filter_params:
627
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
628
        filtered_results = orig_suggestion_results.filter(limit, threshold)
629
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
630
            eval_batch.evaluate_many(batch, subject_sets)
631
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
632
        for metric, score in results.items():
633
            if score >= best_scores[metric]:
634
                best_scores[metric] = score
635
                best_params[metric] = (limit, threshold)
636
        click.echo(
637
            template.format(
638
                limit,
639
                threshold,
640
                results["Precision (doc avg)"],
641
                results["Recall (doc avg)"],
642
                results["F1 score (doc avg)"],
643
            )
644
        )
645
646
    click.echo()
647
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
648
    for metric in OPTIMIZE_METRICS:
649
        click.echo(
650
            template2.format(
651
                metric,
652
                best_scores[metric],
653
                best_params[metric][0],
654
                best_params[metric][1],
655
            )
656
        )
657
    click.echo("Documents evaluated:\t{}".format(ndocs))
658
659
660
@cli.command("hyperopt")
661
@cli_util.project_id
662
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
663
@click.option("--trials", "-T", default=10, help="Number of trials")
664
@click.option(
665
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
666
)
667
@click.option(
668
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
669
)
670
@click.option(
671
    "--results-file",
672
    "-r",
673
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
674
    help="""Specify file path to write trial results as TSV.
675
    File directory must exist, existing file will be overwritten.""",
676
)
677
@cli_util.docs_limit_option
678
@cli_util.common_options
679
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
680
    """
681
    Optimize the hyperparameters of a project using validation documents from
682
    ``PATHS``. Not supported by all backends. Output is a list of trial results
683
    and a report of the best performing parameters.
684
    """
685
    proj = cli_util.get_project(project_id)
686
    documents = cli_util.open_documents(
687
        paths, proj.subjects, proj.vocab_lang, docs_limit
688
    )
689
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
690
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
691
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
692
    click.echo("---")
693
    for line in rec.lines:
694
        click.echo(line)
695
    click.echo("---")
696
697
698
@cli.command("upload")
699
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
700
@click.argument("repo_id")
701
@click.option(
702
    "--token",
703
    help="""Authentication token, obtained from the Hugging Face Hub.
704
    Will default to the stored token.""",
705
)
706
@click.option(
707
    "--revision",
708
    help="""An optional git revision to commit from. Defaults to the head of the "main"
709
    branch.""",
710
)
711
@click.option(
712
    "--commit-message",
713
    help="""The summary / title / first line of the generated commit.""",
714
)
715
@click.option(
716
    "--modelcard/--no-modelcard",
717
    default=True,
718
    help="Update or create a Model Card with upload.",
719
)
720
@cli_util.common_options
721
def run_upload(
722
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
723
):
724
    """
725
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
726
    \f
727
    This command zips the project directories and vocabularies of the projects
728
    that match the given `project_ids_pattern` to archive files, and uploads the
729
    archives along with the project configurations to the specified Hugging Face
730
    Hub repository. An authentication token and commit message can be given with
731
    options. If the README.md does not exist in the repository it is
732
    created with default contents and metadata of the uploaded projects, if it exists,
733
    its metadata are updated as necessary.
734
    """
735
    from huggingface_hub import HfApi
736
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
737
738
    projects = hfh_util.get_matching_projects(project_ids_pattern)
739
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
740
741
    commit_message = (
742
        commit_message
743
        if commit_message is not None
744
        else f"Upload project(s) {project_ids_pattern} with Annif"
745
    )
746
747
    fobjs, operations = [], []
748
    try:
749
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
750
        api = HfApi()
751
        api.create_commit(
752
            repo_id=repo_id,
753
            operations=operations,
754
            commit_message=commit_message,
755
            revision=revision,
756
            token=token,
757
        )
758
    except (HfHubHTTPError, HFValidationError) as err:
759
        raise OperationFailedException(str(err))
760
    else:
761
        if modelcard:
762
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
763
    finally:
764
        for fobj in fobjs:
765
            fobj.close()
766
767
768
@cli.command("download")
769
@click.argument("project_ids_pattern")
770
@click.argument("repo_id")
771
@click.option(
772
    "--token",
773
    help="""Authentication token, obtained from the Hugging Face Hub.
774
    Will default to the stored token.""",
775
)
776
@click.option(
777
    "--revision",
778
    help="""
779
    An optional Git revision id which can be a branch name, a tag, or a commit
780
    hash.
781
    """,
782
)
783
@click.option(
784
    "--force",
785
    "-f",
786
    default=False,
787
    is_flag=True,
788
    help="Replace an existing project/vocabulary/config with the downloaded one",
789
)
790
@click.option(
791
    "--trust-repo",
792
    default=False,
793
    is_flag=True,
794
    help="Allow download from the repository even when it has no entries in the cache",
795
)
796
@cli_util.common_options
797
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
798
    """
799
    Download selected projects and their vocabularies from a Hugging Face Hub
800
    repository.
801
    \f
802
    This command downloads the project and vocabulary archives and the
803
    configuration files of the projects that match the given
804
    `project_ids_pattern` from the specified Hugging Face Hub repository and
805
    unzips the archives to `data/` directory and places the configuration files
806
    to `projects.d/` directory. An authentication token and revision can be given with
807
    options. If the repository hasn’t been used for downloads previously
808
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
809
    `--trust-repo` option needs to be used.
810
    """
811
812
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
813
814
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
815
        project_ids_pattern, repo_id, token, revision
816
    )
817
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
818
819
    vocab_ids = set()
820
    for project_id in project_ids:
821
        project_zip_cache_path = hfh_util.download_from_hf_hub(
822
            f"projects/{project_id}.zip", repo_id, token, revision
823
        )
824
        hfh_util.unzip_archive(project_zip_cache_path, force)
825
        config_file_cache_path = hfh_util.download_from_hf_hub(
826
            f"{project_id}.cfg", repo_id, token, revision
827
        )
828
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
829
        hfh_util.copy_project_config(config_file_cache_path, force)
830
831
    for vocab_id in vocab_ids:
832
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
833
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
834
        )
835
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
836
837
838
@cli.command("completion")
839
@click.option("--bash", "shell", flag_value="bash")
840
@click.option("--zsh", "shell", flag_value="zsh")
841
@click.option("--fish", "shell", flag_value="fish")
842
def run_completion(shell):
843
    """Generate the script for tab-key autocompletion for the given shell. To enable the
844
    completion support in your current bash terminal session run\n
845
        source <(annif completion --bash)
846
847
    To enable the completion support in all new sessions first add the completion script
848
    in your home directory:\n
849
        annif completion --bash > ~/.annif-complete.bash
850
851
    Then make the script to be automatically sourced for new terminal sessions by adding
852
    the following to your ~/.bashrc file (or in some alternative startup file)\n
853
        source ~/.annif-complete.bash
854
    """
855
856
    if shell is None:
857
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
858
859
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
860
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
861
    click.echo(script)
862
863
864
@cli.command("detect-language")
865
@click.argument("languages")
866
@click.argument(
867
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
868
)
869
def run_detect_language(languages, paths):
870
    """
871
    Detect the language of a single text document from standard input or for one or more
872
    document file(s) given its/their path(s).
873
    """
874
875
    langs = tuple(languages.split(","))
876
877
    def detect_language_and_show(text, languages):
878
        try:
879
            proportions = detect_language(text, languages)
880
        except ValueError as e:
881
            raise click.UsageError(e)
882
        for lang, score in proportions.items():
883
            if lang == "unk":
884
                lang = "?"
885
            click.echo(f"{lang}\t{score:.04f}")
886
887
    if paths and not (len(paths) == 1 and paths[0] == "-"):
888
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
889
        for doc, path in zip(doclist.documents, paths):
890
            click.echo(f"Detected languages for {path}")
891
            detect_language_and_show(doc.text, langs)
892
    else:
893
        text = sys.stdin.read()
894
        detect_language_and_show(text, langs)
895
896
897
if __name__ == "__main__":
898
    cli()
899