Passed
Pull Request — main (#889)
by Osma
06:07 queued 03:13
created

annif.cli.run_index()   F

Complexity

Conditions 19

Size

Total Lines 134
Code Lines 98

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 19
eloc 98
nop 11
dl 0
loc 134
rs 0.4854
c 0
b 0
f 0

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like annif.cli.run_index() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif import cli_util, hfh_util
20
from annif.corpus import Document, DocumentDirectory
21
from annif.exception import (
22
    NotInitializedException,
23
    NotSupportedException,
24
    OperationFailedException,
25
)
26
from annif.project import Access
27
from annif.simplemma_util import detect_language
28
from annif.util import metric_code, suggestion_to_dict
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
create_app = annif.create_flask_app
34
cli = FlaskGroup(
35
    create_app=create_app, add_default_commands=False, add_version_option=False
36
)
37
cli = click.version_option(message="%(version)s")(cli)
38
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
39
40
41
@cli.command("list-projects")
42
@cli_util.common_options
43
def run_list_projects():
44
    """
45
    List available projects.
46
    \f
47
    Show a list of currently defined projects. Projects are defined in a
48
    configuration file, normally called ``projects.cfg``. See `Project
49
    configuration
50
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51
    for details.
52
    """
53
54
    column_headings = (
55
        "Project ID",
56
        "Project Name",
57
        "Vocabulary ID",
58
        "Language",
59
        "Trained",
60
        "Modification time",
61
    )
62
    table = [
63
        (
64
            proj.project_id,
65
            proj.name,
66
            proj.vocab.vocab_id if proj.vocab_spec else "-",
67
            proj.language,
68
            str(proj.is_trained),
69
            cli_util.format_datetime(proj.modification_time),
70
        )
71
        for proj in annif.registry.get_projects(min_access=Access.private).values()
72
    ]
73
    template = cli_util.make_list_template(column_headings, *table)
74
    header = template.format(*column_headings)
75
    click.echo(header)
76
    click.echo("-" * len(header))
77
    for row in table:
78
        click.echo(template.format(*row))
79
80
81
@cli.command("show-project")
82
@cli_util.project_id
83
@cli_util.common_options
84
def run_show_project(project_id):
85
    """
86
    Show information about a project.
87
    """
88
89
    proj = cli_util.get_project(project_id)
90
    click.echo(f"Project ID:        {proj.project_id}")
91
    click.echo(f"Project Name:      {proj.name}")
92
    click.echo(f"Language:          {proj.language}")
93
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
94
    click.echo(f"Vocab language:    {proj.vocab_lang}")
95
    click.echo(f"Access:            {proj.access.name}")
96
    click.echo(f"Backend:           {proj.backend.name}")
97
    click.echo(f"Trained:           {proj.is_trained}")
98
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
99
100
101
@cli.command("clear")
102
@cli_util.project_id
103
@cli_util.common_options
104
def run_clear_project(project_id):
105
    """
106
    Initialize the project to its original, untrained state.
107
    """
108
    proj = cli_util.get_project(project_id)
109
    proj.remove_model_data()
110
111
112
@cli.command("list-vocabs")
113
@cli_util.common_options
114
def run_list_vocabs():
115
    """
116
    List available vocabularies.
117
    """
118
119
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
120
    table = []
121
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
122
        try:
123
            languages = ",".join(sorted(vocab.languages))
124
            size = len(vocab)
125
            loaded = True
126
        except NotInitializedException:
127
            languages = "-"
128
            size = "-"
129
            loaded = False
130
        row = (vocab.vocab_id, languages, str(size), str(loaded))
131
        table.append(row)
132
133
    template = cli_util.make_list_template(column_headings, *table)
134
    header = template.format(*column_headings)
135
    click.echo(header)
136
    click.echo("-" * len(header))
137
    for row in table:
138
        click.echo(template.format(*row))
139
140
141
@cli.command("load-vocab")
142
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
143
@click.argument("vocab_file", type=click.Path(exists=True, dir_okay=False))
144
@click.option("--language", "-L", help="Language of TSV vocabulary file")
145
@click.option(
146
    "--force",
147
    "-f",
148
    default=False,
149
    is_flag=True,
150
    help="Replace existing vocabulary completely instead of updating it",
151
)
152
@cli_util.common_options
153
def run_load_vocab(vocab_id, language, force, vocab_file):
154
    """
155
    Load a vocabulary from a subject file.
156
    """
157
    vocab = cli_util.get_vocab(vocab_id)
158
    if annif.vocab.VocabFileSKOS.is_rdf_file(vocab_file):
159
        # SKOS/RDF file supported by rdflib
160
        vocab_file = annif.vocab.VocabFileSKOS(vocab_file)
161
        click.echo(f"Loading vocabulary from SKOS file {vocab_file}...")
162
    elif annif.vocab.VocabFileCSV.is_csv_file(vocab_file):
163
        # CSV file
164
        vocab_file = annif.vocab.VocabFileCSV(vocab_file)
165
        click.echo(f"Loading vocabulary from CSV file {vocab_file}...")
166
    else:
167
        # probably a TSV file - we need to know its language
168
        if not language:
169
            click.echo(
170
                "Please use --language option to set the language of a TSV vocabulary.",
171
                err=True,
172
            )
173
            sys.exit(1)
174
        click.echo(f"Loading vocabulary from TSV file {vocab_file}...")
175
        vocab_file = annif.vocab.VocabFileTSV(vocab_file, language)
176
    vocab.load_vocabulary(vocab_file, force=force)
177
178
179
@cli.command("train")
180
@cli_util.project_id
181
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
182
@click.option(
183
    "--cached/--no-cached",
184
    "-c/-C",
185
    default=False,
186
    help="Reuse preprocessed training data from previous run",
187
)
188
@click.option(
189
    "--jobs",
190
    "-j",
191
    default=0,
192
    help="Number of parallel jobs (0 means choose automatically)",
193
)
194
@cli_util.docs_limit_option
195
@cli_util.backend_param_option
196
@cli_util.common_options
197
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
198
    """
199
    Train a project on a collection of documents.
200
    \f
201
    This will train the project using the documents from ``PATHS`` (directories
202
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
203
    is set, preprocessed training data from the previous run is reused instead
204
    of documents input; see `Reusing preprocessed training data
205
    <https://github.com/NatLibFi/Annif/wiki/
206
    Reusing-preprocessed-training-data>`_.
207
    """
208
    proj = cli_util.get_project(project_id)
209
    backend_params = cli_util.parse_backend_params(backend_param, proj)
210
    if cached:
211
        if len(paths) > 0:
212
            raise click.UsageError(
213
                "Corpus paths cannot be given when using --cached option."
214
            )
215
        documents = "cached"
216
    else:
217
        documents = cli_util.open_documents(
218
            paths, proj.subjects, proj.vocab_lang, docs_limit
219
        )
220
    proj.train(documents, backend_params, jobs)
221
222
223
@cli.command("learn")
224
@cli_util.project_id
225
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
226
@cli_util.docs_limit_option
227
@cli_util.backend_param_option
228
@cli_util.common_options
229
def run_learn(project_id, paths, docs_limit, backend_param):
230
    """
231
    Further train an existing project on a collection of documents.
232
    \f
233
    Similar to the ``train`` command. This will continue training an already
234
    trained project using the documents given by ``PATHS`` in a single batch
235
    operation. Not supported by all backends.
236
    """
237
    proj = cli_util.get_project(project_id)
238
    backend_params = cli_util.parse_backend_params(backend_param, proj)
239
    documents = cli_util.open_documents(
240
        paths, proj.subjects, proj.vocab_lang, docs_limit
241
    )
242
    proj.learn(documents, backend_params)
243
244
245
@cli.command("suggest")
246
@cli_util.project_id
247
@click.argument(
248
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
249
)
250
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
251
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
252
@click.option("--language", "-L", help="Language of subject labels")
253
@cli_util.docs_limit_option
254
@cli_util.backend_param_option
255
@click.option(
256
    "--metadata",
257
    "-D",
258
    multiple=True,
259
    help="Additional metadata for a document read from standard input. "
260
    + "Syntax: `-D <field>=<value>`.",
261
)
262
@cli_util.common_options
263
def run_suggest(
264
    project_id, paths, limit, threshold, language, backend_param, metadata, docs_limit
265
):
266
    """
267
    Suggest subjects for a single document from standard input (optionally
268
    with metadata) or for one or more document file(s) given its/their
269
    path(s).
270
    \f
271
    This will read a text document from standard input and suggest subjects for
272
    it, or if given path(s) to file(s), suggest subjects for it/them.
273
    """
274
    project = cli_util.get_project(project_id)
275
    lang = language or project.vocab_lang
276
    if lang not in project.vocab.languages:
277
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
278
    backend_params = cli_util.parse_backend_params(backend_param, project)
279
280
    if paths and not (len(paths) == 1 and paths[0] == "-"):
281
        docs = cli_util.open_text_documents(paths, docs_limit)
282
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
283
        for (
284
            suggestions,
285
            path,
286
        ) in zip(results, paths):
287
            click.echo(f"Suggestions for {path}")
288
            cli_util.show_hits(suggestions, project, lang)
289
    else:
290
        text = sys.stdin.read()
291
        doc_metadata = cli_util.parse_metadata(metadata)
292
        suggestions = project.suggest(
293
            [Document(text=text, metadata=doc_metadata)], backend_params
294
        ).filter(limit, threshold)[0]
295
        cli_util.show_hits(suggestions, project, lang)
296
297
298
@cli.command("index")
299
@cli_util.project_id
300
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
301
@click.option(
302
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
303
)
304
@click.option(
305
    "--force/--no-force",
306
    "-f/-F",
307
    default=False,
308
    help="Force overwriting of existing result files",
309
)
310
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
311
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
312
@click.option("--language", "-L", help="Language of subject labels")
313
@click.option(
314
    "--gzip/--no-gzip",
315
    "-z/-Z",
316
    "use_gzip",
317
    default=False,
318
    help="Gzip compress result files",
319
)
320
@click.option(
321
    "--output",
322
    "-O",
323
    type=click.Path(dir_okay=False, writable=True),
324
    default=None,
325
    help="Redirect all output to the given file (or '-' for stdout)",
326
)
327
@click.option(
328
    "--include-doc/--no-include-doc",
329
    "-i/-I",
330
    default=True,
331
    help="Include input documents in output",
332
)
333
@cli_util.backend_param_option
334
@cli_util.common_options
335
def run_index(
336
    project_id,
337
    paths,
338
    suffix,
339
    force,
340
    limit,
341
    threshold,
342
    language,
343
    backend_param,
344
    use_gzip=False,
345
    output=None,
346
    include_doc=True,
347
):
348
    """
349
    Index documents from directories or files, suggesting subjects for each document.
350
    Write the results in TSV files (for directories) or JSONL files (for files) with
351
    the given suffix (.jsonl suffix will be added to JSONL files).
352
    """
353
    project = cli_util.get_project(project_id)
354
    lang = language or project.vocab_lang
355
    if lang not in project.vocab.languages:
356
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
357
    backend_params = cli_util.parse_backend_params(backend_param, project)
358
359
    # Helper function to process a directory
360
    def process_directory(directory):
361
        corpus = DocumentDirectory(directory, require_subjects=False)
362
        results = project.suggest_corpus(corpus, backend_params).filter(
363
            limit, threshold
364
        )
365
366
        for doc, suggestions in zip(corpus.documents, results):
367
            subjectfilename = re.sub(r"\.(txt|json)$", suffix, doc.file_path)
368
            if os.path.exists(subjectfilename) and not force:
369
                click.echo(
370
                    "Not overwriting {} (use --force to override)".format(
371
                        subjectfilename
372
                    )
373
                )
374
                continue
375
            with open(subjectfilename, "w", encoding="utf-8") as subjfile:
376
                cli_util.show_hits(suggestions, project, lang, file=subjfile)
377
378
    # Helper function to process a file
379
    def process_file(path):
380
        corpus = cli_util.open_doc_path(
381
            path, project.subjects, lang, require_subjects=False
382
        )
383
        results = project.suggest_corpus(corpus, backend_params).filter(
384
            limit, threshold
385
        )
386
387
        # Determine output stream
388
        if output == "-":
389
            stream_cm = cli_util.get_output_stream(path, suffix, "-", use_gzip, force)
390
        else:
391
            # For individual files, generate output filename based on input path
392
            if output:
393
                outfilename = output + (
394
                    ".gz" if use_gzip and not output.endswith(".gz") else ""
395
                )
396
                stream_cm = cli_util.get_output_stream(
397
                    path, suffix, outfilename, use_gzip, force
398
                )
399
            else:
400
                outfilename = (
401
                    re.sub(r"(\.[^.]+)?(\.gz)?$", "", path) + suffix + ".jsonl"
402
                )
403
                if use_gzip and not outfilename.endswith(".gz"):
404
                    outfilename += ".gz"
405
                stream_cm = cli_util.get_output_stream(
406
                    path, suffix, outfilename, use_gzip, force
407
                )
408
409
        if stream_cm is None:
410
            return
411
412
        with stream_cm as stream:
413
            for doc, suggestions in zip(corpus.documents, results):
414
                if include_doc:
415
                    output_data = doc.as_dict(project.subjects, lang)
416
                else:
417
                    output_data = {}
418
                    if doc.document_id:
419
                        output_data["document_id"] = doc.document_id
420
                output_data["results"] = [
421
                    suggestion_to_dict(suggestion, project.subjects, lang)
422
                    for suggestion in suggestions
423
                ]
424
                stream.write(json.dumps(output_data) + "\n")
425
426
    # Process paths in the order they were given
427
    for path in paths:
428
        if os.path.isdir(path):
429
            process_directory(path)
430
        elif os.path.isfile(path):
431
            process_file(path)
432
433
434
@cli.command("eval")
435
@cli_util.project_id
436
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
437
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
438
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
439
@click.option(
440
    "--metric",
441
    "-m",
442
    default=[],
443
    multiple=True,
444
    help="Metric to calculate (default: all)",
445
)
446
@click.option(
447
    "--metrics-file",
448
    "-M",
449
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
450
    help="""Specify file in order to write evaluation metrics in JSON format.
451
    File directory must exist, existing file will be overwritten.""",
452
)
453
@click.option(
454
    "--results-file",
455
    "-r",
456
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
457
    help="""Specify file in order to write non-aggregated results per subject.
458
    File directory must exist, existing file will be overwritten.""",
459
)
460
@click.option(
461
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
462
)
463
@cli_util.docs_limit_option
464
@cli_util.backend_param_option
465
@cli_util.common_options
466
def run_eval(
467
    project_id,
468
    paths,
469
    limit,
470
    threshold,
471
    docs_limit,
472
    metric,
473
    metrics_file,
474
    results_file,
475
    jobs,
476
    backend_param,
477
):
478
    """
479
    Suggest subjects for documents and evaluate the results by comparing
480
    against a gold standard.
481
    \f
482
    With this command the documents from ``PATHS`` (directories or possibly
483
    gzipped TSV files) will be assigned subject suggestions and then
484
    statistical measures are calculated that quantify how well the suggested
485
    subjects match the gold-standard subjects in the documents.
486
487
    Normally the output is the list of the metrics calculated across documents.
488
    If ``--results-file <FILENAME>`` option is given, the metrics are
489
    calculated separately for each subject, and written to the given file.
490
    """
491
492
    project = cli_util.get_project(project_id)
493
    backend_params = cli_util.parse_backend_params(backend_param, project)
494
495
    import annif.eval
496
497
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
498
499
    if results_file:
500
        try:
501
            print("", end="", file=results_file)
502
            click.echo(
503
                "Writing per subject evaluation results to {!s}".format(
504
                    results_file.name
505
                )
506
            )
507
        except Exception as e:
508
            raise NotSupportedException(
509
                "cannot open results-file for writing: " + str(e)
510
            )
511
    corpus = cli_util.open_documents(
512
        paths, project.subjects, project.vocab_lang, docs_limit
513
    )
514
    jobs, pool_class = annif.parallel.get_pool(jobs)
515
516
    project.initialize(parallel=True)
517
    psmap = annif.parallel.ProjectSuggestMap(
518
        project.registry, [project_id], backend_params, limit, threshold
519
    )
520
521
    with pool_class(jobs) as pool:
522
        for hit_sets, subject_sets in pool.imap_unordered(
523
            psmap.suggest_batch, corpus.doc_batches
524
        ):
525
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
526
527
    template = "{0:<30}\t{1:{fmt_spec}}"
528
    metrics = eval_batch.results(
529
        metrics=metric, results_file=results_file, language=project.vocab_lang
530
    )
531
    for metric, score in metrics.items():
532
        if isinstance(score, int):
533
            fmt_spec = "d"
534
        elif isinstance(score, float):
535
            fmt_spec = ".04f"
536
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
537
    if metrics_file:
538
        json.dump(
539
            {metric_code(mname): val for mname, val in metrics.items()},
540
            metrics_file,
541
            indent=2,
542
        )
543
544
545
@cli.command("run")
546
@click.option("--host", type=str, default="127.0.0.1")
547
@click.option("--port", type=int, default=5000)
548
@click_log.simple_verbosity_option(logger)
549
def run_app(**kwargs):
550
    """
551
    Run Annif in server mode for development.
552
    \f
553
    The server is for development purposes only.
554
    """
555
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
556
    cxapp = annif.create_cx_app()
557
    cxapp.run(**kwargs)
558
559
560
FILTER_BATCH_MAX_LIMIT = 15
561
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
562
563
564
@cli.command("optimize")
565
@cli_util.project_id
566
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
567
@click.option(
568
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
569
)
570
@cli_util.docs_limit_option
571
@cli_util.backend_param_option
572
@cli_util.common_options
573
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
574
    """
575
    Suggest subjects for documents, testing multiple limits and thresholds.
576
    \f
577
    This command will use different limit (maximum number of subjects) and
578
    score threshold values when assigning subjects to each document given by
579
    ``PATHS`` and compare the results against the gold standard subjects in the
580
    documents. The output is a list of parameter combinations and their scores.
581
    From the output, you can determine the optimum limit and threshold
582
    parameters depending on which measure you want to target.
583
    """
584
    project = cli_util.get_project(project_id)
585
    backend_params = cli_util.parse_backend_params(backend_param, project)
586
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
587
588
    import annif.eval
589
590
    corpus = cli_util.open_documents(
591
        paths, project.subjects, project.vocab_lang, docs_limit
592
    )
593
594
    jobs, pool_class = annif.parallel.get_pool(jobs)
595
596
    project.initialize(parallel=True)
597
    psmap = annif.parallel.ProjectSuggestMap(
598
        project.registry,
599
        [project_id],
600
        backend_params,
601
        limit=FILTER_BATCH_MAX_LIMIT,
602
        threshold=0.0,
603
    )
604
605
    ndocs = 0
606
    suggestion_batches = []
607
    subject_set_batches = []
608
    with pool_class(jobs) as pool:
609
        for suggestion_batch, subject_sets in pool.imap_unordered(
610
            psmap.suggest_batch, corpus.doc_batches
611
        ):
612
            ndocs += len(suggestion_batch[project_id])
613
            suggestion_batches.append(suggestion_batch[project_id])
614
            subject_set_batches.append(subject_sets)
615
616
    from annif.suggestion import SuggestionResults
617
618
    orig_suggestion_results = SuggestionResults(suggestion_batches)
619
620
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
621
622
    best_scores = collections.defaultdict(float)
623
    best_params = {}
624
625
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
626
    import annif.eval
627
628
    for limit, threshold in filter_params:
629
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
630
        filtered_results = orig_suggestion_results.filter(limit, threshold)
631
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
632
            eval_batch.evaluate_many(batch, subject_sets)
633
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
634
        for metric, score in results.items():
635
            if score >= best_scores[metric]:
636
                best_scores[metric] = score
637
                best_params[metric] = (limit, threshold)
638
        click.echo(
639
            template.format(
640
                limit,
641
                threshold,
642
                results["Precision (doc avg)"],
643
                results["Recall (doc avg)"],
644
                results["F1 score (doc avg)"],
645
            )
646
        )
647
648
    click.echo()
649
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
650
    for metric in OPTIMIZE_METRICS:
651
        click.echo(
652
            template2.format(
653
                metric,
654
                best_scores[metric],
655
                best_params[metric][0],
656
                best_params[metric][1],
657
            )
658
        )
659
    click.echo("Documents evaluated:\t{}".format(ndocs))
660
661
662
@cli.command("hyperopt")
663
@cli_util.project_id
664
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
665
@click.option("--trials", "-T", default=10, help="Number of trials")
666
@click.option(
667
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
668
)
669
@click.option(
670
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
671
)
672
@click.option(
673
    "--results-file",
674
    "-r",
675
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
676
    help="""Specify file path to write trial results as TSV.
677
    File directory must exist, existing file will be overwritten.""",
678
)
679
@cli_util.docs_limit_option
680
@cli_util.common_options
681
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
682
    """
683
    Optimize the hyperparameters of a project using validation documents from
684
    ``PATHS``. Not supported by all backends. Output is a list of trial results
685
    and a report of the best performing parameters.
686
    """
687
    proj = cli_util.get_project(project_id)
688
    documents = cli_util.open_documents(
689
        paths, proj.subjects, proj.vocab_lang, docs_limit
690
    )
691
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
692
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
693
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
694
    click.echo("---")
695
    for line in rec.lines:
696
        click.echo(line)
697
    click.echo("---")
698
699
700
@cli.command("upload")
701
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
702
@click.argument("repo_id")
703
@click.option(
704
    "--token",
705
    help="""Authentication token, obtained from the Hugging Face Hub.
706
    Will default to the stored token.""",
707
)
708
@click.option(
709
    "--revision",
710
    help="""An optional git revision to commit from. Defaults to the head of the "main"
711
    branch.""",
712
)
713
@click.option(
714
    "--commit-message",
715
    help="""The summary / title / first line of the generated commit.""",
716
)
717
@click.option(
718
    "--modelcard/--no-modelcard",
719
    default=True,
720
    help="Update or create a Model Card with upload.",
721
)
722
@cli_util.common_options
723
def run_upload(
724
    project_ids_pattern, repo_id, token, revision, commit_message, modelcard
725
):
726
    """
727
    Upload selected projects and their vocabularies to a Hugging Face Hub repository.
728
    \f
729
    This command zips the project directories and vocabularies of the projects
730
    that match the given `project_ids_pattern` to archive files, and uploads the
731
    archives along with the project configurations to the specified Hugging Face
732
    Hub repository. An authentication token and commit message can be given with
733
    options. If the README.md does not exist in the repository it is
734
    created with default contents and metadata of the uploaded projects, if it exists,
735
    its metadata are updated as necessary.
736
    """
737
    from huggingface_hub import HfApi
738
    from huggingface_hub.utils import HfHubHTTPError, HFValidationError
739
740
    projects = hfh_util.get_matching_projects(project_ids_pattern)
741
    click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
742
743
    commit_message = (
744
        commit_message
745
        if commit_message is not None
746
        else f"Upload project(s) {project_ids_pattern} with Annif"
747
    )
748
749
    fobjs, operations = [], []
750
    try:
751
        fobjs, operations = hfh_util.prepare_commits(projects, repo_id, token)
752
        api = HfApi()
753
        api.create_commit(
754
            repo_id=repo_id,
755
            operations=operations,
756
            commit_message=commit_message,
757
            revision=revision,
758
            token=token,
759
        )
760
    except (HfHubHTTPError, HFValidationError) as err:
761
        raise OperationFailedException(str(err))
762
    else:
763
        if modelcard:
764
            hfh_util.upsert_modelcard(repo_id, projects, token, revision)
765
    finally:
766
        for fobj in fobjs:
767
            fobj.close()
768
769
770
@cli.command("download")
771
@click.argument("project_ids_pattern")
772
@click.argument("repo_id")
773
@click.option(
774
    "--token",
775
    help="""Authentication token, obtained from the Hugging Face Hub.
776
    Will default to the stored token.""",
777
)
778
@click.option(
779
    "--revision",
780
    help="""
781
    An optional Git revision id which can be a branch name, a tag, or a commit
782
    hash.
783
    """,
784
)
785
@click.option(
786
    "--force",
787
    "-f",
788
    default=False,
789
    is_flag=True,
790
    help="Replace an existing project/vocabulary/config with the downloaded one",
791
)
792
@click.option(
793
    "--trust-repo",
794
    default=False,
795
    is_flag=True,
796
    help="Allow download from the repository even when it has no entries in the cache",
797
)
798
@cli_util.common_options
799
def run_download(project_ids_pattern, repo_id, token, revision, force, trust_repo):
800
    """
801
    Download selected projects and their vocabularies from a Hugging Face Hub
802
    repository.
803
    \f
804
    This command downloads the project and vocabulary archives and the
805
    configuration files of the projects that match the given
806
    `project_ids_pattern` from the specified Hugging Face Hub repository and
807
    unzips the archives to `data/` directory and places the configuration files
808
    to `projects.d/` directory. An authentication token and revision can be given with
809
    options. If the repository hasn’t been used for downloads previously
810
    (i.e., it doesn’t appear in the Hugging Face Hub cache on local system), the
811
    `--trust-repo` option needs to be used.
812
    """
813
814
    hfh_util.check_is_download_allowed(trust_repo, repo_id)
815
816
    project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
817
        project_ids_pattern, repo_id, token, revision
818
    )
819
    click.echo(f"Downloading project(s): {', '.join(project_ids)}")
820
821
    vocab_ids = set()
822
    for project_id in project_ids:
823
        project_zip_cache_path = hfh_util.download_from_hf_hub(
824
            f"projects/{project_id}.zip", repo_id, token, revision
825
        )
826
        hfh_util.unzip_archive(project_zip_cache_path, force)
827
        config_file_cache_path = hfh_util.download_from_hf_hub(
828
            f"{project_id}.cfg", repo_id, token, revision
829
        )
830
        vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
831
        hfh_util.copy_project_config(config_file_cache_path, force)
832
833
    for vocab_id in vocab_ids:
834
        vocab_zip_cache_path = hfh_util.download_from_hf_hub(
835
            f"vocabs/{vocab_id}.zip", repo_id, token, revision
836
        )
837
        hfh_util.unzip_archive(vocab_zip_cache_path, force)
838
839
840
@cli.command("completion")
841
@click.option("--bash", "shell", flag_value="bash")
842
@click.option("--zsh", "shell", flag_value="zsh")
843
@click.option("--fish", "shell", flag_value="fish")
844
def run_completion(shell):
845
    """Generate the script for tab-key autocompletion for the given shell. To enable the
846
    completion support in your current bash terminal session run\n
847
        source <(annif completion --bash)
848
849
    To enable the completion support in all new sessions first add the completion script
850
    in your home directory:\n
851
        annif completion --bash > ~/.annif-complete.bash
852
853
    Then make the script to be automatically sourced for new terminal sessions by adding
854
    the following to your ~/.bashrc file (or in some alternative startup file)\n
855
        source ~/.annif-complete.bash
856
    """
857
858
    if shell is None:
859
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
860
861
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
862
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
863
    click.echo(script)
864
865
866
@cli.command("detect-language")
867
@click.argument("languages")
868
@click.argument(
869
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
870
)
871
def run_detect_language(languages, paths):
872
    """
873
    Detect the language of a single text document from standard input or for one or more
874
    document file(s) given its/their path(s).
875
    """
876
877
    langs = tuple(languages.split(","))
878
879
    def detect_language_and_show(text, languages):
880
        try:
881
            proportions = detect_language(text, languages)
882
        except ValueError as e:
883
            raise click.UsageError(e)
884
        for lang, score in proportions.items():
885
            if lang == "unk":
886
                lang = "?"
887
            click.echo(f"{lang}\t{score:.04f}")
888
889
    if paths and not (len(paths) == 1 and paths[0] == "-"):
890
        doclist = cli_util.open_text_documents(paths, docs_limit=None)
891
        for doc, path in zip(doclist.documents, paths):
892
            click.echo(f"Detected languages for {path}")
893
            detect_language_and_show(doc.text, langs)
894
    else:
895
        text = sys.stdin.read()
896
        detect_language_and_show(text, langs)
897
898
899
if __name__ == "__main__":
900
    cli()
901