Passed
Push — issue686-cli-command-list-proj... ( 267ee5...0cc9fe )
by Juho
05:55 queued 03:04
created

annif.cli.completion()   A

Complexity

Conditions 2

Size

Total Lines 16
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 10
nop 1
dl 0
loc 16
rs 9.9
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import importlib
7
import json
8
import os.path
9
import re
10
import sys
11
12
import click
13
import click_log
14
from flask.cli import FlaskGroup
15
16
import annif
17
import annif.corpus
18
import annif.parallel
19
import annif.project
20
import annif.registry
21
from annif import cli_util
22
from annif.exception import NotInitializedException, NotSupportedException
23
from annif.project import Access
24
from annif.util import metric_code
25
26
logger = annif.logger
27
click_log.basic_config(logger)
28
29
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
30
cli = click.version_option(message="%(version)s")(cli)
31
32
33
@cli.command("list-projects")
34
@cli_util.common_options
35
@click_log.simple_verbosity_option(logger, default="ERROR")
36
def run_list_projects():
37
    """
38
    List available projects.
39
    \f
40
    Show a list of currently defined projects. Projects are defined in a
41
    configuration file, normally called ``projects.cfg``. See `Project
42
    configuration
43
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
44
    for details.
45
    """
46
47
    column_headings = (
48
        "Project ID",
49
        "Project Name",
50
        "Vocabulary ID",
51
        "Language",
52
        "Trained",
53
        "Modification time",
54
    )
55
    table = [
56
        (
57
            proj.project_id,
58
            proj.name,
59
            proj.vocab.vocab_id if proj.vocab_spec else "-",
60
            proj.language,
61
            str(proj.is_trained),
62
            cli_util.format_datetime(proj.modification_time),
63
        )
64
        for proj in annif.registry.get_projects(min_access=Access.private).values()
65
    ]
66
    template = cli_util.make_list_template(column_headings, *table)
67
    header = template.format(*column_headings)
68
    click.echo(header)
69
    click.echo("-" * len(header))
70
    for row in table:
71
        click.echo(template.format(*row))
72
73
74
@cli.command("show-project")
75
@cli_util.project_id
76
@cli_util.common_options
77
def run_show_project(project_id):
78
    """
79
    Show information about a project.
80
    """
81
82
    proj = cli_util.get_project(project_id)
83
    click.echo(f"Project ID:        {proj.project_id}")
84
    click.echo(f"Project Name:      {proj.name}")
85
    click.echo(f"Language:          {proj.language}")
86
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
87
    click.echo(f"Vocab language:    {proj.vocab_lang}")
88
    click.echo(f"Access:            {proj.access.name}")
89
    click.echo(f"Backend:           {proj.backend.name}")
90
    click.echo(f"Trained:           {proj.is_trained}")
91
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
92
93
94
@cli.command("clear")
95
@cli_util.project_id
96
@cli_util.common_options
97
def run_clear_project(project_id):
98
    """
99
    Initialize the project to its original, untrained state.
100
    """
101
    proj = cli_util.get_project(project_id)
102
    proj.remove_model_data()
103
104
105
@cli.command("list-vocabs")
106
@cli_util.common_options
107
@click_log.simple_verbosity_option(logger, default="ERROR")
108
def run_list_vocabs():
109
    """
110
    List available vocabularies.
111
    """
112
113
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
114
    table = []
115
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
116
        try:
117
            languages = ",".join(sorted(vocab.languages))
118
            size = len(vocab)
119
            loaded = True
120
        except NotInitializedException:
121
            languages = "-"
122
            size = "-"
123
            loaded = False
124
        row = (vocab.vocab_id, languages, str(size), str(loaded))
125
        table.append(row)
126
127
    template = cli_util.make_list_template(column_headings, *table)
128
    header = template.format(*column_headings)
129
    click.echo(header)
130
    click.echo("-" * len(header))
131
    for row in table:
132
        click.echo(template.format(*row))
133
134
135
@cli.command("load-vocab")
136
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
137
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
138
@click.option("--language", "-L", help="Language of subject file")
139
@click.option(
140
    "--force",
141
    "-f",
142
    default=False,
143
    is_flag=True,
144
    help="Replace existing vocabulary completely instead of updating it",
145
)
146
@cli_util.common_options
147
def run_load_vocab(vocab_id, language, force, subjectfile):
148
    """
149
    Load a vocabulary from a subject file.
150
    """
151
    vocab = cli_util.get_vocab(vocab_id)
152
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
153
        # SKOS/RDF file supported by rdflib
154
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
155
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
156
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
157
        # CSV file
158
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
159
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
160
    else:
161
        # probably a TSV file - we need to know its language
162
        if not language:
163
            click.echo(
164
                "Please use --language option to set the language of a TSV vocabulary.",
165
                err=True,
166
            )
167
            sys.exit(1)
168
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
169
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
170
    vocab.load_vocabulary(subjects, force=force)
171
172
173
@cli.command("train")
174
@cli_util.project_id
175
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
176
@click.option(
177
    "--cached/--no-cached",
178
    "-c/-C",
179
    default=False,
180
    help="Reuse preprocessed training data from previous run",
181
)
182
@click.option(
183
    "--jobs",
184
    "-j",
185
    default=0,
186
    help="Number of parallel jobs (0 means choose automatically)",
187
)
188
@cli_util.docs_limit_option
189
@cli_util.backend_param_option
190
@cli_util.common_options
191
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
192
    """
193
    Train a project on a collection of documents.
194
    \f
195
    This will train the project using the documents from ``PATHS`` (directories
196
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
197
    is set, preprocessed training data from the previous run is reused instead
198
    of documents input; see `Reusing preprocessed training data
199
    <https://github.com/NatLibFi/Annif/wiki/
200
    Reusing-preprocessed-training-data>`_.
201
    """
202
    proj = cli_util.get_project(project_id)
203
    backend_params = cli_util.parse_backend_params(backend_param, proj)
204
    if cached:
205
        if len(paths) > 0:
206
            raise click.UsageError(
207
                "Corpus paths cannot be given when using --cached option."
208
            )
209
        documents = "cached"
210
    else:
211
        documents = cli_util.open_documents(
212
            paths, proj.subjects, proj.vocab_lang, docs_limit
213
        )
214
    proj.train(documents, backend_params, jobs)
215
216
217
@cli.command("learn")
218
@cli_util.project_id
219
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
220
@cli_util.docs_limit_option
221
@cli_util.backend_param_option
222
@cli_util.common_options
223
def run_learn(project_id, paths, docs_limit, backend_param):
224
    """
225
    Further train an existing project on a collection of documents.
226
    \f
227
    Similar to the ``train`` command. This will continue training an already
228
    trained project using the documents given by ``PATHS`` in a single batch
229
    operation. Not supported by all backends.
230
    """
231
    proj = cli_util.get_project(project_id)
232
    backend_params = cli_util.parse_backend_params(backend_param, proj)
233
    documents = cli_util.open_documents(
234
        paths, proj.subjects, proj.vocab_lang, docs_limit
235
    )
236
    proj.learn(documents, backend_params)
237
238
239
@cli.command("suggest")
240
@cli_util.project_id
241
@click.argument(
242
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
243
)
244
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
245
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
246
@click.option("--language", "-L", help="Language of subject labels")
247
@cli_util.docs_limit_option
248
@cli_util.backend_param_option
249
@cli_util.common_options
250
def run_suggest(
251
    project_id, paths, limit, threshold, language, backend_param, docs_limit
252
):
253
    """
254
    Suggest subjects for a single document from standard input or for one or more
255
    document file(s) given its/their path(s).
256
    \f
257
    This will read a text document from standard input and suggest subjects for
258
    it, or if given path(s) to file(s), suggest subjects for it/them.
259
    """
260
    project = cli_util.get_project(project_id)
261
    lang = language or project.vocab_lang
262
    if lang not in project.vocab.languages:
263
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
264
    backend_params = cli_util.parse_backend_params(backend_param, project)
265
266
    if paths and not (len(paths) == 1 and paths[0] == "-"):
267
        docs = cli_util.open_text_documents(paths, docs_limit)
268
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
269
        for (
270
            suggestions,
271
            path,
272
        ) in zip(results, paths):
273
            click.echo(f"Suggestions for {path}")
274
            cli_util.show_hits(suggestions, project, lang)
275
    else:
276
        text = sys.stdin.read()
277
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
278
            0
279
        ]
280
        cli_util.show_hits(suggestions, project, lang)
281
282
283
@cli.command("index")
284
@cli_util.project_id
285
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
286
@click.option(
287
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
288
)
289
@click.option(
290
    "--force/--no-force",
291
    "-f/-F",
292
    default=False,
293
    help="Force overwriting of existing result files",
294
)
295
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
296
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
297
@click.option("--language", "-L", help="Language of subject labels")
298
@cli_util.backend_param_option
299
@cli_util.common_options
300
def run_index(
301
    project_id, directory, suffix, force, limit, threshold, language, backend_param
302
):
303
    """
304
    Index a directory with documents, suggesting subjects for each document.
305
    Write the results in TSV files with the given suffix (``.annif`` by
306
    default).
307
    """
308
    project = cli_util.get_project(project_id)
309
    lang = language or project.vocab_lang
310
    if lang not in project.vocab.languages:
311
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
312
    backend_params = cli_util.parse_backend_params(backend_param, project)
313
314
    documents = annif.corpus.DocumentDirectory(
315
        directory, None, None, require_subjects=False
316
    )
317
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
318
319
    for (docfilename, _), suggestions in zip(documents, results):
320
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
321
        if os.path.exists(subjectfilename) and not force:
322
            click.echo(
323
                "Not overwriting {} (use --force to override)".format(subjectfilename)
324
            )
325
            continue
326
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
327
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
328
329
330
@cli.command("eval")
331
@cli_util.project_id
332
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
333
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
334
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
335
@click.option(
336
    "--metric",
337
    "-m",
338
    default=[],
339
    multiple=True,
340
    help="Metric to calculate (default: all)",
341
)
342
@click.option(
343
    "--metrics-file",
344
    "-M",
345
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
346
    help="""Specify file in order to write evaluation metrics in JSON format.
347
    File directory must exist, existing file will be overwritten.""",
348
)
349
@click.option(
350
    "--results-file",
351
    "-r",
352
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
353
    help="""Specify file in order to write non-aggregated results per subject.
354
    File directory must exist, existing file will be overwritten.""",
355
)
356
@click.option(
357
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
358
)
359
@cli_util.docs_limit_option
360
@cli_util.backend_param_option
361
@cli_util.common_options
362
def run_eval(
363
    project_id,
364
    paths,
365
    limit,
366
    threshold,
367
    docs_limit,
368
    metric,
369
    metrics_file,
370
    results_file,
371
    jobs,
372
    backend_param,
373
):
374
    """
375
    Suggest subjects for documents and evaluate the results by comparing
376
    against a gold standard.
377
    \f
378
    With this command the documents from ``PATHS`` (directories or possibly
379
    gzipped TSV files) will be assigned subject suggestions and then
380
    statistical measures are calculated that quantify how well the suggested
381
    subjects match the gold-standard subjects in the documents.
382
383
    Normally the output is the list of the metrics calculated across documents.
384
    If ``--results-file <FILENAME>`` option is given, the metrics are
385
    calculated separately for each subject, and written to the given file.
386
    """
387
388
    project = cli_util.get_project(project_id)
389
    backend_params = cli_util.parse_backend_params(backend_param, project)
390
391
    import annif.eval
392
393
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
394
395
    if results_file:
396
        try:
397
            print("", end="", file=results_file)
398
            click.echo(
399
                "Writing per subject evaluation results to {!s}".format(
400
                    results_file.name
401
                )
402
            )
403
        except Exception as e:
404
            raise NotSupportedException(
405
                "cannot open results-file for writing: " + str(e)
406
            )
407
    corpus = cli_util.open_documents(
408
        paths, project.subjects, project.vocab_lang, docs_limit
409
    )
410
    jobs, pool_class = annif.parallel.get_pool(jobs)
411
412
    project.initialize(parallel=True)
413
    psmap = annif.parallel.ProjectSuggestMap(
414
        project.registry, [project_id], backend_params, limit, threshold
415
    )
416
417
    with pool_class(jobs) as pool:
418
        for hit_sets, subject_sets in pool.imap_unordered(
419
            psmap.suggest_batch, corpus.doc_batches
420
        ):
421
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
422
423
    template = "{0:<30}\t{1}"
424
    metrics = eval_batch.results(
425
        metrics=metric, results_file=results_file, language=project.vocab_lang
426
    )
427
    for metric, score in metrics.items():
428
        click.echo(template.format(metric + ":", score))
429
    if metrics_file:
430
        json.dump(
431
            {metric_code(mname): val for mname, val in metrics.items()},
432
            metrics_file,
433
            indent=2,
434
        )
435
436
437
FILTER_BATCH_MAX_LIMIT = 15
438
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
439
440
441
@cli.command("optimize")
442
@cli_util.project_id
443
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
444
@click.option(
445
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
446
)
447
@cli_util.docs_limit_option
448
@cli_util.backend_param_option
449
@cli_util.common_options
450
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
451
    """
452
    Suggest subjects for documents, testing multiple limits and thresholds.
453
    \f
454
    This command will use different limit (maximum number of subjects) and
455
    score threshold values when assigning subjects to each document given by
456
    ``PATHS`` and compare the results against the gold standard subjects in the
457
    documents. The output is a list of parameter combinations and their scores.
458
    From the output, you can determine the optimum limit and threshold
459
    parameters depending on which measure you want to target.
460
    """
461
    project = cli_util.get_project(project_id)
462
    backend_params = cli_util.parse_backend_params(backend_param, project)
463
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
464
465
    import annif.eval
466
467
    corpus = cli_util.open_documents(
468
        paths, project.subjects, project.vocab_lang, docs_limit
469
    )
470
471
    jobs, pool_class = annif.parallel.get_pool(jobs)
472
473
    project.initialize(parallel=True)
474
    psmap = annif.parallel.ProjectSuggestMap(
475
        project.registry,
476
        [project_id],
477
        backend_params,
478
        limit=FILTER_BATCH_MAX_LIMIT,
479
        threshold=0.0,
480
    )
481
482
    ndocs = 0
483
    suggestion_batches = []
484
    subject_set_batches = []
485
    with pool_class(jobs) as pool:
486
        for suggestion_batch, subject_sets in pool.imap_unordered(
487
            psmap.suggest_batch, corpus.doc_batches
488
        ):
489
            ndocs += len(suggestion_batch[project_id])
490
            suggestion_batches.append(suggestion_batch[project_id])
491
            subject_set_batches.append(subject_sets)
492
493
    from annif.suggestion import SuggestionResults
494
495
    orig_suggestion_results = SuggestionResults(suggestion_batches)
496
497
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
498
499
    best_scores = collections.defaultdict(float)
500
    best_params = {}
501
502
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
503
    import annif.eval
504
505
    for limit, threshold in filter_params:
506
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
507
        filtered_results = orig_suggestion_results.filter(limit, threshold)
508
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
509
            eval_batch.evaluate_many(batch, subject_sets)
510
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
511
        for metric, score in results.items():
512
            if score >= best_scores[metric]:
513
                best_scores[metric] = score
514
                best_params[metric] = (limit, threshold)
515
        click.echo(
516
            template.format(
517
                limit,
518
                threshold,
519
                results["Precision (doc avg)"],
520
                results["Recall (doc avg)"],
521
                results["F1 score (doc avg)"],
522
            )
523
        )
524
525
    click.echo()
526
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
527
    for metric in OPTIMIZE_METRICS:
528
        click.echo(
529
            template2.format(
530
                metric,
531
                best_scores[metric],
532
                best_params[metric][0],
533
                best_params[metric][1],
534
            )
535
        )
536
    click.echo("Documents evaluated:\t{}".format(ndocs))
537
538
539
@cli.command("hyperopt")
540
@cli_util.project_id
541
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
542
@click.option("--trials", "-T", default=10, help="Number of trials")
543
@click.option(
544
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
545
)
546
@click.option(
547
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
548
)
549
@click.option(
550
    "--results-file",
551
    "-r",
552
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
553
    help="""Specify file path to write trial results as CSV.
554
    File directory must exist, existing file will be overwritten.""",
555
)
556
@cli_util.docs_limit_option
557
@cli_util.common_options
558
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
559
    """
560
    Optimize the hyperparameters of a project using validation documents from
561
    ``PATHS``. Not supported by all backends. Output is a list of trial results
562
    and a report of the best performing parameters.
563
    """
564
    proj = cli_util.get_project(project_id)
565
    documents = cli_util.open_documents(
566
        paths, proj.subjects, proj.vocab_lang, docs_limit
567
    )
568
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
569
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
570
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
571
    click.echo("---")
572
    for line in rec.lines:
573
        click.echo(line)
574
    click.echo("---")
575
576
577
@cli.command("completion")
578
@click.option("--bash", "shell", flag_value="bash")
579
@click.option("--zsh", "shell", flag_value="zsh")
580
@click.option("--fish", "shell", flag_value="fish")
581
def completion(shell):
582
    """Generate the script for tab-key autocompletion for the given shell. To enable the
583
    completion support in your current bash terminal session run\n
584
        source <(annif completion --bash)
585
    """
586
587
    if shell is None:
588
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
589
590
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
591
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
592
    click.echo(script)
593
594
595
if __name__ == "__main__":
596
    cli()
597