Passed
Pull Request — main (#702)
by Juho
05:50 queued 02:56
created

annif.cli.run_app()   A

Complexity

Conditions 1

Size

Total Lines 13
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 1
dl 0
loc 13
rs 10
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util
21
from annif.exception import NotInitializedException, NotSupportedException
22
from annif.project import Access
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
create_app = annif.create_flask_app
29
cli = FlaskGroup(
30
    create_app=create_app, add_default_commands=False, add_version_option=False
31
)
32
cli = click.version_option(message="%(version)s")(cli)
33
cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
34
35
36
@cli.command("list-projects")
37
@cli_util.common_options
38
@click_log.simple_verbosity_option(logger, default="ERROR")
39
def run_list_projects():
40
    """
41
    List available projects.
42
    \f
43
    Show a list of currently defined projects. Projects are defined in a
44
    configuration file, normally called ``projects.cfg``. See `Project
45
    configuration
46
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
47
    for details.
48
    """
49
50
    column_headings = (
51
        "Project ID",
52
        "Project Name",
53
        "Vocabulary ID",
54
        "Language",
55
        "Trained",
56
        "Modification time",
57
    )
58
    table = [
59
        (
60
            proj.project_id,
61
            proj.name,
62
            proj.vocab.vocab_id if proj.vocab_spec else "-",
63
            proj.language,
64
            str(proj.is_trained),
65
            cli_util.format_datetime(proj.modification_time),
66
        )
67
        for proj in annif.registry.get_projects(min_access=Access.private).values()
68
    ]
69
    template = cli_util.make_list_template(column_headings, *table)
70
    header = template.format(*column_headings)
71
    click.echo(header)
72
    click.echo("-" * len(header))
73
    for row in table:
74
        click.echo(template.format(*row))
75
76
77
@cli.command("show-project")
78
@cli_util.project_id
79
@cli_util.common_options
80
def run_show_project(project_id):
81
    """
82
    Show information about a project.
83
    """
84
85
    proj = cli_util.get_project(project_id)
86
    click.echo(f"Project ID:        {proj.project_id}")
87
    click.echo(f"Project Name:      {proj.name}")
88
    click.echo(f"Language:          {proj.language}")
89
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
90
    click.echo(f"Vocab language:    {proj.vocab_lang}")
91
    click.echo(f"Access:            {proj.access.name}")
92
    click.echo(f"Backend:           {proj.backend.name}")
93
    click.echo(f"Trained:           {proj.is_trained}")
94
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
95
96
97
@cli.command("clear")
98
@cli_util.project_id
99
@cli_util.common_options
100
def run_clear_project(project_id):
101
    """
102
    Initialize the project to its original, untrained state.
103
    """
104
    proj = cli_util.get_project(project_id)
105
    proj.remove_model_data()
106
107
108
@cli.command("list-vocabs")
109
@cli_util.common_options
110
@click_log.simple_verbosity_option(logger, default="ERROR")
111
def run_list_vocabs():
112
    """
113
    List available vocabularies.
114
    """
115
116
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
117
    table = []
118
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
119
        try:
120
            languages = ",".join(sorted(vocab.languages))
121
            size = len(vocab)
122
            loaded = True
123
        except NotInitializedException:
124
            languages = "-"
125
            size = "-"
126
            loaded = False
127
        row = (vocab.vocab_id, languages, str(size), str(loaded))
128
        table.append(row)
129
130
    template = cli_util.make_list_template(column_headings, *table)
131
    header = template.format(*column_headings)
132
    click.echo(header)
133
    click.echo("-" * len(header))
134
    for row in table:
135
        click.echo(template.format(*row))
136
137
138
@cli.command("load-vocab")
139
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
140
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
141
@click.option("--language", "-L", help="Language of subject file")
142
@click.option(
143
    "--force",
144
    "-f",
145
    default=False,
146
    is_flag=True,
147
    help="Replace existing vocabulary completely instead of updating it",
148
)
149
@cli_util.common_options
150
def run_load_vocab(vocab_id, language, force, subjectfile):
151
    """
152
    Load a vocabulary from a subject file.
153
    """
154
    vocab = cli_util.get_vocab(vocab_id)
155
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
156
        # SKOS/RDF file supported by rdflib
157
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
158
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
159
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
160
        # CSV file
161
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
162
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
163
    else:
164
        # probably a TSV file - we need to know its language
165
        if not language:
166
            click.echo(
167
                "Please use --language option to set the language of a TSV vocabulary.",
168
                err=True,
169
            )
170
            sys.exit(1)
171
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
172
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
173
    vocab.load_vocabulary(subjects, force=force)
174
175
176
@cli.command("train")
177
@cli_util.project_id
178
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
179
@click.option(
180
    "--cached/--no-cached",
181
    "-c/-C",
182
    default=False,
183
    help="Reuse preprocessed training data from previous run",
184
)
185
@click.option(
186
    "--jobs",
187
    "-j",
188
    default=0,
189
    help="Number of parallel jobs (0 means choose automatically)",
190
)
191
@cli_util.docs_limit_option
192
@cli_util.backend_param_option
193
@cli_util.common_options
194
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
195
    """
196
    Train a project on a collection of documents.
197
    \f
198
    This will train the project using the documents from ``PATHS`` (directories
199
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
200
    is set, preprocessed training data from the previous run is reused instead
201
    of documents input; see `Reusing preprocessed training data
202
    <https://github.com/NatLibFi/Annif/wiki/
203
    Reusing-preprocessed-training-data>`_.
204
    """
205
    proj = cli_util.get_project(project_id)
206
    backend_params = cli_util.parse_backend_params(backend_param, proj)
207
    if cached:
208
        if len(paths) > 0:
209
            raise click.UsageError(
210
                "Corpus paths cannot be given when using --cached option."
211
            )
212
        documents = "cached"
213
    else:
214
        documents = cli_util.open_documents(
215
            paths, proj.subjects, proj.vocab_lang, docs_limit
216
        )
217
    proj.train(documents, backend_params, jobs)
218
219
220
@cli.command("learn")
221
@cli_util.project_id
222
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
223
@cli_util.docs_limit_option
224
@cli_util.backend_param_option
225
@cli_util.common_options
226
def run_learn(project_id, paths, docs_limit, backend_param):
227
    """
228
    Further train an existing project on a collection of documents.
229
    \f
230
    Similar to the ``train`` command. This will continue training an already
231
    trained project using the documents given by ``PATHS`` in a single batch
232
    operation. Not supported by all backends.
233
    """
234
    proj = cli_util.get_project(project_id)
235
    backend_params = cli_util.parse_backend_params(backend_param, proj)
236
    documents = cli_util.open_documents(
237
        paths, proj.subjects, proj.vocab_lang, docs_limit
238
    )
239
    proj.learn(documents, backend_params)
240
241
242
@cli.command("suggest")
243
@cli_util.project_id
244
@click.argument(
245
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
246
)
247
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
248
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
249
@click.option("--language", "-L", help="Language of subject labels")
250
@cli_util.docs_limit_option
251
@cli_util.backend_param_option
252
@cli_util.common_options
253
def run_suggest(
254
    project_id, paths, limit, threshold, language, backend_param, docs_limit
255
):
256
    """
257
    Suggest subjects for a single document from standard input or for one or more
258
    document file(s) given its/their path(s).
259
    \f
260
    This will read a text document from standard input and suggest subjects for
261
    it, or if given path(s) to file(s), suggest subjects for it/them.
262
    """
263
    project = cli_util.get_project(project_id)
264
    lang = language or project.vocab_lang
265
    if lang not in project.vocab.languages:
266
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
267
    backend_params = cli_util.parse_backend_params(backend_param, project)
268
269
    if paths and not (len(paths) == 1 and paths[0] == "-"):
270
        docs = cli_util.open_text_documents(paths, docs_limit)
271
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
272
        for (
273
            suggestions,
274
            path,
275
        ) in zip(results, paths):
276
            click.echo(f"Suggestions for {path}")
277
            cli_util.show_hits(suggestions, project, lang)
278
    else:
279
        text = sys.stdin.read()
280
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
281
            0
282
        ]
283
        cli_util.show_hits(suggestions, project, lang)
284
285
286
@cli.command("index")
287
@cli_util.project_id
288
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
289
@click.option(
290
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
291
)
292
@click.option(
293
    "--force/--no-force",
294
    "-f/-F",
295
    default=False,
296
    help="Force overwriting of existing result files",
297
)
298
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
299
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
300
@click.option("--language", "-L", help="Language of subject labels")
301
@cli_util.backend_param_option
302
@cli_util.common_options
303
def run_index(
304
    project_id, directory, suffix, force, limit, threshold, language, backend_param
305
):
306
    """
307
    Index a directory with documents, suggesting subjects for each document.
308
    Write the results in TSV files with the given suffix (``.annif`` by
309
    default).
310
    """
311
    project = cli_util.get_project(project_id)
312
    lang = language or project.vocab_lang
313
    if lang not in project.vocab.languages:
314
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
315
    backend_params = cli_util.parse_backend_params(backend_param, project)
316
317
    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
318
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
319
320
    for (docfilename, _), suggestions in zip(documents, results):
321
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
322
        if os.path.exists(subjectfilename) and not force:
323
            click.echo(
324
                "Not overwriting {} (use --force to override)".format(subjectfilename)
325
            )
326
            continue
327
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
328
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
329
330
331
@cli.command("eval")
332
@cli_util.project_id
333
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
334
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
335
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
336
@click.option(
337
    "--metric",
338
    "-m",
339
    default=[],
340
    multiple=True,
341
    help="Metric to calculate (default: all)",
342
)
343
@click.option(
344
    "--metrics-file",
345
    "-M",
346
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
347
    help="""Specify file in order to write evaluation metrics in JSON format.
348
    File directory must exist, existing file will be overwritten.""",
349
)
350
@click.option(
351
    "--results-file",
352
    "-r",
353
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
354
    help="""Specify file in order to write non-aggregated results per subject.
355
    File directory must exist, existing file will be overwritten.""",
356
)
357
@click.option(
358
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
359
)
360
@cli_util.docs_limit_option
361
@cli_util.backend_param_option
362
@cli_util.common_options
363
def run_eval(
364
    project_id,
365
    paths,
366
    limit,
367
    threshold,
368
    docs_limit,
369
    metric,
370
    metrics_file,
371
    results_file,
372
    jobs,
373
    backend_param,
374
):
375
    """
376
    Suggest subjects for documents and evaluate the results by comparing
377
    against a gold standard.
378
    \f
379
    With this command the documents from ``PATHS`` (directories or possibly
380
    gzipped TSV files) will be assigned subject suggestions and then
381
    statistical measures are calculated that quantify how well the suggested
382
    subjects match the gold-standard subjects in the documents.
383
384
    Normally the output is the list of the metrics calculated across documents.
385
    If ``--results-file <FILENAME>`` option is given, the metrics are
386
    calculated separately for each subject, and written to the given file.
387
    """
388
389
    project = cli_util.get_project(project_id)
390
    backend_params = cli_util.parse_backend_params(backend_param, project)
391
392
    import annif.eval
393
394
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
395
396
    if results_file:
397
        try:
398
            print("", end="", file=results_file)
399
            click.echo(
400
                "Writing per subject evaluation results to {!s}".format(
401
                    results_file.name
402
                )
403
            )
404
        except Exception as e:
405
            raise NotSupportedException(
406
                "cannot open results-file for writing: " + str(e)
407
            )
408
    corpus = cli_util.open_documents(
409
        paths, project.subjects, project.vocab_lang, docs_limit
410
    )
411
    jobs, pool_class = annif.parallel.get_pool(jobs)
412
413
    project.initialize(parallel=True)
414
    psmap = annif.parallel.ProjectSuggestMap(
415
        project.registry, [project_id], backend_params, limit, threshold
416
    )
417
418
    with pool_class(jobs) as pool:
419
        for hit_sets, subject_sets in pool.imap_unordered(
420
            psmap.suggest_batch, corpus.doc_batches
421
        ):
422
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
423
424
    template = "{0:<30}\t{1:{fmt_spec}}"
425
    metrics = eval_batch.results(
426
        metrics=metric, results_file=results_file, language=project.vocab_lang
427
    )
428
    for metric, score in metrics.items():
429
        if isinstance(score, int):
430
            fmt_spec = "d"
431
        elif isinstance(score, float):
432
            fmt_spec = ".04f"
433
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
0 ignored issues
show
introduced by
The variable fmt_spec does not seem to be defined for all execution paths.
Loading history...
434
    if metrics_file:
435
        json.dump(
436
            {metric_code(mname): val for mname, val in metrics.items()},
437
            metrics_file,
438
            indent=2,
439
        )
440
441
442
@cli.command("run")
443
@click.option("--port", type=int, default=5000)
444
@click.option("--log-level")
445
@click_log.simple_verbosity_option(logger, default="ERROR")
446
def run_app(**kwargs):
447
    """
448
    Run Annif in server mode for development.
449
    \f
450
    The server is for development purposes only.
451
    """
452
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
453
    cxapp = annif.create_cx_app()
454
    cxapp.run(**kwargs)
455
456
457
FILTER_BATCH_MAX_LIMIT = 15
458
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
459
460
461
@cli.command("optimize")
462
@cli_util.project_id
463
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
464
@click.option(
465
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
466
)
467
@cli_util.docs_limit_option
468
@cli_util.backend_param_option
469
@cli_util.common_options
470
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
471
    """
472
    Suggest subjects for documents, testing multiple limits and thresholds.
473
    \f
474
    This command will use different limit (maximum number of subjects) and
475
    score threshold values when assigning subjects to each document given by
476
    ``PATHS`` and compare the results against the gold standard subjects in the
477
    documents. The output is a list of parameter combinations and their scores.
478
    From the output, you can determine the optimum limit and threshold
479
    parameters depending on which measure you want to target.
480
    """
481
    project = cli_util.get_project(project_id)
482
    backend_params = cli_util.parse_backend_params(backend_param, project)
483
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
484
485
    import annif.eval
486
487
    corpus = cli_util.open_documents(
488
        paths, project.subjects, project.vocab_lang, docs_limit
489
    )
490
491
    jobs, pool_class = annif.parallel.get_pool(jobs)
492
493
    project.initialize(parallel=True)
494
    psmap = annif.parallel.ProjectSuggestMap(
495
        project.registry,
496
        [project_id],
497
        backend_params,
498
        limit=FILTER_BATCH_MAX_LIMIT,
499
        threshold=0.0,
500
    )
501
502
    ndocs = 0
503
    suggestion_batches = []
504
    subject_set_batches = []
505
    with pool_class(jobs) as pool:
506
        for suggestion_batch, subject_sets in pool.imap_unordered(
507
            psmap.suggest_batch, corpus.doc_batches
508
        ):
509
            ndocs += len(suggestion_batch[project_id])
510
            suggestion_batches.append(suggestion_batch[project_id])
511
            subject_set_batches.append(subject_sets)
512
513
    from annif.suggestion import SuggestionResults
514
515
    orig_suggestion_results = SuggestionResults(suggestion_batches)
516
517
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
518
519
    best_scores = collections.defaultdict(float)
520
    best_params = {}
521
522
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
523
    import annif.eval
524
525
    for limit, threshold in filter_params:
526
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
527
        filtered_results = orig_suggestion_results.filter(limit, threshold)
528
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
529
            eval_batch.evaluate_many(batch, subject_sets)
530
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
531
        for metric, score in results.items():
532
            if score >= best_scores[metric]:
533
                best_scores[metric] = score
534
                best_params[metric] = (limit, threshold)
535
        click.echo(
536
            template.format(
537
                limit,
538
                threshold,
539
                results["Precision (doc avg)"],
540
                results["Recall (doc avg)"],
541
                results["F1 score (doc avg)"],
542
            )
543
        )
544
545
    click.echo()
546
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
547
    for metric in OPTIMIZE_METRICS:
548
        click.echo(
549
            template2.format(
550
                metric,
551
                best_scores[metric],
552
                best_params[metric][0],
553
                best_params[metric][1],
554
            )
555
        )
556
    click.echo("Documents evaluated:\t{}".format(ndocs))
557
558
559
@cli.command("hyperopt")
560
@cli_util.project_id
561
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
562
@click.option("--trials", "-T", default=10, help="Number of trials")
563
@click.option(
564
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
565
)
566
@click.option(
567
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
568
)
569
@click.option(
570
    "--results-file",
571
    "-r",
572
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
573
    help="""Specify file path to write trial results as CSV.
574
    File directory must exist, existing file will be overwritten.""",
575
)
576
@cli_util.docs_limit_option
577
@cli_util.common_options
578
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
579
    """
580
    Optimize the hyperparameters of a project using validation documents from
581
    ``PATHS``. Not supported by all backends. Output is a list of trial results
582
    and a report of the best performing parameters.
583
    """
584
    proj = cli_util.get_project(project_id)
585
    documents = cli_util.open_documents(
586
        paths, proj.subjects, proj.vocab_lang, docs_limit
587
    )
588
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
589
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
590
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
591
    click.echo("---")
592
    for line in rec.lines:
593
        click.echo(line)
594
    click.echo("---")
595
596
597
@cli.command("completion")
598
@click.option("--bash", "shell", flag_value="bash")
599
@click.option("--zsh", "shell", flag_value="zsh")
600
@click.option("--fish", "shell", flag_value="fish")
601
def run_completion(shell):
602
    """Generate the script for tab-key autocompletion for the given shell. To enable the
603
    completion support in your current bash terminal session run\n
604
        source <(annif completion --bash)
605
    """
606
607
    if shell is None:
608
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
609
610
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
611
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
612
    click.echo(script)
613
614
615
if __name__ == "__main__":
616
    cli()
617