Passed
Pull Request — main (#702)
by Juho
06:19 queued 03:07
created

annif.cli.run_app()   A

Complexity

Conditions 1

Size

Total Lines 13
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 1
dl 0
loc 13
rs 10
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
import collections
5
import importlib
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util
21
from annif.exception import NotInitializedException, NotSupportedException
22
from annif.project import Access
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
create_app = annif.create_flask_app
29
cli = FlaskGroup(
30
    create_app=create_app, add_default_commands=False, add_version_option=False
31
)
32
cli = click.version_option(message="%(version)s")(cli)
33
34
35
@cli.command("list-projects")
36
@cli_util.common_options
37
@click_log.simple_verbosity_option(logger, default="ERROR")
38
def run_list_projects():
39
    """
40
    List available projects.
41
    \f
42
    Show a list of currently defined projects. Projects are defined in a
43
    configuration file, normally called ``projects.cfg``. See `Project
44
    configuration
45
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
46
    for details.
47
    """
48
49
    column_headings = (
50
        "Project ID",
51
        "Project Name",
52
        "Vocabulary ID",
53
        "Language",
54
        "Trained",
55
        "Modification time",
56
    )
57
    table = [
58
        (
59
            proj.project_id,
60
            proj.name,
61
            proj.vocab.vocab_id if proj.vocab_spec else "-",
62
            proj.language,
63
            str(proj.is_trained),
64
            cli_util.format_datetime(proj.modification_time),
65
        )
66
        for proj in annif.registry.get_projects(min_access=Access.private).values()
67
    ]
68
    template = cli_util.make_list_template(column_headings, *table)
69
    header = template.format(*column_headings)
70
    click.echo(header)
71
    click.echo("-" * len(header))
72
    for row in table:
73
        click.echo(template.format(*row))
74
75
76
@cli.command("show-project")
77
@cli_util.project_id
78
@cli_util.common_options
79
def run_show_project(project_id):
80
    """
81
    Show information about a project.
82
    """
83
84
    proj = cli_util.get_project(project_id)
85
    click.echo(f"Project ID:        {proj.project_id}")
86
    click.echo(f"Project Name:      {proj.name}")
87
    click.echo(f"Language:          {proj.language}")
88
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
89
    click.echo(f"Vocab language:    {proj.vocab_lang}")
90
    click.echo(f"Access:            {proj.access.name}")
91
    click.echo(f"Backend:           {proj.backend.name}")
92
    click.echo(f"Trained:           {proj.is_trained}")
93
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
94
95
96
@cli.command("clear")
97
@cli_util.project_id
98
@cli_util.common_options
99
def run_clear_project(project_id):
100
    """
101
    Initialize the project to its original, untrained state.
102
    """
103
    proj = cli_util.get_project(project_id)
104
    proj.remove_model_data()
105
106
107
@cli.command("list-vocabs")
108
@cli_util.common_options
109
@click_log.simple_verbosity_option(logger, default="ERROR")
110
def run_list_vocabs():
111
    """
112
    List available vocabularies.
113
    """
114
115
    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
116
    table = []
117
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
118
        try:
119
            languages = ",".join(sorted(vocab.languages))
120
            size = len(vocab)
121
            loaded = True
122
        except NotInitializedException:
123
            languages = "-"
124
            size = "-"
125
            loaded = False
126
        row = (vocab.vocab_id, languages, str(size), str(loaded))
127
        table.append(row)
128
129
    template = cli_util.make_list_template(column_headings, *table)
130
    header = template.format(*column_headings)
131
    click.echo(header)
132
    click.echo("-" * len(header))
133
    for row in table:
134
        click.echo(template.format(*row))
135
136
137
@cli.command("load-vocab")
138
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
139
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
140
@click.option("--language", "-L", help="Language of subject file")
141
@click.option(
142
    "--force",
143
    "-f",
144
    default=False,
145
    is_flag=True,
146
    help="Replace existing vocabulary completely instead of updating it",
147
)
148
@cli_util.common_options
149
def run_load_vocab(vocab_id, language, force, subjectfile):
150
    """
151
    Load a vocabulary from a subject file.
152
    """
153
    vocab = cli_util.get_vocab(vocab_id)
154
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
155
        # SKOS/RDF file supported by rdflib
156
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
157
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
158
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
159
        # CSV file
160
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
161
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
162
    else:
163
        # probably a TSV file - we need to know its language
164
        if not language:
165
            click.echo(
166
                "Please use --language option to set the language of a TSV vocabulary.",
167
                err=True,
168
            )
169
            sys.exit(1)
170
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
171
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
172
    vocab.load_vocabulary(subjects, force=force)
173
174
175
@cli.command("train")
176
@cli_util.project_id
177
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
178
@click.option(
179
    "--cached/--no-cached",
180
    "-c/-C",
181
    default=False,
182
    help="Reuse preprocessed training data from previous run",
183
)
184
@click.option(
185
    "--jobs",
186
    "-j",
187
    default=0,
188
    help="Number of parallel jobs (0 means choose automatically)",
189
)
190
@cli_util.docs_limit_option
191
@cli_util.backend_param_option
192
@cli_util.common_options
193
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
194
    """
195
    Train a project on a collection of documents.
196
    \f
197
    This will train the project using the documents from ``PATHS`` (directories
198
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
199
    is set, preprocessed training data from the previous run is reused instead
200
    of documents input; see `Reusing preprocessed training data
201
    <https://github.com/NatLibFi/Annif/wiki/
202
    Reusing-preprocessed-training-data>`_.
203
    """
204
    proj = cli_util.get_project(project_id)
205
    backend_params = cli_util.parse_backend_params(backend_param, proj)
206
    if cached:
207
        if len(paths) > 0:
208
            raise click.UsageError(
209
                "Corpus paths cannot be given when using --cached option."
210
            )
211
        documents = "cached"
212
    else:
213
        documents = cli_util.open_documents(
214
            paths, proj.subjects, proj.vocab_lang, docs_limit
215
        )
216
    proj.train(documents, backend_params, jobs)
217
218
219
@cli.command("learn")
220
@cli_util.project_id
221
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
222
@cli_util.docs_limit_option
223
@cli_util.backend_param_option
224
@cli_util.common_options
225
def run_learn(project_id, paths, docs_limit, backend_param):
226
    """
227
    Further train an existing project on a collection of documents.
228
    \f
229
    Similar to the ``train`` command. This will continue training an already
230
    trained project using the documents given by ``PATHS`` in a single batch
231
    operation. Not supported by all backends.
232
    """
233
    proj = cli_util.get_project(project_id)
234
    backend_params = cli_util.parse_backend_params(backend_param, proj)
235
    documents = cli_util.open_documents(
236
        paths, proj.subjects, proj.vocab_lang, docs_limit
237
    )
238
    proj.learn(documents, backend_params)
239
240
241
@cli.command("suggest")
242
@cli_util.project_id
243
@click.argument(
244
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
245
)
246
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
247
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
248
@click.option("--language", "-L", help="Language of subject labels")
249
@cli_util.docs_limit_option
250
@cli_util.backend_param_option
251
@cli_util.common_options
252
def run_suggest(
253
    project_id, paths, limit, threshold, language, backend_param, docs_limit
254
):
255
    """
256
    Suggest subjects for a single document from standard input or for one or more
257
    document file(s) given its/their path(s).
258
    \f
259
    This will read a text document from standard input and suggest subjects for
260
    it, or if given path(s) to file(s), suggest subjects for it/them.
261
    """
262
    project = cli_util.get_project(project_id)
263
    lang = language or project.vocab_lang
264
    if lang not in project.vocab.languages:
265
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
266
    backend_params = cli_util.parse_backend_params(backend_param, project)
267
268
    if paths and not (len(paths) == 1 and paths[0] == "-"):
269
        docs = cli_util.open_text_documents(paths, docs_limit)
270
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
271
        for (
272
            suggestions,
273
            path,
274
        ) in zip(results, paths):
275
            click.echo(f"Suggestions for {path}")
276
            cli_util.show_hits(suggestions, project, lang)
277
    else:
278
        text = sys.stdin.read()
279
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
280
            0
281
        ]
282
        cli_util.show_hits(suggestions, project, lang)
283
284
285
@cli.command("index")
286
@cli_util.project_id
287
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
288
@click.option(
289
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
290
)
291
@click.option(
292
    "--force/--no-force",
293
    "-f/-F",
294
    default=False,
295
    help="Force overwriting of existing result files",
296
)
297
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
298
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
299
@click.option("--language", "-L", help="Language of subject labels")
300
@cli_util.backend_param_option
301
@cli_util.common_options
302
def run_index(
303
    project_id, directory, suffix, force, limit, threshold, language, backend_param
304
):
305
    """
306
    Index a directory with documents, suggesting subjects for each document.
307
    Write the results in TSV files with the given suffix (``.annif`` by
308
    default).
309
    """
310
    project = cli_util.get_project(project_id)
311
    lang = language or project.vocab_lang
312
    if lang not in project.vocab.languages:
313
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
314
    backend_params = cli_util.parse_backend_params(backend_param, project)
315
316
    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
317
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
318
319
    for (docfilename, _), suggestions in zip(documents, results):
320
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
321
        if os.path.exists(subjectfilename) and not force:
322
            click.echo(
323
                "Not overwriting {} (use --force to override)".format(subjectfilename)
324
            )
325
            continue
326
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
327
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
328
329
330
@cli.command("eval")
331
@cli_util.project_id
332
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
333
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
334
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
335
@click.option(
336
    "--metric",
337
    "-m",
338
    default=[],
339
    multiple=True,
340
    help="Metric to calculate (default: all)",
341
)
342
@click.option(
343
    "--metrics-file",
344
    "-M",
345
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
346
    help="""Specify file in order to write evaluation metrics in JSON format.
347
    File directory must exist, existing file will be overwritten.""",
348
)
349
@click.option(
350
    "--results-file",
351
    "-r",
352
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
353
    help="""Specify file in order to write non-aggregated results per subject.
354
    File directory must exist, existing file will be overwritten.""",
355
)
356
@click.option(
357
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
358
)
359
@cli_util.docs_limit_option
360
@cli_util.backend_param_option
361
@cli_util.common_options
362
def run_eval(
363
    project_id,
364
    paths,
365
    limit,
366
    threshold,
367
    docs_limit,
368
    metric,
369
    metrics_file,
370
    results_file,
371
    jobs,
372
    backend_param,
373
):
374
    """
375
    Suggest subjects for documents and evaluate the results by comparing
376
    against a gold standard.
377
    \f
378
    With this command the documents from ``PATHS`` (directories or possibly
379
    gzipped TSV files) will be assigned subject suggestions and then
380
    statistical measures are calculated that quantify how well the suggested
381
    subjects match the gold-standard subjects in the documents.
382
383
    Normally the output is the list of the metrics calculated across documents.
384
    If ``--results-file <FILENAME>`` option is given, the metrics are
385
    calculated separately for each subject, and written to the given file.
386
    """
387
388
    project = cli_util.get_project(project_id)
389
    backend_params = cli_util.parse_backend_params(backend_param, project)
390
391
    import annif.eval
392
393
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
394
395
    if results_file:
396
        try:
397
            print("", end="", file=results_file)
398
            click.echo(
399
                "Writing per subject evaluation results to {!s}".format(
400
                    results_file.name
401
                )
402
            )
403
        except Exception as e:
404
            raise NotSupportedException(
405
                "cannot open results-file for writing: " + str(e)
406
            )
407
    corpus = cli_util.open_documents(
408
        paths, project.subjects, project.vocab_lang, docs_limit
409
    )
410
    jobs, pool_class = annif.parallel.get_pool(jobs)
411
412
    project.initialize(parallel=True)
413
    psmap = annif.parallel.ProjectSuggestMap(
414
        project.registry, [project_id], backend_params, limit, threshold
415
    )
416
417
    with pool_class(jobs) as pool:
418
        for hit_sets, subject_sets in pool.imap_unordered(
419
            psmap.suggest_batch, corpus.doc_batches
420
        ):
421
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
422
423
    template = "{0:<30}\t{1:{fmt_spec}}"
424
    metrics = eval_batch.results(
425
        metrics=metric, results_file=results_file, language=project.vocab_lang
426
    )
427
    for metric, score in metrics.items():
428
        if isinstance(score, int):
429
            fmt_spec = "d"
430
        elif isinstance(score, float):
431
            fmt_spec = ".04f"
432
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
0 ignored issues
show
introduced by
The variable fmt_spec does not seem to be defined for all execution paths.
Loading history...
433
    if metrics_file:
434
        json.dump(
435
            {metric_code(mname): val for mname, val in metrics.items()},
436
            metrics_file,
437
            indent=2,
438
        )
439
440
441
@cli.command("run")
442
@click.option("--port", type=int)
443
@click.option("--log-level")
444
@click_log.simple_verbosity_option(logger, default="ERROR")
445
def run_app(**kwargs):
446
    """
447
    Run Annif in server mode for development.
448
    \f
449
    The server is for development purposes only.
450
    """
451
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
452
    cxapp = annif.create_cx_app()
453
    cxapp.run(**kwargs)
454
455
456
FILTER_BATCH_MAX_LIMIT = 15
457
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
458
459
460
@cli.command("optimize")
461
@cli_util.project_id
462
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
463
@click.option(
464
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
465
)
466
@cli_util.docs_limit_option
467
@cli_util.backend_param_option
468
@cli_util.common_options
469
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
470
    """
471
    Suggest subjects for documents, testing multiple limits and thresholds.
472
    \f
473
    This command will use different limit (maximum number of subjects) and
474
    score threshold values when assigning subjects to each document given by
475
    ``PATHS`` and compare the results against the gold standard subjects in the
476
    documents. The output is a list of parameter combinations and their scores.
477
    From the output, you can determine the optimum limit and threshold
478
    parameters depending on which measure you want to target.
479
    """
480
    project = cli_util.get_project(project_id)
481
    backend_params = cli_util.parse_backend_params(backend_param, project)
482
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
483
484
    import annif.eval
485
486
    corpus = cli_util.open_documents(
487
        paths, project.subjects, project.vocab_lang, docs_limit
488
    )
489
490
    jobs, pool_class = annif.parallel.get_pool(jobs)
491
492
    project.initialize(parallel=True)
493
    psmap = annif.parallel.ProjectSuggestMap(
494
        project.registry,
495
        [project_id],
496
        backend_params,
497
        limit=FILTER_BATCH_MAX_LIMIT,
498
        threshold=0.0,
499
    )
500
501
    ndocs = 0
502
    suggestion_batches = []
503
    subject_set_batches = []
504
    with pool_class(jobs) as pool:
505
        for suggestion_batch, subject_sets in pool.imap_unordered(
506
            psmap.suggest_batch, corpus.doc_batches
507
        ):
508
            ndocs += len(suggestion_batch[project_id])
509
            suggestion_batches.append(suggestion_batch[project_id])
510
            subject_set_batches.append(subject_sets)
511
512
    from annif.suggestion import SuggestionResults
513
514
    orig_suggestion_results = SuggestionResults(suggestion_batches)
515
516
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
517
518
    best_scores = collections.defaultdict(float)
519
    best_params = {}
520
521
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
522
    import annif.eval
523
524
    for limit, threshold in filter_params:
525
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
526
        filtered_results = orig_suggestion_results.filter(limit, threshold)
527
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
528
            eval_batch.evaluate_many(batch, subject_sets)
529
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
530
        for metric, score in results.items():
531
            if score >= best_scores[metric]:
532
                best_scores[metric] = score
533
                best_params[metric] = (limit, threshold)
534
        click.echo(
535
            template.format(
536
                limit,
537
                threshold,
538
                results["Precision (doc avg)"],
539
                results["Recall (doc avg)"],
540
                results["F1 score (doc avg)"],
541
            )
542
        )
543
544
    click.echo()
545
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
546
    for metric in OPTIMIZE_METRICS:
547
        click.echo(
548
            template2.format(
549
                metric,
550
                best_scores[metric],
551
                best_params[metric][0],
552
                best_params[metric][1],
553
            )
554
        )
555
    click.echo("Documents evaluated:\t{}".format(ndocs))
556
557
558
@cli.command("hyperopt")
559
@cli_util.project_id
560
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
561
@click.option("--trials", "-T", default=10, help="Number of trials")
562
@click.option(
563
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
564
)
565
@click.option(
566
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
567
)
568
@click.option(
569
    "--results-file",
570
    "-r",
571
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
572
    help="""Specify file path to write trial results as CSV.
573
    File directory must exist, existing file will be overwritten.""",
574
)
575
@cli_util.docs_limit_option
576
@cli_util.common_options
577
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
578
    """
579
    Optimize the hyperparameters of a project using validation documents from
580
    ``PATHS``. Not supported by all backends. Output is a list of trial results
581
    and a report of the best performing parameters.
582
    """
583
    proj = cli_util.get_project(project_id)
584
    documents = cli_util.open_documents(
585
        paths, proj.subjects, proj.vocab_lang, docs_limit
586
    )
587
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
588
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
589
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
590
    click.echo("---")
591
    for line in rec.lines:
592
        click.echo(line)
593
    click.echo("---")
594
595
596
@cli.command("completion")
597
@click.option("--bash", "shell", flag_value="bash")
598
@click.option("--zsh", "shell", flag_value="zsh")
599
@click.option("--fish", "shell", flag_value="fish")
600
def run_completion(shell):
601
    """Generate the script for tab-key autocompletion for the given shell. To enable the
602
    completion support in your current bash terminal session run\n
603
        source <(annif completion --bash)
604
    """
605
606
    if shell is None:
607
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
608
609
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
610
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
611
    click.echo(script)
612
613
614
if __name__ == "__main__":
615
    cli()
616