Passed
Push — lazy-imports ( f5a695...70018e )
by Juho
16:14 queued 08:48
created

annif.cli   B

Complexity

Total Complexity 46

Size/Duplication

Total Lines 582
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 46
eloc 381
dl 0
loc 582
rs 8.72
c 0
b 0
f 0

13 Functions

Rating   Name   Duplication   Size   Complexity  
A run_hyperopt() 0 36 2
A run_train() 0 42 3
C run_optimize() 0 96 8
A run_load_vocab() 0 36 4
B run_suggest() 0 42 6
A run_learn() 0 20 1
A completion() 0 16 2
A run_list_vocabs() 0 22 3
A run_list_projects() 0 22 2
B run_index() 0 45 6
A run_clear_project() 0 9 1
C run_eval() 0 104 7
A run_show_project() 0 17 1

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import importlib
7
import json
8
import os.path
9
import re
10
import sys
11
12
import click
13
import click_log
14
from flask.cli import FlaskGroup
15
16
import annif
17
import annif.corpus
18
import annif.parallel
19
import annif.project
20
import annif.registry
21
from annif import cli_util
22
from annif.exception import NotInitializedException, NotSupportedException
23
from annif.project import Access
24
from annif.util import metric_code
25
26
logger = annif.logger
27
click_log.basic_config(logger)
28
29
30
if len(sys.argv) > 1 and sys.argv[1] == "run":
31
    create_app = annif.create_app  # Use Flask with Connexion
32
else:
33
    # Connexion is not needed for most CLI commands, use plain Flask
34
    create_app = annif.create_flask_app
35
36
cli = FlaskGroup(create_app=create_app, add_version_option=False)
37
cli = click.version_option(message="%(version)s")(cli)
38
39
40
@cli.command("list-projects")
41
@cli_util.common_options
42
@click_log.simple_verbosity_option(logger, default="ERROR")
43
def run_list_projects():
44
    """
45
    List available projects.
46
    \f
47
    Show a list of currently defined projects. Projects are defined in a
48
    configuration file, normally called ``projects.cfg``. See `Project
49
    configuration
50
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51
    for details.
52
    """
53
54
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
55
    header = template.format("Project ID", "Project Name", "Language", "Trained")
56
    click.echo(header)
57
    click.echo("-" * len(header))
58
    for proj in annif.registry.get_projects(min_access=Access.private).values():
59
        click.echo(
60
            template.format(
61
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
62
            )
63
        )
64
65
66
@cli.command("show-project")
67
@cli_util.project_id
68
@cli_util.common_options
69
def run_show_project(project_id):
70
    """
71
    Show information about a project.
72
    """
73
74
    proj = cli_util.get_project(project_id)
75
    click.echo(f"Project ID:        {proj.project_id}")
76
    click.echo(f"Project Name:      {proj.name}")
77
    click.echo(f"Language:          {proj.language}")
78
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
79
    click.echo(f"Vocab language:    {proj.vocab_lang}")
80
    click.echo(f"Access:            {proj.access.name}")
81
    click.echo(f"Trained:           {proj.is_trained}")
82
    click.echo(f"Modification time: {proj.modification_time}")
83
84
85
@cli.command("clear")
86
@cli_util.project_id
87
@cli_util.common_options
88
def run_clear_project(project_id):
89
    """
90
    Initialize the project to its original, untrained state.
91
    """
92
    proj = cli_util.get_project(project_id)
93
    proj.remove_model_data()
94
95
96
@cli.command("list-vocabs")
97
@cli_util.common_options
98
@click_log.simple_verbosity_option(logger, default="ERROR")
99
def run_list_vocabs():
100
    """
101
    List available vocabularies.
102
    """
103
104
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
105
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
106
    click.echo(header)
107
    click.echo("-" * len(header))
108
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
109
        try:
110
            languages = ",".join(sorted(vocab.languages))
111
            size = len(vocab)
112
            loaded = True
113
        except NotInitializedException:
114
            languages = "-"
115
            size = "-"
116
            loaded = False
117
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
118
119
120
@cli.command("load-vocab")
121
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
122
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
123
@click.option("--language", "-L", help="Language of subject file")
124
@click.option(
125
    "--force",
126
    "-f",
127
    default=False,
128
    is_flag=True,
129
    help="Replace existing vocabulary completely instead of updating it",
130
)
131
@cli_util.common_options
132
def run_load_vocab(vocab_id, language, force, subjectfile):
133
    """
134
    Load a vocabulary from a subject file.
135
    """
136
    vocab = cli_util.get_vocab(vocab_id)
137
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
138
        # SKOS/RDF file supported by rdflib
139
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
140
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
141
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
142
        # CSV file
143
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
144
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
145
    else:
146
        # probably a TSV file - we need to know its language
147
        if not language:
148
            click.echo(
149
                "Please use --language option to set the language of a TSV vocabulary.",
150
                err=True,
151
            )
152
            sys.exit(1)
153
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
154
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
155
    vocab.load_vocabulary(subjects, force=force)
156
157
158
@cli.command("train")
159
@cli_util.project_id
160
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
161
@click.option(
162
    "--cached/--no-cached",
163
    "-c/-C",
164
    default=False,
165
    help="Reuse preprocessed training data from previous run",
166
)
167
@click.option(
168
    "--jobs",
169
    "-j",
170
    default=0,
171
    help="Number of parallel jobs (0 means choose automatically)",
172
)
173
@cli_util.docs_limit_option
174
@cli_util.backend_param_option
175
@cli_util.common_options
176
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
177
    """
178
    Train a project on a collection of documents.
179
    \f
180
    This will train the project using the documents from ``PATHS`` (directories
181
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
182
    is set, preprocessed training data from the previous run is reused instead
183
    of documents input; see `Reusing preprocessed training data
184
    <https://github.com/NatLibFi/Annif/wiki/
185
    Reusing-preprocessed-training-data>`_.
186
    """
187
    proj = cli_util.get_project(project_id)
188
    backend_params = cli_util.parse_backend_params(backend_param, proj)
189
    if cached:
190
        if len(paths) > 0:
191
            raise click.UsageError(
192
                "Corpus paths cannot be given when using --cached option."
193
            )
194
        documents = "cached"
195
    else:
196
        documents = cli_util.open_documents(
197
            paths, proj.subjects, proj.vocab_lang, docs_limit
198
        )
199
    proj.train(documents, backend_params, jobs)
200
201
202
@cli.command("learn")
203
@cli_util.project_id
204
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
205
@cli_util.docs_limit_option
206
@cli_util.backend_param_option
207
@cli_util.common_options
208
def run_learn(project_id, paths, docs_limit, backend_param):
209
    """
210
    Further train an existing project on a collection of documents.
211
    \f
212
    Similar to the ``train`` command. This will continue training an already
213
    trained project using the documents given by ``PATHS`` in a single batch
214
    operation. Not supported by all backends.
215
    """
216
    proj = cli_util.get_project(project_id)
217
    backend_params = cli_util.parse_backend_params(backend_param, proj)
218
    documents = cli_util.open_documents(
219
        paths, proj.subjects, proj.vocab_lang, docs_limit
220
    )
221
    proj.learn(documents, backend_params)
222
223
224
@cli.command("suggest")
225
@cli_util.project_id
226
@click.argument(
227
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
228
)
229
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
230
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
231
@click.option("--language", "-L", help="Language of subject labels")
232
@cli_util.docs_limit_option
233
@cli_util.backend_param_option
234
@cli_util.common_options
235
def run_suggest(
236
    project_id, paths, limit, threshold, language, backend_param, docs_limit
237
):
238
    """
239
    Suggest subjects for a single document from standard input or for one or more
240
    document file(s) given its/their path(s).
241
    \f
242
    This will read a text document from standard input and suggest subjects for
243
    it, or if given path(s) to file(s), suggest subjects for it/them.
244
    """
245
    project = cli_util.get_project(project_id)
246
    lang = language or project.vocab_lang
247
    if lang not in project.vocab.languages:
248
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
249
    backend_params = cli_util.parse_backend_params(backend_param, project)
250
251
    if paths and not (len(paths) == 1 and paths[0] == "-"):
252
        docs = cli_util.open_text_documents(paths, docs_limit)
253
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
254
        for (
255
            suggestions,
256
            path,
257
        ) in zip(results, paths):
258
            click.echo(f"Suggestions for {path}")
259
            cli_util.show_hits(suggestions, project, lang)
260
    else:
261
        text = sys.stdin.read()
262
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
263
            0
264
        ]
265
        cli_util.show_hits(suggestions, project, lang)
266
267
268
@cli.command("index")
269
@cli_util.project_id
270
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
271
@click.option(
272
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
273
)
274
@click.option(
275
    "--force/--no-force",
276
    "-f/-F",
277
    default=False,
278
    help="Force overwriting of existing result files",
279
)
280
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
281
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
282
@click.option("--language", "-L", help="Language of subject labels")
283
@cli_util.backend_param_option
284
@cli_util.common_options
285
def run_index(
286
    project_id, directory, suffix, force, limit, threshold, language, backend_param
287
):
288
    """
289
    Index a directory with documents, suggesting subjects for each document.
290
    Write the results in TSV files with the given suffix (``.annif`` by
291
    default).
292
    """
293
    project = cli_util.get_project(project_id)
294
    lang = language or project.vocab_lang
295
    if lang not in project.vocab.languages:
296
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
297
    backend_params = cli_util.parse_backend_params(backend_param, project)
298
299
    documents = annif.corpus.DocumentDirectory(
300
        directory, None, None, require_subjects=False
301
    )
302
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
303
304
    for (docfilename, _), suggestions in zip(documents, results):
305
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
306
        if os.path.exists(subjectfilename) and not force:
307
            click.echo(
308
                "Not overwriting {} (use --force to override)".format(subjectfilename)
309
            )
310
            continue
311
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
312
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
313
314
315
@cli.command("eval")
316
@cli_util.project_id
317
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
318
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
319
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
320
@click.option(
321
    "--metric",
322
    "-m",
323
    default=[],
324
    multiple=True,
325
    help="Metric to calculate (default: all)",
326
)
327
@click.option(
328
    "--metrics-file",
329
    "-M",
330
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
331
    help="""Specify file in order to write evaluation metrics in JSON format.
332
    File directory must exist, existing file will be overwritten.""",
333
)
334
@click.option(
335
    "--results-file",
336
    "-r",
337
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
338
    help="""Specify file in order to write non-aggregated results per subject.
339
    File directory must exist, existing file will be overwritten.""",
340
)
341
@click.option(
342
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
343
)
344
@cli_util.docs_limit_option
345
@cli_util.backend_param_option
346
@cli_util.common_options
347
def run_eval(
348
    project_id,
349
    paths,
350
    limit,
351
    threshold,
352
    docs_limit,
353
    metric,
354
    metrics_file,
355
    results_file,
356
    jobs,
357
    backend_param,
358
):
359
    """
360
    Suggest subjects for documents and evaluate the results by comparing
361
    against a gold standard.
362
    \f
363
    With this command the documents from ``PATHS`` (directories or possibly
364
    gzipped TSV files) will be assigned subject suggestions and then
365
    statistical measures are calculated that quantify how well the suggested
366
    subjects match the gold-standard subjects in the documents.
367
368
    Normally the output is the list of the metrics calculated across documents.
369
    If ``--results-file <FILENAME>`` option is given, the metrics are
370
    calculated separately for each subject, and written to the given file.
371
    """
372
373
    project = cli_util.get_project(project_id)
374
    backend_params = cli_util.parse_backend_params(backend_param, project)
375
376
    import annif.eval
377
378
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
379
380
    if results_file:
381
        try:
382
            print("", end="", file=results_file)
383
            click.echo(
384
                "Writing per subject evaluation results to {!s}".format(
385
                    results_file.name
386
                )
387
            )
388
        except Exception as e:
389
            raise NotSupportedException(
390
                "cannot open results-file for writing: " + str(e)
391
            )
392
    corpus = cli_util.open_documents(
393
        paths, project.subjects, project.vocab_lang, docs_limit
394
    )
395
    jobs, pool_class = annif.parallel.get_pool(jobs)
396
397
    project.initialize(parallel=True)
398
    psmap = annif.parallel.ProjectSuggestMap(
399
        project.registry, [project_id], backend_params, limit, threshold
400
    )
401
402
    with pool_class(jobs) as pool:
403
        for hit_sets, subject_sets in pool.imap_unordered(
404
            psmap.suggest_batch, corpus.doc_batches
405
        ):
406
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
407
408
    template = "{0:<30}\t{1}"
409
    metrics = eval_batch.results(
410
        metrics=metric, results_file=results_file, language=project.vocab_lang
411
    )
412
    for metric, score in metrics.items():
413
        click.echo(template.format(metric + ":", score))
414
    if metrics_file:
415
        json.dump(
416
            {metric_code(mname): val for mname, val in metrics.items()},
417
            metrics_file,
418
            indent=2,
419
        )
420
421
422
FILTER_BATCH_MAX_LIMIT = 15
423
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
424
425
426
@cli.command("optimize")
427
@cli_util.project_id
428
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
429
@click.option(
430
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
431
)
432
@cli_util.docs_limit_option
433
@cli_util.backend_param_option
434
@cli_util.common_options
435
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
436
    """
437
    Suggest subjects for documents, testing multiple limits and thresholds.
438
    \f
439
    This command will use different limit (maximum number of subjects) and
440
    score threshold values when assigning subjects to each document given by
441
    ``PATHS`` and compare the results against the gold standard subjects in the
442
    documents. The output is a list of parameter combinations and their scores.
443
    From the output, you can determine the optimum limit and threshold
444
    parameters depending on which measure you want to target.
445
    """
446
    project = cli_util.get_project(project_id)
447
    backend_params = cli_util.parse_backend_params(backend_param, project)
448
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
449
450
    import annif.eval
451
452
    corpus = cli_util.open_documents(
453
        paths, project.subjects, project.vocab_lang, docs_limit
454
    )
455
456
    jobs, pool_class = annif.parallel.get_pool(jobs)
457
458
    project.initialize(parallel=True)
459
    psmap = annif.parallel.ProjectSuggestMap(
460
        project.registry,
461
        [project_id],
462
        backend_params,
463
        limit=FILTER_BATCH_MAX_LIMIT,
464
        threshold=0.0,
465
    )
466
467
    ndocs = 0
468
    suggestion_batches = []
469
    subject_set_batches = []
470
    with pool_class(jobs) as pool:
471
        for suggestion_batch, subject_sets in pool.imap_unordered(
472
            psmap.suggest_batch, corpus.doc_batches
473
        ):
474
            ndocs += len(suggestion_batch[project_id])
475
            suggestion_batches.append(suggestion_batch[project_id])
476
            subject_set_batches.append(subject_sets)
477
478
    from annif.suggestion import SuggestionResults
479
480
    orig_suggestion_results = SuggestionResults(suggestion_batches)
481
482
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
483
484
    best_scores = collections.defaultdict(float)
485
    best_params = {}
486
487
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
488
    import annif.eval
489
490
    for limit, threshold in filter_params:
491
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
492
        filtered_results = orig_suggestion_results.filter(limit, threshold)
493
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
494
            eval_batch.evaluate_many(batch, subject_sets)
495
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
496
        for metric, score in results.items():
497
            if score >= best_scores[metric]:
498
                best_scores[metric] = score
499
                best_params[metric] = (limit, threshold)
500
        click.echo(
501
            template.format(
502
                limit,
503
                threshold,
504
                results["Precision (doc avg)"],
505
                results["Recall (doc avg)"],
506
                results["F1 score (doc avg)"],
507
            )
508
        )
509
510
    click.echo()
511
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
512
    for metric in OPTIMIZE_METRICS:
513
        click.echo(
514
            template2.format(
515
                metric,
516
                best_scores[metric],
517
                best_params[metric][0],
518
                best_params[metric][1],
519
            )
520
        )
521
    click.echo("Documents evaluated:\t{}".format(ndocs))
522
523
524
@cli.command("hyperopt")
525
@cli_util.project_id
526
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
527
@click.option("--trials", "-T", default=10, help="Number of trials")
528
@click.option(
529
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
530
)
531
@click.option(
532
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
533
)
534
@click.option(
535
    "--results-file",
536
    "-r",
537
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
538
    help="""Specify file path to write trial results as CSV.
539
    File directory must exist, existing file will be overwritten.""",
540
)
541
@cli_util.docs_limit_option
542
@cli_util.common_options
543
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
544
    """
545
    Optimize the hyperparameters of a project using validation documents from
546
    ``PATHS``. Not supported by all backends. Output is a list of trial results
547
    and a report of the best performing parameters.
548
    """
549
    proj = cli_util.get_project(project_id)
550
    documents = cli_util.open_documents(
551
        paths, proj.subjects, proj.vocab_lang, docs_limit
552
    )
553
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
554
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
555
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
556
    click.echo("---")
557
    for line in rec.lines:
558
        click.echo(line)
559
    click.echo("---")
560
561
562
@cli.command("completion")
563
@click.option("--bash", "shell", flag_value="bash")
564
@click.option("--zsh", "shell", flag_value="zsh")
565
@click.option("--fish", "shell", flag_value="fish")
566
def completion(shell):
567
    """Generate the script for tab-key autocompletion for the given shell. To enable the
568
    completion support in your current bash terminal session run\n
569
        source <(annif completion --bash)
570
    """
571
572
    if shell is None:
573
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
574
575
    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
576
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
577
    click.echo(script)
578
579
580
if __name__ == "__main__":
581
    cli()
582