Passed
Push — issue678-refactor-suggestionre... ( 1844ad...818ba2 )
by Osma
02:55
created

annif.cli   B

Complexity

Total Complexity 44

Size/Duplication

Total Lines 555
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 367
dl 0
loc 555
rs 8.8798
c 0
b 0
f 0
wmc 44

12 Functions

Rating   Name   Duplication   Size   Complexity  
A run_train() 0 42 3
A run_load_vocab() 0 36 4
A run_learn() 0 20 1
A run_list_vocabs() 0 22 3
A run_list_projects() 0 22 2
A run_clear_project() 0 9 1
A run_show_project() 0 17 1
B run_suggest() 0 42 6
B run_index() 0 45 6
C run_eval() 0 104 7
A run_hyperopt() 0 36 2
C run_optimize() 0 94 8

How to fix   Complexity   

Complexity

Complex classes like annif.cli often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util
21
from annif.exception import NotInitializedException, NotSupportedException
22
from annif.project import Access
23
from annif.suggestion import SuggestionResults
24
from annif.util import metric_code
25
26
logger = annif.logger
27
click_log.basic_config(logger)
28
29
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
30
cli = click.version_option(message="%(version)s")(cli)
31
32
33
@cli.command("list-projects")
34
@cli_util.common_options
35
@click_log.simple_verbosity_option(logger, default="ERROR")
36
def run_list_projects():
37
    """
38
    List available projects.
39
    \f
40
    Show a list of currently defined projects. Projects are defined in a
41
    configuration file, normally called ``projects.cfg``. See `Project
42
    configuration
43
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
44
    for details.
45
    """
46
47
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
48
    header = template.format("Project ID", "Project Name", "Language", "Trained")
49
    click.echo(header)
50
    click.echo("-" * len(header))
51
    for proj in annif.registry.get_projects(min_access=Access.private).values():
52
        click.echo(
53
            template.format(
54
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
55
            )
56
        )
57
58
59
@cli.command("show-project")
60
@click.argument("project_id")
61
@cli_util.common_options
62
def run_show_project(project_id):
63
    """
64
    Show information about a project.
65
    """
66
67
    proj = cli_util.get_project(project_id)
68
    click.echo(f"Project ID:        {proj.project_id}")
69
    click.echo(f"Project Name:      {proj.name}")
70
    click.echo(f"Language:          {proj.language}")
71
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
72
    click.echo(f"Vocab language:    {proj.vocab_lang}")
73
    click.echo(f"Access:            {proj.access.name}")
74
    click.echo(f"Trained:           {proj.is_trained}")
75
    click.echo(f"Modification time: {proj.modification_time}")
76
77
78
@cli.command("clear")
79
@click.argument("project_id")
80
@cli_util.common_options
81
def run_clear_project(project_id):
82
    """
83
    Initialize the project to its original, untrained state.
84
    """
85
    proj = cli_util.get_project(project_id)
86
    proj.remove_model_data()
87
88
89
@cli.command("list-vocabs")
90
@cli_util.common_options
91
@click_log.simple_verbosity_option(logger, default="ERROR")
92
def run_list_vocabs():
93
    """
94
    List available vocabularies.
95
    """
96
97
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
98
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
99
    click.echo(header)
100
    click.echo("-" * len(header))
101
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
102
        try:
103
            languages = ",".join(sorted(vocab.languages))
104
            size = len(vocab)
105
            loaded = True
106
        except NotInitializedException:
107
            languages = "-"
108
            size = "-"
109
            loaded = False
110
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
111
112
113
@cli.command("load-vocab")
114
@click.argument("vocab_id")
115
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
116
@click.option("--language", "-L", help="Language of subject file")
117
@click.option(
118
    "--force",
119
    "-f",
120
    default=False,
121
    is_flag=True,
122
    help="Replace existing vocabulary completely instead of updating it",
123
)
124
@cli_util.common_options
125
def run_load_vocab(vocab_id, language, force, subjectfile):
126
    """
127
    Load a vocabulary from a subject file.
128
    """
129
    vocab = cli_util.get_vocab(vocab_id)
130
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
131
        # SKOS/RDF file supported by rdflib
132
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
133
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
134
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
135
        # CSV file
136
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
137
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
138
    else:
139
        # probably a TSV file - we need to know its language
140
        if not language:
141
            click.echo(
142
                "Please use --language option to set the language of a TSV vocabulary.",
143
                err=True,
144
            )
145
            sys.exit(1)
146
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
147
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
148
    vocab.load_vocabulary(subjects, force=force)
149
150
151
@cli.command("train")
152
@click.argument("project_id")
153
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
154
@click.option(
155
    "--cached/--no-cached",
156
    "-c/-C",
157
    default=False,
158
    help="Reuse preprocessed training data from previous run",
159
)
160
@click.option(
161
    "--jobs",
162
    "-j",
163
    default=0,
164
    help="Number of parallel jobs (0 means choose automatically)",
165
)
166
@cli_util.docs_limit_option
167
@cli_util.backend_param_option
168
@cli_util.common_options
169
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
170
    """
171
    Train a project on a collection of documents.
172
    \f
173
    This will train the project using the documents from ``PATHS`` (directories
174
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
175
    is set, preprocessed training data from the previous run is reused instead
176
    of documents input; see `Reusing preprocessed training data
177
    <https://github.com/NatLibFi/Annif/wiki/
178
    Reusing-preprocessed-training-data>`_.
179
    """
180
    proj = cli_util.get_project(project_id)
181
    backend_params = cli_util.parse_backend_params(backend_param, proj)
182
    if cached:
183
        if len(paths) > 0:
184
            raise click.UsageError(
185
                "Corpus paths cannot be given when using --cached option."
186
            )
187
        documents = "cached"
188
    else:
189
        documents = cli_util.open_documents(
190
            paths, proj.subjects, proj.vocab_lang, docs_limit
191
        )
192
    proj.train(documents, backend_params, jobs)
193
194
195
@cli.command("learn")
196
@click.argument("project_id")
197
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
198
@cli_util.docs_limit_option
199
@cli_util.backend_param_option
200
@cli_util.common_options
201
def run_learn(project_id, paths, docs_limit, backend_param):
202
    """
203
    Further train an existing project on a collection of documents.
204
    \f
205
    Similar to the ``train`` command. This will continue training an already
206
    trained project using the documents given by ``PATHS`` in a single batch
207
    operation. Not supported by all backends.
208
    """
209
    proj = cli_util.get_project(project_id)
210
    backend_params = cli_util.parse_backend_params(backend_param, proj)
211
    documents = cli_util.open_documents(
212
        paths, proj.subjects, proj.vocab_lang, docs_limit
213
    )
214
    proj.learn(documents, backend_params)
215
216
217
@cli.command("suggest")
218
@click.argument("project_id")
219
@click.argument(
220
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
221
)
222
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
223
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
224
@click.option("--language", "-L", help="Language of subject labels")
225
@cli_util.docs_limit_option
226
@cli_util.backend_param_option
227
@cli_util.common_options
228
def run_suggest(
229
    project_id, paths, limit, threshold, language, backend_param, docs_limit
230
):
231
    """
232
    Suggest subjects for a single document from standard input or for one or more
233
    document file(s) given its/their path(s).
234
    \f
235
    This will read a text document from standard input and suggest subjects for
236
    it, or if given path(s) to file(s), suggest subjects for it/them.
237
    """
238
    project = cli_util.get_project(project_id)
239
    lang = language or project.vocab_lang
240
    if lang not in project.vocab.languages:
241
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
242
    backend_params = cli_util.parse_backend_params(backend_param, project)
243
244
    if paths and not (len(paths) == 1 and paths[0] == "-"):
245
        docs = cli_util.open_text_documents(paths, docs_limit)
246
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
247
        for (
248
            suggestions,
249
            path,
250
        ) in zip(results, paths):
251
            click.echo(f"Suggestions for {path}")
252
            cli_util.show_hits(suggestions, project, lang)
253
    else:
254
        text = sys.stdin.read()
255
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
256
            0
257
        ]
258
        cli_util.show_hits(suggestions, project, lang)
259
260
261
@cli.command("index")
262
@click.argument("project_id")
263
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
264
@click.option(
265
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
266
)
267
@click.option(
268
    "--force/--no-force",
269
    "-f/-F",
270
    default=False,
271
    help="Force overwriting of existing result files",
272
)
273
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
274
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
275
@click.option("--language", "-L", help="Language of subject labels")
276
@cli_util.backend_param_option
277
@cli_util.common_options
278
def run_index(
279
    project_id, directory, suffix, force, limit, threshold, language, backend_param
280
):
281
    """
282
    Index a directory with documents, suggesting subjects for each document.
283
    Write the results in TSV files with the given suffix (``.annif`` by
284
    default).
285
    """
286
    project = cli_util.get_project(project_id)
287
    lang = language or project.vocab_lang
288
    if lang not in project.vocab.languages:
289
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
290
    backend_params = cli_util.parse_backend_params(backend_param, project)
291
292
    documents = annif.corpus.DocumentDirectory(
293
        directory, None, None, require_subjects=False
294
    )
295
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
296
297
    for (docfilename, _), suggestions in zip(documents, results):
298
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
299
        if os.path.exists(subjectfilename) and not force:
300
            click.echo(
301
                "Not overwriting {} (use --force to override)".format(subjectfilename)
302
            )
303
            continue
304
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
305
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
306
307
308
@cli.command("eval")
309
@click.argument("project_id")
310
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
311
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
312
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
313
@click.option(
314
    "--metric",
315
    "-m",
316
    default=[],
317
    multiple=True,
318
    help="Metric to calculate (default: all)",
319
)
320
@click.option(
321
    "--metrics-file",
322
    "-M",
323
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
324
    help="""Specify file in order to write evaluation metrics in JSON format.
325
    File directory must exist, existing file will be overwritten.""",
326
)
327
@click.option(
328
    "--results-file",
329
    "-r",
330
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
331
    help="""Specify file in order to write non-aggregated results per subject.
332
    File directory must exist, existing file will be overwritten.""",
333
)
334
@click.option(
335
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
336
)
337
@cli_util.docs_limit_option
338
@cli_util.backend_param_option
339
@cli_util.common_options
340
def run_eval(
341
    project_id,
342
    paths,
343
    limit,
344
    threshold,
345
    docs_limit,
346
    metric,
347
    metrics_file,
348
    results_file,
349
    jobs,
350
    backend_param,
351
):
352
    """
353
    Suggest subjects for documents and evaluate the results by comparing
354
    against a gold standard.
355
    \f
356
    With this command the documents from ``PATHS`` (directories or possibly
357
    gzipped TSV files) will be assigned subject suggestions and then
358
    statistical measures are calculated that quantify how well the suggested
359
    subjects match the gold-standard subjects in the documents.
360
361
    Normally the output is the list of the metrics calculated across documents.
362
    If ``--results-file <FILENAME>`` option is given, the metrics are
363
    calculated separately for each subject, and written to the given file.
364
    """
365
366
    project = cli_util.get_project(project_id)
367
    backend_params = cli_util.parse_backend_params(backend_param, project)
368
369
    import annif.eval
370
371
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
372
373
    if results_file:
374
        try:
375
            print("", end="", file=results_file)
376
            click.echo(
377
                "Writing per subject evaluation results to {!s}".format(
378
                    results_file.name
379
                )
380
            )
381
        except Exception as e:
382
            raise NotSupportedException(
383
                "cannot open results-file for writing: " + str(e)
384
            )
385
    corpus = cli_util.open_documents(
386
        paths, project.subjects, project.vocab_lang, docs_limit
387
    )
388
    jobs, pool_class = annif.parallel.get_pool(jobs)
389
390
    project.initialize(parallel=True)
391
    psmap = annif.parallel.ProjectSuggestMap(
392
        project.registry, [project_id], backend_params, limit, threshold
393
    )
394
395
    with pool_class(jobs) as pool:
396
        for hit_sets, subject_sets in pool.imap_unordered(
397
            psmap.suggest_batch, corpus.doc_batches
398
        ):
399
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
400
401
    template = "{0:<30}\t{1}"
402
    metrics = eval_batch.results(
403
        metrics=metric, results_file=results_file, language=project.vocab_lang
404
    )
405
    for metric, score in metrics.items():
406
        click.echo(template.format(metric + ":", score))
407
    if metrics_file:
408
        json.dump(
409
            {metric_code(mname): val for mname, val in metrics.items()},
410
            metrics_file,
411
            indent=2,
412
        )
413
414
415
FILTER_BATCH_MAX_LIMIT = 15
416
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
417
418
419
@cli.command("optimize")
420
@click.argument("project_id")
421
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
422
@click.option(
423
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
424
)
425
@cli_util.docs_limit_option
426
@cli_util.backend_param_option
427
@cli_util.common_options
428
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
429
    """
430
    Suggest subjects for documents, testing multiple limits and thresholds.
431
    \f
432
    This command will use different limit (maximum number of subjects) and
433
    score threshold values when assigning subjects to each document given by
434
    ``PATHS`` and compare the results against the gold standard subjects in the
435
    documents. The output is a list of parameter combinations and their scores.
436
    From the output, you can determine the optimum limit and threshold
437
    parameters depending on which measure you want to target.
438
    """
439
    project = cli_util.get_project(project_id)
440
    backend_params = cli_util.parse_backend_params(backend_param, project)
441
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
442
443
    import annif.eval
444
445
    corpus = cli_util.open_documents(
446
        paths, project.subjects, project.vocab_lang, docs_limit
447
    )
448
449
    jobs, pool_class = annif.parallel.get_pool(jobs)
450
451
    project.initialize(parallel=True)
452
    psmap = annif.parallel.ProjectSuggestMap(
453
        project.registry,
454
        [project_id],
455
        backend_params,
456
        limit=FILTER_BATCH_MAX_LIMIT,
457
        threshold=0.0,
458
    )
459
460
    ndocs = 0
461
    suggestion_batches = []
462
    subject_set_batches = []
463
    with pool_class(jobs) as pool:
464
        for suggestion_batch, subject_sets in pool.imap_unordered(
465
            psmap.suggest_batch, corpus.doc_batches
466
        ):
467
            ndocs += len(suggestion_batch[project_id])
468
            suggestion_batches.append(suggestion_batch[project_id])
469
            subject_set_batches.append(subject_sets)
470
471
    orig_suggestion_results = SuggestionResults(suggestion_batches)
472
473
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
474
475
    best_scores = collections.defaultdict(float)
476
    best_params = {}
477
478
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
479
    import annif.eval
480
481
    for limit, threshold in filter_params:
482
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
483
        filtered_results = orig_suggestion_results.filter(limit, threshold)
484
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
485
            eval_batch.evaluate_many(batch, subject_sets)
486
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
487
        for metric, score in results.items():
488
            if score >= best_scores[metric]:
489
                best_scores[metric] = score
490
                best_params[metric] = (limit, threshold)
491
        click.echo(
492
            template.format(
493
                limit,
494
                threshold,
495
                results["Precision (doc avg)"],
496
                results["Recall (doc avg)"],
497
                results["F1 score (doc avg)"],
498
            )
499
        )
500
501
    click.echo()
502
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
503
    for metric in OPTIMIZE_METRICS:
504
        click.echo(
505
            template2.format(
506
                metric,
507
                best_scores[metric],
508
                best_params[metric][0],
509
                best_params[metric][1],
510
            )
511
        )
512
    click.echo("Documents evaluated:\t{}".format(ndocs))
513
514
515
@cli.command("hyperopt")
516
@click.argument("project_id")
517
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
518
@click.option("--trials", "-T", default=10, help="Number of trials")
519
@click.option(
520
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
521
)
522
@click.option(
523
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
524
)
525
@click.option(
526
    "--results-file",
527
    "-r",
528
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
529
    help="""Specify file path to write trial results as CSV.
530
    File directory must exist, existing file will be overwritten.""",
531
)
532
@cli_util.docs_limit_option
533
@cli_util.common_options
534
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
535
    """
536
    Optimize the hyperparameters of a project using validation documents from
537
    ``PATHS``. Not supported by all backends. Output is a list of trial results
538
    and a report of the best performing parameters.
539
    """
540
    proj = cli_util.get_project(project_id)
541
    documents = cli_util.open_documents(
542
        paths, proj.subjects, proj.vocab_lang, docs_limit
543
    )
544
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
545
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
546
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
547
    click.echo("---")
548
    for line in rec.lines:
549
        click.echo(line)
550
    click.echo("---")
551
552
553
if __name__ == "__main__":
554
    cli()
555