Passed
Pull Request — main (#681)
by Osma
10:12 queued 07:10
created

annif.cli.run_optimize()   C

Complexity

Conditions 8

Size

Total Lines 96
Code Lines 66

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 66
nop 5
dl 0
loc 96
rs 6.246
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif import cli_util
21
from annif.exception import NotInitializedException, NotSupportedException
22
from annif.project import Access
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
29
cli = click.version_option(message="%(version)s")(cli)
30
31
32
@cli.command("list-projects")
33
@cli_util.common_options
34
@click_log.simple_verbosity_option(logger, default="ERROR")
35
def run_list_projects():
36
    """
37
    List available projects.
38
    \f
39
    Show a list of currently defined projects. Projects are defined in a
40
    configuration file, normally called ``projects.cfg``. See `Project
41
    configuration
42
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
43
    for details.
44
    """
45
46
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
47
    header = template.format("Project ID", "Project Name", "Language", "Trained")
48
    click.echo(header)
49
    click.echo("-" * len(header))
50
    for proj in annif.registry.get_projects(min_access=Access.private).values():
51
        click.echo(
52
            template.format(
53
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
54
            )
55
        )
56
57
58
@cli.command("show-project")
59
@click.argument("project_id")
60
@cli_util.common_options
61
def run_show_project(project_id):
62
    """
63
    Show information about a project.
64
    """
65
66
    proj = cli_util.get_project(project_id)
67
    click.echo(f"Project ID:        {proj.project_id}")
68
    click.echo(f"Project Name:      {proj.name}")
69
    click.echo(f"Language:          {proj.language}")
70
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
71
    click.echo(f"Vocab language:    {proj.vocab_lang}")
72
    click.echo(f"Access:            {proj.access.name}")
73
    click.echo(f"Trained:           {proj.is_trained}")
74
    click.echo(f"Modification time: {proj.modification_time}")
75
76
77
@cli.command("clear")
78
@click.argument("project_id")
79
@cli_util.common_options
80
def run_clear_project(project_id):
81
    """
82
    Initialize the project to its original, untrained state.
83
    """
84
    proj = cli_util.get_project(project_id)
85
    proj.remove_model_data()
86
87
88
@cli.command("list-vocabs")
89
@cli_util.common_options
90
@click_log.simple_verbosity_option(logger, default="ERROR")
91
def run_list_vocabs():
92
    """
93
    List available vocabularies.
94
    """
95
96
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
97
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
98
    click.echo(header)
99
    click.echo("-" * len(header))
100
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
101
        try:
102
            languages = ",".join(sorted(vocab.languages))
103
            size = len(vocab)
104
            loaded = True
105
        except NotInitializedException:
106
            languages = "-"
107
            size = "-"
108
            loaded = False
109
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
110
111
112
@cli.command("load-vocab")
113
@click.argument("vocab_id")
114
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
115
@click.option("--language", "-L", help="Language of subject file")
116
@click.option(
117
    "--force",
118
    "-f",
119
    default=False,
120
    is_flag=True,
121
    help="Replace existing vocabulary completely instead of updating it",
122
)
123
@cli_util.common_options
124
def run_load_vocab(vocab_id, language, force, subjectfile):
125
    """
126
    Load a vocabulary from a subject file.
127
    """
128
    vocab = cli_util.get_vocab(vocab_id)
129
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
130
        # SKOS/RDF file supported by rdflib
131
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
132
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
133
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
134
        # CSV file
135
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
136
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
137
    else:
138
        # probably a TSV file - we need to know its language
139
        if not language:
140
            click.echo(
141
                "Please use --language option to set the language of a TSV vocabulary.",
142
                err=True,
143
            )
144
            sys.exit(1)
145
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
146
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
147
    vocab.load_vocabulary(subjects, force=force)
148
149
150
@cli.command("train")
151
@click.argument("project_id")
152
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
153
@click.option(
154
    "--cached/--no-cached",
155
    "-c/-C",
156
    default=False,
157
    help="Reuse preprocessed training data from previous run",
158
)
159
@click.option(
160
    "--jobs",
161
    "-j",
162
    default=0,
163
    help="Number of parallel jobs (0 means choose automatically)",
164
)
165
@cli_util.docs_limit_option
166
@cli_util.backend_param_option
167
@cli_util.common_options
168
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
169
    """
170
    Train a project on a collection of documents.
171
    \f
172
    This will train the project using the documents from ``PATHS`` (directories
173
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
174
    is set, preprocessed training data from the previous run is reused instead
175
    of documents input; see `Reusing preprocessed training data
176
    <https://github.com/NatLibFi/Annif/wiki/
177
    Reusing-preprocessed-training-data>`_.
178
    """
179
    proj = cli_util.get_project(project_id)
180
    backend_params = cli_util.parse_backend_params(backend_param, proj)
181
    if cached:
182
        if len(paths) > 0:
183
            raise click.UsageError(
184
                "Corpus paths cannot be given when using --cached option."
185
            )
186
        documents = "cached"
187
    else:
188
        documents = cli_util.open_documents(
189
            paths, proj.subjects, proj.vocab_lang, docs_limit
190
        )
191
    proj.train(documents, backend_params, jobs)
192
193
194
@cli.command("learn")
195
@click.argument("project_id")
196
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
197
@cli_util.docs_limit_option
198
@cli_util.backend_param_option
199
@cli_util.common_options
200
def run_learn(project_id, paths, docs_limit, backend_param):
201
    """
202
    Further train an existing project on a collection of documents.
203
    \f
204
    Similar to the ``train`` command. This will continue training an already
205
    trained project using the documents given by ``PATHS`` in a single batch
206
    operation. Not supported by all backends.
207
    """
208
    proj = cli_util.get_project(project_id)
209
    backend_params = cli_util.parse_backend_params(backend_param, proj)
210
    documents = cli_util.open_documents(
211
        paths, proj.subjects, proj.vocab_lang, docs_limit
212
    )
213
    proj.learn(documents, backend_params)
214
215
216
@cli.command("suggest")
217
@click.argument("project_id")
218
@click.argument(
219
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
220
)
221
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
222
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
223
@click.option("--language", "-L", help="Language of subject labels")
224
@cli_util.docs_limit_option
225
@cli_util.backend_param_option
226
@cli_util.common_options
227
def run_suggest(
228
    project_id, paths, limit, threshold, language, backend_param, docs_limit
229
):
230
    """
231
    Suggest subjects for a single document from standard input or for one or more
232
    document file(s) given its/their path(s).
233
    \f
234
    This will read a text document from standard input and suggest subjects for
235
    it, or if given path(s) to file(s), suggest subjects for it/them.
236
    """
237
    project = cli_util.get_project(project_id)
238
    lang = language or project.vocab_lang
239
    if lang not in project.vocab.languages:
240
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
241
    backend_params = cli_util.parse_backend_params(backend_param, project)
242
243
    if paths and not (len(paths) == 1 and paths[0] == "-"):
244
        docs = cli_util.open_text_documents(paths, docs_limit)
245
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
246
        for (
247
            suggestions,
248
            path,
249
        ) in zip(results, paths):
250
            click.echo(f"Suggestions for {path}")
251
            cli_util.show_hits(suggestions, project, lang)
252
    else:
253
        text = sys.stdin.read()
254
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
255
            0
256
        ]
257
        cli_util.show_hits(suggestions, project, lang)
258
259
260
@cli.command("index")
261
@click.argument("project_id")
262
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
263
@click.option(
264
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
265
)
266
@click.option(
267
    "--force/--no-force",
268
    "-f/-F",
269
    default=False,
270
    help="Force overwriting of existing result files",
271
)
272
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
273
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
274
@click.option("--language", "-L", help="Language of subject labels")
275
@cli_util.backend_param_option
276
@cli_util.common_options
277
def run_index(
278
    project_id, directory, suffix, force, limit, threshold, language, backend_param
279
):
280
    """
281
    Index a directory with documents, suggesting subjects for each document.
282
    Write the results in TSV files with the given suffix (``.annif`` by
283
    default).
284
    """
285
    project = cli_util.get_project(project_id)
286
    lang = language or project.vocab_lang
287
    if lang not in project.vocab.languages:
288
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
289
    backend_params = cli_util.parse_backend_params(backend_param, project)
290
291
    documents = annif.corpus.DocumentDirectory(
292
        directory, None, None, require_subjects=False
293
    )
294
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
295
296
    for (docfilename, _), suggestions in zip(documents, results):
297
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
298
        if os.path.exists(subjectfilename) and not force:
299
            click.echo(
300
                "Not overwriting {} (use --force to override)".format(subjectfilename)
301
            )
302
            continue
303
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
304
            cli_util.show_hits(suggestions, project, lang, file=subjfile)
305
306
307
@cli.command("eval")
308
@click.argument("project_id")
309
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
310
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
311
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
312
@click.option(
313
    "--metric",
314
    "-m",
315
    default=[],
316
    multiple=True,
317
    help="Metric to calculate (default: all)",
318
)
319
@click.option(
320
    "--metrics-file",
321
    "-M",
322
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
323
    help="""Specify file in order to write evaluation metrics in JSON format.
324
    File directory must exist, existing file will be overwritten.""",
325
)
326
@click.option(
327
    "--results-file",
328
    "-r",
329
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
330
    help="""Specify file in order to write non-aggregated results per subject.
331
    File directory must exist, existing file will be overwritten.""",
332
)
333
@click.option(
334
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
335
)
336
@cli_util.docs_limit_option
337
@cli_util.backend_param_option
338
@cli_util.common_options
339
def run_eval(
340
    project_id,
341
    paths,
342
    limit,
343
    threshold,
344
    docs_limit,
345
    metric,
346
    metrics_file,
347
    results_file,
348
    jobs,
349
    backend_param,
350
):
351
    """
352
    Suggest subjects for documents and evaluate the results by comparing
353
    against a gold standard.
354
    \f
355
    With this command the documents from ``PATHS`` (directories or possibly
356
    gzipped TSV files) will be assigned subject suggestions and then
357
    statistical measures are calculated that quantify how well the suggested
358
    subjects match the gold-standard subjects in the documents.
359
360
    Normally the output is the list of the metrics calculated across documents.
361
    If ``--results-file <FILENAME>`` option is given, the metrics are
362
    calculated separately for each subject, and written to the given file.
363
    """
364
365
    project = cli_util.get_project(project_id)
366
    backend_params = cli_util.parse_backend_params(backend_param, project)
367
368
    import annif.eval
369
370
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
371
372
    if results_file:
373
        try:
374
            print("", end="", file=results_file)
375
            click.echo(
376
                "Writing per subject evaluation results to {!s}".format(
377
                    results_file.name
378
                )
379
            )
380
        except Exception as e:
381
            raise NotSupportedException(
382
                "cannot open results-file for writing: " + str(e)
383
            )
384
    corpus = cli_util.open_documents(
385
        paths, project.subjects, project.vocab_lang, docs_limit
386
    )
387
    jobs, pool_class = annif.parallel.get_pool(jobs)
388
389
    project.initialize(parallel=True)
390
    psmap = annif.parallel.ProjectSuggestMap(
391
        project.registry, [project_id], backend_params, limit, threshold
392
    )
393
394
    with pool_class(jobs) as pool:
395
        for hit_sets, subject_sets in pool.imap_unordered(
396
            psmap.suggest_batch, corpus.doc_batches
397
        ):
398
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
399
400
    template = "{0:<30}\t{1}"
401
    metrics = eval_batch.results(
402
        metrics=metric, results_file=results_file, language=project.vocab_lang
403
    )
404
    for metric, score in metrics.items():
405
        click.echo(template.format(metric + ":", score))
406
    if metrics_file:
407
        json.dump(
408
            {metric_code(mname): val for mname, val in metrics.items()},
409
            metrics_file,
410
            indent=2,
411
        )
412
413
414
FILTER_BATCH_MAX_LIMIT = 15
415
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
416
417
418
@cli.command("optimize")
419
@click.argument("project_id")
420
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
421
@click.option(
422
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
423
)
424
@cli_util.docs_limit_option
425
@cli_util.backend_param_option
426
@cli_util.common_options
427
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
428
    """
429
    Suggest subjects for documents, testing multiple limits and thresholds.
430
    \f
431
    This command will use different limit (maximum number of subjects) and
432
    score threshold values when assigning subjects to each document given by
433
    ``PATHS`` and compare the results against the gold standard subjects in the
434
    documents. The output is a list of parameter combinations and their scores.
435
    From the output, you can determine the optimum limit and threshold
436
    parameters depending on which measure you want to target.
437
    """
438
    project = cli_util.get_project(project_id)
439
    backend_params = cli_util.parse_backend_params(backend_param, project)
440
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
441
442
    import annif.eval
443
444
    corpus = cli_util.open_documents(
445
        paths, project.subjects, project.vocab_lang, docs_limit
446
    )
447
448
    jobs, pool_class = annif.parallel.get_pool(jobs)
449
450
    project.initialize(parallel=True)
451
    psmap = annif.parallel.ProjectSuggestMap(
452
        project.registry,
453
        [project_id],
454
        backend_params,
455
        limit=FILTER_BATCH_MAX_LIMIT,
456
        threshold=0.0,
457
    )
458
459
    ndocs = 0
460
    suggestion_batches = []
461
    subject_set_batches = []
462
    with pool_class(jobs) as pool:
463
        for suggestion_batch, subject_sets in pool.imap_unordered(
464
            psmap.suggest_batch, corpus.doc_batches
465
        ):
466
            ndocs += len(suggestion_batch[project_id])
467
            suggestion_batches.append(suggestion_batch[project_id])
468
            subject_set_batches.append(subject_sets)
469
470
    from annif.suggestion import SuggestionResults
471
472
    orig_suggestion_results = SuggestionResults(suggestion_batches)
473
474
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
475
476
    best_scores = collections.defaultdict(float)
477
    best_params = {}
478
479
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
480
    import annif.eval
481
482
    for limit, threshold in filter_params:
483
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
484
        filtered_results = orig_suggestion_results.filter(limit, threshold)
485
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
486
            eval_batch.evaluate_many(batch, subject_sets)
487
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
488
        for metric, score in results.items():
489
            if score >= best_scores[metric]:
490
                best_scores[metric] = score
491
                best_params[metric] = (limit, threshold)
492
        click.echo(
493
            template.format(
494
                limit,
495
                threshold,
496
                results["Precision (doc avg)"],
497
                results["Recall (doc avg)"],
498
                results["F1 score (doc avg)"],
499
            )
500
        )
501
502
    click.echo()
503
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
504
    for metric in OPTIMIZE_METRICS:
505
        click.echo(
506
            template2.format(
507
                metric,
508
                best_scores[metric],
509
                best_params[metric][0],
510
                best_params[metric][1],
511
            )
512
        )
513
    click.echo("Documents evaluated:\t{}".format(ndocs))
514
515
516
@cli.command("hyperopt")
517
@click.argument("project_id")
518
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
519
@click.option("--trials", "-T", default=10, help="Number of trials")
520
@click.option(
521
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
522
)
523
@click.option(
524
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
525
)
526
@click.option(
527
    "--results-file",
528
    "-r",
529
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
530
    help="""Specify file path to write trial results as CSV.
531
    File directory must exist, existing file will be overwritten.""",
532
)
533
@cli_util.docs_limit_option
534
@cli_util.common_options
535
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
536
    """
537
    Optimize the hyperparameters of a project using validation documents from
538
    ``PATHS``. Not supported by all backends. Output is a list of trial results
539
    and a report of the best performing parameters.
540
    """
541
    proj = cli_util.get_project(project_id)
542
    documents = cli_util.open_documents(
543
        paths, proj.subjects, proj.vocab_lang, docs_limit
544
    )
545
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
546
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
547
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
548
    click.echo("---")
549
    for line in rec.lines:
550
        click.echo(line)
551
    click.echo("---")
552
553
554
if __name__ == "__main__":
555
    cli()
556