Passed
Pull Request — master (#675)
by Juho
08:52
created

annif.cli.run_optimize()   B

Complexity

Conditions 7

Size

Total Lines 79
Code Lines 55

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 55
nop 4
dl 0
loc 79
rs 7.0727
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask.cli import FlaskGroup
14
15
import annif
16
import annif.corpus
17
import annif.parallel
18
import annif.project
19
import annif.registry
20
from annif.cli_util import (
21
    backend_param_option,
22
    common_options,
23
    docs_limit_option,
24
    generate_filter_batches,
25
    get_project,
26
    get_vocab,
27
    open_documents,
28
    open_text_documents,
29
    parse_backend_params,
30
    show_hits,
31
)
32
from annif.exception import NotInitializedException, NotSupportedException
33
from annif.project import Access
34
from annif.suggestion import ListSuggestionResult, SuggestionFilter
35
from annif.util import metric_code
36
37
logger = annif.logger
38
click_log.basic_config(logger)
39
40
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
41
cli = click.version_option(message="%(version)s")(cli)
42
43
44
@cli.command("list-projects")
45
@common_options
46
@click_log.simple_verbosity_option(logger, default="ERROR")
47
def run_list_projects():
48
    """
49
    List available projects.
50
    \f
51
    Show a list of currently defined projects. Projects are defined in a
52
    configuration file, normally called ``projects.cfg``. See `Project
53
    configuration
54
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
55
    for details.
56
    """
57
58
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
59
    header = template.format("Project ID", "Project Name", "Language", "Trained")
60
    click.echo(header)
61
    click.echo("-" * len(header))
62
    for proj in annif.registry.get_projects(min_access=Access.private).values():
63
        click.echo(
64
            template.format(
65
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
66
            )
67
        )
68
69
70
@cli.command("show-project")
71
@click.argument("project_id")
72
@common_options
73
def run_show_project(project_id):
74
    """
75
    Show information about a project.
76
    """
77
78
    proj = get_project(project_id)
79
    click.echo(f"Project ID:        {proj.project_id}")
80
    click.echo(f"Project Name:      {proj.name}")
81
    click.echo(f"Language:          {proj.language}")
82
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
83
    click.echo(f"Vocab language:    {proj.vocab_lang}")
84
    click.echo(f"Access:            {proj.access.name}")
85
    click.echo(f"Trained:           {proj.is_trained}")
86
    click.echo(f"Modification time: {proj.modification_time}")
87
88
89
@cli.command("clear")
90
@click.argument("project_id")
91
@common_options
92
def run_clear_project(project_id):
93
    """
94
    Initialize the project to its original, untrained state.
95
    """
96
    proj = get_project(project_id)
97
    proj.remove_model_data()
98
99
100
@cli.command("list-vocabs")
101
@common_options
102
@click_log.simple_verbosity_option(logger, default="ERROR")
103
def run_list_vocabs():
104
    """
105
    List available vocabularies.
106
    """
107
108
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
109
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
110
    click.echo(header)
111
    click.echo("-" * len(header))
112
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
113
        try:
114
            languages = ",".join(sorted(vocab.languages))
115
            size = len(vocab)
116
            loaded = True
117
        except NotInitializedException:
118
            languages = "-"
119
            size = "-"
120
            loaded = False
121
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
122
123
124
@cli.command("load-vocab")
125
@click.argument("vocab_id")
126
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
127
@click.option("--language", "-L", help="Language of subject file")
128
@click.option(
129
    "--force",
130
    "-f",
131
    default=False,
132
    is_flag=True,
133
    help="Replace existing vocabulary completely instead of updating it",
134
)
135
@common_options
136
def run_load_vocab(vocab_id, language, force, subjectfile):
137
    """
138
    Load a vocabulary from a subject file.
139
    """
140
    vocab = get_vocab(vocab_id)
141
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
142
        # SKOS/RDF file supported by rdflib
143
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
144
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
145
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
146
        # CSV file
147
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
148
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
149
    else:
150
        # probably a TSV file - we need to know its language
151
        if not language:
152
            click.echo(
153
                "Please use --language option to set the language of a TSV vocabulary.",
154
                err=True,
155
            )
156
            sys.exit(1)
157
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
158
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
159
    vocab.load_vocabulary(subjects, force=force)
160
161
162
@cli.command("train")
163
@click.argument("project_id")
164
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
165
@click.option(
166
    "--cached/--no-cached",
167
    "-c/-C",
168
    default=False,
169
    help="Reuse preprocessed training data from previous run",
170
)
171
@click.option(
172
    "--jobs",
173
    "-j",
174
    default=0,
175
    help="Number of parallel jobs (0 means choose automatically)",
176
)
177
@docs_limit_option
178
@backend_param_option
179
@common_options
180
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
181
    """
182
    Train a project on a collection of documents.
183
    \f
184
    This will train the project using the documents from ``PATHS`` (directories
185
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
186
    is set, preprocessed training data from the previous run is reused instead
187
    of documents input; see `Reusing preprocessed training data
188
    <https://github.com/NatLibFi/Annif/wiki/
189
    Reusing-preprocessed-training-data>`_.
190
    """
191
    proj = get_project(project_id)
192
    backend_params = parse_backend_params(backend_param, proj)
193
    if cached:
194
        if len(paths) > 0:
195
            raise click.UsageError(
196
                "Corpus paths cannot be given when using --cached option."
197
            )
198
        documents = "cached"
199
    else:
200
        documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
201
    proj.train(documents, backend_params, jobs)
202
203
204
@cli.command("learn")
205
@click.argument("project_id")
206
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
207
@docs_limit_option
208
@backend_param_option
209
@common_options
210
def run_learn(project_id, paths, docs_limit, backend_param):
211
    """
212
    Further train an existing project on a collection of documents.
213
    \f
214
    Similar to the ``train`` command. This will continue training an already
215
    trained project using the documents given by ``PATHS`` in a single batch
216
    operation. Not supported by all backends.
217
    """
218
    proj = get_project(project_id)
219
    backend_params = parse_backend_params(backend_param, proj)
220
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
221
    proj.learn(documents, backend_params)
222
223
224
@cli.command("suggest")
225
@click.argument("project_id")
226
@click.argument(
227
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
228
)
229
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
230
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
231
@click.option("--language", "-L", help="Language of subject labels")
232
@docs_limit_option
233
@backend_param_option
234
@common_options
235
def run_suggest(
236
    project_id, paths, limit, threshold, language, backend_param, docs_limit
237
):
238
    """
239
    Suggest subjects for a single document from standard input or for one or more
240
    document file(s) given its/their path(s).
241
    \f
242
    This will read a text document from standard input and suggest subjects for
243
    it, or if given path(s) to file(s), suggest subjects for it/them.
244
    """
245
    project = get_project(project_id)
246
    lang = language or project.vocab_lang
247
    if lang not in project.vocab.languages:
248
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
249
    backend_params = parse_backend_params(backend_param, project)
250
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
251
252
    if paths and not (len(paths) == 1 and paths[0] == "-"):
253
        docs = open_text_documents(paths, docs_limit)
254
        subject_sets = project.suggest_corpus(docs, backend_params)
255
        for (
256
            subjects,
257
            path,
258
        ) in zip(subject_sets, paths):
259
            click.echo(f"Suggestions for {path}")
260
            hits = hit_filter(subjects)
261
            show_hits(hits, project, lang)
262
    else:
263
        text = sys.stdin.read()
264
        hits = hit_filter(project.suggest([text], backend_params)[0])
265
        show_hits(hits, project, lang)
266
267
268
@cli.command("index")
269
@click.argument("project_id")
270
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
271
@click.option(
272
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
273
)
274
@click.option(
275
    "--force/--no-force",
276
    "-f/-F",
277
    default=False,
278
    help="Force overwriting of existing result files",
279
)
280
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
281
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
282
@click.option("--language", "-L", help="Language of subject labels")
283
@backend_param_option
284
@common_options
285
def run_index(
286
    project_id, directory, suffix, force, limit, threshold, language, backend_param
287
):
288
    """
289
    Index a directory with documents, suggesting subjects for each document.
290
    Write the results in TSV files with the given suffix (``.annif`` by
291
    default).
292
    """
293
    project = get_project(project_id)
294
    lang = language or project.vocab_lang
295
    if lang not in project.vocab.languages:
296
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
297
    backend_params = parse_backend_params(backend_param, project)
298
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
299
300
    documents = annif.corpus.DocumentDirectory(
301
        directory, None, None, require_subjects=False
302
    )
303
    subject_sets = project.suggest_corpus(documents, backend_params)
304
305
    for (docfilename, _), subjects in zip(documents, subject_sets):
306
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
307
        if os.path.exists(subjectfilename) and not force:
308
            click.echo(
309
                "Not overwriting {} (use --force to override)".format(subjectfilename)
310
            )
311
            continue
312
        hits = hit_filter(subjects)
313
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
314
            show_hits(hits, project, lang, file=subjfile)
315
316
317
@cli.command("eval")
318
@click.argument("project_id")
319
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
320
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
321
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
322
@click.option(
323
    "--metric",
324
    "-m",
325
    default=[],
326
    multiple=True,
327
    help="Metric to calculate (default: all)",
328
)
329
@click.option(
330
    "--metrics-file",
331
    "-M",
332
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
333
    help="""Specify file in order to write evaluation metrics in JSON format.
334
    File directory must exist, existing file will be overwritten.""",
335
)
336
@click.option(
337
    "--results-file",
338
    "-r",
339
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
340
    help="""Specify file in order to write non-aggregated results per subject.
341
    File directory must exist, existing file will be overwritten.""",
342
)
343
@click.option(
344
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
345
)
346
@docs_limit_option
347
@backend_param_option
348
@common_options
349
def run_eval(
350
    project_id,
351
    paths,
352
    limit,
353
    threshold,
354
    docs_limit,
355
    metric,
356
    metrics_file,
357
    results_file,
358
    jobs,
359
    backend_param,
360
):
361
    """
362
    Suggest subjects for documents and evaluate the results by comparing
363
    against a gold standard.
364
    \f
365
    With this command the documents from ``PATHS`` (directories or possibly
366
    gzipped TSV files) will be assigned subject suggestions and then
367
    statistical measures are calculated that quantify how well the suggested
368
    subjects match the gold-standard subjects in the documents.
369
370
    Normally the output is the list of the metrics calculated across documents.
371
    If ``--results-file <FILENAME>`` option is given, the metrics are
372
    calculated separately for each subject, and written to the given file.
373
    """
374
375
    project = get_project(project_id)
376
    backend_params = parse_backend_params(backend_param, project)
377
378
    import annif.eval
379
380
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
381
382
    if results_file:
383
        try:
384
            print("", end="", file=results_file)
385
            click.echo(
386
                "Writing per subject evaluation results to {!s}".format(
387
                    results_file.name
388
                )
389
            )
390
        except Exception as e:
391
            raise NotSupportedException(
392
                "cannot open results-file for writing: " + str(e)
393
            )
394
    corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
395
    jobs, pool_class = annif.parallel.get_pool(jobs)
396
397
    project.initialize(parallel=True)
398
    psmap = annif.parallel.ProjectSuggestMap(
399
        project.registry, [project_id], backend_params, limit, threshold
400
    )
401
402
    with pool_class(jobs) as pool:
403
        for hit_sets, subject_sets in pool.imap_unordered(
404
            psmap.suggest_batch, corpus.doc_batches
405
        ):
406
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
407
408
    template = "{0:<30}\t{1}"
409
    metrics = eval_batch.results(
410
        metrics=metric, results_file=results_file, language=project.vocab_lang
411
    )
412
    for metric, score in metrics.items():
413
        click.echo(template.format(metric + ":", score))
414
    if metrics_file:
415
        json.dump(
416
            {metric_code(mname): val for mname, val in metrics.items()},
417
            metrics_file,
418
            indent=2,
419
        )
420
421
422
FILTER_BATCH_MAX_LIMIT = 15
423
424
425
@cli.command("optimize")
426
@click.argument("project_id")
427
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
428
@docs_limit_option
429
@backend_param_option
430
@common_options
431
def run_optimize(project_id, paths, docs_limit, backend_param):
432
    """
433
    Suggest subjects for documents, testing multiple limits and thresholds.
434
    \f
435
    This command will use different limit (maximum number of subjects) and
436
    score threshold values when assigning subjects to each document given by
437
    ``PATHS`` and compare the results against the gold standard subjects in the
438
    documents. The output is a list of parameter combinations and their scores.
439
    From the output, you can determine the optimum limit and threshold
440
    parameters depending on which measure you want to target.
441
    """
442
    project = get_project(project_id)
443
    backend_params = parse_backend_params(backend_param, project)
444
445
    filter_batches = generate_filter_batches(project.subjects, FILTER_BATCH_MAX_LIMIT)
446
447
    ndocs = 0
448
    corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
449
    for docs_batch in corpus.doc_batches:
450
        texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in docs_batch])
451
        raw_hit_sets = project.suggest(texts, backend_params)
452
        hit_sets = [
453
            raw_hits.filter(project.subjects, limit=FILTER_BATCH_MAX_LIMIT)
454
            for raw_hits in raw_hit_sets
455
        ]
456
        assert isinstance(hit_sets[0], ListSuggestionResult), (
457
            "Optimize should only be done with ListSuggestionResult "
458
            + "as it would be very slow with VectorSuggestionResult."
459
        )
460
        for hit_filter, filter_batch in filter_batches.values():
461
            filtered_hits = [hit_filter(hits) for hits in hit_sets]
462
            filter_batch.evaluate_many(filtered_hits, subject_sets)
463
        ndocs += len(texts)
464
465
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
466
467
    best_scores = collections.defaultdict(float)
468
    best_params = {}
469
470
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
471
    # Store the batches in a list that gets consumed along the way
472
    # This way GC will have a chance to reclaim the memory
473
    filter_batches = list(filter_batches.items())
474
    while filter_batches:
475
        params, filter_batch = filter_batches.pop(0)
476
        metrics = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
477
        results = filter_batch[1].results(metrics=metrics)
478
        for metric, score in results.items():
479
            if score >= best_scores[metric]:
480
                best_scores[metric] = score
481
                best_params[metric] = params
482
        click.echo(
483
            template.format(
484
                params[0],
485
                params[1],
486
                results["Precision (doc avg)"],
487
                results["Recall (doc avg)"],
488
                results["F1 score (doc avg)"],
489
            )
490
        )
491
492
    click.echo()
493
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
494
    for metric in metrics:
0 ignored issues
show
introduced by
The variable metrics does not seem to be defined in case the while loop on line 474 is not entered. Are you sure this can never be the case?
Loading history...
495
        click.echo(
496
            template2.format(
497
                metric,
498
                best_scores[metric],
499
                best_params[metric][0],
500
                best_params[metric][1],
501
            )
502
        )
503
    click.echo("Documents evaluated:\t{}".format(ndocs))
504
505
506
@cli.command("hyperopt")
507
@click.argument("project_id")
508
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
509
@click.option("--trials", "-T", default=10, help="Number of trials")
510
@click.option(
511
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
512
)
513
@click.option(
514
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
515
)
516
@click.option(
517
    "--results-file",
518
    "-r",
519
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
520
    help="""Specify file path to write trial results as CSV.
521
    File directory must exist, existing file will be overwritten.""",
522
)
523
@docs_limit_option
524
@common_options
525
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
526
    """
527
    Optimize the hyperparameters of a project using validation documents from
528
    ``PATHS``. Not supported by all backends. Output is a list of trial results
529
    and a report of the best performing parameters.
530
    """
531
    proj = get_project(project_id)
532
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
533
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
534
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
535
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
536
    click.echo("---")
537
    for line in rec.lines:
538
        click.echo(line)
539
    click.echo("---")
540
541
542
if __name__ == "__main__":
543
    cli()
544