Passed
Pull Request — master (#663)
by Juho
03:24
created

annif.cli.run_suggest()   B

Complexity

Conditions 6

Size

Total Lines 48
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 36
nop 7
dl 0
loc 48
rs 8.0826
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask import current_app
14
from flask.cli import FlaskGroup, ScriptInfo
15
16
import annif
17
import annif.corpus
18
import annif.parallel
19
import annif.project
20
import annif.registry
21
from annif.exception import (
22
    ConfigurationException,
23
    NotInitializedException,
24
    NotSupportedException,
25
)
26
from annif.project import Access
27
from annif.suggestion import ListSuggestionResult, SuggestionFilter
28
from annif.util import metric_code
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
34
cli = click.version_option(message="%(version)s")(cli)
35
36
37
def get_project(project_id):
38
    """
39
    Helper function to get a project by ID and bail out if it doesn't exist"""
40
    try:
41
        return annif.registry.get_project(project_id, min_access=Access.private)
42
    except ValueError:
43
        click.echo("No projects found with id '{0}'.".format(project_id), err=True)
44
        sys.exit(1)
45
46
47
def get_vocab(vocab_id):
48
    """
49
    Helper function to get a vocabulary by ID and bail out if it doesn't
50
    exist"""
51
    try:
52
        return annif.registry.get_vocab(vocab_id, min_access=Access.private)
53
    except ValueError:
54
        click.echo(f"No vocabularies found with the id '{vocab_id}'.", err=True)
55
        sys.exit(1)
56
57
58
def open_documents(paths, subject_index, vocab_lang, docs_limit):
59
    """Helper function to open a document corpus from a list of pathnames,
60
    each of which is either a TSV file or a directory of TXT files. For
61
    directories with subjects in TSV files, the given vocabulary language
62
    will be used to convert subject labels into URIs. The corpus will be
63
    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
64
65
    def open_doc_path(path, subject_index):
66
        """open a single path and return it as a DocumentCorpus"""
67
        if os.path.isdir(path):
68
            return annif.corpus.DocumentDirectory(
69
                path, subject_index, vocab_lang, require_subjects=True
70
            )
71
        return annif.corpus.DocumentFile(path, subject_index)
72
73
    if len(paths) == 0:
74
        logger.warning("Reading empty file")
75
        docs = open_doc_path(os.path.devnull, subject_index)
76
    elif len(paths) == 1:
77
        docs = open_doc_path(paths[0], subject_index)
78
    else:
79
        corpora = [open_doc_path(path, subject_index) for path in paths]
80
        docs = annif.corpus.CombinedCorpus(corpora)
81
    if docs_limit is not None:
82
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
83
    return docs
84
85
86
def open_text_documents(paths, docs_limit):
87
    def _docs(paths):
88
        for path in paths:
89
            if path == "-":
90
                doc = annif.corpus.Document(text=sys.stdin.read(), subject_set=None)
91
            else:
92
                with open(path, errors="replace", encoding="utf-8-sig") as docfile:
93
                    doc = annif.corpus.Document(text=docfile.read(), subject_set=None)
94
            yield doc
95
96
    return annif.corpus.DocumentList(_docs(paths[:docs_limit]))
97
98
99
def show_hits(hits, project, lang, file=None):
100
    for hit in hits.as_list():
101
        subj = project.subjects[hit.subject_id]
102
        line = "<{}>\t{}\t{}".format(
103
            subj.uri,
104
            "\t".join(filter(None, (subj.labels[lang], subj.notation))),
105
            hit.score,
106
        )
107
        click.echo(line, file=file)
108
109
110
def parse_backend_params(backend_param, project):
111
    """Parse a list of backend parameters given with the --backend-param
112
    option into a nested dict structure"""
113
    backend_params = collections.defaultdict(dict)
114
    for beparam in backend_param:
115
        backend, param = beparam.split(".", 1)
116
        key, val = param.split("=", 1)
117
        validate_backend_params(backend, beparam, project)
118
        backend_params[backend][key] = val
119
    return backend_params
120
121
122
def validate_backend_params(backend, beparam, project):
123
    if backend != project.config["backend"]:
124
        raise ConfigurationException(
125
            'The backend {} in CLI option "-b {}" not matching the project'
126
            " backend {}.".format(backend, beparam, project.config["backend"])
127
        )
128
129
130
BATCH_MAX_LIMIT = 15
131
132
133
def generate_filter_batches(subjects):
134
    import annif.eval
135
136
    filter_batches = collections.OrderedDict()
137
    for limit in range(1, BATCH_MAX_LIMIT + 1):
138
        for threshold in [i * 0.05 for i in range(20)]:
139
            hit_filter = SuggestionFilter(subjects, limit, threshold)
140
            batch = annif.eval.EvaluationBatch(subjects)
141
            filter_batches[(limit, threshold)] = (hit_filter, batch)
142
    return filter_batches
143
144
145
def set_project_config_file_path(ctx, param, value):
146
    """Override the default path or the path given in env by CLI option"""
147
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
148
        if value:
149
            current_app.config["PROJECTS_CONFIG_PATH"] = value
150
151
152
def common_options(f):
153
    """Decorator to add common options for all CLI commands"""
154
    f = click.option(
155
        "-p",
156
        "--projects",
157
        help="Set path to project configuration file or directory",
158
        type=click.Path(dir_okay=True, exists=True),
159
        callback=set_project_config_file_path,
160
        expose_value=False,
161
        is_eager=True,
162
    )(f)
163
    return click_log.simple_verbosity_option(logger)(f)
164
165
166
def backend_param_option(f):
167
    """Decorator to add an option for CLI commands to override BE parameters"""
168
    return click.option(
169
        "--backend-param",
170
        "-b",
171
        multiple=True,
172
        help="Override backend parameter of the config file. "
173
        + "Syntax: `-b <backend>.<parameter>=<value>`.",
174
    )(f)
175
176
177
@cli.command("list-projects")
178
@common_options
179
@click_log.simple_verbosity_option(logger, default="ERROR")
180
def run_list_projects():
181
    """
182
    List available projects.
183
    \f
184
    Show a list of currently defined projects. Projects are defined in a
185
    configuration file, normally called ``projects.cfg``. See `Project
186
    configuration
187
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
188
    for details.
189
    """
190
191
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
192
    header = template.format("Project ID", "Project Name", "Language", "Trained")
193
    click.echo(header)
194
    click.echo("-" * len(header))
195
    for proj in annif.registry.get_projects(min_access=Access.private).values():
196
        click.echo(
197
            template.format(
198
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
199
            )
200
        )
201
202
203
@cli.command("show-project")
204
@click.argument("project_id")
205
@common_options
206
def run_show_project(project_id):
207
    """
208
    Show information about a project.
209
    """
210
211
    proj = get_project(project_id)
212
    click.echo(f"Project ID:        {proj.project_id}")
213
    click.echo(f"Project Name:      {proj.name}")
214
    click.echo(f"Language:          {proj.language}")
215
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
216
    click.echo(f"Vocab language:    {proj.vocab_lang}")
217
    click.echo(f"Access:            {proj.access.name}")
218
    click.echo(f"Trained:           {proj.is_trained}")
219
    click.echo(f"Modification time: {proj.modification_time}")
220
221
222
@cli.command("clear")
223
@click.argument("project_id")
224
@common_options
225
def run_clear_project(project_id):
226
    """
227
    Initialize the project to its original, untrained state.
228
    """
229
    proj = get_project(project_id)
230
    proj.remove_model_data()
231
232
233
@cli.command("list-vocabs")
234
@common_options
235
@click_log.simple_verbosity_option(logger, default="ERROR")
236
def run_list_vocabs():
237
    """
238
    List available vocabularies.
239
    """
240
241
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
242
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
243
    click.echo(header)
244
    click.echo("-" * len(header))
245
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
246
        try:
247
            languages = ",".join(sorted(vocab.languages))
248
            size = len(vocab)
249
            loaded = True
250
        except NotInitializedException:
251
            languages = "-"
252
            size = "-"
253
            loaded = False
254
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
255
256
257
@cli.command("load-vocab")
258
@click.argument("vocab_id")
259
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
260
@click.option("--language", "-L", help="Language of subject file")
261
@click.option(
262
    "--force",
263
    "-f",
264
    default=False,
265
    is_flag=True,
266
    help="Replace existing vocabulary completely " + "instead of updating it",
267
)
268
@common_options
269
def run_load_vocab(vocab_id, language, force, subjectfile):
270
    """
271
    Load a vocabulary from a subject file.
272
    """
273
    vocab = get_vocab(vocab_id)
274
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
275
        # SKOS/RDF file supported by rdflib
276
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
277
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
278
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
279
        # CSV file
280
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
281
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
282
    else:
283
        # probably a TSV file - we need to know its language
284
        if not language:
285
            click.echo(
286
                "Please use --language option to set the language of "
287
                + "a TSV vocabulary.",
288
                err=True,
289
            )
290
            sys.exit(1)
291
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
292
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
293
    vocab.load_vocabulary(subjects, force=force)
294
295
296
@cli.command("train")
297
@click.argument("project_id")
298
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
299
@click.option(
300
    "--cached/--no-cached",
301
    "-c/-C",
302
    default=False,
303
    help="Reuse preprocessed training data from previous run",
304
)
305
@click.option(
306
    "--docs-limit",
307
    "-d",
308
    default=None,
309
    type=click.IntRange(0, None),
310
    help="Maximum number of documents to use",
311
)
312
@click.option(
313
    "--jobs",
314
    "-j",
315
    default=0,
316
    help="Number of parallel jobs (0 means choose automatically)",
317
)
318
@backend_param_option
319
@common_options
320
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
321
    """
322
    Train a project on a collection of documents.
323
    \f
324
    This will train the project using the documents from ``PATHS`` (directories
325
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
326
    is set, preprocessed training data from the previous run is reused instead
327
    of documents input; see `Reusing preprocessed training data
328
    <https://github.com/NatLibFi/Annif/wiki/
329
    Reusing-preprocessed-training-data>`_.
330
    """
331
    proj = get_project(project_id)
332
    backend_params = parse_backend_params(backend_param, proj)
333
    if cached:
334
        if len(paths) > 0:
335
            raise click.UsageError(
336
                "Corpus paths cannot be given when using --cached option."
337
            )
338
        documents = "cached"
339
    else:
340
        documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
341
    proj.train(documents, backend_params, jobs)
342
343
344
@cli.command("learn")
345
@click.argument("project_id")
346
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
347
@click.option(
348
    "--docs-limit",
349
    "-d",
350
    default=None,
351
    type=click.IntRange(0, None),
352
    help="Maximum number of documents to use",
353
)
354
@backend_param_option
355
@common_options
356
def run_learn(project_id, paths, docs_limit, backend_param):
357
    """
358
    Further train an existing project on a collection of documents.
359
    \f
360
    Similar to the ``train`` command. This will continue training an already
361
    trained project using the documents given by ``PATHS`` in a single batch
362
    operation. Not supported by all backends.
363
    """
364
    proj = get_project(project_id)
365
    backend_params = parse_backend_params(backend_param, proj)
366
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
367
    proj.learn(documents, backend_params)
368
369
370
@cli.command("suggest")
371
@click.argument("project_id")
372
@click.argument(
373
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
374
)
375
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
376
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
377
@click.option("--language", "-L", help="Language of subject labels")
378
@click.option(
379
    "--docs-limit",
380
    "-d",
381
    default=None,
382
    type=click.IntRange(0, None),
383
    help="Maximum number of documents to use",
384
)
385
@backend_param_option
386
@common_options
387
def run_suggest(
388
    project_id, paths, limit, threshold, language, backend_param, docs_limit
389
):
390
    """
391
    Suggest subjects for a single document from standard input or for one or more
392
    document file(s) given its/their path(s).
393
    \f
394
    This will read a text document from standard input and suggest subjects for
395
    it, or if given path(s) to file(s), suggest subjects for it/them.
396
    """
397
    project = get_project(project_id)
398
    lang = language or project.vocab_lang
399
    if lang not in project.vocab.languages:
400
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
401
    backend_params = parse_backend_params(backend_param, project)
402
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
403
404
    if paths and not (len(paths) == 1 and paths[0] == "-"):
405
        docs = open_text_documents(paths, docs_limit)
406
        subject_sets = project.suggest_batch(docs, backend_params)
407
        for (
408
            subjects,
409
            path,
410
        ) in zip(subject_sets, paths):
411
            click.echo(f"Suggestions for {path}")
412
            hits = hit_filter(subjects)
413
            show_hits(hits, project, lang)
414
    else:
415
        text = sys.stdin.read()
416
        hits = hit_filter(project.suggest(text, backend_params))
417
        show_hits(hits, project, lang)
418
419
420
@cli.command("index")
421
@click.argument("project_id")
422
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
423
@click.option(
424
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
425
)
426
@click.option(
427
    "--force/--no-force",
428
    "-f/-F",
429
    default=False,
430
    help="Force overwriting of existing result files",
431
)
432
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
433
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
434
@click.option("--language", "-L", help="Language of subject labels")
435
@backend_param_option
436
@common_options
437
def run_index(
438
    project_id, directory, suffix, force, limit, threshold, language, backend_param
439
):
440
    """
441
    Index a directory with documents, suggesting subjects for each document.
442
    Write the results in TSV files with the given suffix (``.annif`` by
443
    default).
444
    """
445
    project = get_project(project_id)
446
    lang = language or project.vocab_lang
447
    if lang not in project.vocab.languages:
448
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
449
    backend_params = parse_backend_params(backend_param, project)
450
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
451
452
    documents = annif.corpus.DocumentDirectory(
453
        directory, None, None, require_subjects=False
454
    )
455
    subject_sets = project.suggest_batch(documents, backend_params)
456
457
    for (docfilename, _), subjects in zip(documents, subject_sets):
458
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
459
        if os.path.exists(subjectfilename) and not force:
460
            click.echo(
461
                "Not overwriting {} (use --force to override)".format(subjectfilename)
462
            )
463
            continue
464
        hits = hit_filter(subjects)
465
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
466
            show_hits(hits, project, lang, file=subjfile)
467
468
469
@cli.command("eval")
470
@click.argument("project_id")
471
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
472
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
473
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
474
@click.option(
475
    "--docs-limit",
476
    "-d",
477
    default=None,
478
    type=click.IntRange(0, None),
479
    help="Maximum number of documents to use",
480
)
481
@click.option(
482
    "--metric",
483
    "-m",
484
    default=[],
485
    multiple=True,
486
    help="Metric to calculate (default: all)",
487
)
488
@click.option(
489
    "--metrics-file",
490
    "-M",
491
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
492
    help="""Specify file in order to write evaluation metrics in JSON format.
493
    File directory must exist, existing file will be overwritten.""",
494
)
495
@click.option(
496
    "--results-file",
497
    "-r",
498
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
499
    help="""Specify file in order to write non-aggregated results per subject.
500
    File directory must exist, existing file will be overwritten.""",
501
)
502
@click.option(
503
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
504
)
505
@backend_param_option
506
@common_options
507
def run_eval(
508
    project_id,
509
    paths,
510
    limit,
511
    threshold,
512
    docs_limit,
513
    metric,
514
    metrics_file,
515
    results_file,
516
    jobs,
517
    backend_param,
518
):
519
    """
520
    Suggest subjects for documents and evaluate the results by comparing
521
    against a gold standard.
522
    \f
523
    With this command the documents from ``PATHS`` (directories or possibly
524
    gzipped TSV files) will be assigned subject suggestions and then
525
    statistical measures are calculated that quantify how well the suggested
526
    subjects match the gold-standard subjects in the documents.
527
528
    Normally the output is the list of the metrics calculated across documents.
529
    If ``--results-file <FILENAME>`` option is given, the metrics are
530
    calculated separately for each subject, and written to the given file.
531
    """
532
533
    project = get_project(project_id)
534
    backend_params = parse_backend_params(backend_param, project)
535
536
    import annif.eval
537
538
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
539
540
    if results_file:
541
        try:
542
            print("", end="", file=results_file)
543
            click.echo(
544
                "Writing per subject evaluation results to {!s}".format(
545
                    results_file.name
546
                )
547
            )
548
        except Exception as e:
549
            raise NotSupportedException(
550
                "cannot open results-file for writing: " + str(e)
551
            )
552
    docs = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
553
554
    jobs, pool_class = annif.parallel.get_pool(jobs)
555
556
    project.initialize(parallel=True)
557
    psmap = annif.parallel.ProjectSuggestMap(
558
        project.registry, [project_id], backend_params, limit, threshold
559
    )
560
561
    with pool_class(jobs) as pool:
562
        for hits, subject_set in pool.imap_unordered(psmap.suggest, docs.documents):
563
            eval_batch.evaluate(hits[project_id], subject_set)
564
565
    template = "{0:<30}\t{1}"
566
    metrics = eval_batch.results(
567
        metrics=metric, results_file=results_file, language=project.vocab_lang
568
    )
569
    for metric, score in metrics.items():
570
        click.echo(template.format(metric + ":", score))
571
    if metrics_file:
572
        json.dump(
573
            {metric_code(mname): val for mname, val in metrics.items()},
574
            metrics_file,
575
            indent=2,
576
        )
577
578
579
@cli.command("optimize")
580
@click.argument("project_id")
581
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
582
@click.option(
583
    "--docs-limit",
584
    "-d",
585
    default=None,
586
    type=click.IntRange(0, None),
587
    help="Maximum number of documents to use",
588
)
589
@backend_param_option
590
@common_options
591
def run_optimize(project_id, paths, docs_limit, backend_param):
592
    """
593
    Suggest subjects for documents, testing multiple limits and thresholds.
594
    \f
595
    This command will use different limit (maximum number of subjects) and
596
    score threshold values when assigning subjects to each document given by
597
    ``PATHS`` and compare the results against the gold standard subjects in the
598
    documents. The output is a list of parameter combinations and their scores.
599
    From the output, you can determine the optimum limit and threshold
600
    parameters depending on which measure you want to target.
601
    """
602
    project = get_project(project_id)
603
    backend_params = parse_backend_params(backend_param, project)
604
605
    filter_batches = generate_filter_batches(project.subjects)
606
607
    ndocs = 0
608
    docs = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
609
    for doc in docs.documents:
610
        raw_hits = project.suggest(doc.text, backend_params)
611
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
612
        assert isinstance(hits, ListSuggestionResult), (
613
            "Optimize should only be done with ListSuggestionResult "
614
            + "as it would be very slow with VectorSuggestionResult."
615
        )
616
        for hit_filter, batch in filter_batches.values():
617
            batch.evaluate(hit_filter(hits), doc.subject_set)
618
        ndocs += 1
619
620
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
621
622
    best_scores = collections.defaultdict(float)
623
    best_params = {}
624
625
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
626
    # Store the batches in a list that gets consumed along the way
627
    # This way GC will have a chance to reclaim the memory
628
    filter_batches = list(filter_batches.items())
629
    while filter_batches:
630
        params, filter_batch = filter_batches.pop(0)
631
        metrics = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
632
        results = filter_batch[1].results(metrics=metrics)
633
        for metric, score in results.items():
634
            if score >= best_scores[metric]:
635
                best_scores[metric] = score
636
                best_params[metric] = params
637
        click.echo(
638
            template.format(
639
                params[0],
640
                params[1],
641
                results["Precision (doc avg)"],
642
                results["Recall (doc avg)"],
643
                results["F1 score (doc avg)"],
644
            )
645
        )
646
647
    click.echo()
648
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
649
    for metric in metrics:
0 ignored issues
show
introduced by
The variable metrics does not seem to be defined in case the while loop on line 629 is not entered. Are you sure this can never be the case?
Loading history...
650
        click.echo(
651
            template2.format(
652
                metric,
653
                best_scores[metric],
654
                best_params[metric][0],
655
                best_params[metric][1],
656
            )
657
        )
658
    click.echo("Documents evaluated:\t{}".format(ndocs))
659
660
661
@cli.command("hyperopt")
662
@click.argument("project_id")
663
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
664
@click.option(
665
    "--docs-limit",
666
    "-d",
667
    default=None,
668
    type=click.IntRange(0, None),
669
    help="Maximum number of documents to use",
670
)
671
@click.option("--trials", "-T", default=10, help="Number of trials")
672
@click.option(
673
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
674
)
675
@click.option(
676
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
677
)
678
@click.option(
679
    "--results-file",
680
    "-r",
681
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
682
    help="""Specify file path to write trial results as CSV.
683
    File directory must exist, existing file will be overwritten.""",
684
)
685
@common_options
686
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
687
    """
688
    Optimize the hyperparameters of a project using validation documents from
689
    ``PATHS``. Not supported by all backends. Output is a list of trial results
690
    and a report of the best performing parameters.
691
    """
692
    proj = get_project(project_id)
693
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
694
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
695
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
696
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
697
    click.echo("---")
698
    for line in rec.lines:
699
        click.echo(line)
700
    click.echo("---")
701
702
703
if __name__ == "__main__":
704
    cli()
705