Passed
Pull Request — master (#663)
by Juho
03:15
created

annif.cli.run_learn()   A

Complexity

Conditions 1

Size

Total Lines 24
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 16
nop 4
dl 0
loc 24
rs 9.6
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import json
7
import os.path
8
import re
9
import sys
10
11
import click
12
import click_log
13
from flask import current_app
14
from flask.cli import FlaskGroup, ScriptInfo
15
16
import annif
17
import annif.corpus
18
import annif.parallel
19
import annif.project
20
import annif.registry
21
from annif.exception import (
22
    ConfigurationException,
23
    NotInitializedException,
24
    NotSupportedException,
25
)
26
from annif.project import Access
27
from annif.suggestion import ListSuggestionResult, SuggestionFilter
28
from annif.util import metric_code
29
30
logger = annif.logger
31
click_log.basic_config(logger)
32
33
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
34
cli = click.version_option(message="%(version)s")(cli)
35
36
37
def get_project(project_id):
38
    """
39
    Helper function to get a project by ID and bail out if it doesn't exist"""
40
    try:
41
        return annif.registry.get_project(project_id, min_access=Access.private)
42
    except ValueError:
43
        click.echo("No projects found with id '{0}'.".format(project_id), err=True)
44
        sys.exit(1)
45
46
47
def get_vocab(vocab_id):
48
    """
49
    Helper function to get a vocabulary by ID and bail out if it doesn't
50
    exist"""
51
    try:
52
        return annif.registry.get_vocab(vocab_id, min_access=Access.private)
53
    except ValueError:
54
        click.echo(f"No vocabularies found with the id '{vocab_id}'.", err=True)
55
        sys.exit(1)
56
57
58
def open_documents(paths, subject_index, vocab_lang, docs_limit):
59
    """Helper function to open a document corpus from a list of pathnames,
60
    each of which is either a TSV file or a directory of TXT files. For
61
    directories with subjects in TSV files, the given vocabulary language
62
    will be used to convert subject labels into URIs. The corpus will be
63
    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
64
65
    def open_doc_path(path, subject_index):
66
        """open a single path and return it as a DocumentCorpus"""
67
        if os.path.isdir(path):
68
            return annif.corpus.DocumentDirectory(
69
                path, subject_index, vocab_lang, require_subjects=True
70
            )
71
        return annif.corpus.DocumentFile(path, subject_index)
72
73
    if len(paths) == 0:
74
        logger.warning("Reading empty file")
75
        docs = open_doc_path(os.path.devnull, subject_index)
76
    elif len(paths) == 1:
77
        docs = open_doc_path(paths[0], subject_index)
78
    else:
79
        corpora = [open_doc_path(path, subject_index) for path in paths]
80
        docs = annif.corpus.CombinedCorpus(corpora)
81
    if docs_limit is not None:
82
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
83
    return docs
84
85
86
def open_text_documents(paths, docs_limit):
87
    docs = []
88
    for path in paths[:docs_limit]:
89
        with open(path, errors="replace", encoding="utf-8-sig") as docfile:
90
            docs.append(annif.corpus.Document(text=docfile.read(), subject_set=None))
91
    return annif.corpus.DocumentList(docs)
92
93
94
def show_hits(hits, project, lang, file=None):
95
    for hit in hits.as_list():
96
        subj = project.subjects[hit.subject_id]
97
        line = "<{}>\t{}\t{}".format(
98
            subj.uri,
99
            "\t".join(filter(None, (subj.labels[lang], subj.notation))),
100
            hit.score,
101
        )
102
        click.echo(line, file=file)
103
104
105
def parse_backend_params(backend_param, project):
106
    """Parse a list of backend parameters given with the --backend-param
107
    option into a nested dict structure"""
108
    backend_params = collections.defaultdict(dict)
109
    for beparam in backend_param:
110
        backend, param = beparam.split(".", 1)
111
        key, val = param.split("=", 1)
112
        validate_backend_params(backend, beparam, project)
113
        backend_params[backend][key] = val
114
    return backend_params
115
116
117
def validate_backend_params(backend, beparam, project):
118
    if backend != project.config["backend"]:
119
        raise ConfigurationException(
120
            'The backend {} in CLI option "-b {}" not matching the project'
121
            " backend {}.".format(backend, beparam, project.config["backend"])
122
        )
123
124
125
BATCH_MAX_LIMIT = 15
126
127
128
def generate_filter_batches(subjects):
129
    import annif.eval
130
131
    filter_batches = collections.OrderedDict()
132
    for limit in range(1, BATCH_MAX_LIMIT + 1):
133
        for threshold in [i * 0.05 for i in range(20)]:
134
            hit_filter = SuggestionFilter(subjects, limit, threshold)
135
            batch = annif.eval.EvaluationBatch(subjects)
136
            filter_batches[(limit, threshold)] = (hit_filter, batch)
137
    return filter_batches
138
139
140
def set_project_config_file_path(ctx, param, value):
141
    """Override the default path or the path given in env by CLI option"""
142
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
143
        if value:
144
            current_app.config["PROJECTS_CONFIG_PATH"] = value
145
146
147
def common_options(f):
148
    """Decorator to add common options for all CLI commands"""
149
    f = click.option(
150
        "-p",
151
        "--projects",
152
        help="Set path to project configuration file or directory",
153
        type=click.Path(dir_okay=True, exists=True),
154
        callback=set_project_config_file_path,
155
        expose_value=False,
156
        is_eager=True,
157
    )(f)
158
    return click_log.simple_verbosity_option(logger)(f)
159
160
161
def backend_param_option(f):
162
    """Decorator to add an option for CLI commands to override BE parameters"""
163
    return click.option(
164
        "--backend-param",
165
        "-b",
166
        multiple=True,
167
        help="Override backend parameter of the config file. "
168
        + "Syntax: `-b <backend>.<parameter>=<value>`.",
169
    )(f)
170
171
172
@cli.command("list-projects")
173
@common_options
174
@click_log.simple_verbosity_option(logger, default="ERROR")
175
def run_list_projects():
176
    """
177
    List available projects.
178
    \f
179
    Show a list of currently defined projects. Projects are defined in a
180
    configuration file, normally called ``projects.cfg``. See `Project
181
    configuration
182
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
183
    for details.
184
    """
185
186
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
187
    header = template.format("Project ID", "Project Name", "Language", "Trained")
188
    click.echo(header)
189
    click.echo("-" * len(header))
190
    for proj in annif.registry.get_projects(min_access=Access.private).values():
191
        click.echo(
192
            template.format(
193
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
194
            )
195
        )
196
197
198
@cli.command("show-project")
199
@click.argument("project_id")
200
@common_options
201
def run_show_project(project_id):
202
    """
203
    Show information about a project.
204
    """
205
206
    proj = get_project(project_id)
207
    click.echo(f"Project ID:        {proj.project_id}")
208
    click.echo(f"Project Name:      {proj.name}")
209
    click.echo(f"Language:          {proj.language}")
210
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
211
    click.echo(f"Vocab language:    {proj.vocab_lang}")
212
    click.echo(f"Access:            {proj.access.name}")
213
    click.echo(f"Trained:           {proj.is_trained}")
214
    click.echo(f"Modification time: {proj.modification_time}")
215
216
217
@cli.command("clear")
218
@click.argument("project_id")
219
@common_options
220
def run_clear_project(project_id):
221
    """
222
    Initialize the project to its original, untrained state.
223
    """
224
    proj = get_project(project_id)
225
    proj.remove_model_data()
226
227
228
@cli.command("list-vocabs")
229
@common_options
230
@click_log.simple_verbosity_option(logger, default="ERROR")
231
def run_list_vocabs():
232
    """
233
    List available vocabularies.
234
    """
235
236
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
237
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
238
    click.echo(header)
239
    click.echo("-" * len(header))
240
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
241
        try:
242
            languages = ",".join(sorted(vocab.languages))
243
            size = len(vocab)
244
            loaded = True
245
        except NotInitializedException:
246
            languages = "-"
247
            size = "-"
248
            loaded = False
249
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
250
251
252
@cli.command("load-vocab")
253
@click.argument("vocab_id")
254
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
255
@click.option("--language", "-L", help="Language of subject file")
256
@click.option(
257
    "--force",
258
    "-f",
259
    default=False,
260
    is_flag=True,
261
    help="Replace existing vocabulary completely " + "instead of updating it",
262
)
263
@common_options
264
def run_load_vocab(vocab_id, language, force, subjectfile):
265
    """
266
    Load a vocabulary from a subject file.
267
    """
268
    vocab = get_vocab(vocab_id)
269
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
270
        # SKOS/RDF file supported by rdflib
271
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
272
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
273
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
274
        # CSV file
275
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
276
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
277
    else:
278
        # probably a TSV file - we need to know its language
279
        if not language:
280
            click.echo(
281
                "Please use --language option to set the language of "
282
                + "a TSV vocabulary.",
283
                err=True,
284
            )
285
            sys.exit(1)
286
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
287
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
288
    vocab.load_vocabulary(subjects, force=force)
289
290
291
@cli.command("train")
292
@click.argument("project_id")
293
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
294
@click.option(
295
    "--cached/--no-cached",
296
    "-c/-C",
297
    default=False,
298
    help="Reuse preprocessed training data from previous run",
299
)
300
@click.option(
301
    "--docs-limit",
302
    "-d",
303
    default=None,
304
    type=click.IntRange(0, None),
305
    help="Maximum number of documents to use",
306
)
307
@click.option(
308
    "--jobs",
309
    "-j",
310
    default=0,
311
    help="Number of parallel jobs (0 means choose automatically)",
312
)
313
@backend_param_option
314
@common_options
315
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
316
    """
317
    Train a project on a collection of documents.
318
    \f
319
    This will train the project using the documents from ``PATHS`` (directories
320
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
321
    is set, preprocessed training data from the previous run is reused instead
322
    of documents input; see `Reusing preprocessed training data
323
    <https://github.com/NatLibFi/Annif/wiki/
324
    Reusing-preprocessed-training-data>`_.
325
    """
326
    proj = get_project(project_id)
327
    backend_params = parse_backend_params(backend_param, proj)
328
    if cached:
329
        if len(paths) > 0:
330
            raise click.UsageError(
331
                "Corpus paths cannot be given when using --cached option."
332
            )
333
        documents = "cached"
334
    else:
335
        documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
336
    proj.train(documents, backend_params, jobs)
337
338
339
@cli.command("learn")
340
@click.argument("project_id")
341
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
342
@click.option(
343
    "--docs-limit",
344
    "-d",
345
    default=None,
346
    type=click.IntRange(0, None),
347
    help="Maximum number of documents to use",
348
)
349
@backend_param_option
350
@common_options
351
def run_learn(project_id, paths, docs_limit, backend_param):
352
    """
353
    Further train an existing project on a collection of documents.
354
    \f
355
    Similar to the ``train`` command. This will continue training an already
356
    trained project using the documents given by ``PATHS`` in a single batch
357
    operation. Not supported by all backends.
358
    """
359
    proj = get_project(project_id)
360
    backend_params = parse_backend_params(backend_param, proj)
361
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
362
    proj.learn(documents, backend_params)
363
364
365
@cli.command("suggest")
366
@click.argument("project_id")
367
@click.argument("paths", type=click.Path(dir_okay=False, exists=True), nargs=-1)
368
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
369
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
370
@click.option("--language", "-L", help="Language of subject labels")
371
@click.option(
372
    "--docs-limit",
373
    "-d",
374
    default=None,
375
    type=click.IntRange(0, None),
376
    help="Maximum number of documents to use",
377
)
378
@backend_param_option
379
@common_options
380
def run_suggest(
381
    project_id, paths, limit, threshold, language, backend_param, docs_limit
382
):
383
    """
384
    Suggest subjects for a single document from standard input or for one or more
385
    document file(s) given its/their path(s).
386
    \f
387
    This will read a text document from standard input and suggest subjects for
388
    it, or if given path(s) to file(s), suggest subjects for it/them.
389
    """
390
    project = get_project(project_id)
391
    lang = language or project.vocab_lang
392
    if lang not in project.vocab.languages:
393
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
394
    backend_params = parse_backend_params(backend_param, project)
395
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
396
397
    if sys.stdin.isatty():
398
        docs = open_text_documents(paths, docs_limit)
399
        subject_sets = project.suggest_batch(docs, backend_params)
400
        for (
401
            subjects,
402
            path,
403
        ) in zip(subject_sets, paths):
404
            click.echo(f"Suggestions for {path}")
405
            hits = hit_filter(subjects)
406
            show_hits(hits, project, lang)
407
    else:
408
        text = sys.stdin.read()
409
        hits = hit_filter(project.suggest(text, backend_params))
410
        show_hits(hits, project, lang)
411
412
413
@cli.command("index")
414
@click.argument("project_id")
415
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
416
@click.option(
417
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
418
)
419
@click.option(
420
    "--force/--no-force",
421
    "-f/-F",
422
    default=False,
423
    help="Force overwriting of existing result files",
424
)
425
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
426
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
427
@click.option("--language", "-L", help="Language of subject labels")
428
@backend_param_option
429
@common_options
430
def run_index(
431
    project_id, directory, suffix, force, limit, threshold, language, backend_param
432
):
433
    """
434
    Index a directory with documents, suggesting subjects for each document.
435
    Write the results in TSV files with the given suffix (``.annif`` by
436
    default).
437
    """
438
    project = get_project(project_id)
439
    lang = language or project.vocab_lang
440
    if lang not in project.vocab.languages:
441
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
442
    backend_params = parse_backend_params(backend_param, project)
443
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
444
445
    documents = annif.corpus.DocumentDirectory(
446
        directory, None, None, require_subjects=False
447
    )
448
    subject_sets = project.suggest_batch(documents, backend_params)
449
450
    for (docfilename, dummy_subjectfn), subjects in zip(documents, subject_sets):
451
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
452
        if os.path.exists(subjectfilename) and not force:
453
            click.echo(
454
                "Not overwriting {} (use --force to override)".format(subjectfilename)
455
            )
456
            continue
457
        hits = hit_filter(subjects)
458
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
459
            show_hits(hits, project, lang, file=subjfile)
460
461
462
@cli.command("eval")
463
@click.argument("project_id")
464
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
465
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
466
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
467
@click.option(
468
    "--docs-limit",
469
    "-d",
470
    default=None,
471
    type=click.IntRange(0, None),
472
    help="Maximum number of documents to use",
473
)
474
@click.option(
475
    "--metric",
476
    "-m",
477
    default=[],
478
    multiple=True,
479
    help="Metric to calculate (default: all)",
480
)
481
@click.option(
482
    "--metrics-file",
483
    "-M",
484
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
485
    help="""Specify file in order to write evaluation metrics in JSON format.
486
    File directory must exist, existing file will be overwritten.""",
487
)
488
@click.option(
489
    "--results-file",
490
    "-r",
491
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
492
    help="""Specify file in order to write non-aggregated results per subject.
493
    File directory must exist, existing file will be overwritten.""",
494
)
495
@click.option(
496
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
497
)
498
@backend_param_option
499
@common_options
500
def run_eval(
501
    project_id,
502
    paths,
503
    limit,
504
    threshold,
505
    docs_limit,
506
    metric,
507
    metrics_file,
508
    results_file,
509
    jobs,
510
    backend_param,
511
):
512
    """
513
    Suggest subjects for documents and evaluate the results by comparing
514
    against a gold standard.
515
    \f
516
    With this command the documents from ``PATHS`` (directories or possibly
517
    gzipped TSV files) will be assigned subject suggestions and then
518
    statistical measures are calculated that quantify how well the suggested
519
    subjects match the gold-standard subjects in the documents.
520
521
    Normally the output is the list of the metrics calculated across documents.
522
    If ``--results-file <FILENAME>`` option is given, the metrics are
523
    calculated separately for each subject, and written to the given file.
524
    """
525
526
    project = get_project(project_id)
527
    backend_params = parse_backend_params(backend_param, project)
528
529
    import annif.eval
530
531
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
532
533
    if results_file:
534
        try:
535
            print("", end="", file=results_file)
536
            click.echo(
537
                "Writing per subject evaluation results to {!s}".format(
538
                    results_file.name
539
                )
540
            )
541
        except Exception as e:
542
            raise NotSupportedException(
543
                "cannot open results-file for writing: " + str(e)
544
            )
545
    docs = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
546
547
    jobs, pool_class = annif.parallel.get_pool(jobs)
548
549
    project.initialize(parallel=True)
550
    psmap = annif.parallel.ProjectSuggestMap(
551
        project.registry, [project_id], backend_params, limit, threshold
552
    )
553
554
    with pool_class(jobs) as pool:
555
        for hits, subject_set in pool.imap_unordered(psmap.suggest, docs.documents):
556
            eval_batch.evaluate(hits[project_id], subject_set)
557
558
    template = "{0:<30}\t{1}"
559
    metrics = eval_batch.results(
560
        metrics=metric, results_file=results_file, language=project.vocab_lang
561
    )
562
    for metric, score in metrics.items():
563
        click.echo(template.format(metric + ":", score))
564
    if metrics_file:
565
        json.dump(
566
            {metric_code(mname): val for mname, val in metrics.items()},
567
            metrics_file,
568
            indent=2,
569
        )
570
571
572
@cli.command("optimize")
573
@click.argument("project_id")
574
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
575
@click.option(
576
    "--docs-limit",
577
    "-d",
578
    default=None,
579
    type=click.IntRange(0, None),
580
    help="Maximum number of documents to use",
581
)
582
@backend_param_option
583
@common_options
584
def run_optimize(project_id, paths, docs_limit, backend_param):
585
    """
586
    Suggest subjects for documents, testing multiple limits and thresholds.
587
    \f
588
    This command will use different limit (maximum number of subjects) and
589
    score threshold values when assigning subjects to each document given by
590
    ``PATHS`` and compare the results against the gold standard subjects in the
591
    documents. The output is a list of parameter combinations and their scores.
592
    From the output, you can determine the optimum limit and threshold
593
    parameters depending on which measure you want to target.
594
    """
595
    project = get_project(project_id)
596
    backend_params = parse_backend_params(backend_param, project)
597
598
    filter_batches = generate_filter_batches(project.subjects)
599
600
    ndocs = 0
601
    docs = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
602
    for doc in docs.documents:
603
        raw_hits = project.suggest(doc.text, backend_params)
604
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
605
        assert isinstance(hits, ListSuggestionResult), (
606
            "Optimize should only be done with ListSuggestionResult "
607
            + "as it would be very slow with VectorSuggestionResult."
608
        )
609
        for hit_filter, batch in filter_batches.values():
610
            batch.evaluate(hit_filter(hits), doc.subject_set)
611
        ndocs += 1
612
613
    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
614
615
    best_scores = collections.defaultdict(float)
616
    best_params = {}
617
618
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
619
    # Store the batches in a list that gets consumed along the way
620
    # This way GC will have a chance to reclaim the memory
621
    filter_batches = list(filter_batches.items())
622
    while filter_batches:
623
        params, filter_batch = filter_batches.pop(0)
624
        metrics = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
625
        results = filter_batch[1].results(metrics=metrics)
626
        for metric, score in results.items():
627
            if score >= best_scores[metric]:
628
                best_scores[metric] = score
629
                best_params[metric] = params
630
        click.echo(
631
            template.format(
632
                params[0],
633
                params[1],
634
                results["Precision (doc avg)"],
635
                results["Recall (doc avg)"],
636
                results["F1 score (doc avg)"],
637
            )
638
        )
639
640
    click.echo()
641
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
642
    for metric in metrics:
0 ignored issues
show
introduced by
The variable metrics does not seem to be defined in case the while loop on line 622 is not entered. Are you sure this can never be the case?
Loading history...
643
        click.echo(
644
            template2.format(
645
                metric,
646
                best_scores[metric],
647
                best_params[metric][0],
648
                best_params[metric][1],
649
            )
650
        )
651
    click.echo("Documents evaluated:\t{}".format(ndocs))
652
653
654
@cli.command("hyperopt")
655
@click.argument("project_id")
656
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
657
@click.option(
658
    "--docs-limit",
659
    "-d",
660
    default=None,
661
    type=click.IntRange(0, None),
662
    help="Maximum number of documents to use",
663
)
664
@click.option("--trials", "-T", default=10, help="Number of trials")
665
@click.option(
666
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
667
)
668
@click.option(
669
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
670
)
671
@click.option(
672
    "--results-file",
673
    "-r",
674
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
675
    help="""Specify file path to write trial results as CSV.
676
    File directory must exist, existing file will be overwritten.""",
677
)
678
@common_options
679
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
680
    """
681
    Optimize the hyperparameters of a project using validation documents from
682
    ``PATHS``. Not supported by all backends. Output is a list of trial results
683
    and a report of the best performing parameters.
684
    """
685
    proj = get_project(project_id)
686
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
687
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
688
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
689
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
690
    click.echo("---")
691
    for line in rec.lines:
692
        click.echo(line)
693
    click.echo("---")
694
695
696
if __name__ == "__main__":
697
    cli()
698