Passed
Pull Request — master (#614)
by Osma
02:50
created

annif.cli.get_vocab()   A

Complexity

Conditions 2

Size

Total Lines 12
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 9
nop 1
dl 0
loc 12
rs 9.95
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import json
10
import click
11
import click_log
12
from flask import current_app
13
from flask.cli import FlaskGroup, ScriptInfo
14
import annif
15
import annif.corpus
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter, ListSuggestionResult
21
from annif.exception import ConfigurationException, NotSupportedException
22
from annif.exception import NotInitializedException
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
29
cli = click.version_option(message='%(version)s')(cli)
30
31
32
def get_project(project_id):
33
    """
34
    Helper function to get a project by ID and bail out if it doesn't exist"""
35
    try:
36
        return annif.registry.get_project(project_id,
37
                                          min_access=Access.private)
38
    except ValueError:
39
        click.echo(
40
            "No projects found with id \'{0}\'.".format(project_id),
41
            err=True)
42
        sys.exit(1)
43
44
45
def get_vocab(vocab_id):
46
    """
47
    Helper function to get a vocabulary by ID and bail out if it doesn't
48
    exist"""
49
    try:
50
        return annif.registry.get_vocab(vocab_id,
51
                                        min_access=Access.private)
52
    except ValueError:
53
        click.echo(
54
            f"No vocabularies found with the id '{vocab_id}'.",
55
            err=True)
56
        sys.exit(1)
57
58
59 View Code Duplication
def open_documents(paths, subject_index, vocab_lang, docs_limit):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
60
    """Helper function to open a document corpus from a list of pathnames,
61
    each of which is either a TSV file or a directory of TXT files. For
62
    directories with subjects in TSV files, the given vocabulary language
63
    will be used to convert subject labels into URIs. The corpus will be
64
    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
65
66
    def open_doc_path(path, subject_index):
67
        """open a single path and return it as a DocumentCorpus"""
68
        if os.path.isdir(path):
69
            return annif.corpus.DocumentDirectory(path, subject_index,
70
                                                  vocab_lang,
71
                                                  require_subjects=True)
72
        return annif.corpus.DocumentFile(path, subject_index)
73
74
    if len(paths) == 0:
75
        logger.warning('Reading empty file')
76
        docs = open_doc_path(os.path.devnull, subject_index)
77
    elif len(paths) == 1:
78
        docs = open_doc_path(paths[0], subject_index)
79
    else:
80
        corpora = [open_doc_path(path, subject_index) for path in paths]
81
        docs = annif.corpus.CombinedCorpus(corpora)
82
    if docs_limit is not None:
83
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
84
    return docs
85
86
87 View Code Duplication
def parse_backend_params(backend_param, project):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
88
    """Parse a list of backend parameters given with the --backend-param
89
    option into a nested dict structure"""
90
    backend_params = collections.defaultdict(dict)
91
    for beparam in backend_param:
92
        backend, param = beparam.split('.', 1)
93
        key, val = param.split('=', 1)
94
        validate_backend_params(backend, beparam, project)
95
        backend_params[backend][key] = val
96
    return backend_params
97
98
99
def validate_backend_params(backend, beparam, project):
100
    if backend != project.config['backend']:
101
        raise ConfigurationException(
102
            'The backend {} in CLI option "-b {}" not matching the project'
103
            ' backend {}.'
104
            .format(backend, beparam, project.config['backend']))
105
106
107
BATCH_MAX_LIMIT = 15
108
109
110
def generate_filter_batches(subjects):
111
    import annif.eval
112
    filter_batches = collections.OrderedDict()
113
    for limit in range(1, BATCH_MAX_LIMIT + 1):
114
        for threshold in [i * 0.05 for i in range(20)]:
115
            hit_filter = SuggestionFilter(subjects, limit, threshold)
116
            batch = annif.eval.EvaluationBatch(subjects)
117
            filter_batches[(limit, threshold)] = (hit_filter, batch)
118
    return filter_batches
119
120
121
def set_project_config_file_path(ctx, param, value):
122
    """Override the default path or the path given in env by CLI option"""
123
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
124
        if value:
125
            current_app.config['PROJECTS_CONFIG_PATH'] = value
126
127
128
def common_options(f):
129
    """Decorator to add common options for all CLI commands"""
130
    f = click.option(
131
        '-p', '--projects',
132
        help='Set path to project configuration file or directory',
133
        type=click.Path(dir_okay=True, exists=True),
134
        callback=set_project_config_file_path, expose_value=False,
135
        is_eager=True)(f)
136
    return click_log.simple_verbosity_option(logger)(f)
137
138
139
def backend_param_option(f):
140
    """Decorator to add an option for CLI commands to override BE parameters"""
141
    return click.option(
142
        '--backend-param', '-b', multiple=True,
143
        help='Override backend parameter of the config file. ' +
144
        'Syntax: `-b <backend>.<parameter>=<value>`.')(f)
145
146
147 View Code Duplication
@cli.command('list-projects')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
148
@common_options
149
@click_log.simple_verbosity_option(logger, default='ERROR')
150
def run_list_projects():
151
    """
152
    List available projects.
153
    \f
154
    Show a list of currently defined projects. Projects are defined in a
155
    configuration file, normally called ``projects.cfg``. See `Project
156
    configuration
157
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
158
    for details.
159
    """
160
161
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
162
    header = template.format(
163
        "Project ID", "Project Name", "Language", "Trained")
164
    click.echo(header)
165
    click.echo("-" * len(header))
166
    for proj in annif.registry.get_projects(
167
            min_access=Access.private).values():
168
        click.echo(template.format(
169
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
170
171
172 View Code Duplication
@cli.command('show-project')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
173
@click.argument('project_id')
174
@common_options
175
def run_show_project(project_id):
176
    """
177
    Show information about a project.
178
    """
179
180
    proj = get_project(project_id)
181
    click.echo(f'Project ID:        {proj.project_id}')
182
    click.echo(f'Project Name:      {proj.name}')
183
    click.echo(f'Language:          {proj.language}')
184
    click.echo(f'Vocabulary:        {proj.vocab.vocab_id}')
185
    click.echo(f'Vocab language:    {proj.vocab_lang}')
186
    click.echo(f'Access:            {proj.access.name}')
187
    click.echo(f'Trained:           {proj.is_trained}')
188
    click.echo(f'Modification time: {proj.modification_time}')
189
190
191
@cli.command('clear')
192
@click.argument('project_id')
193
@common_options
194
def run_clear_project(project_id):
195
    """
196
    Initialize the project to its original, untrained state.
197
    """
198
    proj = get_project(project_id)
199
    proj.remove_model_data()
200
201
202 View Code Duplication
@cli.command('list-vocabs')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
203
@common_options
204
@click_log.simple_verbosity_option(logger, default='ERROR')
205
def run_list_vocabs():
206
    """
207
    List available vocabularies.
208
    """
209
210
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
211
    header = template.format(
212
        "Vocabulary ID", "Languages", "Size", "Loaded")
213
    click.echo(header)
214
    click.echo("-" * len(header))
215
    for vocab in annif.registry.get_vocabs(
216
            min_access=Access.private).values():
217
        try:
218
            languages = ','.join(sorted(vocab.languages))
219
            size = len(vocab)
220
            loaded = True
221
        except NotInitializedException:
222
            languages = '-'
223
            size = '-'
224
            loaded = False
225
        click.echo(template.format(
226
            vocab.vocab_id, languages, size, str(loaded)))
227
228
229 View Code Duplication
@cli.command('loadvoc', deprecated=True)
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
230
@click.argument('project_id')
231
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
232
@click.option('--force', '-f', default=False, is_flag=True,
233
              help='Replace existing vocabulary completely ' +
234
                   'instead of updating it')
235
@common_options
236
def run_loadvoc(project_id, force, subjectfile):
237
    """
238
    Load a vocabulary for a project.
239
    \f
240
    This will load the vocabulary to be used in subject indexing. Note that
241
    although ``PROJECT_ID`` is a parameter of the command, the vocabulary is
242
    shared by all the projects with the same vocab identifier in the project
243
    configuration, and the vocabulary only needs to be loaded for one of those
244
    projects.
245
246
    If a vocabulary has already been loaded, reinvoking loadvoc with a new
247
    subject file will update the Annif’s internal vocabulary: label names are
248
    updated and any subject not appearing in the new subject file is removed.
249
    Note that new subjects will not be suggested before the project is
250
    retrained with the updated vocabulary. The update behavior can be
251
    overridden with the ``--force`` option.
252
    """
253
    proj = get_project(project_id)
254
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
255
        # SKOS/RDF file supported by rdflib
256
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
257
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
258
        # CSV file
259
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
260
    else:
261
        # probably a TSV file
262
        subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.vocab_lang)
263
    proj.vocab.load_vocabulary(subjects, force=force)
264
265
266 View Code Duplication
@cli.command('load-vocab')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
267
@click.argument('vocab_id')
268
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
269
@click.option('--language', '-L', help='Language of subject file')
270
@click.option('--force', '-f', default=False, is_flag=True,
271
              help='Replace existing vocabulary completely ' +
272
                   'instead of updating it')
273
@common_options
274
def run_load_vocab(vocab_id, language, force, subjectfile):
275
    """
276
    Load a vocabulary from a subject file.
277
    """
278
    vocab = get_vocab(vocab_id)
279
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
280
        # SKOS/RDF file supported by rdflib
281
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
282
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
283
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
284
        # CSV file
285
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
286
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
287
    else:
288
        # probably a TSV file - we need to know its language
289
        if not language:
290
            click.echo("Please use --language option to set the language of " +
291
                       "a TSV vocabulary.", err=True)
292
            sys.exit(1)
293
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
294
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
295
    vocab.load_vocabulary(subjects, force=force)
296
297
298 View Code Duplication
@cli.command('train')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
299
@click.argument('project_id')
300
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
301
@click.option('--cached/--no-cached', '-c/-C', default=False,
302
              help='Reuse preprocessed training data from previous run')
303
@click.option('--docs-limit', '-d', default=None,
304
              type=click.IntRange(0, None),
305
              help='Maximum number of documents to use')
306
@click.option('--jobs',
307
              '-j',
308
              default=0,
309
              help='Number of parallel jobs (0 means choose automatically)')
310
@backend_param_option
311
@common_options
312
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
313
    """
314
    Train a project on a collection of documents.
315
    \f
316
    This will train the project using the documents from ``PATHS`` (directories
317
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
318
    is set, preprocessed training data from the previous run is reused instead
319
    of documents input; see `Reusing preprocessed training data
320
    <https://github.com/NatLibFi/Annif/wiki/
321
    Reusing-preprocessed-training-data>`_.
322
    """
323
    proj = get_project(project_id)
324
    backend_params = parse_backend_params(backend_param, proj)
325
    if cached:
326
        if len(paths) > 0:
327
            raise click.UsageError(
328
                "Corpus paths cannot be given when using --cached option.")
329
        documents = 'cached'
330
    else:
331
        documents = open_documents(paths, proj.subjects,
332
                                   proj.vocab_lang, docs_limit)
333
    proj.train(documents, backend_params, jobs)
334
335
336 View Code Duplication
@cli.command('learn')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
337
@click.argument('project_id')
338
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
339
@click.option('--docs-limit', '-d', default=None,
340
              type=click.IntRange(0, None),
341
              help='Maximum number of documents to use')
342
@backend_param_option
343
@common_options
344
def run_learn(project_id, paths, docs_limit, backend_param):
345
    """
346
    Further train an existing project on a collection of documents.
347
    \f
348
    Similar to the ``train`` command. This will continue training an already
349
    trained project using the documents given by ``PATHS`` in a single batch
350
    operation. Not supported by all backends.
351
    """
352
    proj = get_project(project_id)
353
    backend_params = parse_backend_params(backend_param, proj)
354
    documents = open_documents(paths, proj.subjects,
355
                               proj.vocab_lang, docs_limit)
356
    proj.learn(documents, backend_params)
357
358
359 View Code Duplication
@cli.command('suggest')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
360
@click.argument('project_id')
361
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
362
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
363
@backend_param_option
364
@common_options
365
def run_suggest(project_id, limit, threshold, backend_param):
366
    """
367
    Suggest subjects for a single document from standard input.
368
    \f
369
    This will read a text document from standard input and suggest subjects for
370
    it.
371
    """
372
    project = get_project(project_id)
373
    text = sys.stdin.read()
374
    backend_params = parse_backend_params(backend_param, project)
375
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
376
    hits = hit_filter(project.suggest(text, backend_params))
377
    for hit in hits.as_list():
378
        subj = project.subjects[hit.subject_id]
379
        click.echo(
380
            "<{}>\t{}\t{}".format(
381
                subj.uri,
382
                '\t'.join(filter(None,
383
                                 (subj.labels[project.vocab_lang],
384
                                  subj.notation))),
385
                hit.score))
386
387
388 View Code Duplication
@cli.command('index')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
389
@click.argument('project_id')
390
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
391
@click.option(
392
    '--suffix',
393
    '-s',
394
    default='.annif',
395
    help='File name suffix for result files')
396
@click.option('--force/--no-force', '-f/-F', default=False,
397
              help='Force overwriting of existing result files')
398
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
399
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
400
@backend_param_option
401
@common_options
402
def run_index(project_id, directory, suffix, force,
403
              limit, threshold, backend_param):
404
    """
405
    Index a directory with documents, suggesting subjects for each document.
406
    Write the results in TSV files with the given suffix (``.annif`` by
407
    default).
408
    """
409
    project = get_project(project_id)
410
    backend_params = parse_backend_params(backend_param, project)
411
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
412
413
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
414
            directory, project.subjects, project.vocab_lang,
415
            require_subjects=False):
416
        with open(docfilename, encoding='utf-8-sig') as docfile:
417
            text = docfile.read()
418
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
419
        if os.path.exists(subjectfilename) and not force:
420
            click.echo(
421
                "Not overwriting {} (use --force to override)".format(
422
                    subjectfilename))
423
            continue
424
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
425
            results = project.suggest(text, backend_params)
426
            for hit in hit_filter(results).as_list():
427
                subj = project.subjects[hit.subject_id]
428
                line = "<{}>\t{}\t{}".format(
429
                    subj.uri,
430
                    '\t'.join(filter(None, (subj.labels[project.vocab_lang],
431
                                            subj.notation))),
432
                    hit.score)
433
                click.echo(line, file=subjfile)
434
435
436 View Code Duplication
@cli.command('eval')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
437
@click.argument('project_id')
438
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
439
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
440
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
441
@click.option('--docs-limit', '-d', default=None,
442
              type=click.IntRange(0, None),
443
              help='Maximum number of documents to use')
444
@click.option('--metric', '-m', default=[], multiple=True,
445
              help='Metric to calculate (default: all)')
446
@click.option(
447
    '--metrics-file',
448
    '-M',
449
    type=click.File(
450
        'w',
451
        encoding='utf-8',
452
        errors='ignore',
453
        lazy=True),
454
    help="""Specify file in order to write evaluation metrics in JSON format.
455
    File directory must exist, existing file will be overwritten.""")
456
@click.option(
457
    '--results-file',
458
    '-r',
459
    type=click.File(
460
        'w',
461
        encoding='utf-8',
462
        errors='ignore',
463
        lazy=True),
464
    help="""Specify file in order to write non-aggregated results per subject.
465
    File directory must exist, existing file will be overwritten.""")
466
@click.option('--jobs',
467
              '-j',
468
              default=1,
469
              help='Number of parallel jobs (0 means all CPUs)')
470
@backend_param_option
471
@common_options
472
def run_eval(
473
        project_id,
474
        paths,
475
        limit,
476
        threshold,
477
        docs_limit,
478
        metric,
479
        metrics_file,
480
        results_file,
481
        jobs,
482
        backend_param):
483
    """
484
    Suggest subjects for documents and evaluate the results by comparing
485
    against a gold standard.
486
    \f
487
    With this command the documents from ``PATHS`` (directories or possibly
488
    gzipped TSV files) will be assigned subject suggestions and then
489
    statistical measures are calculated that quantify how well the suggested
490
    subjects match the gold-standard subjects in the documents.
491
492
    Normally the output is the list of the metrics calculated across documents.
493
    If ``--results-file <FILENAME>`` option is given, the metrics are
494
    calculated separately for each subject, and written to the given file.
495
    """
496
497
    project = get_project(project_id)
498
    backend_params = parse_backend_params(backend_param, project)
499
500
    import annif.eval
501
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
502
503
    if results_file:
504
        try:
505
            print('', end='', file=results_file)
506
            click.echo('Writing per subject evaluation results to {!s}'.format(
507
                results_file.name))
508
        except Exception as e:
509
            raise NotSupportedException(
510
                "cannot open results-file for writing: " + str(e))
511
    docs = open_documents(paths, project.subjects,
512
                          project.vocab_lang, docs_limit)
513
514
    jobs, pool_class = annif.parallel.get_pool(jobs)
515
516
    project.initialize(parallel=True)
517
    psmap = annif.parallel.ProjectSuggestMap(
518
        project.registry, [project_id], backend_params, limit, threshold)
519
520
    with pool_class(jobs) as pool:
521
        for hits, subject_set in pool.imap_unordered(
522
                psmap.suggest, docs.documents):
523
            eval_batch.evaluate(hits[project_id],
524
                                subject_set)
525
526
    template = "{0:<30}\t{1}"
527
    metrics = eval_batch.results(metrics=metric,
528
                                 results_file=results_file,
529
                                 language=project.vocab_lang)
530
    for metric, score in metrics.items():
531
        click.echo(template.format(metric + ":", score))
532
    if metrics_file:
533
        json.dump(
534
            {metric_code(mname): val for mname, val in metrics.items()},
535
            metrics_file, indent=2)
536
537
538 View Code Duplication
@cli.command('optimize')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
539
@click.argument('project_id')
540
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
541
@click.option('--docs-limit', '-d', default=None,
542
              type=click.IntRange(0, None),
543
              help='Maximum number of documents to use')
544
@backend_param_option
545
@common_options
546
def run_optimize(project_id, paths, docs_limit, backend_param):
547
    """
548
    Suggest subjects for documents, testing multiple limits and thresholds.
549
    \f
550
    This command will use different limit (maximum number of subjects) and
551
    score threshold values when assigning subjects to each document given by
552
    ``PATHS`` and compare the results against the gold standard subjects in the
553
    documents. The output is a list of parameter combinations and their scores.
554
    From the output, you can determine the optimum limit and threshold
555
    parameters depending on which measure you want to target.
556
    """
557
    project = get_project(project_id)
558
    backend_params = parse_backend_params(backend_param, project)
559
560
    filter_batches = generate_filter_batches(project.subjects)
561
562
    ndocs = 0
563
    docs = open_documents(paths, project.subjects,
564
                          project.vocab_lang, docs_limit)
565
    for doc in docs.documents:
566
        raw_hits = project.suggest(doc.text, backend_params)
567
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
568
        assert isinstance(hits, ListSuggestionResult), \
569
            "Optimize should only be done with ListSuggestionResult " + \
570
            "as it would be very slow with VectorSuggestionResult."
571
        for hit_filter, batch in filter_batches.values():
572
            batch.evaluate(hit_filter(hits), doc.subject_set)
573
        ndocs += 1
574
575
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
576
577
    best_scores = collections.defaultdict(float)
578
    best_params = {}
579
580
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
581
    # Store the batches in a list that gets consumed along the way
582
    # This way GC will have a chance to reclaim the memory
583
    filter_batches = list(filter_batches.items())
584
    while filter_batches:
585
        params, filter_batch = filter_batches.pop(0)
586
        metrics = ['Precision (doc avg)',
587
                   'Recall (doc avg)',
588
                   'F1 score (doc avg)']
589
        results = filter_batch[1].results(metrics=metrics)
590
        for metric, score in results.items():
591
            if score >= best_scores[metric]:
592
                best_scores[metric] = score
593
                best_params[metric] = params
594
        click.echo(
595
            template.format(
596
                params[0],
597
                params[1],
598
                results['Precision (doc avg)'],
599
                results['Recall (doc avg)'],
600
                results['F1 score (doc avg)']))
601
602
    click.echo()
603
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
604
    for metric in metrics:
605
        click.echo(
606
            template2.format(
607
                metric,
608
                best_scores[metric],
609
                best_params[metric][0],
610
                best_params[metric][1]))
611
    click.echo("Documents evaluated:\t{}".format(ndocs))
612
613
614 View Code Duplication
@cli.command('hyperopt')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
615
@click.argument('project_id')
616
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
617
@click.option('--docs-limit', '-d', default=None,
618
              type=click.IntRange(0, None),
619
              help='Maximum number of documents to use')
620
@click.option('--trials', '-T', default=10, help='Number of trials')
621
@click.option('--jobs',
622
              '-j',
623
              default=1,
624
              help='Number of parallel runs (0 means all CPUs)')
625
@click.option('--metric', '-m', default='NDCG',
626
              help='Metric to optimize (default: NDCG)')
627
@click.option(
628
    '--results-file',
629
    '-r',
630
    type=click.File(
631
        'w',
632
        encoding='utf-8',
633
        errors='ignore',
634
        lazy=True),
635
    help="""Specify file path to write trial results as CSV.
636
    File directory must exist, existing file will be overwritten.""")
637
@common_options
638
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
639
                 results_file):
640
    """
641
    Optimize the hyperparameters of a project using validation documents from
642
    ``PATHS``. Not supported by all backends. Output is a list of trial results
643
    and a report of the best performing parameters.
644
    """
645
    proj = get_project(project_id)
646
    documents = open_documents(paths, proj.subjects,
647
                               proj.vocab_lang, docs_limit)
648
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
649
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
650
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
651
    click.echo("---")
652
    for line in rec.lines:
653
        click.echo(line)
654
    click.echo("---")
655
656
657
if __name__ == '__main__':
658
    cli()
659