Passed
Pull Request — master (#414)
by Osma
02:14
created

annif.cli.run_hyperopt()   A

Complexity

Conditions 2

Size

Total Lines 15
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 12
dl 0
loc 15
rs 9.8
c 0
b 0
f 0
cc 2
nop 3
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import click
10
import click_log
11
from flask import current_app
12
from flask.cli import FlaskGroup, ScriptInfo
13
import annif
14
import annif.corpus
15
import annif.eval
16
import annif.project
17
from annif.project import Access
18
from annif.suggestion import SuggestionFilter
19
from annif.exception import ConfigurationException, NotSupportedException
20
21
logger = annif.logger
22
click_log.basic_config(logger)
23
24
cli = FlaskGroup(create_app=annif.create_app)
25
26
27
def get_project(project_id):
28
    """
29
    Helper function to get a project by ID and bail out if it doesn't exist"""
30
    try:
31
        return annif.project.get_project(project_id, min_access=Access.hidden)
32
    except ValueError:
33
        click.echo(
34
            "No projects found with id \'{0}\'.".format(project_id),
35
            err=True)
36
        sys.exit(1)
37
38
39
def open_documents(paths):
40
    """Helper function to open a document corpus from a list of pathnames,
41
    each of which is either a TSV file or a directory of TXT files. The
42
    corpus will be returned as an instance of DocumentCorpus."""
43
44
    def open_doc_path(path):
45
        """open a single path and return it as a DocumentCorpus"""
46
        if os.path.isdir(path):
47
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
48
        return annif.corpus.DocumentFile(path)
49
50
    if len(paths) == 0:
51
        logger.warning('Reading empty file')
52
        docs = open_doc_path(os.path.devnull)
53
    elif len(paths) == 1:
54
        docs = open_doc_path(paths[0])
55
    else:
56
        corpora = [open_doc_path(path) for path in paths]
57
        docs = annif.corpus.CombinedCorpus(corpora)
58
    return docs
59
60
61
def parse_backend_params(backend_param, project):
62
    """Parse a list of backend parameters given with the --backend-param
63
    option into a nested dict structure"""
64
    backend_params = collections.defaultdict(dict)
65
    for beparam in backend_param:
66
        backend, param = beparam.split('.', 1)
67
        key, val = param.split('=', 1)
68
        validate_backend_params(backend, beparam, project)
69
        backend_params[backend][key] = val
70
    return backend_params
71
72
73
def validate_backend_params(backend, beparam, project):
74
    if 'algorithm' in beparam:
75
        raise NotSupportedException('Algorithm overriding not supported.')
76
    if backend != project.config['backend']:
77
        raise ConfigurationException(
78
            'The backend {} in CLI option "-b {}" not matching the project'
79
            ' backend {}.'
80
            .format(backend, beparam, project.config['backend']))
81
82
83
def generate_filter_batches(subjects):
84
    filter_batches = collections.OrderedDict()
85
    for limit in range(1, 16):
86
        for threshold in [i * 0.05 for i in range(20)]:
87
            hit_filter = SuggestionFilter(limit, threshold)
88
            batch = annif.eval.EvaluationBatch(subjects)
89
            filter_batches[(limit, threshold)] = (hit_filter, batch)
90
    return filter_batches
91
92
93
def set_project_config_file_path(ctx, param, value):
94
    """Override the default path or the path given in env by CLI option"""
95
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
96
        if value:
97
            current_app.config['PROJECTS_FILE'] = value
98
99
100
def common_options(f):
101
    """Decorator to add common options for all CLI commands"""
102
    f = click.option(
103
        '-p', '--projects', help='Set path to projects.cfg',
104
        type=click.Path(dir_okay=False, exists=True),
105
        callback=set_project_config_file_path, expose_value=False,
106
        is_eager=True)(f)
107
    return click_log.simple_verbosity_option(logger)(f)
108
109
110
def backend_param_option(f):
111
    """Decorator to add an option for CLI commands to override BE parameters"""
112
    return click.option(
113
        '--backend-param', '-b', multiple=True,
114
        help='Override backend parameter of the config file. ' +
115
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)
116
117
118
@cli.command('list-projects')
119
@common_options
120
def run_list_projects():
121
    """
122
    List available projects.
123
    """
124
125
    template = "{0: <25}{1: <45}{2: <8}"
126
    header = template.format("Project ID", "Project Name", "Language")
127
    click.echo(header)
128
    click.echo("-" * len(header))
129
    for proj in annif.project.get_projects(min_access=Access.private).values():
130
        click.echo(template.format(proj.project_id, proj.name, proj.language))
131
132
133
@cli.command('show-project')
134
@click.argument('project_id')
135
@common_options
136
def run_show_project(project_id):
137
    """
138
    Show information about a project.
139
    """
140
141
    proj = get_project(project_id)
142
    template = "{0:<20}{1}"
143
    click.echo(template.format('Project ID:', proj.project_id))
144
    click.echo(template.format('Project Name:', proj.name))
145
    click.echo(template.format('Language:', proj.language))
146
    click.echo(template.format('Access:', proj.access.name))
147
148
149
@cli.command('clear')
150
@click.argument('project_id')
151
@common_options
152
def run_clear_project(project_id):
153
    """
154
    Initialize the project to its original, untrained state.
155
    """
156
    proj = get_project(project_id)
157
    proj.remove_model_data()
158
159
160
@cli.command('loadvoc')
161
@click.argument('project_id')
162
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
163
@common_options
164
def run_loadvoc(project_id, subjectfile):
165
    """
166
    Load a vocabulary for a project.
167
    """
168
    proj = get_project(project_id)
169
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
170
        # SKOS/RDF file supported by rdflib
171
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
172
    else:
173
        # probably a TSV file
174
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
175
    proj.vocab.load_vocabulary(subjects, proj.language)
176
177
178
@cli.command('train')
179
@click.argument('project_id')
180
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
181
@click.option('--cached/--no-cached', default=False,
182
              help='Reuse preprocessed training data from previous run')
183
@backend_param_option
184
@common_options
185
def run_train(project_id, paths, cached, backend_param):
186
    """
187
    Train a project on a collection of documents.
188
    """
189
    proj = get_project(project_id)
190
    backend_params = parse_backend_params(backend_param, proj)
191
    if cached:
192
        if len(paths) > 0:
193
            raise click.UsageError(
194
                "Corpus paths cannot be given when using --cached option.")
195
        documents = 'cached'
196
    else:
197
        documents = open_documents(paths)
198
    proj.train(documents, backend_params)
199
200
201
@cli.command('learn')
202
@click.argument('project_id')
203
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
204
@backend_param_option
205
@common_options
206
def run_learn(project_id, paths, backend_param):
207
    """
208
    Further train an existing project on a collection of documents.
209
    """
210
    proj = get_project(project_id)
211
    backend_params = parse_backend_params(backend_param, proj)
212
    documents = open_documents(paths)
213
    proj.learn(documents, backend_params)
214
215
216
@cli.command('suggest')
217
@click.argument('project_id')
218
@click.option('--limit', default=10, help='Maximum number of subjects')
219
@click.option('--threshold', default=0.0, help='Minimum score threshold')
220
@backend_param_option
221
@common_options
222
def run_suggest(project_id, limit, threshold, backend_param):
223
    """
224
    Suggest subjects for a single document from standard input.
225
    """
226
    project = get_project(project_id)
227
    text = sys.stdin.read()
228
    backend_params = parse_backend_params(backend_param, project)
229
    hit_filter = SuggestionFilter(limit, threshold)
230
    hits = hit_filter(project.suggest(text, backend_params))
231
    for hit in hits:
232
        click.echo(
233
            "<{}>\t{}\t{}".format(
234
                hit.uri,
235
                '\t'.join(filter(None, (hit.label, hit.notation))),
236
                hit.score))
237
238
239
@cli.command('index')
240
@click.argument('project_id')
241
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
242
@click.option(
243
    '--suffix',
244
    default='.annif',
245
    help='File name suffix for result files')
246
@click.option('--force/--no-force', default=False,
247
              help='Force overwriting of existing result files')
248
@click.option('--limit', default=10, help='Maximum number of subjects')
249
@click.option('--threshold', default=0.0, help='Minimum score threshold')
250
@backend_param_option
251
@common_options
252
def run_index(project_id, directory, suffix, force,
253
              limit, threshold, backend_param):
254
    """
255
    Index a directory with documents, suggesting subjects for each document.
256
    Write the results in TSV files with the given suffix.
257
    """
258
    project = get_project(project_id)
259
    backend_params = parse_backend_params(backend_param, project)
260
    hit_filter = SuggestionFilter(limit, threshold)
261
262
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
263
            directory, require_subjects=False):
264
        with open(docfilename, encoding='utf-8-sig') as docfile:
265
            text = docfile.read()
266
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
267
        if os.path.exists(subjectfilename) and not force:
268
            click.echo(
269
                "Not overwriting {} (use --force to override)".format(
270
                    subjectfilename))
271
            continue
272
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
273
            results = project.suggest(text, backend_params)
274
            for hit in hit_filter(results):
275
                line = "<{}>\t{}\t{}".format(
276
                    hit.uri,
277
                    '\t'.join(filter(None, (hit.label, hit.notation))),
278
                    hit.score)
279
                click.echo(line, file=subjfile)
280
281
282
@cli.command('eval')
283
@click.argument('project_id')
284
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
285
@click.option('--limit', default=10, help='Maximum number of subjects')
286
@click.option('--threshold', default=0.0, help='Minimum score threshold')
287
@click.option(
288
    '--results-file',
289
    type=click.File(
290
        'w',
291
        encoding='utf-8',
292
        errors='ignore',
293
        lazy=True),
294
    help="""Specify file in order to write non-aggregated results per subject.
295
    File directory must exists, existing file will be overwritten.""")
296
@backend_param_option
297
@common_options
298
def run_eval(project_id, paths, limit, threshold, results_file, backend_param):
299
    """
300
    Analyze documents and evaluate the result.
301
302
    Compare the results of automated indexing against a gold standard. The
303
    path may be either a TSV file with short documents or a directory with
304
    documents in separate files.
305
    """
306
307
    project = get_project(project_id)
308
    backend_params = parse_backend_params(backend_param, project)
309
310
    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
311
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
312
313
    if results_file:
314
        try:
315
            print('', end='', file=results_file)
316
            click.echo('Writing per subject evaluation results to {!s}'.format(
317
                results_file.name))
318
        except Exception as e:
319
            raise NotSupportedException(
320
                "cannot open results-file for writing: " + str(e))
321
    docs = open_documents(paths)
322
    for doc in docs.documents:
323
        results = project.suggest(doc.text, backend_params)
324
        hits = hit_filter(results)
325
        eval_batch.evaluate(hits,
326
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))
327
328
    template = "{0:<30}\t{1}"
329
    for metric, score in eval_batch.results(results_file=results_file).items():
330
        click.echo(template.format(metric + ":", score))
331
332
333
@cli.command('optimize')
334
@click.argument('project_id')
335
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
336
@backend_param_option
337
@common_options
338
def run_optimize(project_id, paths, backend_param):
339
    """
340
    Analyze documents, testing multiple limits and thresholds.
341
342
    Evaluate the analysis results for a directory with documents against a
343
    gold standard given in subject files. Test different limit/threshold
344
    values and report the precision, recall and F-measure of each combination
345
    of settings.
346
    """
347
    project = get_project(project_id)
348
    backend_params = parse_backend_params(backend_param, project)
349
350
    filter_batches = generate_filter_batches(project.subjects)
351
352
    ndocs = 0
353
    docs = open_documents(paths)
354
    for doc in docs.documents:
355
        hits = project.suggest(doc.text, backend_params)
356
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
357
        for hit_filter, batch in filter_batches.values():
358
            batch.evaluate(hit_filter(hits), gold_subjects)
359
        ndocs += 1
360
361
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
362
363
    best_scores = collections.defaultdict(float)
364
    best_params = {}
365
366
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
367
    # Store the batches in a list that gets consumed along the way
368
    # This way GC will have a chance to reclaim the memory
369
    filter_batches = list(filter_batches.items())
370
    while filter_batches:
371
        params, filter_batch = filter_batches.pop(0)
372
        results = filter_batch[1].results(metrics='simple')
373
        for metric, score in results.items():
374
            if score >= best_scores[metric]:
375
                best_scores[metric] = score
376
                best_params[metric] = params
377
        click.echo(
378
            template.format(
379
                params[0],
380
                params[1],
381
                results['Precision (doc avg)'],
382
                results['Recall (doc avg)'],
383
                results['F1 score (doc avg)']))
384
385
    click.echo()
386
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
387
    for metric in ('Precision (doc avg)',
388
                   'Recall (doc avg)',
389
                   'F1 score (doc avg)',
390
                   'NDCG@5',
391
                   'NDCG@10'):
392
        click.echo(
393
            template2.format(
394
                metric,
395
                best_scores[metric],
396
                best_params[metric][0],
397
                best_params[metric][1]))
398
    click.echo("Documents evaluated:\t{}".format(ndocs))
399
400
401
@cli.command('hyperopt')
402
@click.argument('project_id')
403
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
404
@click.option('--trials', default=10, help='Number of trials')
405
@common_options
406
def run_hyperopt(project_id, paths, trials):
407
    """
408
    Optimize the hyperparameters of a project using a validation corpus.
409
    """
410
    proj = get_project(project_id)
411
    documents = open_documents(paths)
412
    best, score = proj.hyperopt(documents, trials)
413
    click.echo(f"Best NDCG score {score} with the following hyperparameters:")
414
    for param, value in best.items():
415
        click.echo(f"{param}:\t{value}")
416
417
418
if __name__ == '__main__':
419
    cli()
420