Completed
Push — master ( 1b4762...49aa5d )
by Osma
17s queued 13s
created

annif.cli.run_optimize()   B

Complexity

Conditions 7

Size

Total Lines 72
Code Lines 56

Duplication

Lines 72
Ratio 100 %

Importance

Changes 0
Metric Value
cc 7
eloc 56
nop 4
dl 72
loc 72
rs 7.0399
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import json
10
import click
11
import click_log
12
from flask import current_app
13
from flask.cli import FlaskGroup, ScriptInfo
14
import annif
15
import annif.corpus
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter, ListSuggestionResult
21
from annif.exception import ConfigurationException, NotSupportedException
22
from annif.util import metric_code
23
24
logger = annif.logger
25
click_log.basic_config(logger)
26
27
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
28
cli = click.version_option(message='%(version)s')(cli)
29
30
31
def get_project(project_id):
32
    """
33
    Helper function to get a project by ID and bail out if it doesn't exist"""
34
    try:
35
        return annif.registry.get_project(project_id,
36
                                          min_access=Access.private)
37
    except ValueError:
38
        click.echo(
39
            "No projects found with id \'{0}\'.".format(project_id),
40
            err=True)
41
        sys.exit(1)
42
43
44 View Code Duplication
def open_documents(paths, docs_limit):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
45
    """Helper function to open a document corpus from a list of pathnames,
46
    each of which is either a TSV file or a directory of TXT files. The
47
    corpus will be returned as an instance of DocumentCorpus or
48
    LimitingDocumentCorpus."""
49
50
    def open_doc_path(path):
51
        """open a single path and return it as a DocumentCorpus"""
52
        if os.path.isdir(path):
53
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
54
        return annif.corpus.DocumentFile(path)
55
56
    if len(paths) == 0:
57
        logger.warning('Reading empty file')
58
        docs = open_doc_path(os.path.devnull)
59
    elif len(paths) == 1:
60
        docs = open_doc_path(paths[0])
61
    else:
62
        corpora = [open_doc_path(path) for path in paths]
63
        docs = annif.corpus.CombinedCorpus(corpora)
64
    if docs_limit is not None:
65
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
66
    return docs
67
68
69 View Code Duplication
def parse_backend_params(backend_param, project):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
70
    """Parse a list of backend parameters given with the --backend-param
71
    option into a nested dict structure"""
72
    backend_params = collections.defaultdict(dict)
73
    for beparam in backend_param:
74
        backend, param = beparam.split('.', 1)
75
        key, val = param.split('=', 1)
76
        validate_backend_params(backend, beparam, project)
77
        backend_params[backend][key] = val
78
    return backend_params
79
80
81
def validate_backend_params(backend, beparam, project):
82
    if backend != project.config['backend']:
83
        raise ConfigurationException(
84
            'The backend {} in CLI option "-b {}" not matching the project'
85
            ' backend {}.'
86
            .format(backend, beparam, project.config['backend']))
87
88
89
BATCH_MAX_LIMIT = 15
90
91
92
def generate_filter_batches(subjects):
93
    import annif.eval
94
    filter_batches = collections.OrderedDict()
95
    for limit in range(1, BATCH_MAX_LIMIT + 1):
96
        for threshold in [i * 0.05 for i in range(20)]:
97
            hit_filter = SuggestionFilter(subjects, limit, threshold)
98
            batch = annif.eval.EvaluationBatch(subjects)
99
            filter_batches[(limit, threshold)] = (hit_filter, batch)
100
    return filter_batches
101
102
103
def set_project_config_file_path(ctx, param, value):
104
    """Override the default path or the path given in env by CLI option"""
105
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
106
        if value:
107
            current_app.config['PROJECTS_FILE'] = value
108
109
110
def common_options(f):
111
    """Decorator to add common options for all CLI commands"""
112
    f = click.option(
113
        '-p', '--projects', help='Set path to projects.cfg',
114
        type=click.Path(dir_okay=False, exists=True),
115
        callback=set_project_config_file_path, expose_value=False,
116
        is_eager=True)(f)
117
    return click_log.simple_verbosity_option(logger)(f)
118
119
120
def backend_param_option(f):
121
    """Decorator to add an option for CLI commands to override BE parameters"""
122
    return click.option(
123
        '--backend-param', '-b', multiple=True,
124
        help='Override backend parameter of the config file. ' +
125
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)
126
127
128 View Code Duplication
@cli.command('list-projects')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
129
@common_options
130
@click_log.simple_verbosity_option(logger, default='ERROR')
131
def run_list_projects():
132
    """
133
    List available projects.
134
    """
135
136
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
137
    header = template.format(
138
        "Project ID", "Project Name", "Language", "Trained")
139
    click.echo(header)
140
    click.echo("-" * len(header))
141
    for proj in annif.registry.get_projects(
142
            min_access=Access.private).values():
143
        click.echo(template.format(
144
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
145
146
147
@cli.command('show-project')
148
@click.argument('project_id')
149
@common_options
150
def run_show_project(project_id):
151
    """
152
    Show information about a project.
153
    """
154
155
    proj = get_project(project_id)
156
    click.echo(f'Project ID:        {proj.project_id}')
157
    click.echo(f'Project Name:      {proj.name}')
158
    click.echo(f'Language:          {proj.language}')
159
    click.echo(f'Access:            {proj.access.name}')
160
    click.echo(f'Trained:           {proj.is_trained}')
161
    click.echo(f'Modification time: {proj.modification_time}')
162
163
164
@cli.command('clear')
165
@click.argument('project_id')
166
@common_options
167
def run_clear_project(project_id):
168
    """
169
    Initialize the project to its original, untrained state.
170
    """
171
    proj = get_project(project_id)
172
    proj.remove_model_data()
173
174
175 View Code Duplication
@cli.command('loadvoc')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
176
@click.argument('project_id')
177
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
178
@common_options
179
def run_loadvoc(project_id, subjectfile):
180
    """
181
    Load a vocabulary for a project.
182
    """
183
    proj = get_project(project_id)
184
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
185
        # SKOS/RDF file supported by rdflib
186
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
187
    else:
188
        # probably a TSV file
189
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
190
    proj.vocab.load_vocabulary(subjects, proj.language)
191
192
193 View Code Duplication
@cli.command('train')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
194
@click.argument('project_id')
195
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
196
@click.option('--cached/--no-cached', '-c/-C', default=False,
197
              help='Reuse preprocessed training data from previous run')
198
@click.option('--docs-limit', '-d', default=None,
199
              type=click.IntRange(0, None),
200
              help='Maximum number of documents to use')
201
@click.option('--jobs',
202
              '-j',
203
              default=0,
204
              help='Number of parallel jobs (0 means choose automatically)')
205
@backend_param_option
206
@common_options
207
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
208
    """
209
    Train a project on a collection of documents.
210
    """
211
    proj = get_project(project_id)
212
    backend_params = parse_backend_params(backend_param, proj)
213
    if cached:
214
        if len(paths) > 0:
215
            raise click.UsageError(
216
                "Corpus paths cannot be given when using --cached option.")
217
        documents = 'cached'
218
    else:
219
        documents = open_documents(paths, docs_limit)
220
    proj.train(documents, backend_params, jobs)
221
222
223 View Code Duplication
@cli.command('learn')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
224
@click.argument('project_id')
225
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
226
@click.option('--docs-limit', '-d', default=None,
227
              type=click.IntRange(0, None),
228
              help='Maximum number of documents to use')
229
@backend_param_option
230
@common_options
231
def run_learn(project_id, paths, docs_limit, backend_param):
232
    """
233
    Further train an existing project on a collection of documents.
234
    """
235
    proj = get_project(project_id)
236
    backend_params = parse_backend_params(backend_param, proj)
237
    documents = open_documents(paths, docs_limit)
238
    proj.learn(documents, backend_params)
239
240
241 View Code Duplication
@cli.command('suggest')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
242
@click.argument('project_id')
243
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
244
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
245
@backend_param_option
246
@common_options
247
def run_suggest(project_id, limit, threshold, backend_param):
248
    """
249
    Suggest subjects for a single document from standard input.
250
    """
251
    project = get_project(project_id)
252
    text = sys.stdin.read()
253
    backend_params = parse_backend_params(backend_param, project)
254
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
255
    hits = hit_filter(project.suggest(text, backend_params))
256
    for hit in hits.as_list(project.subjects):
257
        click.echo(
258
            "<{}>\t{}\t{}".format(
259
                hit.uri,
260
                '\t'.join(filter(None, (hit.label, hit.notation))),
261
                hit.score))
262
263
264 View Code Duplication
@cli.command('index')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
265
@click.argument('project_id')
266
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
267
@click.option(
268
    '--suffix',
269
    '-s',
270
    default='.annif',
271
    help='File name suffix for result files')
272
@click.option('--force/--no-force', '-f/-F', default=False,
273
              help='Force overwriting of existing result files')
274
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
275
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
276
@backend_param_option
277
@common_options
278
def run_index(project_id, directory, suffix, force,
279
              limit, threshold, backend_param):
280
    """
281
    Index a directory with documents, suggesting subjects for each document.
282
    Write the results in TSV files with the given suffix.
283
    """
284
    project = get_project(project_id)
285
    backend_params = parse_backend_params(backend_param, project)
286
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
287
288
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
289
            directory, require_subjects=False):
290
        with open(docfilename, encoding='utf-8-sig') as docfile:
291
            text = docfile.read()
292
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
293
        if os.path.exists(subjectfilename) and not force:
294
            click.echo(
295
                "Not overwriting {} (use --force to override)".format(
296
                    subjectfilename))
297
            continue
298
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
299
            results = project.suggest(text, backend_params)
300
            for hit in hit_filter(results).as_list(project.subjects):
301
                line = "<{}>\t{}\t{}".format(
302
                    hit.uri,
303
                    '\t'.join(filter(None, (hit.label, hit.notation))),
304
                    hit.score)
305
                click.echo(line, file=subjfile)
306
307
308 View Code Duplication
@cli.command('eval')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
309
@click.argument('project_id')
310
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
311
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
312
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
313
@click.option('--docs-limit', '-d', default=None,
314
              type=click.IntRange(0, None),
315
              help='Maximum number of documents to use')
316
@click.option(
317
    '--metrics-file',
318
    '-M',
319
    type=click.File(
320
        'w',
321
        encoding='utf-8',
322
        errors='ignore',
323
        lazy=True),
324
    help="""Specify file in order to write evaluation metrics in JSON format.
325
    File directory must exist, existing file will be overwritten.""")
326
@click.option(
327
    '--results-file',
328
    '-r',
329
    type=click.File(
330
        'w',
331
        encoding='utf-8',
332
        errors='ignore',
333
        lazy=True),
334
    help="""Specify file in order to write non-aggregated results per subject.
335
    File directory must exist, existing file will be overwritten.""")
336
@click.option('--jobs',
337
              '-j',
338
              default=1,
339
              help='Number of parallel jobs (0 means all CPUs)')
340
@backend_param_option
341
@common_options
342
def run_eval(
343
        project_id,
344
        paths,
345
        limit,
346
        threshold,
347
        docs_limit,
348
        metrics_file,
349
        results_file,
350
        jobs,
351
        backend_param):
352
    """
353
    Analyze documents and evaluate the result.
354
355
    Compare the results of automated indexing against a gold standard. The
356
    path may be either a TSV file with short documents or a directory with
357
    documents in separate files.
358
    """
359
360
    project = get_project(project_id)
361
    backend_params = parse_backend_params(backend_param, project)
362
363
    import annif.eval
364
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
365
366
    if results_file:
367
        try:
368
            print('', end='', file=results_file)
369
            click.echo('Writing per subject evaluation results to {!s}'.format(
370
                results_file.name))
371
        except Exception as e:
372
            raise NotSupportedException(
373
                "cannot open results-file for writing: " + str(e))
374
    docs = open_documents(paths, docs_limit)
375
376
    jobs, pool_class = annif.parallel.get_pool(jobs)
377
378
    project.initialize(parallel=True)
379
    psmap = annif.parallel.ProjectSuggestMap(
380
        project.registry, [project_id], backend_params, limit, threshold)
381
382
    with pool_class(jobs) as pool:
383
        for hits, uris, labels in pool.imap_unordered(
384
                psmap.suggest, docs.documents):
385
            eval_batch.evaluate(hits[project_id],
386
                                annif.corpus.SubjectSet((uris, labels)))
387
388
    template = "{0:<30}\t{1}"
389
    metrics = eval_batch.results(results_file=results_file)
390
    for metric, score in metrics.items():
391
        click.echo(template.format(metric + ":", score))
392
    if metrics_file:
393
        json.dump(
394
            {metric_code(metric): val for metric, val in metrics.items()},
395
            metrics_file, indent=2)
396
397
398 View Code Duplication
@cli.command('optimize')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
399
@click.argument('project_id')
400
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
401
@click.option('--docs-limit', '-d', default=None,
402
              type=click.IntRange(0, None),
403
              help='Maximum number of documents to use')
404
@backend_param_option
405
@common_options
406
def run_optimize(project_id, paths, docs_limit, backend_param):
407
    """
408
    Analyze documents, testing multiple limits and thresholds.
409
410
    Evaluate the analysis results for a directory with documents against a
411
    gold standard given in subject files. Test different limit/threshold
412
    values and report the precision, recall and F-measure of each combination
413
    of settings.
414
    """
415
    project = get_project(project_id)
416
    backend_params = parse_backend_params(backend_param, project)
417
418
    filter_batches = generate_filter_batches(project.subjects)
419
420
    ndocs = 0
421
    docs = open_documents(paths, docs_limit)
422
    for doc in docs.documents:
423
        raw_hits = project.suggest(doc.text, backend_params)
424
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
425
        assert isinstance(hits, ListSuggestionResult), \
426
            "Optimize should only be done with ListSuggestionResult " + \
427
            "as it would be very slow with VectorSuggestionResult."
428
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
429
        for hit_filter, batch in filter_batches.values():
430
            batch.evaluate(hit_filter(hits), gold_subjects)
431
        ndocs += 1
432
433
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
434
435
    best_scores = collections.defaultdict(float)
436
    best_params = {}
437
438
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
439
    # Store the batches in a list that gets consumed along the way
440
    # This way GC will have a chance to reclaim the memory
441
    filter_batches = list(filter_batches.items())
442
    while filter_batches:
443
        params, filter_batch = filter_batches.pop(0)
444
        metrics = ['Precision (doc avg)',
445
                   'Recall (doc avg)',
446
                   'F1 score (doc avg)']
447
        results = filter_batch[1].results(metrics=metrics)
448
        for metric, score in results.items():
449
            if score >= best_scores[metric]:
450
                best_scores[metric] = score
451
                best_params[metric] = params
452
        click.echo(
453
            template.format(
454
                params[0],
455
                params[1],
456
                results['Precision (doc avg)'],
457
                results['Recall (doc avg)'],
458
                results['F1 score (doc avg)']))
459
460
    click.echo()
461
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
462
    for metric in metrics:
463
        click.echo(
464
            template2.format(
465
                metric,
466
                best_scores[metric],
467
                best_params[metric][0],
468
                best_params[metric][1]))
469
    click.echo("Documents evaluated:\t{}".format(ndocs))
470
471
472 View Code Duplication
@cli.command('hyperopt')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
473
@click.argument('project_id')
474
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
475
@click.option('--docs-limit', '-d', default=None,
476
              type=click.IntRange(0, None),
477
              help='Maximum number of documents to use')
478
@click.option('--trials', '-T', default=10, help='Number of trials')
479
@click.option('--jobs',
480
              '-j',
481
              default=1,
482
              help='Number of parallel runs (0 means all CPUs)')
483
@click.option('--metric', '-m', default='NDCG',
484
              help='Metric to optimize (default: NDCG)')
485
@click.option(
486
    '--results-file',
487
    '-r',
488
    type=click.File(
489
        'w',
490
        encoding='utf-8',
491
        errors='ignore',
492
        lazy=True),
493
    help="""Specify file path to write trial results as CSV.
494
    File directory must exist, existing file will be overwritten.""")
495
@common_options
496
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
497
                 results_file):
498
    """
499
    Optimize the hyperparameters of a project using a validation corpus.
500
    """
501
    proj = get_project(project_id)
502
    documents = open_documents(paths, docs_limit)
503
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
504
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
505
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
506
    click.echo("---")
507
    for line in rec.lines:
508
        click.echo(line)
509
    click.echo("---")
510
511
512
if __name__ == '__main__':
513
    cli()
514