Passed
Pull Request — master (#614)
by Osma
04:36
created

annif.cli.run_learn()   A

Complexity

Conditions 1

Size

Total Lines 17
Code Lines 14

Duplication

Lines 17
Ratio 100 %

Importance

Changes 0
Metric Value
cc 1
eloc 14
nop 4
dl 17
loc 17
rs 9.7
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import json
10
import click
11
import click_log
12
from flask import current_app
13
from flask.cli import FlaskGroup, ScriptInfo
14
import annif
15
import annif.corpus
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter, ListSuggestionResult
21
from annif.exception import ConfigurationException, NotSupportedException
22
from annif.exception import NotInitializedException
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
29
cli = click.version_option(message='%(version)s')(cli)
30
31
32
def get_project(project_id):
33
    """
34
    Helper function to get a project by ID and bail out if it doesn't exist"""
35
    try:
36
        return annif.registry.get_project(project_id,
37
                                          min_access=Access.private)
38
    except ValueError:
39
        click.echo(
40
            "No projects found with id \'{0}\'.".format(project_id),
41
            err=True)
42
        sys.exit(1)
43
44
45
def get_vocab(vocab_id):
46
    """
47
    Helper function to get a vocabulary by ID and bail out if it doesn't
48
    exist"""
49
    try:
50
        return annif.registry.get_vocab(vocab_id,
51
                                        min_access=Access.private)
52
    except ValueError:
53
        click.echo(
54
            f"No vocabularies found with the id '{vocab_id}'.",
55
            err=True)
56
        sys.exit(1)
57
58
59 View Code Duplication
def open_documents(paths, subject_index, vocab_lang, docs_limit):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
60
    """Helper function to open a document corpus from a list of pathnames,
61
    each of which is either a TSV file or a directory of TXT files. For
62
    directories with subjects in TSV files, the given vocabulary language
63
    will be used to convert subject labels into URIs. The corpus will be
64
    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
65
66
    def open_doc_path(path, subject_index):
67
        """open a single path and return it as a DocumentCorpus"""
68
        if os.path.isdir(path):
69
            return annif.corpus.DocumentDirectory(path, subject_index,
70
                                                  vocab_lang,
71
                                                  require_subjects=True)
72
        return annif.corpus.DocumentFile(path, subject_index)
73
74
    if len(paths) == 0:
75
        logger.warning('Reading empty file')
76
        docs = open_doc_path(os.path.devnull, subject_index)
77
    elif len(paths) == 1:
78
        docs = open_doc_path(paths[0], subject_index)
79
    else:
80
        corpora = [open_doc_path(path, subject_index) for path in paths]
81
        docs = annif.corpus.CombinedCorpus(corpora)
82
    if docs_limit is not None:
83
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
84
    return docs
85
86
87 View Code Duplication
def parse_backend_params(backend_param, project):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
88
    """Parse a list of backend parameters given with the --backend-param
89
    option into a nested dict structure"""
90
    backend_params = collections.defaultdict(dict)
91
    for beparam in backend_param:
92
        backend, param = beparam.split('.', 1)
93
        key, val = param.split('=', 1)
94
        validate_backend_params(backend, beparam, project)
95
        backend_params[backend][key] = val
96
    return backend_params
97
98
99
def validate_backend_params(backend, beparam, project):
100
    if backend != project.config['backend']:
101
        raise ConfigurationException(
102
            'The backend {} in CLI option "-b {}" not matching the project'
103
            ' backend {}.'
104
            .format(backend, beparam, project.config['backend']))
105
106
107
BATCH_MAX_LIMIT = 15
108
109
110
def generate_filter_batches(subjects):
111
    import annif.eval
112
    filter_batches = collections.OrderedDict()
113
    for limit in range(1, BATCH_MAX_LIMIT + 1):
114
        for threshold in [i * 0.05 for i in range(20)]:
115
            hit_filter = SuggestionFilter(subjects, limit, threshold)
116
            batch = annif.eval.EvaluationBatch(subjects)
117
            filter_batches[(limit, threshold)] = (hit_filter, batch)
118
    return filter_batches
119
120
121
def set_project_config_file_path(ctx, param, value):
122
    """Override the default path or the path given in env by CLI option"""
123
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
124
        if value:
125
            current_app.config['PROJECTS_CONFIG_PATH'] = value
126
127
128
def common_options(f):
129
    """Decorator to add common options for all CLI commands"""
130
    f = click.option(
131
        '-p', '--projects',
132
        help='Set path to project configuration file or directory',
133
        type=click.Path(dir_okay=True, exists=True),
134
        callback=set_project_config_file_path, expose_value=False,
135
        is_eager=True)(f)
136
    return click_log.simple_verbosity_option(logger)(f)
137
138
139
def backend_param_option(f):
140
    """Decorator to add an option for CLI commands to override BE parameters"""
141
    return click.option(
142
        '--backend-param', '-b', multiple=True,
143
        help='Override backend parameter of the config file. ' +
144
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)
145
146
147 View Code Duplication
@cli.command('list-projects')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
148
@common_options
149
@click_log.simple_verbosity_option(logger, default='ERROR')
150
def run_list_projects():
151
    """
152
    List available projects.
153
    """
154
155
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
156
    header = template.format(
157
        "Project ID", "Project Name", "Language", "Trained")
158
    click.echo(header)
159
    click.echo("-" * len(header))
160
    for proj in annif.registry.get_projects(
161
            min_access=Access.private).values():
162
        click.echo(template.format(
163
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
164
165
166 View Code Duplication
@cli.command('show-project')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
167
@click.argument('project_id')
168
@common_options
169
def run_show_project(project_id):
170
    """
171
    Show information about a project.
172
    """
173
174
    proj = get_project(project_id)
175
    click.echo(f'Project ID:        {proj.project_id}')
176
    click.echo(f'Project Name:      {proj.name}')
177
    click.echo(f'Language:          {proj.language}')
178
    click.echo(f'Vocabulary:        {proj.vocab.vocab_id}')
179
    click.echo(f'Vocab language:    {proj.vocab_lang}')
180
    click.echo(f'Access:            {proj.access.name}')
181
    click.echo(f'Trained:           {proj.is_trained}')
182
    click.echo(f'Modification time: {proj.modification_time}')
183
184
185
@cli.command('clear')
186
@click.argument('project_id')
187
@common_options
188
def run_clear_project(project_id):
189
    """
190
    Initialize the project to its original, untrained state.
191
    """
192
    proj = get_project(project_id)
193
    proj.remove_model_data()
194
195
196 View Code Duplication
@cli.command('list-vocabs')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
197
@common_options
198
@click_log.simple_verbosity_option(logger, default='ERROR')
199
def run_list_vocabs():
200
    """
201
    List available vocabularies.
202
    """
203
204
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
205
    header = template.format(
206
        "Vocabulary ID", "Languages", "Size", "Loaded")
207
    click.echo(header)
208
    click.echo("-" * len(header))
209
    for vocab in annif.registry.get_vocabs(
210
            min_access=Access.private).values():
211
        try:
212
            languages = ','.join(sorted(vocab.languages))
213
            size = len(vocab)
214
            loaded = True
215
        except NotInitializedException:
216
            languages = '-'
217
            size = '-'
218
            loaded = False
219
        click.echo(template.format(
220
            vocab.vocab_id, languages, size, str(loaded)))
221
222
223 View Code Duplication
@cli.command('loadvoc', deprecated=True)
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
224
@click.argument('project_id')
225
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
226
@click.option('--force', '-f', default=False, is_flag=True,
227
              help='Replace existing vocabulary completely ' +
228
                   'instead of updating it')
229
@common_options
230
def run_loadvoc(project_id, force, subjectfile):
231
    """
232
    Load a vocabulary for a project.
233
    """
234
    proj = get_project(project_id)
235
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
236
        # SKOS/RDF file supported by rdflib
237
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
238
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
239
        # CSV file
240
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
241
    else:
242
        # probably a TSV file
243
        subjects = annif.corpus.SubjectFileTSV(subjectfile, proj.vocab_lang)
244
    proj.vocab.load_vocabulary(subjects, force=force)
245
246
247 View Code Duplication
@cli.command('load-vocab')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
248
@click.argument('vocab_id')
249
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
250
@click.option('--language', '-L', help='Language of subject file')
251
@click.option('--force', '-f', default=False, is_flag=True,
252
              help='Replace existing vocabulary completely ' +
253
                   'instead of updating it')
254
@common_options
255
def run_load_vocab(vocab_id, language, force, subjectfile):
256
    """
257
    Load a vocabulary from a subject file.
258
    """
259
    vocab = get_vocab(vocab_id)
260
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
261
        # SKOS/RDF file supported by rdflib
262
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
263
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
264
        # CSV file
265
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
266
    else:
267
        # probably a TSV file - we need to know its language
268
        if not language:
269
            click.echo("Please use --language option to set the language of " +
270
                       "a TSV vocabulary.", err=True)
271
            sys.exit(1)
272
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
273
    vocab.load_vocabulary(subjects, force=force)
274
275
276 View Code Duplication
@cli.command('train')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
277
@click.argument('project_id')
278
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
279
@click.option('--cached/--no-cached', '-c/-C', default=False,
280
              help='Reuse preprocessed training data from previous run')
281
@click.option('--docs-limit', '-d', default=None,
282
              type=click.IntRange(0, None),
283
              help='Maximum number of documents to use')
284
@click.option('--jobs',
285
              '-j',
286
              default=0,
287
              help='Number of parallel jobs (0 means choose automatically)')
288
@backend_param_option
289
@common_options
290
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
291
    """
292
    Train a project on a collection of documents.
293
    """
294
    proj = get_project(project_id)
295
    backend_params = parse_backend_params(backend_param, proj)
296
    if cached:
297
        if len(paths) > 0:
298
            raise click.UsageError(
299
                "Corpus paths cannot be given when using --cached option.")
300
        documents = 'cached'
301
    else:
302
        documents = open_documents(paths, proj.subjects,
303
                                   proj.vocab_lang, docs_limit)
304
    proj.train(documents, backend_params, jobs)
305
306
307 View Code Duplication
@cli.command('learn')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
308
@click.argument('project_id')
309
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
310
@click.option('--docs-limit', '-d', default=None,
311
              type=click.IntRange(0, None),
312
              help='Maximum number of documents to use')
313
@backend_param_option
314
@common_options
315
def run_learn(project_id, paths, docs_limit, backend_param):
316
    """
317
    Further train an existing project on a collection of documents.
318
    """
319
    proj = get_project(project_id)
320
    backend_params = parse_backend_params(backend_param, proj)
321
    documents = open_documents(paths, proj.subjects,
322
                               proj.vocab_lang, docs_limit)
323
    proj.learn(documents, backend_params)
324
325
326 View Code Duplication
@cli.command('suggest')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
327
@click.argument('project_id')
328
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
329
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
330
@backend_param_option
331
@common_options
332
def run_suggest(project_id, limit, threshold, backend_param):
333
    """
334
    Suggest subjects for a single document from standard input.
335
    """
336
    project = get_project(project_id)
337
    text = sys.stdin.read()
338
    backend_params = parse_backend_params(backend_param, project)
339
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
340
    hits = hit_filter(project.suggest(text, backend_params))
341
    for hit in hits.as_list():
342
        subj = project.subjects[hit.subject_id]
343
        click.echo(
344
            "<{}>\t{}\t{}".format(
345
                subj.uri,
346
                '\t'.join(filter(None,
347
                                 (subj.labels[project.vocab_lang],
348
                                  subj.notation))),
349
                hit.score))
350
351
352 View Code Duplication
@cli.command('index')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
353
@click.argument('project_id')
354
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
355
@click.option(
356
    '--suffix',
357
    '-s',
358
    default='.annif',
359
    help='File name suffix for result files')
360
@click.option('--force/--no-force', '-f/-F', default=False,
361
              help='Force overwriting of existing result files')
362
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
363
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
364
@backend_param_option
365
@common_options
366
def run_index(project_id, directory, suffix, force,
367
              limit, threshold, backend_param):
368
    """
369
    Index a directory with documents, suggesting subjects for each document.
370
    Write the results in TSV files with the given suffix.
371
    """
372
    project = get_project(project_id)
373
    backend_params = parse_backend_params(backend_param, project)
374
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
375
376
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
377
            directory, project.subjects, project.vocab_lang,
378
            require_subjects=False):
379
        with open(docfilename, encoding='utf-8-sig') as docfile:
380
            text = docfile.read()
381
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
382
        if os.path.exists(subjectfilename) and not force:
383
            click.echo(
384
                "Not overwriting {} (use --force to override)".format(
385
                    subjectfilename))
386
            continue
387
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
388
            results = project.suggest(text, backend_params)
389
            for hit in hit_filter(results).as_list():
390
                subj = project.subjects[hit.subject_id]
391
                line = "<{}>\t{}\t{}".format(
392
                    subj.uri,
393
                    '\t'.join(filter(None, (subj.labels[project.vocab_lang],
394
                                            subj.notation))),
395
                    hit.score)
396
                click.echo(line, file=subjfile)
397
398
399 View Code Duplication
@cli.command('eval')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
400
@click.argument('project_id')
401
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
402
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
403
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
404
@click.option('--docs-limit', '-d', default=None,
405
              type=click.IntRange(0, None),
406
              help='Maximum number of documents to use')
407
@click.option('--metric', '-m', default=[], multiple=True,
408
              help='Metric to calculate (default: all)')
409
@click.option(
410
    '--metrics-file',
411
    '-M',
412
    type=click.File(
413
        'w',
414
        encoding='utf-8',
415
        errors='ignore',
416
        lazy=True),
417
    help="""Specify file in order to write evaluation metrics in JSON format.
418
    File directory must exist, existing file will be overwritten.""")
419
@click.option(
420
    '--results-file',
421
    '-r',
422
    type=click.File(
423
        'w',
424
        encoding='utf-8',
425
        errors='ignore',
426
        lazy=True),
427
    help="""Specify file in order to write non-aggregated results per subject.
428
    File directory must exist, existing file will be overwritten.""")
429
@click.option('--jobs',
430
              '-j',
431
              default=1,
432
              help='Number of parallel jobs (0 means all CPUs)')
433
@backend_param_option
434
@common_options
435
def run_eval(
436
        project_id,
437
        paths,
438
        limit,
439
        threshold,
440
        docs_limit,
441
        metric,
442
        metrics_file,
443
        results_file,
444
        jobs,
445
        backend_param):
446
    """
447
    Analyze documents and evaluate the result.
448
449
    Compare the results of automated indexing against a gold standard. The
450
    path may be either a TSV file with short documents or a directory with
451
    documents in separate files.
452
    """
453
454
    project = get_project(project_id)
455
    backend_params = parse_backend_params(backend_param, project)
456
457
    import annif.eval
458
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
459
460
    if results_file:
461
        try:
462
            print('', end='', file=results_file)
463
            click.echo('Writing per subject evaluation results to {!s}'.format(
464
                results_file.name))
465
        except Exception as e:
466
            raise NotSupportedException(
467
                "cannot open results-file for writing: " + str(e))
468
    docs = open_documents(paths, project.subjects,
469
                          project.vocab_lang, docs_limit)
470
471
    jobs, pool_class = annif.parallel.get_pool(jobs)
472
473
    project.initialize(parallel=True)
474
    psmap = annif.parallel.ProjectSuggestMap(
475
        project.registry, [project_id], backend_params, limit, threshold)
476
477
    with pool_class(jobs) as pool:
478
        for hits, subject_set in pool.imap_unordered(
479
                psmap.suggest, docs.documents):
480
            eval_batch.evaluate(hits[project_id],
481
                                subject_set)
482
483
    template = "{0:<30}\t{1}"
484
    metrics = eval_batch.results(metrics=metric,
485
                                 results_file=results_file,
486
                                 language=project.vocab_lang)
487
    for metric, score in metrics.items():
488
        click.echo(template.format(metric + ":", score))
489
    if metrics_file:
490
        json.dump(
491
            {metric_code(mname): val for mname, val in metrics.items()},
492
            metrics_file, indent=2)
493
494
495 View Code Duplication
@cli.command('optimize')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
496
@click.argument('project_id')
497
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
498
@click.option('--docs-limit', '-d', default=None,
499
              type=click.IntRange(0, None),
500
              help='Maximum number of documents to use')
501
@backend_param_option
502
@common_options
503
def run_optimize(project_id, paths, docs_limit, backend_param):
504
    """
505
    Analyze documents, testing multiple limits and thresholds.
506
507
    Evaluate the analysis results for a directory with documents against a
508
    gold standard given in subject files. Test different limit/threshold
509
    values and report the precision, recall and F-measure of each combination
510
    of settings.
511
    """
512
    project = get_project(project_id)
513
    backend_params = parse_backend_params(backend_param, project)
514
515
    filter_batches = generate_filter_batches(project.subjects)
516
517
    ndocs = 0
518
    docs = open_documents(paths, project.subjects,
519
                          project.vocab_lang, docs_limit)
520
    for doc in docs.documents:
521
        raw_hits = project.suggest(doc.text, backend_params)
522
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
523
        assert isinstance(hits, ListSuggestionResult), \
524
            "Optimize should only be done with ListSuggestionResult " + \
525
            "as it would be very slow with VectorSuggestionResult."
526
        for hit_filter, batch in filter_batches.values():
527
            batch.evaluate(hit_filter(hits), doc.subject_set)
528
        ndocs += 1
529
530
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
531
532
    best_scores = collections.defaultdict(float)
533
    best_params = {}
534
535
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
536
    # Store the batches in a list that gets consumed along the way
537
    # This way GC will have a chance to reclaim the memory
538
    filter_batches = list(filter_batches.items())
539
    while filter_batches:
540
        params, filter_batch = filter_batches.pop(0)
541
        metrics = ['Precision (doc avg)',
542
                   'Recall (doc avg)',
543
                   'F1 score (doc avg)']
544
        results = filter_batch[1].results(metrics=metrics)
545
        for metric, score in results.items():
546
            if score >= best_scores[metric]:
547
                best_scores[metric] = score
548
                best_params[metric] = params
549
        click.echo(
550
            template.format(
551
                params[0],
552
                params[1],
553
                results['Precision (doc avg)'],
554
                results['Recall (doc avg)'],
555
                results['F1 score (doc avg)']))
556
557
    click.echo()
558
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
559
    for metric in metrics:
560
        click.echo(
561
            template2.format(
562
                metric,
563
                best_scores[metric],
564
                best_params[metric][0],
565
                best_params[metric][1]))
566
    click.echo("Documents evaluated:\t{}".format(ndocs))
567
568
569 View Code Duplication
@cli.command('hyperopt')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
570
@click.argument('project_id')
571
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
572
@click.option('--docs-limit', '-d', default=None,
573
              type=click.IntRange(0, None),
574
              help='Maximum number of documents to use')
575
@click.option('--trials', '-T', default=10, help='Number of trials')
576
@click.option('--jobs',
577
              '-j',
578
              default=1,
579
              help='Number of parallel runs (0 means all CPUs)')
580
@click.option('--metric', '-m', default='NDCG',
581
              help='Metric to optimize (default: NDCG)')
582
@click.option(
583
    '--results-file',
584
    '-r',
585
    type=click.File(
586
        'w',
587
        encoding='utf-8',
588
        errors='ignore',
589
        lazy=True),
590
    help="""Specify file path to write trial results as CSV.
591
    File directory must exist, existing file will be overwritten.""")
592
@common_options
593
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
594
                 results_file):
595
    """
596
    Optimize the hyperparameters of a project using a validation corpus.
597
    """
598
    proj = get_project(project_id)
599
    documents = open_documents(paths, proj.subjects,
600
                               proj.vocab_lang, docs_limit)
601
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
602
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
603
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
604
    click.echo("---")
605
    for line in rec.lines:
606
        click.echo(line)
607
    click.echo("---")
608
609
610
if __name__ == '__main__':
611
    cli()
612